#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Support for working with :class:`zope.catalog.field` indexes.
All of the indexes we define are compatible with both
:mod:`zope.catalog` query syntax (and internal attributes) and :mod:`zc.catalog`
syntax (and public attributes).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# stdlib imports
try:
from collections.abc import Mapping
from collections.abc import Iterable
except ImportError: # pragma: no cover
from collections import Mapping
from collections import Iterable
import BTrees
import six
import zc.catalog.catalogindex
import zc.catalog.index
import zc.catalog.stemmer
from zope.interface import implementer
from zope.catalog.attribute import AttributeIndex
from zope.catalog.interfaces import ICatalogIndex
from zope.catalog.text import TextIndex
from zope.container.contained import Contained
import zope.index.field
import zope.index.keyword
from zope.index.text import lexicon
from nti.property.property import alias
from nti.zope_catalog.interfaces import IFieldIndex
from nti.zope_catalog.interfaces import IIntegerValueIndex
from nti.zope_catalog.interfaces import IKeywordIndex
from nti.zope_catalog.interfaces import ISetIndex
from nti.zope_catalog.interfaces import ITextIndex
from nti.zope_catalog.interfaces import IValueIndex
__docformat__ = "restructuredtext en"
logger = __import__('logging').getLogger(__name__)
def is_nonstr_iter(x):
return not isinstance(x, six.string_types) \
and isinstance(x, Iterable)
def convertQuery(query):
# Convert zope.index style two-tuple (min/max)
# query to new-style
if isinstance(query, tuple) and len(query) == 2:
if query[0] == query[1]:
# common case of exact match
query = {'any_of': (query[0],)}
else:
query = {'between': query}
return query
class _ZCApplyMixin(object):
"""
Convert zope.index style two-tuple query to new style.
"""
def apply(self, query):
query = convertQuery(query)
return super(_ZCApplyMixin, self).apply(query)
class _ZCAbstractIndexMixin(object):
"""
Helpers and compatibility mixins for zope.catalog and zc.catalog.
Makes zc.catalog indexes look a bit more like zope.catalog indexes.
"""
family = BTrees.family64
_num_docs = alias('documentCount')
_fwd_index = alias('values_to_documents')
_rev_index = alias('documents_to_values')
class _ZipMixin(object):
def zip(self, doc_ids=()):
for doc_id in doc_ids or ():
value = self._rev_index.get(doc_id)
yield doc_id, value
class _SetZipMixin(_ZipMixin):
def zip(self, doc_ids=()):
for d, v in _ZipMixin.zip(self, doc_ids):
# TODO: Should we really be doing this? The values
# are usually [OO]TreeSet, which is more memory and persistence
# friendly
yield d, set(v) if v is not None else None
[docs]@implementer(IFieldIndex)
class NormalizingFieldIndex(_ZipMixin,
zope.index.field.FieldIndex,
Contained):
"""
A field index that normalizes before indexing or searching.
.. note:: For more flexibility, use a :class:`~.NormalizationWrapper`.
"""
# We default to 64-bit trees
family = BTrees.family64
[docs] def normalize(self, value):
"Subclasses must override this method."
raise NotImplementedError()
[docs] def index_doc(self, docid, value):
super(NormalizingFieldIndex, self).index_doc(
docid, self.normalize(value))
def apply(self, query):
query = tuple(self.normalize(x) for x in query)
return super(NormalizingFieldIndex, self).apply(query)
def ids(self):
return self._rev_index.keys()
def doc_value(self, doc_id):
result = self._rev_index.get(doc_id)
return result
[docs]class CaseInsensitiveAttributeFieldIndex(AttributeIndex,
NormalizingFieldIndex):
"""
An attribute index that normalizes case. It is queried with a
two-tuple giving the min and max values.
"""
[docs] def normalize(self, value):
value = value.lower() if value else value
return value
# Normalizing and wrappers:
# The normalizing code needs to get the actual values. Because AttributeIndex
# gets the attribute value in index_doc and then calls the same method on super
# with that returned value, the NormalizationWrapper has to extend AttributeIndex
# to get the right value to pass to the normamlizer. That means it cannot be used
# to wrap another AttributeIndex, only a plain ValueIndex or SetIndex. Note
# that it is somewhat painful to construct
[docs]@implementer(IValueIndex)
class ValueIndex(_ZCApplyMixin,
_ZCAbstractIndexMixin,
_ZipMixin,
zc.catalog.index.ValueIndex):
"An index of raw values."
[docs]class AttributeValueIndex(ValueIndex,
zc.catalog.catalogindex.ValueIndex):
"An index of values stored in a particular attribute."
[docs]@implementer(ISetIndex)
class SetIndex(_ZCAbstractIndexMixin,
_SetZipMixin,
zc.catalog.index.SetIndex):
"An index of values that are multiple."
[docs]class AttributeSetIndex(SetIndex,
zc.catalog.catalogindex.SetIndex):
"An index of values that are multiple and stored in an attribute."
[docs]@implementer(IIntegerValueIndex)
class IntegerValueIndex(_ZCApplyMixin,
_ZCAbstractIndexMixin,
_ZipMixin,
zc.catalog.index.ValueIndex):
"""
A "raw" index that is optimized for, and only supports,
storing integer values. To normalize, use a :class:`zc.catalog.index.NormalizationWrapper`;
to store in a catalog and normalize, use a :class:`NormalizationWrapper`
(which is an attribute index).
"""
def clear(self):
super(IntegerValueIndex, self).clear()
self.documents_to_values = self.family.II.BTree()
self.values_to_documents = self.family.IO.BTree()
[docs]class IntegerAttributeIndex(IntegerValueIndex,
zc.catalog.catalogindex.ValueIndex):
"""
An attribute index that is optimized for, and only supports,
storing integer values. To normalize, use a :class:`zc.catalog.index.NormalizationWrapper`;
note that because :class:`zc.catalog.catalogindex.NormalizationWrapper` is
also an attribute index it cannot be used to wrap this class, and your normalizer
will have to return an object that has the right attribute.
"""
[docs]@implementer(IKeywordIndex)
class NormalizingKeywordIndex(_SetZipMixin,
zope.index.keyword.CaseInsensitiveKeywordIndex,
Contained):
"""
A case-insensitive keyword index supporting traditional
queries as well as extent-based queries.
"""
family = BTrees.family64
def _parseQuery(self, query): # pylint:disable=too-many-branches
if isinstance(query, Mapping):
if 'query' in query: # support legacy
query_type = query.get('operator') or 'and'
query = query['query']
elif len(query) > 1:
raise ValueError('may only pass one of key, value pair')
elif not query:
return None, None
else:
query_type, query = next(iter(query.items()))
query_type = query_type.lower()
elif isinstance(query, six.string_types):
query_type = 'and'
elif zc.catalog.interfaces.IExtent.providedBy(query):
# This is iterable, so must go before that test.
query_type = 'none'
elif is_nonstr_iter(query):
query_type = 'and'
else:
raise ValueError('Invalid query')
if query_type not in ('any', 'none'):
query = list(query) if is_nonstr_iter(query) else [query]
query = [x for x in query if isinstance(x, six.string_types)]
if not query:
query_type, query = None, None
elif query_type == 'any_of':
query_type = 'or'
elif query_type == 'all':
query_type = 'and'
return query_type, query
def apply(self, query): # any_of, any, between, none,
query = convertQuery(query)
query_type, query = self._parseQuery(query)
if query_type is None:
res = self.family.IF.Set()
elif query_type in ('or', 'and'):
res = super(NormalizingKeywordIndex, self).search(
query, operator=query_type)
elif query_type in ('between',):
query = list(self._fwd_index.iterkeys(query[0], query[1]))
res = super(NormalizingKeywordIndex, self).search(
query, operator='or')
elif query_type == 'none':
assert zc.catalog.interfaces.IExtent.providedBy(query)
res = query & self.family.IF.Set(self.ids())
elif query_type == 'any':
if query is None:
res = self.family.IF.Set(self.ids())
else:
assert zc.catalog.interfaces.IExtent.providedBy(query)
res = query & self.family.IF.Set(self.ids())
else:
raise ValueError("unknown query type", query_type) # pragma: no cover (can't get here)
return res
def ids(self):
return self._rev_index.keys()
def words(self):
return self._fwd_index.keys()
def remove_words(self, *seq):
# XXX: Why does this method exist?
# Why don't we just unindex the docs?
seq = self.normalize(*seq)
for word in seq:
try:
docids = self._fwd_index[word]
except KeyError:
pass
else:
del self._fwd_index[word]
for docid in docids:
try:
s = self._rev_index[docid]
except KeyError:
logger.exception("Your index is corrupted: %s", word)
else:
s.remove(word)
if not s:
del self._rev_index[docid]
self._num_docs.change(-1)
removeWords = remove_words
[docs]class AttributeKeywordIndex(AttributeIndex, NormalizingKeywordIndex):
"""An index for keywords stored in an attribute."""
[docs]@implementer(ICatalogIndex) # The superclass forgets this
class NormalizationWrapper(_ZCApplyMixin,
zc.catalog.catalogindex.NormalizationWrapper):
"""
An attribute index that wraps a raw index and normalizes values.
This class exists mainly to sort out the difficulty constructing
instances by only accepting keyword arguments.
"""
def __init__(self, field_name=None, interface=None, field_callable=False,
index=None, normalizer=None, is_collection=False):
"""
You should only call this constructor with keyword arguments;
due to inheritance, mixing and matching keyword and non-keyword is a bad idea.
The first three arguments that are not keyword are taken as `field_name`,
`interface` and `field_callable`.
"""
# sadly we can't reuse any of the defaults from the super classes, and we
# must rely on the order of parameters
# pylint:disable=useless-super-delegation
super(NormalizationWrapper, self).__init__(field_name, interface, field_callable,
index, normalizer, is_collection)
# text
[docs]def stemmer_lexicon(lang='english', stopwords=True):
"""
A lexicon for text indexes using zc.catalog.
"""
pipeline = [
lexicon.Splitter(),
lexicon.CaseNormalizer(),
]
if stopwords:
pipeline.append(lexicon.StopWordRemover())
pipeline.append(zc.catalog.stemmer.Stemmer(lang))
return lexicon.Lexicon(*pipeline)
[docs]@implementer(ITextIndex)
class AttributeTextIndex(TextIndex):
"""
A 64-bit text index.
Example::
index = AttributeTextIndex('field',
lexicon=stemmer_lexicon())
"""
#: We default to 64-bit btrees.
family = BTrees.family64