Source code for nti.zope_catalog.index

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Support for working with :class:`zope.catalog.field` indexes.

All of the indexes we define are compatible with both
:mod:`zope.catalog` query syntax (and internal attributes) and :mod:`zc.catalog`
syntax (and public attributes).
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# stdlib imports
try:
    from collections.abc import Mapping
    from collections.abc import Iterable
except ImportError: # pragma: no cover
    from collections import Mapping
    from collections import Iterable

import BTrees
import six
import zc.catalog.catalogindex
import zc.catalog.index
import zc.catalog.stemmer
from zope.interface import implementer
from zope.catalog.attribute import AttributeIndex
from zope.catalog.interfaces import ICatalogIndex
from zope.catalog.text import TextIndex
from zope.container.contained import Contained
import zope.index.field
import zope.index.keyword
from zope.index.text import lexicon

from nti.property.property import alias
from nti.zope_catalog.interfaces import IFieldIndex
from nti.zope_catalog.interfaces import IIntegerValueIndex
from nti.zope_catalog.interfaces import IKeywordIndex
from nti.zope_catalog.interfaces import ISetIndex
from nti.zope_catalog.interfaces import ITextIndex
from nti.zope_catalog.interfaces import IValueIndex

__docformat__ = "restructuredtext en"

logger = __import__('logging').getLogger(__name__)


def is_nonstr_iter(x):
    return not isinstance(x, six.string_types) \
        and isinstance(x, Iterable)


def convertQuery(query):
    # Convert zope.index style two-tuple (min/max)
    # query to new-style
    if isinstance(query, tuple) and len(query) == 2:
        if query[0] == query[1]:
            # common case of exact match
            query = {'any_of': (query[0],)}
        else:
            query = {'between': query}
    return query


class _ZCApplyMixin(object):
    """
    Convert zope.index style two-tuple query to new style.
    """

    def apply(self, query):
        query = convertQuery(query)
        return super(_ZCApplyMixin, self).apply(query)


class _ZCAbstractIndexMixin(object):
    """
    Helpers and compatibility mixins for zope.catalog and zc.catalog.
    Makes zc.catalog indexes look a bit more like zope.catalog indexes.
    """

    family = BTrees.family64
    _num_docs = alias('documentCount')
    _fwd_index = alias('values_to_documents')
    _rev_index = alias('documents_to_values')


class _ZipMixin(object):

    def zip(self, doc_ids=()):
        for doc_id in doc_ids or ():
            value = self._rev_index.get(doc_id)
            yield doc_id, value

class _SetZipMixin(_ZipMixin):

    def zip(self, doc_ids=()):
        for d, v in _ZipMixin.zip(self, doc_ids):
            # TODO: Should we really be doing this? The values
            # are usually [OO]TreeSet, which is more memory and persistence
            # friendly
            yield d, set(v) if v is not None else None

[docs]@implementer(IFieldIndex) class NormalizingFieldIndex(_ZipMixin, zope.index.field.FieldIndex, Contained): """ A field index that normalizes before indexing or searching. .. note:: For more flexibility, use a :class:`~.NormalizationWrapper`. """ # We default to 64-bit trees family = BTrees.family64
[docs] def normalize(self, value): "Subclasses must override this method." raise NotImplementedError()
[docs] def index_doc(self, docid, value): super(NormalizingFieldIndex, self).index_doc( docid, self.normalize(value))
def apply(self, query): query = tuple(self.normalize(x) for x in query) return super(NormalizingFieldIndex, self).apply(query) def ids(self): return self._rev_index.keys() def doc_value(self, doc_id): result = self._rev_index.get(doc_id) return result
[docs]class CaseInsensitiveAttributeFieldIndex(AttributeIndex, NormalizingFieldIndex): """ An attribute index that normalizes case. It is queried with a two-tuple giving the min and max values. """
[docs] def normalize(self, value): value = value.lower() if value else value return value
# Normalizing and wrappers: # The normalizing code needs to get the actual values. Because AttributeIndex # gets the attribute value in index_doc and then calls the same method on super # with that returned value, the NormalizationWrapper has to extend AttributeIndex # to get the right value to pass to the normamlizer. That means it cannot be used # to wrap another AttributeIndex, only a plain ValueIndex or SetIndex. Note # that it is somewhat painful to construct
[docs]@implementer(IValueIndex) class ValueIndex(_ZCApplyMixin, _ZCAbstractIndexMixin, _ZipMixin, zc.catalog.index.ValueIndex): "An index of raw values."
[docs]class AttributeValueIndex(ValueIndex, zc.catalog.catalogindex.ValueIndex): "An index of values stored in a particular attribute."
[docs]@implementer(ISetIndex) class SetIndex(_ZCAbstractIndexMixin, _SetZipMixin, zc.catalog.index.SetIndex): "An index of values that are multiple."
[docs]class AttributeSetIndex(SetIndex, zc.catalog.catalogindex.SetIndex): "An index of values that are multiple and stored in an attribute."
[docs]@implementer(IIntegerValueIndex) class IntegerValueIndex(_ZCApplyMixin, _ZCAbstractIndexMixin, _ZipMixin, zc.catalog.index.ValueIndex): """ A "raw" index that is optimized for, and only supports, storing integer values. To normalize, use a :class:`zc.catalog.index.NormalizationWrapper`; to store in a catalog and normalize, use a :class:`NormalizationWrapper` (which is an attribute index). """ def clear(self): super(IntegerValueIndex, self).clear() self.documents_to_values = self.family.II.BTree() self.values_to_documents = self.family.IO.BTree()
[docs]class IntegerAttributeIndex(IntegerValueIndex, zc.catalog.catalogindex.ValueIndex): """ An attribute index that is optimized for, and only supports, storing integer values. To normalize, use a :class:`zc.catalog.index.NormalizationWrapper`; note that because :class:`zc.catalog.catalogindex.NormalizationWrapper` is also an attribute index it cannot be used to wrap this class, and your normalizer will have to return an object that has the right attribute. """
[docs]@implementer(IKeywordIndex) class NormalizingKeywordIndex(_SetZipMixin, zope.index.keyword.CaseInsensitiveKeywordIndex, Contained): """ A case-insensitive keyword index supporting traditional queries as well as extent-based queries. """ family = BTrees.family64 def _parseQuery(self, query): # pylint:disable=too-many-branches if isinstance(query, Mapping): if 'query' in query: # support legacy query_type = query.get('operator') or 'and' query = query['query'] elif len(query) > 1: raise ValueError('may only pass one of key, value pair') elif not query: return None, None else: query_type, query = next(iter(query.items())) query_type = query_type.lower() elif isinstance(query, six.string_types): query_type = 'and' elif zc.catalog.interfaces.IExtent.providedBy(query): # This is iterable, so must go before that test. query_type = 'none' elif is_nonstr_iter(query): query_type = 'and' else: raise ValueError('Invalid query') if query_type not in ('any', 'none'): query = list(query) if is_nonstr_iter(query) else [query] query = [x for x in query if isinstance(x, six.string_types)] if not query: query_type, query = None, None elif query_type == 'any_of': query_type = 'or' elif query_type == 'all': query_type = 'and' return query_type, query def apply(self, query): # any_of, any, between, none, query = convertQuery(query) query_type, query = self._parseQuery(query) if query_type is None: res = self.family.IF.Set() elif query_type in ('or', 'and'): res = super(NormalizingKeywordIndex, self).search( query, operator=query_type) elif query_type in ('between',): query = list(self._fwd_index.iterkeys(query[0], query[1])) res = super(NormalizingKeywordIndex, self).search( query, operator='or') elif query_type == 'none': assert zc.catalog.interfaces.IExtent.providedBy(query) res = query & self.family.IF.Set(self.ids()) elif query_type == 'any': if query is None: res = self.family.IF.Set(self.ids()) else: assert zc.catalog.interfaces.IExtent.providedBy(query) res = query & self.family.IF.Set(self.ids()) else: raise ValueError("unknown query type", query_type) # pragma: no cover (can't get here) return res def ids(self): return self._rev_index.keys() def words(self): return self._fwd_index.keys() def remove_words(self, *seq): # XXX: Why does this method exist? # Why don't we just unindex the docs? seq = self.normalize(*seq) for word in seq: try: docids = self._fwd_index[word] except KeyError: pass else: del self._fwd_index[word] for docid in docids: try: s = self._rev_index[docid] except KeyError: logger.exception("Your index is corrupted: %s", word) else: s.remove(word) if not s: del self._rev_index[docid] self._num_docs.change(-1) removeWords = remove_words
[docs]class AttributeKeywordIndex(AttributeIndex, NormalizingKeywordIndex): """An index for keywords stored in an attribute."""
[docs]@implementer(ICatalogIndex) # The superclass forgets this class NormalizationWrapper(_ZCApplyMixin, zc.catalog.catalogindex.NormalizationWrapper): """ An attribute index that wraps a raw index and normalizes values. This class exists mainly to sort out the difficulty constructing instances by only accepting keyword arguments. """ def __init__(self, field_name=None, interface=None, field_callable=False, index=None, normalizer=None, is_collection=False): """ You should only call this constructor with keyword arguments; due to inheritance, mixing and matching keyword and non-keyword is a bad idea. The first three arguments that are not keyword are taken as `field_name`, `interface` and `field_callable`. """ # sadly we can't reuse any of the defaults from the super classes, and we # must rely on the order of parameters # pylint:disable=useless-super-delegation super(NormalizationWrapper, self).__init__(field_name, interface, field_callable, index, normalizer, is_collection)
# text
[docs]def stemmer_lexicon(lang='english', stopwords=True): """ A lexicon for text indexes using zc.catalog. """ pipeline = [ lexicon.Splitter(), lexicon.CaseNormalizer(), ] if stopwords: pipeline.append(lexicon.StopWordRemover()) pipeline.append(zc.catalog.stemmer.Stemmer(lang)) return lexicon.Lexicon(*pipeline)
[docs]@implementer(ITextIndex) class AttributeTextIndex(TextIndex): """ A 64-bit text index. Example:: index = AttributeTextIndex('field', lexicon=stemmer_lexicon()) """ #: We default to 64-bit btrees. family = BTrees.family64