| # -*- coding: UTF-8 -*- |
| |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| r"""Whoosh specific backend for Bloodhound Search plugin.""" |
| |
| import os |
| from datetime import datetime |
| |
| from trac.core import Component, implements, TracError |
| from trac.config import Option, IntOption |
| from trac.env import ISystemInfoProvider |
| from trac.util.datefmt import utc |
| |
| import whoosh |
| from whoosh import index, analysis |
| from whoosh.collectors import FilterCollector |
| from whoosh.fields import Schema, ID, DATETIME, KEYWORD, TEXT |
| from whoosh.writing import AsyncWriter |
| |
| from bhsearch import BHSEARCH_CONFIG_SECTION |
| from bhsearch.api import ISearchBackend, DESC, QueryResult, SCORE |
| from bhsearch.security import SecurityPreprocessor |
| from bhsearch.utils import get_global_env |
| |
| UNIQUE_ID = "unique_id" |
| |
| |
| class WhooshBackend(Component): |
| """ |
| Implements Whoosh SearchBackend interface |
| """ |
| implements(ISearchBackend, ISystemInfoProvider) |
| |
| index_dir_setting = Option( |
| BHSEARCH_CONFIG_SECTION, |
| 'whoosh_index_dir', |
| default='whoosh_index', |
| doc="""Relative path is resolved relatively to the |
| directory of the environment.""", doc_domain='bhsearch') |
| |
| advanced_security = Option( |
| BHSEARCH_CONFIG_SECTION, |
| 'advanced_security', |
| default=False, |
| doc="Check view permission for each document when retrieving results.", |
| doc_domain='bhsearch' |
| ) |
| |
| max_fragment_size = IntOption( |
| BHSEARCH_CONFIG_SECTION, |
| 'max_fragment_size', |
| default=240, |
| doc="The maximum number of characters allowed in a fragment.", |
| doc_domain='bhsearch') |
| |
| fragment_surround = IntOption( |
| BHSEARCH_CONFIG_SECTION, |
| 'fragment_surround', |
| default=60, |
| doc="""The number of extra characters of context to add both before |
| the first matched term and after the last matched term.""", |
| doc_domain='bhsearch') |
| |
| #This is schema prototype. It will be changed later |
| #TODO: add other fields support, add dynamic field support. |
| #Schema must be driven by index participants |
| SCHEMA = Schema( |
| unique_id=ID(stored=True, unique=True), |
| id=ID(stored=True), |
| type=ID(stored=True), |
| product=ID(stored=True), |
| milestone=ID(stored=True), |
| time=DATETIME(stored=True), |
| due=DATETIME(stored=True), |
| completed=DATETIME(stored=True), |
| author=ID(stored=True), |
| component=ID(stored=True), |
| status=ID(stored=True), |
| resolution=ID(stored=True), |
| keywords=KEYWORD(scorable=True), |
| summary=TEXT(stored=True, |
| analyzer=analysis.StandardAnalyzer(stoplist=None)), |
| content=TEXT(stored=True, |
| analyzer=analysis.StandardAnalyzer(stoplist=None)), |
| changes=TEXT(analyzer=analysis.StandardAnalyzer(stoplist=None)), |
| owner=TEXT(stored=True, |
| analyzer=analysis.SimpleAnalyzer()), |
| repository=TEXT(stored=True, |
| analyzer=analysis.SimpleAnalyzer()), |
| revision=TEXT(stored=True, |
| analyzer=analysis.SimpleAnalyzer()), |
| message=TEXT(stored=True, |
| analyzer=analysis.SimpleAnalyzer()), |
| required_permission=ID(), |
| name=TEXT(stored=True, |
| analyzer=analysis.SimpleAnalyzer()), |
| query_suggestion_basket=TEXT(analyzer=analysis.SimpleAnalyzer(), |
| spelling=True), |
| relations=KEYWORD(lowercase=True, commas=True), |
| ) |
| |
| def __init__(self): |
| self.index_dir = self.index_dir_setting |
| if not os.path.isabs(self.index_dir): |
| self.index_dir = os.path.join(get_global_env(self.env).path, |
| self.index_dir) |
| if index.exists_in(self.index_dir): |
| self.index = index.open_dir(self.index_dir) |
| else: |
| self.index = None |
| |
| # ISystemInfoProvider methods |
| |
| def get_system_info(self): |
| yield 'Whoosh', whoosh.versionstring() |
| |
| # ISearchBackend methods |
| |
| def start_operation(self): |
| return self._create_writer() |
| |
| def _create_writer(self): |
| return AsyncWriter(self.index) |
| |
| def add_doc(self, doc, operation_context=None): |
| """Add any type of document index. |
| |
| The contents should be a dict with fields matching the search schema. |
| The only required fields are type and id, everything else is optional. |
| """ |
| writer = operation_context |
| is_local_writer = False |
| if writer is None: |
| is_local_writer = True |
| writer = self._create_writer() |
| |
| self._reformat_doc(doc) |
| doc[UNIQUE_ID] = self._create_unique_id(doc.get("product", ''), |
| doc["type"], |
| doc["id"]) |
| self.log.debug("Doc to index: %s", doc) |
| try: |
| writer.update_document(**doc) |
| if is_local_writer: |
| writer.commit() |
| except: |
| if is_local_writer: |
| writer.cancel() |
| raise |
| |
| def _reformat_doc(self, doc): |
| """ |
| Strings must be converted unicode format accepted by Whoosh. |
| """ |
| for key, value in doc.items(): |
| if key is None: |
| del doc[None] |
| elif value is None: |
| del doc[key] |
| elif isinstance(value, basestring) and value == "": |
| del doc[key] |
| else: |
| doc[key] = self._to_whoosh_format(value) |
| |
| def delete_doc(self, product, doc_type, doc_id, operation_context=None): |
| unique_id = self._create_unique_id(product, doc_type, doc_id) |
| self.log.debug('Removing document from the index: %s', unique_id) |
| writer = operation_context |
| is_local_writer = False |
| if writer is None: |
| is_local_writer = True |
| writer = self._create_writer() |
| try: |
| writer.delete_by_term(UNIQUE_ID, unique_id) |
| if is_local_writer: |
| writer.commit() |
| except: |
| if is_local_writer: |
| writer.cancel() |
| raise |
| |
| def optimize(self): |
| writer = AsyncWriter(self.index) |
| writer.commit(optimize=True) |
| |
| def is_index_outdated(self): |
| return self.index is None or not self.index.schema == self.SCHEMA |
| |
| def recreate_index(self): |
| self.log.info('Creating Whoosh index in %s' % self.index_dir) |
| self._make_dir_if_not_exists() |
| self.index = index.create_in(self.index_dir, schema=self.SCHEMA) |
| return self.index |
| |
| def query(self, |
| query, |
| query_string=None, |
| sort=None, |
| fields=None, |
| filter=None, |
| facets=None, |
| pagenum=1, |
| pagelen=20, |
| highlight=False, |
| highlight_fields=None, |
| context=None): |
| # pylint: disable=too-many-locals |
| with self.index.searcher() as searcher: |
| self._apply_advanced_security(searcher, context) |
| |
| highlight_fields = self._prepare_highlight_fields(highlight, |
| highlight_fields) |
| |
| sortedby = self._prepare_sortedby(sort) |
| |
| #TODO: investigate how faceting is applied to multi-value fields |
| #e.g. keywords. For now, just pass facets lit to Whoosh API |
| #groupedby = self._prepare_groupedby(facets) |
| groupedby = facets |
| |
| query_parameters = dict( |
| query=query, |
| pagenum=pagenum, |
| pagelen=pagelen, |
| sortedby=sortedby, |
| groupedby=groupedby, |
| maptype=whoosh.sorting.Count, |
| filter=filter, |
| ) |
| self.env.log.debug("Whoosh query to execute: %s", |
| query_parameters) |
| raw_page = searcher.search_page(**query_parameters) |
| results = self._process_results(raw_page, |
| fields, |
| highlight_fields, |
| query_parameters) |
| if query_string is not None: |
| c = searcher.correct_query(query, query_string) |
| results.query_suggestion = c.string |
| try: |
| actual_query = unicode(query.simplify(searcher)) |
| results.debug['actual_query'] = actual_query |
| # pylint: disable=bare-except |
| except: |
| # Simplify has a bug that causes it to fail sometimes. |
| pass |
| return results |
| |
| def _apply_advanced_security(self, searcher, context=None): |
| if not self.advanced_security: |
| return |
| |
| old_collector = searcher.collector |
| security_processor = SecurityPreprocessor(self.env) |
| |
| def check_permission(doc): |
| return security_processor.check_permission(doc, context) |
| |
| def collector(*args, **kwargs): |
| c = old_collector(*args, **kwargs) |
| if isinstance(c, FilterCollector): |
| c = AdvancedFilterCollector( |
| c.child, c.allow, c.restrict, check_permission |
| ) |
| else: |
| c = AdvancedFilterCollector( |
| c, None, None, check_permission |
| ) |
| return c |
| searcher.collector = collector |
| |
| def _create_unique_id(self, product, doc_type, doc_id): |
| if product: |
| return u"%s:%s:%s" % (product, doc_type, doc_id) |
| else: |
| return u"%s:%s" % (doc_type, doc_id) |
| |
| def _to_whoosh_format(self, value): |
| if isinstance(value, basestring): |
| value = unicode(value) |
| elif isinstance(value, datetime): |
| value = self._convert_date_to_tz_naive_utc(value) |
| return value |
| |
| def _convert_date_to_tz_naive_utc(self, value): |
| """Convert datetime to naive utc datetime |
| Whoosh can not read from index datetime values passed from Trac with |
| tzinfo=trac.util.datefmt.FixedOffset because of non-empty |
| constructor of FixedOffset""" |
| if value.tzinfo: |
| utc_time = value.astimezone(utc) |
| value = utc_time.replace(tzinfo=None) |
| return value |
| |
| def _from_whoosh_format(self, value): |
| if isinstance(value, datetime): |
| value = utc.localize(value) |
| return value |
| |
| def _prepare_groupedby(self, facets): |
| if not facets: |
| return None |
| groupedby = whoosh.sorting.Facets() |
| for facet_name in facets: |
| groupedby.add_field( |
| facet_name, |
| allow_overlap=True, |
| maptype=whoosh.sortingwhoosh.Count) |
| return groupedby |
| |
| def _prepare_sortedby(self, sort): |
| if not sort: |
| return None |
| sortedby = [] |
| for sort_instruction in sort: |
| field = sort_instruction.field |
| order = sort_instruction.order |
| if field.lower() == SCORE: |
| if self._is_desc(order): |
| #We can implement tis later by our own ScoreFacet with |
| # "score DESC" support |
| raise TracError( |
| "Whoosh does not support DESC score ordering.") |
| sort_condition = whoosh.sorting.ScoreFacet() |
| else: |
| sort_condition = whoosh.sorting.FieldFacet( |
| field, |
| reverse=self._is_desc(order)) |
| sortedby.append(sort_condition) |
| return sortedby |
| |
| def _prepare_highlight_fields(self, highlight, highlight_fields): |
| if not highlight: |
| return () |
| |
| if not highlight_fields: |
| highlight_fields = self._all_highlightable_fields() |
| |
| return highlight_fields |
| |
| def _all_highlightable_fields(self): |
| return [name for name, field in self.SCHEMA.items() |
| if self._is_highlightable(field)] |
| |
| def _is_highlightable(self, field): |
| return not isinstance(field, whoosh.fields.DATETIME) and field.stored |
| |
| def _is_desc(self, order): |
| return (order.lower()==DESC) |
| |
| def _process_results(self, |
| page, |
| fields, |
| highlight_fields, |
| search_parameters=None): |
| # It's important to grab the hits first before slicing. Otherwise, this |
| # can cause pagination failures. |
| """ |
| :type fields: iterator |
| :type page: ResultsPage |
| """ |
| results = QueryResult() |
| results.hits = page.total |
| results.total_page_count = page.pagecount |
| results.page_number = page.pagenum |
| results.offset = page.offset |
| results.facets = self._load_facets(page) |
| |
| docs = [] |
| highlighting = [] |
| for retrieved_record in page: |
| result_doc = self._process_record(fields, retrieved_record) |
| docs.append(result_doc) |
| |
| result_highlights = self._create_highlights(highlight_fields, |
| retrieved_record) |
| highlighting.append(result_highlights) |
| results.docs = docs |
| results.highlighting = highlighting |
| |
| results.debug["search_parameters"] = search_parameters |
| return results |
| |
| def _process_record(self, fields, retrieved_record): |
| result_doc = dict() |
| #add score field by default |
| if not fields or SCORE in fields: |
| score = retrieved_record.score |
| result_doc[SCORE] = score |
| |
| if fields: |
| for field in fields: |
| if field in retrieved_record: |
| result_doc[field] = retrieved_record[field] |
| else: |
| for key, value in retrieved_record.items(): |
| result_doc[key] = value |
| |
| for key, value in result_doc.iteritems(): |
| result_doc[key] = self._from_whoosh_format(value) |
| return result_doc |
| |
| def _load_facets(self, page): |
| """This method can be also used by unit-tests""" |
| non_paged_results = page.results |
| facet_names = non_paged_results.facet_names() |
| if not facet_names: |
| return None |
| facets_result = dict() |
| for name in facet_names: |
| facets_result[name] = non_paged_results.groups(name) |
| return facets_result |
| |
| def _make_dir_if_not_exists(self): |
| if not os.path.exists(self.index_dir): |
| os.mkdir(self.index_dir) |
| |
| if not os.access(self.index_dir, os.W_OK): |
| raise TracError( |
| "The path to Whoosh index '%s' is not writable for the\ |
| current user." |
| % self.index_dir) |
| |
| def _create_highlights(self, fields, record): |
| result_highlights = dict() |
| fragmenter = whoosh.highlight.ContextFragmenter( |
| self.max_fragment_size, |
| self.fragment_surround, |
| ) |
| highlighter = whoosh.highlight.Highlighter( |
| formatter=WhooshEmFormatter(), |
| fragmenter=fragmenter) |
| |
| for field in fields: |
| if field in record: |
| highlighted = highlighter.highlight_hit(record, field) |
| else: |
| highlighted = '' |
| result_highlights[field] = highlighted |
| return result_highlights |
| |
| |
| class WhooshEmFormatter(whoosh.highlight.HtmlFormatter): |
| template = '<em>%(t)s</em>' |
| |
| |
| class AdvancedFilterCollector(FilterCollector): |
| """An advanced filter collector, accepting a callback function that |
| will be called for each document to determine whether it should be |
| filtered out or not. |
| |
| Please note that it can be slow. Very slow. |
| """ |
| |
| def __init__(self, child, allow, restrict, filter_func=None): |
| FilterCollector.__init__(self, child, allow, restrict) |
| self.filter_func = filter_func |
| |
| def collect_matches(self): |
| child = self.child |
| _allow = self._allow |
| _restrict = self._restrict |
| |
| if _allow is not None or _restrict is not None: |
| filtered_count = self.filtered_count |
| for sub_docnum in child.matches(): |
| global_docnum = self.offset + sub_docnum |
| if ((_allow is not None and global_docnum not in _allow) |
| or (_restrict is not None and global_docnum in _restrict)): |
| filtered_count += 1 |
| continue |
| |
| if self.filter_func: |
| doc = self.subsearcher.stored_fields(sub_docnum) |
| if not self.filter_func(doc): |
| filtered_count += 1 |
| continue |
| |
| child.collect(sub_docnum) |
| # pylint: disable=attribute-defined-outside-init |
| self.filtered_count = filtered_count |
| else: |
| # If there was no allow or restrict set, don't do anything special, |
| # just forward the call to the child collector |
| child.collect_matches() |