blob: 1647c4b94f17c601436ce6db217d26402c5a1287 [file] [log] [blame]
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
r"""Whoosh specific backend for Bloodhound Search plugin."""
from bhsearch import BHSEARCH_CONFIG_SECTION
from bhsearch.api import ISearchBackend, DESC, QueryResult, SCORE, \
IDocIndexPreprocessor, IResultPostprocessor, IndexFields, \
IQueryPreprocessor
import os
from bhsearch.search_resources.ticket_search import TicketFields
from bhsearch.security import SecurityPreprocessor
from bhsearch.utils import get_global_env
from trac.core import Component, implements, TracError
from trac.config import Option, IntOption
from trac.util.text import empty
from trac.util.datefmt import utc
from whoosh.fields import Schema, ID, DATETIME, KEYWORD, TEXT
from whoosh import index, analysis
import whoosh
import whoosh.highlight
from whoosh.collectors import FilterCollector
from whoosh.writing import AsyncWriter
from datetime import datetime
from bhsearch.whoosh_fixes import fixes_for
for fix in fixes_for(whoosh.__version__):
apply(fix)
UNIQUE_ID = "unique_id"
class WhooshBackend(Component):
"""
Implements Whoosh SearchBackend interface
"""
implements(ISearchBackend)
index_dir_setting = Option(
BHSEARCH_CONFIG_SECTION,
'whoosh_index_dir',
default='whoosh_index',
doc="""Relative path is resolved relatively to the
directory of the environment.""")
advanced_security = Option(
BHSEARCH_CONFIG_SECTION,
'advanced_security',
default=False,
doc="Check view permission for each document when retrieving results."
)
max_fragment_size = IntOption(
BHSEARCH_CONFIG_SECTION,
'max_fragment_size',
default=240,
doc="The maximum number of characters allowed in a fragment.")
fragment_surround = IntOption(
BHSEARCH_CONFIG_SECTION,
'fragment_surround',
default=60,
doc="""The number of extra characters of context to add both before
the first matched term and after the last matched term.""")
#This is schema prototype. It will be changed later
#TODO: add other fields support, add dynamic field support.
#Schema must be driven by index participants
SCHEMA = Schema(
unique_id=ID(stored=True, unique=True),
id=ID(stored=True),
type=ID(stored=True),
product=ID(stored=True),
milestone=ID(stored=True),
time=DATETIME(stored=True),
due=DATETIME(stored=True),
completed=DATETIME(stored=True),
author=ID(stored=True),
component=ID(stored=True),
status=ID(stored=True),
resolution=ID(stored=True),
keywords=KEYWORD(scorable=True),
summary=TEXT(stored=True,
analyzer=analysis.StandardAnalyzer(stoplist=None)),
content=TEXT(stored=True,
analyzer=analysis.StandardAnalyzer(stoplist=None)),
changes=TEXT(analyzer=analysis.StandardAnalyzer(stoplist=None)),
owner=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
repository=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
revision=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
message=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
required_permission=ID(),
name=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
query_suggestion_basket=TEXT(analyzer=analysis.SimpleAnalyzer(),
spelling=True),
relations=KEYWORD(lowercase=True, commas=True),
)
def __init__(self):
self.index_dir = self.index_dir_setting
if not os.path.isabs(self.index_dir):
self.index_dir = os.path.join(get_global_env(self.env).path,
self.index_dir)
if index.exists_in(self.index_dir):
self.index = index.open_dir(self.index_dir)
else:
self.index = None
#ISearchBackend methods
def start_operation(self):
return self._create_writer()
def _create_writer(self):
return AsyncWriter(self.index)
def add_doc(self, doc, operation_context=None):
"""Add any type of document index.
The contents should be a dict with fields matching the search schema.
The only required fields are type and id, everything else is optional.
"""
writer = operation_context
is_local_writer = False
if writer is None:
is_local_writer = True
writer = self._create_writer()
self._reformat_doc(doc)
doc[UNIQUE_ID] = self._create_unique_id(doc.get("product", ''),
doc["type"],
doc["id"])
self.log.debug("Doc to index: %s", doc)
try:
writer.update_document(**doc)
if is_local_writer:
writer.commit()
except:
if is_local_writer:
writer.cancel()
raise
def _reformat_doc(self, doc):
"""
Strings must be converted unicode format accepted by Whoosh.
"""
for key, value in doc.items():
if key is None:
del doc[None]
elif value is None:
del doc[key]
elif isinstance(value, basestring) and value == "":
del doc[key]
else:
doc[key] = self._to_whoosh_format(value)
def delete_doc(self, product, doc_type, doc_id, operation_context=None):
unique_id = self._create_unique_id(product, doc_type, doc_id)
self.log.debug('Removing document from the index: %s', unique_id)
writer = operation_context
is_local_writer = False
if writer is None:
is_local_writer = True
writer = self._create_writer()
try:
writer.delete_by_term(UNIQUE_ID, unique_id)
if is_local_writer:
writer.commit()
except:
if is_local_writer:
writer.cancel()
raise
def optimize(self):
writer = AsyncWriter(self.index)
writer.commit(optimize=True)
def is_index_outdated(self):
return self.index is None or not self.index.schema == self.SCHEMA
def recreate_index(self):
self.log.info('Creating Whoosh index in %s' % self.index_dir)
self._make_dir_if_not_exists()
self.index = index.create_in(self.index_dir, schema=self.SCHEMA)
return self.index
def query(self,
query,
query_string=None,
sort = None,
fields = None,
filter = None,
facets = None,
pagenum = 1,
pagelen = 20,
highlight = False,
highlight_fields = None,
context=None):
# pylint: disable=too-many-locals
with self.index.searcher() as searcher:
self._apply_advanced_security(searcher, context)
highlight_fields = self._prepare_highlight_fields(highlight,
highlight_fields)
sortedby = self._prepare_sortedby(sort)
#TODO: investigate how faceting is applied to multi-value fields
#e.g. keywords. For now, just pass facets lit to Whoosh API
#groupedby = self._prepare_groupedby(facets)
groupedby = facets
query_parameters = dict(
query = query,
pagenum = pagenum,
pagelen = pagelen,
sortedby = sortedby,
groupedby = groupedby,
maptype=whoosh.sorting.Count,
filter = filter,
)
self.env.log.debug("Whoosh query to execute: %s",
query_parameters)
raw_page = searcher.search_page(**query_parameters)
results = self._process_results(raw_page,
fields,
highlight_fields,
query_parameters)
if query_string is not None:
c = searcher.correct_query(query, query_string)
results.query_suggestion = c.string
try:
actual_query = unicode(query.simplify(searcher))
results.debug['actual_query'] = actual_query
# pylint: disable=bare-except
except:
# Simplify has a bug that causes it to fail sometimes.
pass
return results
def _apply_advanced_security(self, searcher, context=None):
if not self.advanced_security:
return
old_collector = searcher.collector
security_processor = SecurityPreprocessor(self.env)
def check_permission(doc):
return security_processor.check_permission(doc, context)
def collector(*args, **kwargs):
c = old_collector(*args, **kwargs)
if isinstance(c, FilterCollector):
c = AdvancedFilterCollector(
c.child, c.allow, c.restrict, check_permission
)
else:
c = AdvancedFilterCollector(
c, None, None, check_permission
)
return c
searcher.collector = collector
def _create_unique_id(self, product, doc_type, doc_id):
product, doc_type, doc_id = \
self._apply_empty_facets_workaround(product, doc_type, doc_id)
if product:
return u"%s:%s:%s" % (product, doc_type, doc_id)
else:
return u"%s:%s" % (doc_type, doc_id)
def _apply_empty_facets_workaround(self, product, doc_type, doc_id):
# Apply the same workaround that is used at insertion time
doc = {
IndexFields.PRODUCT: product,
IndexFields.TYPE: doc_type,
IndexFields.ID: doc_id,
}
WhooshEmptyFacetErrorWorkaround(self.env).pre_process(doc)
return (doc[IndexFields.PRODUCT],
doc[IndexFields.TYPE],
doc[IndexFields.ID])
def _to_whoosh_format(self, value):
if isinstance(value, basestring):
value = unicode(value)
elif isinstance(value, datetime):
value = self._convert_date_to_tz_naive_utc(value)
return value
def _convert_date_to_tz_naive_utc(self, value):
"""Convert datetime to naive utc datetime
Whoosh can not read from index datetime values passed from Trac with
tzinfo=trac.util.datefmt.FixedOffset because of non-empty
constructor of FixedOffset"""
if value.tzinfo:
utc_time = value.astimezone(utc)
value = utc_time.replace(tzinfo=None)
return value
def _from_whoosh_format(self, value):
if isinstance(value, datetime):
value = utc.localize(value)
return value
def _prepare_groupedby(self, facets):
if not facets:
return None
groupedby = whoosh.sorting.Facets()
for facet_name in facets:
groupedby.add_field(
facet_name,
allow_overlap=True,
maptype=whoosh.sortingwhoosh.Count)
return groupedby
def _prepare_sortedby(self, sort):
if not sort:
return None
sortedby = []
for sort_instruction in sort:
field = sort_instruction.field
order = sort_instruction.order
if field.lower() == SCORE:
if self._is_desc(order):
#We can implement tis later by our own ScoreFacet with
# "score DESC" support
raise TracError(
"Whoosh does not support DESC score ordering.")
sort_condition = whoosh.sorting.ScoreFacet()
else:
sort_condition = whoosh.sorting.FieldFacet(
field,
reverse=self._is_desc(order))
sortedby.append(sort_condition)
return sortedby
def _prepare_highlight_fields(self, highlight, highlight_fields):
if not highlight:
return ()
if not highlight_fields:
highlight_fields = self._all_highlightable_fields()
return highlight_fields
def _all_highlightable_fields(self):
return [name for name, field in self.SCHEMA.items()
if self._is_highlightable(field)]
def _is_highlightable(self, field):
return not isinstance(field, whoosh.fields.DATETIME) and field.stored
def _is_desc(self, order):
return (order.lower()==DESC)
def _process_results(self,
page,
fields,
highlight_fields,
search_parameters=None):
# It's important to grab the hits first before slicing. Otherwise, this
# can cause pagination failures.
"""
:type fields: iterator
:type page: ResultsPage
"""
results = QueryResult()
results.hits = page.total
results.total_page_count = page.pagecount
results.page_number = page.pagenum
results.offset = page.offset
results.facets = self._load_facets(page)
docs = []
highlighting = []
for retrieved_record in page:
result_doc = self._process_record(fields, retrieved_record)
docs.append(result_doc)
result_highlights = self._create_highlights(highlight_fields,
retrieved_record)
highlighting.append(result_highlights)
results.docs = docs
results.highlighting = highlighting
results.debug["search_parameters"] = search_parameters
return results
def _process_record(self, fields, retrieved_record):
result_doc = dict()
#add score field by default
if not fields or SCORE in fields:
score = retrieved_record.score
result_doc[SCORE] = score
if fields:
for field in fields:
if field in retrieved_record:
result_doc[field] = retrieved_record[field]
else:
for key, value in retrieved_record.items():
result_doc[key] = value
for key, value in result_doc.iteritems():
result_doc[key] = self._from_whoosh_format(value)
return result_doc
def _load_facets(self, page):
"""This method can be also used by unit-tests"""
non_paged_results = page.results
facet_names = non_paged_results.facet_names()
if not facet_names:
return None
facets_result = dict()
for name in facet_names:
facets_result[name] = non_paged_results.groups(name)
return facets_result
def _make_dir_if_not_exists(self):
if not os.path.exists(self.index_dir):
os.mkdir(self.index_dir)
if not os.access(self.index_dir, os.W_OK):
raise TracError(
"The path to Whoosh index '%s' is not writable for the\
current user."
% self.index_dir)
def _create_highlights(self, fields, record):
result_highlights = dict()
fragmenter = whoosh.highlight.ContextFragmenter(
self.max_fragment_size,
self.fragment_surround,
)
highlighter = whoosh.highlight.Highlighter(
formatter=WhooshEmFormatter(),
fragmenter=fragmenter)
for field in fields:
if field in record:
highlighted = highlighter.highlight_hit(record, field)
else:
highlighted = ''
result_highlights[field] = highlighted
return result_highlights
class WhooshEmFormatter(whoosh.highlight.HtmlFormatter):
template = '<em>%(t)s</em>'
class WhooshEmptyFacetErrorWorkaround(Component):
"""
Whoosh 2.4.1 raises "IndexError: list index out of range"
when search contains facets on field that is missing in at least one
document in the index. The error manifests only when index contains
more than one segment.
The goal of this class is to temporary solve the problem for
prototype phase. Fro non-prototype phase, the problem should be solved
by the next version of Whoosh.
Remove this class when fixed version of Whoosh is introduced.
"""
implements(IDocIndexPreprocessor)
implements(IResultPostprocessor)
implements(IQueryPreprocessor)
NULL_MARKER = u"empty"
should_not_be_empty_fields = [
IndexFields.STATUS,
TicketFields.MILESTONE,
TicketFields.COMPONENT,
IndexFields.PRODUCT,
]
#IDocIndexPreprocessor methods
def pre_process(self, doc):
for field in self.should_not_be_empty_fields:
if field not in doc or doc[field] is None or doc[field] == empty:
doc[field] = self.NULL_MARKER
#IResultPostprocessor methods
def post_process(self, query_result):
#fix facets
if query_result.facets:
for count_dict in query_result.facets.values():
for field, count in count_dict.iteritems():
if field == self.NULL_MARKER:
count_dict[None] = count
del count_dict[self.NULL_MARKER]
#fix query_result.docs
for doc in query_result.docs:
for field, value in doc.items():
if value == self.NULL_MARKER:
del doc[field]
#IQueryPreprocessor methods
def query_pre_process(self, query_parameters, context=None):
"""
Go through filter queries and replace "NOT (field_name:*)" query with
"field_name:NULL_MARKER" query.
This is really quick fix to make prototype working with hope that
the next Whoosh version will be released soon.
"""
# pylint: disable=unused-argument
if "filter" in query_parameters and query_parameters["filter"]:
term_to_replace = \
self._find_and_fix_condition(query_parameters["filter"])
if term_to_replace:
query_parameters["filter"] = term_to_replace
if "query" in query_parameters and query_parameters["query"]:
term_to_replace = \
self._find_and_fix_condition(query_parameters["query"])
if term_to_replace:
query_parameters["query"] = term_to_replace
def _find_and_fix_condition(self, filter_condition):
if isinstance(filter_condition, whoosh.query.CompoundQuery):
sub_queries = list(filter_condition.subqueries)
for i, subquery in enumerate(sub_queries):
term_to_replace = self._find_and_fix_condition(subquery)
if term_to_replace:
filter_condition.subqueries[i] = term_to_replace
elif isinstance(filter_condition, whoosh.query.Not):
not_query = filter_condition.query
if isinstance(not_query, whoosh.query.Every) and \
not_query.fieldname in self.should_not_be_empty_fields:
return whoosh.query.Term(not_query.fieldname, self.NULL_MARKER)
return None
class AdvancedFilterCollector(FilterCollector):
"""An advanced filter collector, accepting a callback function that
will be called for each document to determine whether it should be
filtered out or not.
Please note that it can be slow. Very slow.
"""
def __init__(self, child, allow, restrict, filter_func=None):
FilterCollector.__init__(self, child, allow, restrict)
self.filter_func = filter_func
def collect_matches(self):
child = self.child
_allow = self._allow
_restrict = self._restrict
if _allow is not None or _restrict is not None:
filtered_count = self.filtered_count
for sub_docnum in child.matches():
global_docnum = self.offset + sub_docnum
if ((_allow is not None and global_docnum not in _allow)
or (_restrict is not None and global_docnum in _restrict)):
filtered_count += 1
continue
if self.filter_func:
doc = self.subsearcher.stored_fields(sub_docnum)
if not self.filter_func(doc):
filtered_count += 1
continue
child.collect(sub_docnum)
# pylint: disable=attribute-defined-outside-init
self.filtered_count = filtered_count
else:
# If there was no allow or restrict set, don't do anything special,
# just forward the call to the child collector
child.collect_matches()