blob: e22f318247917b89b772ccb7313ab1fabd1b7d84 [file] [log] [blame]
# -*- coding: UTF-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
r"""Whoosh specific backend for Bloodhound Search plugin."""
import os
from datetime import datetime
from trac.core import Component, implements, TracError
from trac.config import Option, IntOption
from trac.env import ISystemInfoProvider
from trac.util.datefmt import utc
import whoosh
from whoosh import index, analysis
from whoosh.collectors import FilterCollector
from whoosh.fields import Schema, ID, DATETIME, KEYWORD, TEXT
from whoosh.writing import AsyncWriter
from bhsearch import BHSEARCH_CONFIG_SECTION
from bhsearch.api import ISearchBackend, DESC, QueryResult, SCORE
from bhsearch.security import SecurityPreprocessor
from bhsearch.utils import get_global_env
UNIQUE_ID = "unique_id"
class WhooshBackend(Component):
"""
Implements Whoosh SearchBackend interface
"""
implements(ISearchBackend, ISystemInfoProvider)
index_dir_setting = Option(
BHSEARCH_CONFIG_SECTION,
'whoosh_index_dir',
default='whoosh_index',
doc="""Relative path is resolved relatively to the
directory of the environment.""", doc_domain='bhsearch')
advanced_security = Option(
BHSEARCH_CONFIG_SECTION,
'advanced_security',
default=False,
doc="Check view permission for each document when retrieving results.",
doc_domain='bhsearch'
)
max_fragment_size = IntOption(
BHSEARCH_CONFIG_SECTION,
'max_fragment_size',
default=240,
doc="The maximum number of characters allowed in a fragment.",
doc_domain='bhsearch')
fragment_surround = IntOption(
BHSEARCH_CONFIG_SECTION,
'fragment_surround',
default=60,
doc="""The number of extra characters of context to add both before
the first matched term and after the last matched term.""",
doc_domain='bhsearch')
#This is schema prototype. It will be changed later
#TODO: add other fields support, add dynamic field support.
#Schema must be driven by index participants
SCHEMA = Schema(
unique_id=ID(stored=True, unique=True),
id=ID(stored=True),
type=ID(stored=True),
product=ID(stored=True),
milestone=ID(stored=True),
time=DATETIME(stored=True),
due=DATETIME(stored=True),
completed=DATETIME(stored=True),
author=ID(stored=True),
component=ID(stored=True),
status=ID(stored=True),
resolution=ID(stored=True),
keywords=KEYWORD(scorable=True),
summary=TEXT(stored=True,
analyzer=analysis.StandardAnalyzer(stoplist=None)),
content=TEXT(stored=True,
analyzer=analysis.StandardAnalyzer(stoplist=None)),
changes=TEXT(analyzer=analysis.StandardAnalyzer(stoplist=None)),
owner=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
repository=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
revision=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
message=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
required_permission=ID(),
name=TEXT(stored=True,
analyzer=analysis.SimpleAnalyzer()),
query_suggestion_basket=TEXT(analyzer=analysis.SimpleAnalyzer(),
spelling=True),
relations=KEYWORD(lowercase=True, commas=True),
)
def __init__(self):
self.index_dir = self.index_dir_setting
if not os.path.isabs(self.index_dir):
self.index_dir = os.path.join(get_global_env(self.env).path,
self.index_dir)
if index.exists_in(self.index_dir):
self.index = index.open_dir(self.index_dir)
else:
self.index = None
# ISystemInfoProvider methods
def get_system_info(self):
yield 'Whoosh', whoosh.versionstring()
# ISearchBackend methods
def start_operation(self):
return self._create_writer()
def _create_writer(self):
return AsyncWriter(self.index)
def add_doc(self, doc, operation_context=None):
"""Add any type of document index.
The contents should be a dict with fields matching the search schema.
The only required fields are type and id, everything else is optional.
"""
writer = operation_context
is_local_writer = False
if writer is None:
is_local_writer = True
writer = self._create_writer()
self._reformat_doc(doc)
doc[UNIQUE_ID] = self._create_unique_id(doc.get("product", ''),
doc["type"],
doc["id"])
self.log.debug("Doc to index: %s", doc)
try:
writer.update_document(**doc)
if is_local_writer:
writer.commit()
except:
if is_local_writer:
writer.cancel()
raise
def _reformat_doc(self, doc):
"""
Strings must be converted unicode format accepted by Whoosh.
"""
for key, value in doc.items():
if key is None:
del doc[None]
elif value is None:
del doc[key]
elif isinstance(value, basestring) and value == "":
del doc[key]
else:
doc[key] = self._to_whoosh_format(value)
def delete_doc(self, product, doc_type, doc_id, operation_context=None):
unique_id = self._create_unique_id(product, doc_type, doc_id)
self.log.debug('Removing document from the index: %s', unique_id)
writer = operation_context
is_local_writer = False
if writer is None:
is_local_writer = True
writer = self._create_writer()
try:
writer.delete_by_term(UNIQUE_ID, unique_id)
if is_local_writer:
writer.commit()
except:
if is_local_writer:
writer.cancel()
raise
def optimize(self):
writer = AsyncWriter(self.index)
writer.commit(optimize=True)
def is_index_outdated(self):
return self.index is None or not self.index.schema == self.SCHEMA
def recreate_index(self):
self.log.info('Creating Whoosh index in %s' % self.index_dir)
self._make_dir_if_not_exists()
self.index = index.create_in(self.index_dir, schema=self.SCHEMA)
return self.index
def query(self,
query,
query_string=None,
sort=None,
fields=None,
filter=None,
facets=None,
pagenum=1,
pagelen=20,
highlight=False,
highlight_fields=None,
context=None):
# pylint: disable=too-many-locals
with self.index.searcher() as searcher:
self._apply_advanced_security(searcher, context)
highlight_fields = self._prepare_highlight_fields(highlight,
highlight_fields)
sortedby = self._prepare_sortedby(sort)
#TODO: investigate how faceting is applied to multi-value fields
#e.g. keywords. For now, just pass facets lit to Whoosh API
#groupedby = self._prepare_groupedby(facets)
groupedby = facets
query_parameters = dict(
query=query,
pagenum=pagenum,
pagelen=pagelen,
sortedby=sortedby,
groupedby=groupedby,
maptype=whoosh.sorting.Count,
filter=filter,
)
self.env.log.debug("Whoosh query to execute: %s",
query_parameters)
raw_page = searcher.search_page(**query_parameters)
results = self._process_results(raw_page,
fields,
highlight_fields,
query_parameters)
if query_string is not None:
c = searcher.correct_query(query, query_string)
results.query_suggestion = c.string
try:
actual_query = unicode(query.simplify(searcher))
results.debug['actual_query'] = actual_query
# pylint: disable=bare-except
except:
# Simplify has a bug that causes it to fail sometimes.
pass
return results
def _apply_advanced_security(self, searcher, context=None):
if not self.advanced_security:
return
old_collector = searcher.collector
security_processor = SecurityPreprocessor(self.env)
def check_permission(doc):
return security_processor.check_permission(doc, context)
def collector(*args, **kwargs):
c = old_collector(*args, **kwargs)
if isinstance(c, FilterCollector):
c = AdvancedFilterCollector(
c.child, c.allow, c.restrict, check_permission
)
else:
c = AdvancedFilterCollector(
c, None, None, check_permission
)
return c
searcher.collector = collector
def _create_unique_id(self, product, doc_type, doc_id):
if product:
return u"%s:%s:%s" % (product, doc_type, doc_id)
else:
return u"%s:%s" % (doc_type, doc_id)
def _to_whoosh_format(self, value):
if isinstance(value, basestring):
value = unicode(value)
elif isinstance(value, datetime):
value = self._convert_date_to_tz_naive_utc(value)
return value
def _convert_date_to_tz_naive_utc(self, value):
"""Convert datetime to naive utc datetime
Whoosh can not read from index datetime values passed from Trac with
tzinfo=trac.util.datefmt.FixedOffset because of non-empty
constructor of FixedOffset"""
if value.tzinfo:
utc_time = value.astimezone(utc)
value = utc_time.replace(tzinfo=None)
return value
def _from_whoosh_format(self, value):
if isinstance(value, datetime):
value = utc.localize(value)
return value
def _prepare_groupedby(self, facets):
if not facets:
return None
groupedby = whoosh.sorting.Facets()
for facet_name in facets:
groupedby.add_field(
facet_name,
allow_overlap=True,
maptype=whoosh.sortingwhoosh.Count)
return groupedby
def _prepare_sortedby(self, sort):
if not sort:
return None
sortedby = []
for sort_instruction in sort:
field = sort_instruction.field
order = sort_instruction.order
if field.lower() == SCORE:
if self._is_desc(order):
#We can implement tis later by our own ScoreFacet with
# "score DESC" support
raise TracError(
"Whoosh does not support DESC score ordering.")
sort_condition = whoosh.sorting.ScoreFacet()
else:
sort_condition = whoosh.sorting.FieldFacet(
field,
reverse=self._is_desc(order))
sortedby.append(sort_condition)
return sortedby
def _prepare_highlight_fields(self, highlight, highlight_fields):
if not highlight:
return ()
if not highlight_fields:
highlight_fields = self._all_highlightable_fields()
return highlight_fields
def _all_highlightable_fields(self):
return [name for name, field in self.SCHEMA.items()
if self._is_highlightable(field)]
def _is_highlightable(self, field):
return not isinstance(field, whoosh.fields.DATETIME) and field.stored
def _is_desc(self, order):
return (order.lower()==DESC)
def _process_results(self,
page,
fields,
highlight_fields,
search_parameters=None):
# It's important to grab the hits first before slicing. Otherwise, this
# can cause pagination failures.
"""
:type fields: iterator
:type page: ResultsPage
"""
results = QueryResult()
results.hits = page.total
results.total_page_count = page.pagecount
results.page_number = page.pagenum
results.offset = page.offset
results.facets = self._load_facets(page)
docs = []
highlighting = []
for retrieved_record in page:
result_doc = self._process_record(fields, retrieved_record)
docs.append(result_doc)
result_highlights = self._create_highlights(highlight_fields,
retrieved_record)
highlighting.append(result_highlights)
results.docs = docs
results.highlighting = highlighting
results.debug["search_parameters"] = search_parameters
return results
def _process_record(self, fields, retrieved_record):
result_doc = dict()
#add score field by default
if not fields or SCORE in fields:
score = retrieved_record.score
result_doc[SCORE] = score
if fields:
for field in fields:
if field in retrieved_record:
result_doc[field] = retrieved_record[field]
else:
for key, value in retrieved_record.items():
result_doc[key] = value
for key, value in result_doc.iteritems():
result_doc[key] = self._from_whoosh_format(value)
return result_doc
def _load_facets(self, page):
"""This method can be also used by unit-tests"""
non_paged_results = page.results
facet_names = non_paged_results.facet_names()
if not facet_names:
return None
facets_result = dict()
for name in facet_names:
facets_result[name] = non_paged_results.groups(name)
return facets_result
def _make_dir_if_not_exists(self):
if not os.path.exists(self.index_dir):
os.mkdir(self.index_dir)
if not os.access(self.index_dir, os.W_OK):
raise TracError(
"The path to Whoosh index '%s' is not writable for the\
current user."
% self.index_dir)
def _create_highlights(self, fields, record):
result_highlights = dict()
fragmenter = whoosh.highlight.ContextFragmenter(
self.max_fragment_size,
self.fragment_surround,
)
highlighter = whoosh.highlight.Highlighter(
formatter=WhooshEmFormatter(),
fragmenter=fragmenter)
for field in fields:
if field in record:
highlighted = highlighter.highlight_hit(record, field)
else:
highlighted = ''
result_highlights[field] = highlighted
return result_highlights
class WhooshEmFormatter(whoosh.highlight.HtmlFormatter):
template = '<em>%(t)s</em>'
class AdvancedFilterCollector(FilterCollector):
"""An advanced filter collector, accepting a callback function that
will be called for each document to determine whether it should be
filtered out or not.
Please note that it can be slow. Very slow.
"""
def __init__(self, child, allow, restrict, filter_func=None):
FilterCollector.__init__(self, child, allow, restrict)
self.filter_func = filter_func
def collect_matches(self):
child = self.child
_allow = self._allow
_restrict = self._restrict
if _allow is not None or _restrict is not None:
filtered_count = self.filtered_count
for sub_docnum in child.matches():
global_docnum = self.offset + sub_docnum
if ((_allow is not None and global_docnum not in _allow)
or (_restrict is not None and global_docnum in _restrict)):
filtered_count += 1
continue
if self.filter_func:
doc = self.subsearcher.stored_fields(sub_docnum)
if not self.filter_func(doc):
filtered_count += 1
continue
child.collect(sub_docnum)
# pylint: disable=attribute-defined-outside-init
self.filtered_count = filtered_count
else:
# If there was no allow or restrict set, don't do anything special,
# just forward the call to the child collector
child.collect_matches()