blob: 530d52eace709fce63a94889f7a6d936da7b555f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.analyzing;
import java.io.Closeable;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
// TODO:
// - a PostingsFormat that stores super-high-freq terms as
// a bitset should be a win for the prefix terms?
// (LUCENE-5052)
// - we could offer a better integration with
// DocumentDictionary and NRT? so that your suggester
// "automatically" keeps in sync w/ your index
/** Analyzes the input text and then suggests matches based
* on prefix matches to any tokens in the indexed text.
* This also highlights the tokens that match.
*
* <p>This suggester supports payloads. Matches are sorted only
* by the suggest weight; it would be nice to support
* blended score + weight sort in the future. This means
* this suggester best applies when there is a strong
* a-priori ranking of all the suggestions.
*
* <p>This suggester supports contexts, including arbitrary binary
* terms.
*
* @lucene.experimental */
public class AnalyzingInfixSuggester extends Lookup implements Closeable {
/** edgegrams for searching short prefixes without Prefix Query
* that's controlled by {@linkplain #minPrefixChars} */
protected final static String TEXTGRAMS_FIELD_NAME = "textgrams";
/** Field name used for the indexed text. */
protected final static String TEXT_FIELD_NAME = "text";
/** Field name used for the indexed text, as a
* StringField, for exact lookup. */
protected final static String EXACT_TEXT_FIELD_NAME = "exacttext";
/** Field name used for the indexed context, as a
* StringField and a SortedSetDVField, for filtering. */
protected final static String CONTEXTS_FIELD_NAME = "contexts";
/** Analyzer used at search time */
protected final Analyzer queryAnalyzer;
/** Analyzer used at index time */
protected final Analyzer indexAnalyzer;
private final Directory dir;
final int minPrefixChars;
private final boolean allTermsRequired;
private final boolean highlight;
private final boolean commitOnBuild;
private final boolean closeIndexWriterOnBuild;
/** Used for ongoing NRT additions/updates. */
protected IndexWriter writer;
/** {@link IndexSearcher} used for lookups. */
protected SearcherManager searcherMgr;
/** Used to manage concurrent access to searcherMgr */
protected final Object searcherMgrLock = new Object();
/** Default minimum number of leading characters before
* PrefixQuery is used (4). */
public static final int DEFAULT_MIN_PREFIX_CHARS = 4;
/** Default boolean clause option for multiple terms matching (all terms required). */
public static final boolean DEFAULT_ALL_TERMS_REQUIRED = true;
/** Default higlighting option. */
public static final boolean DEFAULT_HIGHLIGHT = true;
/** Default option to close the IndexWriter once the index has been built. */
protected final static boolean DEFAULT_CLOSE_INDEXWRITER_ON_BUILD = true;
/** How we sort the postings and search results. */
private static final Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true));
/** Create a new instance, loading from a previously built
* AnalyzingInfixSuggester directory, if it exists. This directory must be
* private to the infix suggester (i.e., not an external
* Lucene index). Note that {@link #close}
* will also close the provided directory. */
public AnalyzingInfixSuggester(Directory dir, Analyzer analyzer) throws IOException {
this(dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS, false, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT);
}
/** Create a new instance, loading from a previously built
* AnalyzingInfixSuggester directory, if it exists. This directory must be
* private to the infix suggester (i.e., not an external
* Lucene index). Note that {@link #close}
* will also close the provided directory.
*
* @param minPrefixChars Minimum number of leading characters
* before PrefixQuery is used (default 4).
* Prefixes shorter than this are indexed as character
* ngrams (increasing index size but making lookups
* faster).
*
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built dictionary.
*/
public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars,
boolean commitOnBuild) throws IOException {
this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT);
}
/** Create a new instance, loading from a previously built
* AnalyzingInfixSuggester directory, if it exists. This directory must be
* private to the infix suggester (i.e., not an external
* Lucene index). Note that {@link #close}
* will also close the provided directory.
*
* @param minPrefixChars Minimum number of leading characters
* before PrefixQuery is used (default 4).
* Prefixes shorter than this are indexed as character
* ngrams (increasing index size but making lookups
* faster).
*
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built dictionary.
*
* @param allTermsRequired All terms in the suggest query must be matched.
* @param highlight Highlight suggest query in suggestions.
*
*/
public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars,
boolean commitOnBuild,
boolean allTermsRequired, boolean highlight) throws IOException {
this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, allTermsRequired, highlight,
DEFAULT_CLOSE_INDEXWRITER_ON_BUILD);
}
/** Create a new instance, loading from a previously built
* AnalyzingInfixSuggester directory, if it exists. This directory must be
* private to the infix suggester (i.e., not an external
* Lucene index). Note that {@link #close}
* will also close the provided directory.
*
* @param minPrefixChars Minimum number of leading characters
* before PrefixQuery is used (default 4).
* Prefixes shorter than this are indexed as character
* ngrams (increasing index size but making lookups
* faster).
*
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built dictionary.
*
* @param allTermsRequired All terms in the suggest query must be matched.
* @param highlight Highlight suggest query in suggestions.
* @param closeIndexWriterOnBuild If true, the IndexWriter will be closed after the index has finished building.
*/
public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars,
boolean commitOnBuild, boolean allTermsRequired,
boolean highlight, boolean closeIndexWriterOnBuild) throws IOException {
if (minPrefixChars < 0) {
throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars);
}
this.queryAnalyzer = queryAnalyzer;
this.indexAnalyzer = indexAnalyzer;
this.dir = dir;
this.minPrefixChars = minPrefixChars;
this.commitOnBuild = commitOnBuild;
this.allTermsRequired = allTermsRequired;
this.highlight = highlight;
this.closeIndexWriterOnBuild = closeIndexWriterOnBuild;
if (DirectoryReader.indexExists(dir)) {
// Already built; open it:
searcherMgr = new SearcherManager(dir, null);
}
}
/** Override this to customize index settings, e.g. which
* codec to use. */
protected IndexWriterConfig getIndexWriterConfig(Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) {
IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer);
iwc.setOpenMode(openMode);
// This way all merged segments will be sorted at
// merge time, allow for per-segment early termination
// when those segments are searched:
iwc.setIndexSort(SORT);
return iwc;
}
/** Subclass can override to choose a specific {@link
* Directory} implementation. */
protected Directory getDirectory(Path path) throws IOException {
return FSDirectory.open(path);
}
@Override
public void build(InputIterator iter) throws IOException {
synchronized (searcherMgrLock) {
if (searcherMgr != null) {
searcherMgr.close();
searcherMgr = null;
}
if (writer != null) {
writer.close();
writer = null;
}
boolean success = false;
try {
// First pass: build a temporary normal Lucene index,
// just indexing the suggestions as they iterate:
writer = new IndexWriter(dir,
getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE));
//long t0 = System.nanoTime();
// TODO: use threads?
BytesRef text;
while ((text = iter.next()) != null) {
BytesRef payload;
if (iter.hasPayloads()) {
payload = iter.payload();
} else {
payload = null;
}
add(text, iter.contexts(), iter.weight(), payload);
}
//System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec");
if (commitOnBuild || closeIndexWriterOnBuild) {
commit();
}
searcherMgr = new SearcherManager(writer, null);
success = true;
} finally {
if (success) {
if (closeIndexWriterOnBuild) {
writer.close();
writer = null;
}
} else { // failure
if (writer != null) {
writer.rollback();
writer = null;
}
}
}
}
}
/** Commits all pending changes made to this suggester to disk.
*
* @see IndexWriter#commit */
public void commit() throws IOException {
if (writer == null) {
if (searcherMgr == null || closeIndexWriterOnBuild == false) {
throw new IllegalStateException("Cannot commit on an closed writer. Add documents first");
}
// else no-op: writer was committed and closed after the index was built, so commit is unnecessary
} else {
writer.commit();
}
}
private Analyzer getGramAnalyzer() {
return new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return indexAnalyzer;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
assert !(fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars == 0)
: "no need \"textgrams\" when minPrefixChars="+minPrefixChars;
if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) {
// TODO: should use an EdgeNGramTokenFilterFactory here
TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false);
return new TokenStreamComponents(components.getSource(), filter);
} else {
return components;
}
}
};
}
private void ensureOpen() throws IOException {
synchronized (searcherMgrLock) {
if (writer == null) {
if (DirectoryReader.indexExists(dir)) {
// Already built; open it:
writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.APPEND));
} else {
writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE));
}
SearcherManager oldSearcherMgr = searcherMgr;
searcherMgr = new SearcherManager(writer, null);
if (oldSearcherMgr != null) {
oldSearcherMgr.close();
}
}
}
}
/** Adds a new suggestion. Be sure to use {@link #update}
* instead if you want to replace a previous suggestion.
* After adding or updating a batch of new suggestions,
* you must call {@link #refresh} in the end in order to
* see the suggestions in {@link #lookup} */
public void add(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException {
ensureOpen();
writer.addDocument(buildDocument(text, contexts, weight, payload));
}
/** Updates a previous suggestion, matching the exact same
* text as before. Use this to change the weight or
* payload of an already added suggestion. If you know
* this text is not already present you can use {@link
* #add} instead. After adding or updating a batch of
* new suggestions, you must call {@link #refresh} in the
* end in order to see the suggestions in {@link #lookup} */
public void update(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException {
ensureOpen();
writer.updateDocument(new Term(EXACT_TEXT_FIELD_NAME, text.utf8ToString()),
buildDocument(text, contexts, weight, payload));
}
private Document buildDocument(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException {
String textString = text.utf8ToString();
Document doc = new Document();
FieldType ft = getTextFieldType();
doc.add(new Field(TEXT_FIELD_NAME, textString, ft));
if (minPrefixChars>0) {
doc.add(new Field(TEXTGRAMS_FIELD_NAME, textString, ft));
}
doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO));
doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text));
doc.add(new NumericDocValuesField("weight", weight));
if (payload != null) {
doc.add(new BinaryDocValuesField("payloads", payload));
}
if (contexts != null) {
for(BytesRef context : contexts) {
doc.add(new StringField(CONTEXTS_FIELD_NAME, context, Field.Store.NO));
doc.add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context));
}
}
return doc;
}
/** Reopens the underlying searcher; it's best to "batch
* up" many additions/updates, and then call refresh
* once in the end. */
public void refresh() throws IOException {
if (searcherMgr == null) {
throw new IllegalStateException("suggester was not built");
}
if (writer != null) {
searcherMgr.maybeRefreshBlocking();
}
// else no-op: writer was committed and closed after the index was built
// and before searchMgr was constructed, so refresh is unnecessary
}
/**
* Subclass can override this method to change the field type of the text field
* e.g. to change the index options
*/
protected FieldType getTextFieldType(){
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS);
ft.setOmitNorms(true);
return ft;
}
@Override
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) throws IOException {
return lookup(key, contexts, num, allTermsRequired, highlight);
}
/** Lookup, without any context. */
public List<LookupResult> lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
return lookup(key, (BooleanQuery)null, num, allTermsRequired, doHighlight);
}
/** Lookup, with context but without booleans. Context booleans default to SHOULD,
* so each suggestion must have at least one of the contexts. */
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
return lookup(key, toQuery(contexts), num, allTermsRequired, doHighlight);
}
/** This is called if the last token isn't ended
* (e.g. user did not type a space after it). Return an
* appropriate Query clause to add to the BooleanQuery. */
protected Query getLastTokenQuery(String token) throws IOException {
if (token.length() < minPrefixChars) {
// The leading ngram was directly indexed:
return new TermQuery(new Term(TEXTGRAMS_FIELD_NAME, token));
}
return new PrefixQuery(new Term(TEXT_FIELD_NAME, token));
}
/** Retrieve suggestions, specifying whether all terms
* must match ({@code allTermsRequired}) and whether the hits
* should be highlighted ({@code doHighlight}). */
public List<LookupResult> lookup(CharSequence key, Map<BytesRef, BooleanClause.Occur> contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight);
}
private BooleanQuery toQuery(Map<BytesRef,BooleanClause.Occur> contextInfo) {
if (contextInfo == null || contextInfo.isEmpty()) {
return null;
}
BooleanQuery.Builder contextFilter = new BooleanQuery.Builder();
for (Map.Entry<BytesRef,BooleanClause.Occur> entry : contextInfo.entrySet()) {
addContextToQuery(contextFilter, entry.getKey(), entry.getValue());
}
return contextFilter.build();
}
private BooleanQuery toQuery(Set<BytesRef> contextInfo) {
if (contextInfo == null || contextInfo.isEmpty()) {
return null;
}
BooleanQuery.Builder contextFilter = new BooleanQuery.Builder();
for (BytesRef context : contextInfo) {
addContextToQuery(contextFilter, context, BooleanClause.Occur.SHOULD);
}
return contextFilter.build();
}
/**
* This method is handy as we do not need access to internal fields such as CONTEXTS_FIELD_NAME in order to build queries
* However, here may not be its best location.
*
* @param query an instance of @See {@link BooleanQuery}
* @param context the context
* @param clause one of {@link Occur}
*/
public void addContextToQuery(BooleanQuery.Builder query, BytesRef context, BooleanClause.Occur clause) {
// NOTE: we "should" wrap this in
// ConstantScoreQuery, or maybe send this as a
// Filter instead to search.
// TODO: if we had a BinaryTermField we could fix
// this "must be valid ut8f" limitation:
query.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context)), clause);
}
/**
* This is an advanced method providing the capability to send down to the suggester any
* arbitrary lucene query to be used to filter the result of the suggester
*
* @param key the keyword being looked for
* @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery.
* @param num number of items to return
* @param allTermsRequired all searched terms must match or not
* @param doHighlight if true, the matching term will be highlighted in the search result
* @return the result of the suggester
* @throws IOException f the is IO exception while reading data from the index
*/
public List<LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
if (searcherMgr == null) {
throw new IllegalStateException("suggester was not built");
}
final BooleanClause.Occur occur;
if (allTermsRequired) {
occur = BooleanClause.Occur.MUST;
} else {
occur = BooleanClause.Occur.SHOULD;
}
BooleanQuery.Builder query;
Set<String> matchedTokens;
String prefixToken = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
//long t0 = System.currentTimeMillis();
ts.reset();
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
String lastToken = null;
query = new BooleanQuery.Builder();
int maxEndOffset = -1;
matchedTokens = new HashSet<>();
while (ts.incrementToken()) {
if (lastToken != null) {
matchedTokens.add(lastToken);
query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
}
lastToken = termAtt.toString();
if (lastToken != null) {
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
}
}
ts.end();
if (lastToken != null) {
Query lastQuery;
if (maxEndOffset == offsetAtt.endOffset()) {
// Use PrefixQuery (or the ngram equivalent) when
// there was no trailing discarded chars in the
// string (e.g. whitespace), so that if query does
// not end with a space we show prefix matches for
// that token:
lastQuery = getLastTokenQuery(lastToken);
prefixToken = lastToken;
} else {
// Use TermQuery for an exact match if there were
// trailing discarded chars (e.g. whitespace), so
// that if query ends with a space we only show
// exact matches for that term:
matchedTokens.add(lastToken);
lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
}
if (lastQuery != null) {
query.add(lastQuery, occur);
}
}
if (contextQuery != null) {
boolean allMustNot = true;
for (BooleanClause clause : contextQuery.clauses()) {
if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) {
allMustNot = false;
break;
}
}
if (allMustNot) {
// All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query)
for (BooleanClause clause : contextQuery.clauses()) {
query.add(clause);
}
} else if (allTermsRequired == false) {
// We must carefully upgrade the query clauses to MUST:
BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
newQuery.add(query.build(), BooleanClause.Occur.MUST);
newQuery.add(contextQuery, BooleanClause.Occur.MUST);
query = newQuery;
} else {
// Add contextQuery as sub-query
query.add(contextQuery, BooleanClause.Occur.MUST);
}
}
}
// TODO: we could allow blended sort here, combining
// weight w/ score. Now we ignore score and sort only
// by weight:
Query finalQuery = finishQuery(query, allTermsRequired);
//System.out.println("finalQuery=" + finalQuery);
// Sort by weight, descending:
TopFieldCollector c = TopFieldCollector.create(SORT, num, 1);
List<LookupResult> results = null;
SearcherManager mgr;
IndexSearcher searcher;
synchronized (searcherMgrLock) {
mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference
searcher = mgr.acquire();
}
try {
//System.out.println("got searcher=" + searcher);
searcher.search(finalQuery, c);
TopFieldDocs hits = c.topDocs();
// Slower way if postings are not pre-sorted by weight:
// hits = searcher.search(query, null, num, SORT);
results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
} finally {
mgr.release(searcher);
}
//System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest");
//System.out.println(results);
return results;
}
/**
* Create the results based on the search hits.
* Can be overridden by subclass to add particular behavior (e.g. weight transformation).
* Note that there is no prefix token (the {@code prefixToken} argument will
* be null) whenever the final token in the incoming request was in fact finished
* (had trailing characters, such as white-space).
*
* @throws IOException If there are problems reading fields from the underlying Lucene index.
*/
protected List<LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num,
CharSequence charSequence,
boolean doHighlight, Set<String> matchedTokens, String prefixToken)
throws IOException {
List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
List<LookupResult> results = new ArrayList<>();
for (int i=0;i<hits.scoreDocs.length;i++) {
FieldDoc fd = (FieldDoc) hits.scoreDocs[i];
BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);
textDV.advance(fd.doc);
BytesRef term = textDV.binaryValue();
String text = term.utf8ToString();
long score = (Long) fd.fields[0];
// This will just be null if app didn't pass payloads to build():
// TODO: maybe just stored fields? they compress...
BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
BytesRef payload;
if (payloadsDV != null) {
if (payloadsDV.advance(fd.doc) == fd.doc) {
payload = BytesRef.deepCopyOf(payloadsDV.binaryValue());
} else {
payload = new BytesRef(BytesRef.EMPTY_BYTES);
}
} else {
payload = null;
}
// Must look up sorted-set by segment:
int segment = ReaderUtil.subIndex(fd.doc, leaves);
SortedSetDocValues contextsDV = leaves.get(segment).reader().getSortedSetDocValues(CONTEXTS_FIELD_NAME);
Set<BytesRef> contexts;
if (contextsDV != null) {
contexts = new HashSet<BytesRef>();
int targetDocID = fd.doc - leaves.get(segment).docBase;
if (contextsDV.advance(targetDocID) == targetDocID) {
long ord;
while ((ord = contextsDV.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
BytesRef context = BytesRef.deepCopyOf(contextsDV.lookupOrd(ord));
contexts.add(context);
}
}
} else {
contexts = null;
}
LookupResult result;
if (doHighlight) {
result = new LookupResult(text, highlight(text, matchedTokens, prefixToken), score, payload, contexts);
} else {
result = new LookupResult(text, score, payload, contexts);
}
results.add(result);
}
return results;
}
/** Subclass can override this to tweak the Query before
* searching. */
protected Query finishQuery(BooleanQuery.Builder in, boolean allTermsRequired) {
return in.build();
}
/** Override this method to customize the Object
* representing a single highlighted suggestions; the
* result is set on each {@link
* org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} member. */
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
StringBuilder sb = new StringBuilder();
int upto = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
addNonMatch(sb, text.substring(upto, startOffset));
upto = startOffset;
} else if (upto > startOffset) {
continue;
}
if (matchedTokens.contains(token)) {
// Token matches.
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
upto = endOffset;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
upto = endOffset;
}
}
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
addNonMatch(sb, text.substring(upto));
}
return sb.toString();
}
}
/** Called while highlighting a single result, to append a
* non-matching chunk of text from the suggestion to the
* provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param text The text chunk to add
*/
protected void addNonMatch(StringBuilder sb, String text) {
sb.append(text);
}
/** Called while highlighting a single result, to append
* the whole matched token to the provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param surface The surface form (original) text
* @param analyzed The analyzed token corresponding to the surface form text
*/
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) {
sb.append("<b>");
sb.append(surface);
sb.append("</b>");
}
/** Called while highlighting a single result, to append a
* matched prefix token, to the provided fragments list.
* @param sb The {@code StringBuilder} to append to
* @param surface The fragment of the surface form
* (indexed during {@link #build}, corresponding to
* this match
* @param analyzed The analyzed token that matched
* @param prefixToken The prefix of the token that matched
*/
protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) {
// TODO: apps can try to invert their analysis logic
// here, e.g. downcase the two before checking prefix:
if (prefixToken.length() >= surface.length()) {
addWholeMatch(sb, surface, analyzed);
return;
}
sb.append("<b>");
sb.append(surface.substring(0, prefixToken.length()));
sb.append("</b>");
sb.append(surface.substring(prefixToken.length()));
}
@Override
public boolean store(DataOutput in) throws IOException {
return false;
}
@Override
public boolean load(DataInput out) throws IOException {
return false;
}
@Override
public void close() throws IOException {
if (searcherMgr != null) {
searcherMgr.close();
searcherMgr = null;
}
if (writer != null) {
writer.close();
writer = null;
}
if (dir != null) {
dir.close();
}
}
@Override
public long ramBytesUsed() {
long mem = RamUsageEstimator.shallowSizeOf(this);
try {
if (searcherMgr != null) {
SearcherManager mgr;
IndexSearcher searcher;
synchronized (searcherMgrLock) {
mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference
searcher = mgr.acquire();
}
try {
for (LeafReaderContext context : searcher.getIndexReader().leaves()) {
LeafReader reader = FilterLeafReader.unwrap(context.reader());
if (reader instanceof SegmentReader) {
mem += ((SegmentReader) context.reader()).ramBytesUsed();
}
}
} finally {
mgr.release(searcher);
}
}
return mem;
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
@Override
public Collection<Accountable> getChildResources() {
List<Accountable> resources = new ArrayList<>();
try {
if (searcherMgr != null) {
SearcherManager mgr;
IndexSearcher searcher;
synchronized (searcherMgrLock) {
mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference
searcher = mgr.acquire();
}
try {
for (LeafReaderContext context : searcher.getIndexReader().leaves()) {
LeafReader reader = FilterLeafReader.unwrap(context.reader());
if (reader instanceof SegmentReader) {
resources.add(Accountables.namedAccountable("segment", (SegmentReader)reader));
}
}
} finally {
mgr.release(searcher);
}
}
return Collections.unmodifiableList(resources);
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
@Override
public long getCount() throws IOException {
if (searcherMgr == null) {
return 0;
}
SearcherManager mgr;
IndexSearcher searcher;
synchronized (searcherMgrLock) {
mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference
searcher = mgr.acquire();
}
try {
return searcher.getIndexReader().numDocs();
} finally {
mgr.release(searcher);
}
}
}