| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.analyzing; |
| |
| import java.io.Closeable; |
| import java.io.IOException; |
| import java.io.StringReader; |
| import java.nio.file.Path; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.AnalyzerWrapper; |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.document.BinaryDocValuesField; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.NumericDocValuesField; |
| import org.apache.lucene.document.SortedSetDocValuesField; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.BinaryDocValues; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.FilterLeafReader; |
| import org.apache.lucene.index.IndexOptions; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.LeafReader; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.MultiDocValues; |
| import org.apache.lucene.index.ReaderUtil; |
| import org.apache.lucene.index.SegmentReader; |
| import org.apache.lucene.index.SortedSetDocValues; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.BooleanClause.Occur; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.FieldDoc; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.PrefixQuery; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.SearcherManager; |
| import org.apache.lucene.search.Sort; |
| import org.apache.lucene.search.SortField; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.TopFieldCollector; |
| import org.apache.lucene.search.TopFieldDocs; |
| import org.apache.lucene.search.suggest.InputIterator; |
| import org.apache.lucene.search.suggest.Lookup; |
| import org.apache.lucene.store.DataInput; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.Accountables; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.RamUsageEstimator; |
| |
| // TODO: |
| // - a PostingsFormat that stores super-high-freq terms as |
| // a bitset should be a win for the prefix terms? |
| // (LUCENE-5052) |
| // - we could offer a better integration with |
| // DocumentDictionary and NRT? so that your suggester |
| // "automatically" keeps in sync w/ your index |
| |
| /** Analyzes the input text and then suggests matches based |
| * on prefix matches to any tokens in the indexed text. |
| * This also highlights the tokens that match. |
| * |
| * <p>This suggester supports payloads. Matches are sorted only |
| * by the suggest weight; it would be nice to support |
| * blended score + weight sort in the future. This means |
| * this suggester best applies when there is a strong |
| * a-priori ranking of all the suggestions. |
| * |
| * <p>This suggester supports contexts, including arbitrary binary |
| * terms. |
| * |
| * @lucene.experimental */ |
| |
| public class AnalyzingInfixSuggester extends Lookup implements Closeable { |
| |
| /** edgegrams for searching short prefixes without Prefix Query |
| * that's controlled by {@linkplain #minPrefixChars} */ |
| protected final static String TEXTGRAMS_FIELD_NAME = "textgrams"; |
| |
| /** Field name used for the indexed text. */ |
| protected final static String TEXT_FIELD_NAME = "text"; |
| |
| /** Field name used for the indexed text, as a |
| * StringField, for exact lookup. */ |
| protected final static String EXACT_TEXT_FIELD_NAME = "exacttext"; |
| |
| /** Field name used for the indexed context, as a |
| * StringField and a SortedSetDVField, for filtering. */ |
| protected final static String CONTEXTS_FIELD_NAME = "contexts"; |
| |
| /** Analyzer used at search time */ |
| protected final Analyzer queryAnalyzer; |
| /** Analyzer used at index time */ |
| protected final Analyzer indexAnalyzer; |
| private final Directory dir; |
| final int minPrefixChars; |
| |
| private final boolean allTermsRequired; |
| private final boolean highlight; |
| |
| private final boolean commitOnBuild; |
| private final boolean closeIndexWriterOnBuild; |
| |
| /** Used for ongoing NRT additions/updates. */ |
| protected IndexWriter writer; |
| |
| /** {@link IndexSearcher} used for lookups. */ |
| protected SearcherManager searcherMgr; |
| |
| /** Used to manage concurrent access to searcherMgr */ |
| protected final Object searcherMgrLock = new Object(); |
| |
| /** Default minimum number of leading characters before |
| * PrefixQuery is used (4). */ |
| public static final int DEFAULT_MIN_PREFIX_CHARS = 4; |
| |
| /** Default boolean clause option for multiple terms matching (all terms required). */ |
| public static final boolean DEFAULT_ALL_TERMS_REQUIRED = true; |
| |
| /** Default higlighting option. */ |
| public static final boolean DEFAULT_HIGHLIGHT = true; |
| |
| /** Default option to close the IndexWriter once the index has been built. */ |
| protected final static boolean DEFAULT_CLOSE_INDEXWRITER_ON_BUILD = true; |
| |
| /** How we sort the postings and search results. */ |
| private static final Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true)); |
| |
| /** Create a new instance, loading from a previously built |
| * AnalyzingInfixSuggester directory, if it exists. This directory must be |
| * private to the infix suggester (i.e., not an external |
| * Lucene index). Note that {@link #close} |
| * will also close the provided directory. */ |
| public AnalyzingInfixSuggester(Directory dir, Analyzer analyzer) throws IOException { |
| this(dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS, false, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); |
| } |
| |
| /** Create a new instance, loading from a previously built |
| * AnalyzingInfixSuggester directory, if it exists. This directory must be |
| * private to the infix suggester (i.e., not an external |
| * Lucene index). Note that {@link #close} |
| * will also close the provided directory. |
| * |
| * @param minPrefixChars Minimum number of leading characters |
| * before PrefixQuery is used (default 4). |
| * Prefixes shorter than this are indexed as character |
| * ngrams (increasing index size but making lookups |
| * faster). |
| * |
| * @param commitOnBuild Call commit after the index has finished building. This would persist the |
| * suggester index to disk and future instances of this suggester can use this pre-built dictionary. |
| */ |
| public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, |
| boolean commitOnBuild) throws IOException { |
| this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); |
| } |
| |
| /** Create a new instance, loading from a previously built |
| * AnalyzingInfixSuggester directory, if it exists. This directory must be |
| * private to the infix suggester (i.e., not an external |
| * Lucene index). Note that {@link #close} |
| * will also close the provided directory. |
| * |
| * @param minPrefixChars Minimum number of leading characters |
| * before PrefixQuery is used (default 4). |
| * Prefixes shorter than this are indexed as character |
| * ngrams (increasing index size but making lookups |
| * faster). |
| * |
| * @param commitOnBuild Call commit after the index has finished building. This would persist the |
| * suggester index to disk and future instances of this suggester can use this pre-built dictionary. |
| * |
| * @param allTermsRequired All terms in the suggest query must be matched. |
| * @param highlight Highlight suggest query in suggestions. |
| * |
| */ |
| public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, |
| boolean commitOnBuild, |
| boolean allTermsRequired, boolean highlight) throws IOException { |
| this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, allTermsRequired, highlight, |
| DEFAULT_CLOSE_INDEXWRITER_ON_BUILD); |
| } |
| |
| /** Create a new instance, loading from a previously built |
| * AnalyzingInfixSuggester directory, if it exists. This directory must be |
| * private to the infix suggester (i.e., not an external |
| * Lucene index). Note that {@link #close} |
| * will also close the provided directory. |
| * |
| * @param minPrefixChars Minimum number of leading characters |
| * before PrefixQuery is used (default 4). |
| * Prefixes shorter than this are indexed as character |
| * ngrams (increasing index size but making lookups |
| * faster). |
| * |
| * @param commitOnBuild Call commit after the index has finished building. This would persist the |
| * suggester index to disk and future instances of this suggester can use this pre-built dictionary. |
| * |
| * @param allTermsRequired All terms in the suggest query must be matched. |
| * @param highlight Highlight suggest query in suggestions. |
| * @param closeIndexWriterOnBuild If true, the IndexWriter will be closed after the index has finished building. |
| */ |
| public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, |
| boolean commitOnBuild, boolean allTermsRequired, |
| boolean highlight, boolean closeIndexWriterOnBuild) throws IOException { |
| |
| if (minPrefixChars < 0) { |
| throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars); |
| } |
| |
| this.queryAnalyzer = queryAnalyzer; |
| this.indexAnalyzer = indexAnalyzer; |
| this.dir = dir; |
| this.minPrefixChars = minPrefixChars; |
| this.commitOnBuild = commitOnBuild; |
| this.allTermsRequired = allTermsRequired; |
| this.highlight = highlight; |
| this.closeIndexWriterOnBuild = closeIndexWriterOnBuild; |
| |
| if (DirectoryReader.indexExists(dir)) { |
| // Already built; open it: |
| searcherMgr = new SearcherManager(dir, null); |
| } |
| } |
| |
| /** Override this to customize index settings, e.g. which |
| * codec to use. */ |
| protected IndexWriterConfig getIndexWriterConfig(Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) { |
| IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer); |
| iwc.setOpenMode(openMode); |
| |
| // This way all merged segments will be sorted at |
| // merge time, allow for per-segment early termination |
| // when those segments are searched: |
| iwc.setIndexSort(SORT); |
| |
| return iwc; |
| } |
| |
| /** Subclass can override to choose a specific {@link |
| * Directory} implementation. */ |
| protected Directory getDirectory(Path path) throws IOException { |
| return FSDirectory.open(path); |
| } |
| |
| @Override |
| public void build(InputIterator iter) throws IOException { |
| |
| synchronized (searcherMgrLock) { |
| if (searcherMgr != null) { |
| searcherMgr.close(); |
| searcherMgr = null; |
| } |
| |
| if (writer != null) { |
| writer.close(); |
| writer = null; |
| } |
| |
| boolean success = false; |
| try { |
| // First pass: build a temporary normal Lucene index, |
| // just indexing the suggestions as they iterate: |
| writer = new IndexWriter(dir, |
| getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); |
| //long t0 = System.nanoTime(); |
| |
| // TODO: use threads? |
| BytesRef text; |
| while ((text = iter.next()) != null) { |
| BytesRef payload; |
| if (iter.hasPayloads()) { |
| payload = iter.payload(); |
| } else { |
| payload = null; |
| } |
| |
| add(text, iter.contexts(), iter.weight(), payload); |
| } |
| |
| //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec"); |
| if (commitOnBuild || closeIndexWriterOnBuild) { |
| commit(); |
| } |
| searcherMgr = new SearcherManager(writer, null); |
| success = true; |
| } finally { |
| if (success) { |
| if (closeIndexWriterOnBuild) { |
| writer.close(); |
| writer = null; |
| } |
| } else { // failure |
| if (writer != null) { |
| writer.rollback(); |
| writer = null; |
| } |
| } |
| } |
| } |
| } |
| |
| /** Commits all pending changes made to this suggester to disk. |
| * |
| * @see IndexWriter#commit */ |
| public void commit() throws IOException { |
| if (writer == null) { |
| if (searcherMgr == null || closeIndexWriterOnBuild == false) { |
| throw new IllegalStateException("Cannot commit on an closed writer. Add documents first"); |
| } |
| // else no-op: writer was committed and closed after the index was built, so commit is unnecessary |
| } else { |
| writer.commit(); |
| } |
| } |
| |
| private Analyzer getGramAnalyzer() { |
| return new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) { |
| @Override |
| protected Analyzer getWrappedAnalyzer(String fieldName) { |
| return indexAnalyzer; |
| } |
| |
| @Override |
| protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { |
| assert !(fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars == 0) |
| : "no need \"textgrams\" when minPrefixChars="+minPrefixChars; |
| if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) { |
| // TODO: should use an EdgeNGramTokenFilterFactory here |
| TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false); |
| return new TokenStreamComponents(components.getSource(), filter); |
| } else { |
| return components; |
| } |
| } |
| }; |
| } |
| |
| private void ensureOpen() throws IOException { |
| synchronized (searcherMgrLock) { |
| if (writer == null) { |
| if (DirectoryReader.indexExists(dir)) { |
| // Already built; open it: |
| writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.APPEND)); |
| } else { |
| writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); |
| } |
| |
| SearcherManager oldSearcherMgr = searcherMgr; |
| searcherMgr = new SearcherManager(writer, null); |
| if (oldSearcherMgr != null) { |
| oldSearcherMgr.close(); |
| } |
| } |
| } |
| } |
| |
| /** Adds a new suggestion. Be sure to use {@link #update} |
| * instead if you want to replace a previous suggestion. |
| * After adding or updating a batch of new suggestions, |
| * you must call {@link #refresh} in the end in order to |
| * see the suggestions in {@link #lookup} */ |
| public void add(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException { |
| ensureOpen(); |
| writer.addDocument(buildDocument(text, contexts, weight, payload)); |
| } |
| |
| /** Updates a previous suggestion, matching the exact same |
| * text as before. Use this to change the weight or |
| * payload of an already added suggestion. If you know |
| * this text is not already present you can use {@link |
| * #add} instead. After adding or updating a batch of |
| * new suggestions, you must call {@link #refresh} in the |
| * end in order to see the suggestions in {@link #lookup} */ |
| public void update(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException { |
| ensureOpen(); |
| writer.updateDocument(new Term(EXACT_TEXT_FIELD_NAME, text.utf8ToString()), |
| buildDocument(text, contexts, weight, payload)); |
| } |
| |
| private Document buildDocument(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException { |
| String textString = text.utf8ToString(); |
| Document doc = new Document(); |
| FieldType ft = getTextFieldType(); |
| doc.add(new Field(TEXT_FIELD_NAME, textString, ft)); |
| if (minPrefixChars>0) { |
| doc.add(new Field(TEXTGRAMS_FIELD_NAME, textString, ft)); |
| } |
| doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO)); |
| doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text)); |
| doc.add(new NumericDocValuesField("weight", weight)); |
| if (payload != null) { |
| doc.add(new BinaryDocValuesField("payloads", payload)); |
| } |
| if (contexts != null) { |
| for(BytesRef context : contexts) { |
| doc.add(new StringField(CONTEXTS_FIELD_NAME, context, Field.Store.NO)); |
| doc.add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context)); |
| } |
| } |
| return doc; |
| } |
| |
| /** Reopens the underlying searcher; it's best to "batch |
| * up" many additions/updates, and then call refresh |
| * once in the end. */ |
| public void refresh() throws IOException { |
| if (searcherMgr == null) { |
| throw new IllegalStateException("suggester was not built"); |
| } |
| if (writer != null) { |
| searcherMgr.maybeRefreshBlocking(); |
| } |
| // else no-op: writer was committed and closed after the index was built |
| // and before searchMgr was constructed, so refresh is unnecessary |
| } |
| |
| /** |
| * Subclass can override this method to change the field type of the text field |
| * e.g. to change the index options |
| */ |
| protected FieldType getTextFieldType(){ |
| FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); |
| ft.setIndexOptions(IndexOptions.DOCS); |
| ft.setOmitNorms(true); |
| |
| return ft; |
| } |
| |
| @Override |
| public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) throws IOException { |
| return lookup(key, contexts, num, allTermsRequired, highlight); |
| } |
| |
| /** Lookup, without any context. */ |
| public List<LookupResult> lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { |
| return lookup(key, (BooleanQuery)null, num, allTermsRequired, doHighlight); |
| } |
| |
| /** Lookup, with context but without booleans. Context booleans default to SHOULD, |
| * so each suggestion must have at least one of the contexts. */ |
| public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { |
| return lookup(key, toQuery(contexts), num, allTermsRequired, doHighlight); |
| } |
| |
| /** This is called if the last token isn't ended |
| * (e.g. user did not type a space after it). Return an |
| * appropriate Query clause to add to the BooleanQuery. */ |
| protected Query getLastTokenQuery(String token) throws IOException { |
| if (token.length() < minPrefixChars) { |
| // The leading ngram was directly indexed: |
| return new TermQuery(new Term(TEXTGRAMS_FIELD_NAME, token)); |
| } |
| |
| return new PrefixQuery(new Term(TEXT_FIELD_NAME, token)); |
| } |
| |
| /** Retrieve suggestions, specifying whether all terms |
| * must match ({@code allTermsRequired}) and whether the hits |
| * should be highlighted ({@code doHighlight}). */ |
| public List<LookupResult> lookup(CharSequence key, Map<BytesRef, BooleanClause.Occur> contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { |
| return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight); |
| } |
| |
| private BooleanQuery toQuery(Map<BytesRef,BooleanClause.Occur> contextInfo) { |
| if (contextInfo == null || contextInfo.isEmpty()) { |
| return null; |
| } |
| |
| BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); |
| for (Map.Entry<BytesRef,BooleanClause.Occur> entry : contextInfo.entrySet()) { |
| addContextToQuery(contextFilter, entry.getKey(), entry.getValue()); |
| } |
| |
| return contextFilter.build(); |
| } |
| |
| private BooleanQuery toQuery(Set<BytesRef> contextInfo) { |
| if (contextInfo == null || contextInfo.isEmpty()) { |
| return null; |
| } |
| |
| BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); |
| for (BytesRef context : contextInfo) { |
| addContextToQuery(contextFilter, context, BooleanClause.Occur.SHOULD); |
| } |
| return contextFilter.build(); |
| } |
| |
| |
| /** |
| * This method is handy as we do not need access to internal fields such as CONTEXTS_FIELD_NAME in order to build queries |
| * However, here may not be its best location. |
| * |
| * @param query an instance of @See {@link BooleanQuery} |
| * @param context the context |
| * @param clause one of {@link Occur} |
| */ |
| public void addContextToQuery(BooleanQuery.Builder query, BytesRef context, BooleanClause.Occur clause) { |
| // NOTE: we "should" wrap this in |
| // ConstantScoreQuery, or maybe send this as a |
| // Filter instead to search. |
| |
| // TODO: if we had a BinaryTermField we could fix |
| // this "must be valid ut8f" limitation: |
| query.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context)), clause); |
| } |
| |
| /** |
| * This is an advanced method providing the capability to send down to the suggester any |
| * arbitrary lucene query to be used to filter the result of the suggester |
| * |
| * @param key the keyword being looked for |
| * @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery. |
| * @param num number of items to return |
| * @param allTermsRequired all searched terms must match or not |
| * @param doHighlight if true, the matching term will be highlighted in the search result |
| * @return the result of the suggester |
| * @throws IOException f the is IO exception while reading data from the index |
| */ |
| public List<LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { |
| |
| if (searcherMgr == null) { |
| throw new IllegalStateException("suggester was not built"); |
| } |
| |
| final BooleanClause.Occur occur; |
| if (allTermsRequired) { |
| occur = BooleanClause.Occur.MUST; |
| } else { |
| occur = BooleanClause.Occur.SHOULD; |
| } |
| |
| BooleanQuery.Builder query; |
| Set<String> matchedTokens; |
| String prefixToken = null; |
| |
| try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) { |
| //long t0 = System.currentTimeMillis(); |
| ts.reset(); |
| final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); |
| final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); |
| String lastToken = null; |
| query = new BooleanQuery.Builder(); |
| int maxEndOffset = -1; |
| matchedTokens = new HashSet<>(); |
| while (ts.incrementToken()) { |
| if (lastToken != null) { |
| matchedTokens.add(lastToken); |
| query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur); |
| } |
| lastToken = termAtt.toString(); |
| if (lastToken != null) { |
| maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset()); |
| } |
| } |
| ts.end(); |
| |
| if (lastToken != null) { |
| Query lastQuery; |
| if (maxEndOffset == offsetAtt.endOffset()) { |
| // Use PrefixQuery (or the ngram equivalent) when |
| // there was no trailing discarded chars in the |
| // string (e.g. whitespace), so that if query does |
| // not end with a space we show prefix matches for |
| // that token: |
| lastQuery = getLastTokenQuery(lastToken); |
| prefixToken = lastToken; |
| } else { |
| // Use TermQuery for an exact match if there were |
| // trailing discarded chars (e.g. whitespace), so |
| // that if query ends with a space we only show |
| // exact matches for that term: |
| matchedTokens.add(lastToken); |
| lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)); |
| } |
| |
| if (lastQuery != null) { |
| query.add(lastQuery, occur); |
| } |
| } |
| |
| if (contextQuery != null) { |
| boolean allMustNot = true; |
| for (BooleanClause clause : contextQuery.clauses()) { |
| if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) { |
| allMustNot = false; |
| break; |
| } |
| } |
| |
| if (allMustNot) { |
| // All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query) |
| for (BooleanClause clause : contextQuery.clauses()) { |
| query.add(clause); |
| } |
| } else if (allTermsRequired == false) { |
| // We must carefully upgrade the query clauses to MUST: |
| BooleanQuery.Builder newQuery = new BooleanQuery.Builder(); |
| newQuery.add(query.build(), BooleanClause.Occur.MUST); |
| newQuery.add(contextQuery, BooleanClause.Occur.MUST); |
| query = newQuery; |
| } else { |
| // Add contextQuery as sub-query |
| query.add(contextQuery, BooleanClause.Occur.MUST); |
| } |
| } |
| } |
| |
| // TODO: we could allow blended sort here, combining |
| // weight w/ score. Now we ignore score and sort only |
| // by weight: |
| |
| Query finalQuery = finishQuery(query, allTermsRequired); |
| |
| //System.out.println("finalQuery=" + finalQuery); |
| |
| // Sort by weight, descending: |
| TopFieldCollector c = TopFieldCollector.create(SORT, num, 1); |
| List<LookupResult> results = null; |
| SearcherManager mgr; |
| IndexSearcher searcher; |
| synchronized (searcherMgrLock) { |
| mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference |
| searcher = mgr.acquire(); |
| } |
| try { |
| //System.out.println("got searcher=" + searcher); |
| searcher.search(finalQuery, c); |
| |
| TopFieldDocs hits = c.topDocs(); |
| |
| // Slower way if postings are not pre-sorted by weight: |
| // hits = searcher.search(query, null, num, SORT); |
| results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken); |
| } finally { |
| mgr.release(searcher); |
| } |
| |
| //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest"); |
| //System.out.println(results); |
| |
| return results; |
| } |
| |
| /** |
| * Create the results based on the search hits. |
| * Can be overridden by subclass to add particular behavior (e.g. weight transformation). |
| * Note that there is no prefix token (the {@code prefixToken} argument will |
| * be null) whenever the final token in the incoming request was in fact finished |
| * (had trailing characters, such as white-space). |
| * |
| * @throws IOException If there are problems reading fields from the underlying Lucene index. |
| */ |
| protected List<LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num, |
| CharSequence charSequence, |
| boolean doHighlight, Set<String> matchedTokens, String prefixToken) |
| throws IOException { |
| |
| List<LeafReaderContext> leaves = searcher.getIndexReader().leaves(); |
| List<LookupResult> results = new ArrayList<>(); |
| for (int i=0;i<hits.scoreDocs.length;i++) { |
| FieldDoc fd = (FieldDoc) hits.scoreDocs[i]; |
| BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME); |
| textDV.advance(fd.doc); |
| BytesRef term = textDV.binaryValue(); |
| String text = term.utf8ToString(); |
| long score = (Long) fd.fields[0]; |
| |
| // This will just be null if app didn't pass payloads to build(): |
| // TODO: maybe just stored fields? they compress... |
| BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); |
| |
| BytesRef payload; |
| if (payloadsDV != null) { |
| if (payloadsDV.advance(fd.doc) == fd.doc) { |
| payload = BytesRef.deepCopyOf(payloadsDV.binaryValue()); |
| } else { |
| payload = new BytesRef(BytesRef.EMPTY_BYTES); |
| } |
| } else { |
| payload = null; |
| } |
| |
| // Must look up sorted-set by segment: |
| int segment = ReaderUtil.subIndex(fd.doc, leaves); |
| SortedSetDocValues contextsDV = leaves.get(segment).reader().getSortedSetDocValues(CONTEXTS_FIELD_NAME); |
| Set<BytesRef> contexts; |
| if (contextsDV != null) { |
| contexts = new HashSet<BytesRef>(); |
| int targetDocID = fd.doc - leaves.get(segment).docBase; |
| if (contextsDV.advance(targetDocID) == targetDocID) { |
| long ord; |
| while ((ord = contextsDV.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { |
| BytesRef context = BytesRef.deepCopyOf(contextsDV.lookupOrd(ord)); |
| contexts.add(context); |
| } |
| } |
| } else { |
| contexts = null; |
| } |
| |
| LookupResult result; |
| |
| if (doHighlight) { |
| result = new LookupResult(text, highlight(text, matchedTokens, prefixToken), score, payload, contexts); |
| } else { |
| result = new LookupResult(text, score, payload, contexts); |
| } |
| |
| results.add(result); |
| } |
| |
| return results; |
| } |
| |
| /** Subclass can override this to tweak the Query before |
| * searching. */ |
| protected Query finishQuery(BooleanQuery.Builder in, boolean allTermsRequired) { |
| return in.build(); |
| } |
| |
| /** Override this method to customize the Object |
| * representing a single highlighted suggestions; the |
| * result is set on each {@link |
| * org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} member. */ |
| protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException { |
| try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) { |
| CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); |
| OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); |
| ts.reset(); |
| StringBuilder sb = new StringBuilder(); |
| int upto = 0; |
| while (ts.incrementToken()) { |
| String token = termAtt.toString(); |
| int startOffset = offsetAtt.startOffset(); |
| int endOffset = offsetAtt.endOffset(); |
| if (upto < startOffset) { |
| addNonMatch(sb, text.substring(upto, startOffset)); |
| upto = startOffset; |
| } else if (upto > startOffset) { |
| continue; |
| } |
| |
| if (matchedTokens.contains(token)) { |
| // Token matches. |
| addWholeMatch(sb, text.substring(startOffset, endOffset), token); |
| upto = endOffset; |
| } else if (prefixToken != null && token.startsWith(prefixToken)) { |
| addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken); |
| upto = endOffset; |
| } |
| } |
| ts.end(); |
| int endOffset = offsetAtt.endOffset(); |
| if (upto < endOffset) { |
| addNonMatch(sb, text.substring(upto)); |
| } |
| return sb.toString(); |
| } |
| } |
| |
| /** Called while highlighting a single result, to append a |
| * non-matching chunk of text from the suggestion to the |
| * provided fragments list. |
| * @param sb The {@code StringBuilder} to append to |
| * @param text The text chunk to add |
| */ |
| protected void addNonMatch(StringBuilder sb, String text) { |
| sb.append(text); |
| } |
| |
| /** Called while highlighting a single result, to append |
| * the whole matched token to the provided fragments list. |
| * @param sb The {@code StringBuilder} to append to |
| * @param surface The surface form (original) text |
| * @param analyzed The analyzed token corresponding to the surface form text |
| */ |
| protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) { |
| sb.append("<b>"); |
| sb.append(surface); |
| sb.append("</b>"); |
| } |
| |
| /** Called while highlighting a single result, to append a |
| * matched prefix token, to the provided fragments list. |
| * @param sb The {@code StringBuilder} to append to |
| * @param surface The fragment of the surface form |
| * (indexed during {@link #build}, corresponding to |
| * this match |
| * @param analyzed The analyzed token that matched |
| * @param prefixToken The prefix of the token that matched |
| */ |
| protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) { |
| // TODO: apps can try to invert their analysis logic |
| // here, e.g. downcase the two before checking prefix: |
| if (prefixToken.length() >= surface.length()) { |
| addWholeMatch(sb, surface, analyzed); |
| return; |
| } |
| sb.append("<b>"); |
| sb.append(surface.substring(0, prefixToken.length())); |
| sb.append("</b>"); |
| sb.append(surface.substring(prefixToken.length())); |
| } |
| |
| @Override |
| public boolean store(DataOutput in) throws IOException { |
| return false; |
| } |
| |
| @Override |
| public boolean load(DataInput out) throws IOException { |
| return false; |
| } |
| |
| @Override |
| public void close() throws IOException { |
| if (searcherMgr != null) { |
| searcherMgr.close(); |
| searcherMgr = null; |
| } |
| if (writer != null) { |
| writer.close(); |
| writer = null; |
| } |
| if (dir != null) { |
| dir.close(); |
| } |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| long mem = RamUsageEstimator.shallowSizeOf(this); |
| try { |
| if (searcherMgr != null) { |
| SearcherManager mgr; |
| IndexSearcher searcher; |
| synchronized (searcherMgrLock) { |
| mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference |
| searcher = mgr.acquire(); |
| } |
| try { |
| for (LeafReaderContext context : searcher.getIndexReader().leaves()) { |
| LeafReader reader = FilterLeafReader.unwrap(context.reader()); |
| if (reader instanceof SegmentReader) { |
| mem += ((SegmentReader) context.reader()).ramBytesUsed(); |
| } |
| } |
| } finally { |
| mgr.release(searcher); |
| } |
| } |
| return mem; |
| } catch (IOException ioe) { |
| throw new RuntimeException(ioe); |
| } |
| } |
| |
| @Override |
| public Collection<Accountable> getChildResources() { |
| List<Accountable> resources = new ArrayList<>(); |
| try { |
| if (searcherMgr != null) { |
| SearcherManager mgr; |
| IndexSearcher searcher; |
| synchronized (searcherMgrLock) { |
| mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference |
| searcher = mgr.acquire(); |
| } |
| try { |
| for (LeafReaderContext context : searcher.getIndexReader().leaves()) { |
| LeafReader reader = FilterLeafReader.unwrap(context.reader()); |
| if (reader instanceof SegmentReader) { |
| resources.add(Accountables.namedAccountable("segment", (SegmentReader)reader)); |
| } |
| } |
| } finally { |
| mgr.release(searcher); |
| } |
| } |
| return Collections.unmodifiableList(resources); |
| } catch (IOException ioe) { |
| throw new RuntimeException(ioe); |
| } |
| } |
| |
| @Override |
| public long getCount() throws IOException { |
| if (searcherMgr == null) { |
| return 0; |
| } |
| SearcherManager mgr; |
| IndexSearcher searcher; |
| synchronized (searcherMgrLock) { |
| mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference |
| searcher = mgr.acquire(); |
| } |
| try { |
| return searcher.getIndexReader().numDocs(); |
| } finally { |
| mgr.release(searcher); |
| } |
| } |
| } |