| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.analyzing; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Comparator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.TreeSet; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.BinaryDocValues; |
| import org.apache.lucene.index.IndexOptions; |
| import org.apache.lucene.index.MultiDocValues; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.FieldDoc; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.TopFieldDocs; |
| import org.apache.lucene.search.suggest.Lookup; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| |
| // TODO: |
| // - allow to use the search score |
| |
| /** |
| * Extension of the AnalyzingInfixSuggester which transforms the weight |
| * after search to take into account the position of the searched term into |
| * the indexed text. |
| * Please note that it increases the number of elements searched and applies the |
| * ponderation after. It might be costly for long suggestions. |
| * |
| * @lucene.experimental |
| */ |
| public class BlendedInfixSuggester extends AnalyzingInfixSuggester { |
| |
| /** |
| * Coefficient used for linear blending |
| */ |
| protected static double LINEAR_COEF = 0.10; |
| |
| private Double exponent = 2.0; |
| |
| /** |
| * Default factor |
| */ |
| public static int DEFAULT_NUM_FACTOR = 10; |
| |
| /** |
| * Factor to multiply the number of searched elements |
| */ |
| private final int numFactor; |
| |
| /** |
| * Type of blender used by the suggester |
| */ |
| private final BlenderType blenderType; |
| |
| /** |
| * The different types of blender. |
| */ |
| public static enum BlenderType { |
| /** Application dependent; override {@link |
| * #calculateCoefficient} to compute it. */ |
| CUSTOM, |
| /** weight*(1 - 0.10*position) */ |
| POSITION_LINEAR, |
| /** weight/(1+position) */ |
| POSITION_RECIPROCAL, |
| /** weight/pow(1+position, exponent) */ |
| POSITION_EXPONENTIAL_RECIPROCAL |
| // TODO: |
| //SCORE |
| } |
| |
| /** |
| * Create a new instance, loading from a previously built |
| * directory, if it exists. |
| */ |
| public BlendedInfixSuggester(Directory dir, Analyzer analyzer) throws IOException { |
| super(dir, analyzer); |
| this.blenderType = BlenderType.POSITION_LINEAR; |
| this.numFactor = DEFAULT_NUM_FACTOR; |
| } |
| |
| /** |
| * Create a new instance, loading from a previously built |
| * directory, if it exists. |
| * |
| * @param blenderType Type of blending strategy, see BlenderType for more precisions |
| * @param numFactor Factor to multiply the number of searched elements before ponderate |
| * @param commitOnBuild Call commit after the index has finished building. This would persist the |
| * suggester index to disk and future instances of this suggester can use this pre-built dictionary. |
| * @throws IOException If there are problems opening the underlying Lucene index. |
| */ |
| public BlendedInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, |
| int minPrefixChars, BlenderType blenderType, int numFactor, boolean commitOnBuild) throws IOException { |
| super(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild); |
| this.blenderType = blenderType; |
| this.numFactor = numFactor; |
| } |
| |
| /** |
| * Create a new instance, loading from a previously built |
| * directory, if it exists. |
| * |
| * @param blenderType Type of blending strategy, see BlenderType for more precisions |
| * @param numFactor Factor to multiply the number of searched elements before ponderate |
| * @param exponent exponent used only when blenderType is BlenderType.POSITION_EXPONENTIAL_RECIPROCAL |
| * @param commitOnBuild Call commit after the index has finished building. This would persist the |
| * suggester index to disk and future instances of this suggester can use this pre-built dictionary. |
| * @param allTermsRequired All terms in the suggest query must be matched. |
| * @param highlight Highlight suggest query in suggestions. |
| * @throws IOException If there are problems opening the underlying Lucene index. |
| */ |
| public BlendedInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, |
| int minPrefixChars, BlenderType blenderType, int numFactor, Double exponent, |
| boolean commitOnBuild, boolean allTermsRequired, boolean highlight) throws IOException { |
| super(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, allTermsRequired, highlight); |
| this.blenderType = blenderType; |
| this.numFactor = numFactor; |
| if(exponent != null) { |
| this.exponent = exponent; |
| } |
| } |
| |
| @Override |
| public List<Lookup.LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) throws IOException { |
| // Don't * numFactor here since we do it down below, once, in the call chain: |
| return super.lookup(key, contexts, onlyMorePopular, num); |
| } |
| |
| @Override |
| public List<Lookup.LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { |
| // Don't * numFactor here since we do it down below, once, in the call chain: |
| return super.lookup(key, contexts, num, allTermsRequired, doHighlight); |
| } |
| |
| @Override |
| public List<Lookup.LookupResult> lookup(CharSequence key, Map<BytesRef, BooleanClause.Occur> contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { |
| // Don't * numFactor here since we do it down below, once, in the call chain: |
| return super.lookup(key, contextInfo, num, allTermsRequired, doHighlight); |
| } |
| |
| @Override |
| public List<Lookup.LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { |
| /** We need to do num * numFactor here only because it is the last call in the lookup chain*/ |
| return super.lookup(key, contextQuery, num * numFactor, allTermsRequired, doHighlight); |
| } |
| |
| @Override |
| protected FieldType getTextFieldType() { |
| FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); |
| ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); |
| ft.setStoreTermVectors(true); |
| ft.setStoreTermVectorPositions(true); |
| ft.setOmitNorms(true); |
| |
| return ft; |
| } |
| |
| @Override |
| protected List<Lookup.LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num, CharSequence key, |
| boolean doHighlight, Set<String> matchedTokens, String prefixToken) |
| throws IOException { |
| |
| TreeSet<Lookup.LookupResult> results = new TreeSet<>(LOOKUP_COMP); |
| |
| // we reduce the num to the one initially requested |
| int actualNum = num / numFactor; |
| |
| for (int i = 0; i < hits.scoreDocs.length; i++) { |
| FieldDoc fd = (FieldDoc) hits.scoreDocs[i]; |
| |
| BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME); |
| assert textDV != null; |
| |
| textDV.advance(fd.doc); |
| |
| final String text = textDV.binaryValue().utf8ToString(); |
| long weight = (Long) fd.fields[0]; |
| |
| // This will just be null if app didn't pass payloads to build(): |
| // TODO: maybe just stored fields? they compress... |
| BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); |
| |
| BytesRef payload; |
| if (payloadsDV != null) { |
| if (payloadsDV.advance(fd.doc) == fd.doc) { |
| payload = BytesRef.deepCopyOf(payloadsDV.binaryValue()); |
| } else { |
| payload = new BytesRef(BytesRef.EMPTY_BYTES); |
| } |
| } else { |
| payload = null; |
| } |
| |
| double coefficient; |
| if (text.startsWith(key.toString())) { |
| // if hit starts with the key, we don't change the score |
| coefficient = 1; |
| } else { |
| coefficient = createCoefficient(searcher, fd.doc, matchedTokens, prefixToken); |
| } |
| if (weight == 0) { |
| weight = 1; |
| } |
| if (weight < 1 / LINEAR_COEF && weight > -1 / LINEAR_COEF) { |
| weight *= 1 / LINEAR_COEF; |
| } |
| long score = (long) (weight * coefficient); |
| |
| LookupResult result; |
| if (doHighlight) { |
| result = new LookupResult(text, highlight(text, matchedTokens, prefixToken), score, payload); |
| } else { |
| result = new LookupResult(text, score, payload); |
| } |
| |
| boundedTreeAdd(results, result, actualNum); |
| } |
| |
| return new ArrayList<>(results.descendingSet()); |
| } |
| |
| /** |
| * Add an element to the tree respecting a size limit |
| * |
| * @param results the tree to add in |
| * @param result the result we try to add |
| * @param num size limit |
| */ |
| private static void boundedTreeAdd(TreeSet<Lookup.LookupResult> results, Lookup.LookupResult result, int num) { |
| |
| if (results.size() >= num) { |
| if (results.first().value < result.value) { |
| results.pollFirst(); |
| } else { |
| return; |
| } |
| } |
| |
| results.add(result); |
| } |
| |
| /** |
| * Create the coefficient to transform the weight. |
| * |
| * @param doc id of the document |
| * @param matchedTokens tokens found in the query |
| * @param prefixToken unfinished token in the query |
| * @return the coefficient |
| * @throws IOException If there are problems reading term vectors from the underlying Lucene index. |
| */ |
| private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException { |
| |
| Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME); |
| TermsEnum it = tv.iterator(); |
| |
| Integer position = Integer.MAX_VALUE; |
| BytesRef term; |
| // find the closest token position |
| while ((term = it.next()) != null) { |
| |
| String docTerm = term.utf8ToString(); |
| |
| if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) { |
| |
| PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS); |
| docPosEnum.nextDoc(); |
| |
| // use the first occurrence of the term |
| int p = docPosEnum.nextPosition(); |
| if (p < position) { |
| position = p; |
| } |
| } |
| } |
| |
| // create corresponding coefficient based on position |
| return calculateCoefficient(position); |
| } |
| |
| /** |
| * Calculate the weight coefficient based on the position of the first matching word. |
| * Subclass should override it to adapt it to particular needs |
| * @param position of the first matching word in text |
| * @return the coefficient |
| */ |
| protected double calculateCoefficient(int position) { |
| |
| double coefficient; |
| switch (blenderType) { |
| case POSITION_LINEAR: |
| coefficient = 1 - LINEAR_COEF * position; |
| break; |
| |
| case POSITION_RECIPROCAL: |
| coefficient = 1. / (position + 1); |
| break; |
| |
| case POSITION_EXPONENTIAL_RECIPROCAL: |
| coefficient = 1. / Math.pow((position + 1.0), exponent); |
| break; |
| |
| default: |
| coefficient = 1; |
| } |
| |
| return coefficient; |
| } |
| |
| private static Comparator<Lookup.LookupResult> LOOKUP_COMP = new LookUpComparator(); |
| |
| private static class LookUpComparator implements Comparator<Lookup.LookupResult> { |
| |
| @Override |
| public int compare(Lookup.LookupResult o1, Lookup.LookupResult o2) { |
| // order on weight |
| if (o1.value > o2.value) { |
| return 1; |
| } else if (o1.value < o2.value) { |
| return -1; |
| } |
| |
| // otherwise on alphabetic order |
| int keyCompare = CHARSEQUENCE_COMPARATOR.compare(o1.key, o2.key); |
| |
| if (keyCompare != 0) { |
| return keyCompare; |
| } |
| |
| // if same weight and title, use the payload if there is one |
| if (o1.payload != null) { |
| return o1.payload.compareTo(o2.payload); |
| } |
| |
| return 0; |
| } |
| } |
| } |
| |