blob: 63f432fb76a0e8523330c149661ec22fa1959b74 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.suggest.analyzing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
// TODO:
// - allow to use the search score
/**
* Extension of the AnalyzingInfixSuggester which transforms the weight
* after search to take into account the position of the searched term into
* the indexed text.
* Please note that it increases the number of elements searched and applies the
* ponderation after. It might be costly for long suggestions.
*
* @lucene.experimental
*/
public class BlendedInfixSuggester extends AnalyzingInfixSuggester {
/**
* Coefficient used for linear blending
*/
protected static double LINEAR_COEF = 0.10;
private Double exponent = 2.0;
/**
* Default factor
*/
public static int DEFAULT_NUM_FACTOR = 10;
/**
* Factor to multiply the number of searched elements
*/
private final int numFactor;
/**
* Type of blender used by the suggester
*/
private final BlenderType blenderType;
/**
* The different types of blender.
*/
public static enum BlenderType {
/** Application dependent; override {@link
* #calculateCoefficient} to compute it. */
CUSTOM,
/** weight*(1 - 0.10*position) */
POSITION_LINEAR,
/** weight/(1+position) */
POSITION_RECIPROCAL,
/** weight/pow(1+position, exponent) */
POSITION_EXPONENTIAL_RECIPROCAL
// TODO:
//SCORE
}
/**
* Create a new instance, loading from a previously built
* directory, if it exists.
*/
public BlendedInfixSuggester(Directory dir, Analyzer analyzer) throws IOException {
super(dir, analyzer);
this.blenderType = BlenderType.POSITION_LINEAR;
this.numFactor = DEFAULT_NUM_FACTOR;
}
/**
* Create a new instance, loading from a previously built
* directory, if it exists.
*
* @param blenderType Type of blending strategy, see BlenderType for more precisions
* @param numFactor Factor to multiply the number of searched elements before ponderate
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built dictionary.
* @throws IOException If there are problems opening the underlying Lucene index.
*/
public BlendedInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer,
int minPrefixChars, BlenderType blenderType, int numFactor, boolean commitOnBuild) throws IOException {
super(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild);
this.blenderType = blenderType;
this.numFactor = numFactor;
}
/**
* Create a new instance, loading from a previously built
* directory, if it exists.
*
* @param blenderType Type of blending strategy, see BlenderType for more precisions
* @param numFactor Factor to multiply the number of searched elements before ponderate
* @param exponent exponent used only when blenderType is BlenderType.POSITION_EXPONENTIAL_RECIPROCAL
* @param commitOnBuild Call commit after the index has finished building. This would persist the
* suggester index to disk and future instances of this suggester can use this pre-built dictionary.
* @param allTermsRequired All terms in the suggest query must be matched.
* @param highlight Highlight suggest query in suggestions.
* @throws IOException If there are problems opening the underlying Lucene index.
*/
public BlendedInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer,
int minPrefixChars, BlenderType blenderType, int numFactor, Double exponent,
boolean commitOnBuild, boolean allTermsRequired, boolean highlight) throws IOException {
super(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, allTermsRequired, highlight);
this.blenderType = blenderType;
this.numFactor = numFactor;
if(exponent != null) {
this.exponent = exponent;
}
}
@Override
public List<Lookup.LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) throws IOException {
// Don't * numFactor here since we do it down below, once, in the call chain:
return super.lookup(key, contexts, onlyMorePopular, num);
}
@Override
public List<Lookup.LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
// Don't * numFactor here since we do it down below, once, in the call chain:
return super.lookup(key, contexts, num, allTermsRequired, doHighlight);
}
@Override
public List<Lookup.LookupResult> lookup(CharSequence key, Map<BytesRef, BooleanClause.Occur> contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
// Don't * numFactor here since we do it down below, once, in the call chain:
return super.lookup(key, contextInfo, num, allTermsRequired, doHighlight);
}
@Override
public List<Lookup.LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
/** We need to do num * numFactor here only because it is the last call in the lookup chain*/
return super.lookup(key, contextQuery, num * numFactor, allTermsRequired, doHighlight);
}
@Override
protected FieldType getTextFieldType() {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setOmitNorms(true);
return ft;
}
@Override
protected List<Lookup.LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num, CharSequence key,
boolean doHighlight, Set<String> matchedTokens, String prefixToken)
throws IOException {
TreeSet<Lookup.LookupResult> results = new TreeSet<>(LOOKUP_COMP);
// we reduce the num to the one initially requested
int actualNum = num / numFactor;
for (int i = 0; i < hits.scoreDocs.length; i++) {
FieldDoc fd = (FieldDoc) hits.scoreDocs[i];
BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME);
assert textDV != null;
textDV.advance(fd.doc);
final String text = textDV.binaryValue().utf8ToString();
long weight = (Long) fd.fields[0];
// This will just be null if app didn't pass payloads to build():
// TODO: maybe just stored fields? they compress...
BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads");
BytesRef payload;
if (payloadsDV != null) {
if (payloadsDV.advance(fd.doc) == fd.doc) {
payload = BytesRef.deepCopyOf(payloadsDV.binaryValue());
} else {
payload = new BytesRef(BytesRef.EMPTY_BYTES);
}
} else {
payload = null;
}
double coefficient;
if (text.startsWith(key.toString())) {
// if hit starts with the key, we don't change the score
coefficient = 1;
} else {
coefficient = createCoefficient(searcher, fd.doc, matchedTokens, prefixToken);
}
if (weight == 0) {
weight = 1;
}
if (weight < 1 / LINEAR_COEF && weight > -1 / LINEAR_COEF) {
weight *= 1 / LINEAR_COEF;
}
long score = (long) (weight * coefficient);
LookupResult result;
if (doHighlight) {
result = new LookupResult(text, highlight(text, matchedTokens, prefixToken), score, payload);
} else {
result = new LookupResult(text, score, payload);
}
boundedTreeAdd(results, result, actualNum);
}
return new ArrayList<>(results.descendingSet());
}
/**
* Add an element to the tree respecting a size limit
*
* @param results the tree to add in
* @param result the result we try to add
* @param num size limit
*/
private static void boundedTreeAdd(TreeSet<Lookup.LookupResult> results, Lookup.LookupResult result, int num) {
if (results.size() >= num) {
if (results.first().value < result.value) {
results.pollFirst();
} else {
return;
}
}
results.add(result);
}
/**
* Create the coefficient to transform the weight.
*
* @param doc id of the document
* @param matchedTokens tokens found in the query
* @param prefixToken unfinished token in the query
* @return the coefficient
* @throws IOException If there are problems reading term vectors from the underlying Lucene index.
*/
private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException {
Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME);
TermsEnum it = tv.iterator();
Integer position = Integer.MAX_VALUE;
BytesRef term;
// find the closest token position
while ((term = it.next()) != null) {
String docTerm = term.utf8ToString();
if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) {
PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS);
docPosEnum.nextDoc();
// use the first occurrence of the term
int p = docPosEnum.nextPosition();
if (p < position) {
position = p;
}
}
}
// create corresponding coefficient based on position
return calculateCoefficient(position);
}
/**
* Calculate the weight coefficient based on the position of the first matching word.
* Subclass should override it to adapt it to particular needs
* @param position of the first matching word in text
* @return the coefficient
*/
protected double calculateCoefficient(int position) {
double coefficient;
switch (blenderType) {
case POSITION_LINEAR:
coefficient = 1 - LINEAR_COEF * position;
break;
case POSITION_RECIPROCAL:
coefficient = 1. / (position + 1);
break;
case POSITION_EXPONENTIAL_RECIPROCAL:
coefficient = 1. / Math.pow((position + 1.0), exponent);
break;
default:
coefficient = 1;
}
return coefficient;
}
private static Comparator<Lookup.LookupResult> LOOKUP_COMP = new LookUpComparator();
private static class LookUpComparator implements Comparator<Lookup.LookupResult> {
@Override
public int compare(Lookup.LookupResult o1, Lookup.LookupResult o2) {
// order on weight
if (o1.value > o2.value) {
return 1;
} else if (o1.value < o2.value) {
return -1;
}
// otherwise on alphabetic order
int keyCompare = CHARSEQUENCE_COMPARATOR.compare(o1.key, o2.key);
if (keyCompare != 0) {
return keyCompare;
}
// if same weight and title, use the payload if there is one
if (o1.payload != null) {
return o1.payload.compareTo(o2.payload);
}
return 0;
}
}
}