blob: 896ca08807c3c7908c8665f6898322e12cb7df04 [file] [log] [blame]
package org.apache.lucene.server.handlers;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.expressions.Expression;
import org.apache.lucene.expressions.js.JavascriptCompiler;
import org.apache.lucene.facet.taxonomy.SearcherTaxonomyManager.SearcherAndTaxonomy;
import org.apache.lucene.search.suggest.DocumentDictionary;
import org.apache.lucene.search.suggest.DocumentValueSourceDictionary;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup.LookupResult; // javadocs
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester;
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
import org.apache.lucene.search.suggest.analyzing.FuzzySuggester;
import org.apache.lucene.server.FinishRequest;
import org.apache.lucene.server.FromFileTermFreqIterator;
import org.apache.lucene.server.GlobalState;
import org.apache.lucene.server.IndexState;
import org.apache.lucene.server.params.*;
import org.apache.lucene.server.params.PolyType.PolyEntry;
import org.apache.lucene.util.BytesRef;
import net.minidev.json.JSONArray;
import net.minidev.json.JSONObject;
import net.minidev.json.JSONValue;
import net.minidev.json.parser.ParseException;
import static org.apache.lucene.server.handlers.RegisterFieldHandler.ANALYZER_TYPE;
/** Handles {@code buildSuggest}. */
public class BuildSuggestHandler extends Handler {
private final static StructType TYPE =
new StructType(
new Param("indexName", "Index Name", new StringType()),
new Param("class", "Which suggester implementation to use",
new PolyType(Lookup.class,
new PolyEntry("AnalyzingSuggester", "Suggester that first analyzes the surface form, adds the analyzed form to a weighted FST, and then does the same thing at lookup time (see @lucene:suggest:org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester)",
new Param("analyzer", "Index and query analyzer", ANALYZER_TYPE),
new Param("indexAnalyzer", "Index analyzer", ANALYZER_TYPE),
new Param("queryAnalyzer", "Query analyzer", ANALYZER_TYPE),
new Param("maxSurfaceFormsPerAnalyzedForm", "Maximum number of surface forms to keep for a single analyzed form", new IntType(), 256),
new Param("maxGraphExpansions", "Maximum number of graph paths to expand from the analyzed from", new IntType(), -1),
new Param("preserveSep", "True if token separators should be preserved when matching", new BooleanType(), true),
new Param("exactFirst", "True if the exact match should always be returned first regardless of score", new BooleanType(), true)),
new PolyEntry("FuzzySuggester", "Implements a fuzzy AnalyzingSuggester (see @lucene:suggest:org.apache.lucene.search.suggest.analyzing.FuzzySuggester)",
new Param("analyzer", "Index and query analyzer", ANALYZER_TYPE),
new Param("indexAnalyzer", "Index analyzer", ANALYZER_TYPE),
new Param("queryAnalyzer", "Query analyzer", ANALYZER_TYPE),
new Param("maxSurfaceFormsPerAnalyzedForm", "Maximum number of surface forms to keep for a single analyzed form", new IntType(), 156),
new Param("maxGraphExpansions", "Maximum number of graph paths to expand from the analyzed from", new IntType(), -1),
new Param("preserveSep", "True if token separators should be preserved when matching", new BooleanType(), true),
new Param("exactFirst", "True if the exact match should always be returned first regardless of score", new BooleanType(), true),
new Param("minFuzzyLength", "Minimum key length before edits are allowed", new IntType(), FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH),
new Param("nonFuzzyPrefix", "Key prefix where edits are not allowed", new IntType(), FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX),
new Param("maxEdits", "Maximum number of edits for fuzzy suggestions", new IntType(), FuzzySuggester.DEFAULT_MAX_EDITS),
new Param("transpositions", "Whether transpositions are allowed", new BooleanType(), FuzzySuggester.DEFAULT_TRANSPOSITIONS),
new Param("unicodeAware", "True if all edits are measured in unicode characters, not UTF-8 bytes", new BooleanType(), FuzzySuggester.DEFAULT_UNICODE_AWARE)),
new PolyEntry("InfixSuggester", "A suggester that matches terms anywhere in the input text, not just as a prefix. (see @lucene:org:server.InfixSuggester)",
new Param("analyzer", "Index and query analyzer", ANALYZER_TYPE),
new Param("indexAnalyzer", "Index analyzer", ANALYZER_TYPE),
new Param("queryAnalyzer", "Query analyzer", ANALYZER_TYPE))),
"AnalyzingSuggester"),
// nocommit option to stream suggestions in over the wire too
new Param("source", "Where to get suggestions from",
new StructType(
new Param("localFile", "Local file (to the server) to read suggestions + weights from; format is weight U+001F suggestion U+001F payload, one per line, with suggestion UTF-8 encoded. If this option is used then searcher, suggestField, weightField/Expression, payloadField should not be specified.", new StringType()),
new Param("searcher", "Specific searcher version to use for pull suggestions to build. There are three different ways to specify a searcher version.",
SearchHandler.SEARCHER_VERSION_TYPE),
new Param("suggestField", "Field (from stored documents) containing the suggestion text", new StringType()),
new Param("weightField", "Numeric field (from stored documents) containing the weight", new StringType()),
new Param("weightExpression", "Alternative to weightField, an expression that's evaluated to the weight. Note that any fields referenced in the expression must have been indexed with sort=true.", new StringType()),
new Param("payloadField", "Optional binary or string field (from stored documents) containing the payload", new StringType()))),
new Param("suggestName", "Unique name for this suggest build.", new StringType())
);
/** Sole constructor. */
public BuildSuggestHandler(GlobalState state) {
super(state);
}
@Override
public StructType getType() {
return TYPE;
}
@Override
public String getTopDoc() {
return "Builds a new auto-suggester, loading suggestions via the provided local file path.";
}
/** Load all previously built suggesters. */
public void load(IndexState state, JSONObject saveState) throws IOException {
for(Map.Entry<String,Object> ent : saveState.entrySet()) {
String suggestName = ent.getKey();
JSONObject params = (JSONObject) ent.getValue();
String jsonOrig = params.toString();
Request r = new Request(null, null, params, TYPE);
// Must consume these up front since getSuggester
// won't:
r.getString("suggestName");
Request source = r.getStruct("source");
if (source.hasParam("localFile")) {
source.getString("localFile");
} else {
Request searcher = source.getStruct("searcher");
if (searcher.hasParam("indexGen")) {
searcher.getLong("indexGen");
} else {
searcher.getString("snapshot");
}
source.getString("suggestField");
if (source.hasParam("weightField")) {
source.getString("weightField");
} else {
source.getString("weightExpression");
}
if (source.hasParam("payloadField")) {
source.getString("payloadField");
}
}
Lookup suggester = getSuggester(state, suggestName, r);
assert !Request.anythingLeft(params);
if ((suggester instanceof AnalyzingInfixSuggester) == false) {
File path = new File(state.rootDir, "suggest." + suggestName);
FileInputStream in = new FileInputStream(path);
try {
suggester.load(in);
} finally {
in.close();
}
}
try {
state.addSuggest(suggestName, (JSONObject) JSONValue.parseStrict(jsonOrig));
} catch (ParseException pe) {
// BUG
throw new RuntimeException(pe);
}
}
}
private Lookup getSuggester(IndexState state, String suggestName, Request r) throws IOException {
Request.PolyResult pr = r.getPoly("class");
Lookup oldSuggester = state.suggesters.get(suggestName);
if (oldSuggester != null && oldSuggester instanceof Closeable) {
((Closeable) oldSuggester).close();
state.suggesters.remove(suggestName);
}
String impl = pr.name;
r = pr.r;
Analyzer indexAnalyzer;
Analyzer queryAnalyzer;
// nocommit allow passing a field name, and we use that
// field name's analyzer?
if (r.hasParam("analyzer")) {
indexAnalyzer = queryAnalyzer = RegisterFieldHandler.getAnalyzer(state, r, "analyzer");
} else {
indexAnalyzer = RegisterFieldHandler.getAnalyzer(state, r, "indexAnalyzer");
queryAnalyzer = RegisterFieldHandler.getAnalyzer(state, r, "queryAnalyzer");
}
if (indexAnalyzer == null) {
r.fail("analyzer", "analyzer or indexAnalyzer must be specified");
}
if (queryAnalyzer == null) {
r.fail("analyzer", "analyzer or queryAnalyzer must be specified");
}
final Lookup suggester;
if (impl.equals("FuzzySuggester")) {
int options = 0;
if (r.getBoolean("preserveSep")) {
options |= AnalyzingSuggester.PRESERVE_SEP;
}
if (r.getBoolean("exactFirst")) {
options |= AnalyzingSuggester.EXACT_FIRST;
}
suggester = new FuzzySuggester(indexAnalyzer, queryAnalyzer,
options,
r.getInt("maxSurfaceFormsPerAnalyzedForm"),
r.getInt("maxGraphExpansions"),
true,
r.getInt("maxEdits"),
r.getBoolean("transpositions"),
r.getInt("nonFuzzyPrefix"),
r.getInt("minFuzzyLength"),
r.getBoolean("unicodeAware"));
} else if (impl.equals("AnalyzingSuggester")) {
int options = 0;
if (r.getBoolean("preserveSep")) {
options |= AnalyzingSuggester.PRESERVE_SEP;
}
if (r.getBoolean("exactFirst")) {
options |= AnalyzingSuggester.EXACT_FIRST;
}
suggester = new AnalyzingSuggester(indexAnalyzer, queryAnalyzer, options,
r.getInt("maxSurfaceFormsPerAnalyzedForm"),
r.getInt("maxGraphExpansions"), true);
} else if (impl.equals("InfixSuggester")) {
suggester = new AnalyzingInfixSuggester(state.matchVersion,
new File(state.rootDir, "suggest." + suggestName + ".infix"),
indexAnalyzer,
queryAnalyzer,
AnalyzingInfixSuggester.DEFAULT_MIN_PREFIX_CHARS) {
@Override
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
// We override the entire highlight method, to
// render directly to JSONArray instead of html
// string:
TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
JSONArray fragments = new JSONArray();
int upto = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
JSONObject o = new JSONObject();
fragments.add(o);
o.put("isHit", false);
o.put("text", text.substring(upto, startOffset));
upto = startOffset;
} else if (upto > startOffset) {
continue;
}
if (matchedTokens.contains(token)) {
// Token matches.
JSONObject o = new JSONObject();
fragments.add(o);
o.put("isHit", true);
o.put("text", text.substring(startOffset, endOffset));
upto = endOffset;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
JSONObject o = new JSONObject();
fragments.add(o);
o.put("isHit", true);
o.put("text", text.substring(startOffset, startOffset+prefixToken.length()));
if (prefixToken.length() < token.length()) {
o = new JSONObject();
fragments.add(o);
o.put("isHit", false);
o.put("text", text.substring(startOffset+prefixToken.length(), startOffset+token.length()));
}
upto = endOffset;
}
}
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
JSONObject o = new JSONObject();
fragments.add(o);
o.put("isHit", false);
o.put("text", text.substring(upto));
}
ts.close();
return fragments;
}
};
} else {
suggester = null;
assert false;
}
state.suggesters.put(suggestName, suggester);
return suggester;
}
/** Used to return highlighted result; see {@link
* LookupResult#highlightKey} */
public static final class LookupHighlightFragment {
/** Portion of text for this fragment. */
public final String text;
/** True if this text matched a part of the user's
* query. */
public final boolean isHit;
/** Sole constructor. */
public LookupHighlightFragment(String text, boolean isHit) {
this.text = text;
this.isHit = isHit;
}
@Override
public String toString() {
return "LookupHighlightFragment(text=" + text + " isHit=" + isHit + ")";
}
}
/** Wraps another {@link InputIterator} and counts how
* many suggestions were seen. */
private static class CountingInputIterator implements InputIterator {
private final InputIterator other;
private int count;
public CountingInputIterator(InputIterator other) {
this.other = other;
}
@Override
public long weight() {
return other.weight();
}
@Override
public BytesRef next() throws IOException {
BytesRef result = other.next();
if (result != null) {
count++;
}
return result;
}
@Override
public BytesRef payload() {
return other.payload();
}
@Override
public boolean hasPayloads() {
return other.hasPayloads();
}
public int getCount() {
return count;
}
}
@SuppressWarnings("unchecked")
@Override
public FinishRequest handle(final IndexState state, final Request r, Map<String,List<String>> params) throws Exception {
final String jsonOrig = r.toString();
final String suggestName = r.getString("suggestName");
if (!IndexState.isSimpleName(suggestName)) {
r.fail("suggestName", "invalid suggestName \"" + suggestName + "\": must be [a-zA-Z_][a-zA-Z0-9]*");
}
final Lookup suggester = getSuggester(state, suggestName, r);
Request source = r.getStruct("source");
InputIterator iterator = null;
final SearcherAndTaxonomy searcher;
if (source.hasParam("localFile")) {
final File localFile = new File(source.getString("localFile"));
if (!localFile.exists()) {
r.fail("localFile", "file does not exist");
}
if (!localFile.canRead()) {
r.fail("localFile", "cannot read file");
}
searcher = null;
// Pull suggestions from local file:
try {
iterator = new FromFileTermFreqIterator(localFile);
} catch (IOException ioe) {
r.fail("localFile", "cannot open file", ioe);
}
} else {
state.verifyStarted(r);
// Pull suggestions from stored docs:
if (source.hasParam("searcher")) {
// Specific searcher version:
searcher = SearchHandler.getSearcherAndTaxonomy(source, state, null);
} else {
// Just use current searcher version:
searcher = state.manager.acquire();
}
String suggestField = source.getString("suggestField");
String payloadField;
if (source.hasParam("payloadField")) {
payloadField = source.getString("payloadField");
} else {
payloadField = null;
}
DocumentDictionary dict;
if (source.hasParam("weightField")) {
// Weight is a field
String weightField = source.getString("weightField");
dict = new DocumentDictionary(searcher.searcher.getIndexReader(),
suggestField,
weightField,
payloadField);
} else {
// Weight is an expression; add bindings for all
// numeric DV fields:
Expression expr;
try {
expr = JavascriptCompiler.compile(source.getString("weightExpression"));
} catch (Exception e) {
source.fail("weightExpression", "expression does not compile", e);
// Dead code but compiler disagrees
expr = null;
}
dict = new DocumentValueSourceDictionary(searcher.searcher.getIndexReader(),
suggestField,
expr.getValueSource(state.exprBindings),
payloadField);
}
iterator = dict.getEntryIterator();
}
// nocommit return error if suggester already exists?
// nocommit need a DeleteSuggestHandler
final InputIterator iterator0 = iterator;
return new FinishRequest() {
@Override
public String finish() throws IOException {
try {
suggester.build(iterator0);
} finally {
if (iterator0 instanceof Closeable) {
((Closeable) iterator0).close();
}
if (searcher != null) {
state.manager.release(searcher);
}
}
File outFile = new File(state.rootDir, "suggest." + suggestName);
OutputStream out = new FileOutputStream(outFile);
boolean success = false;
try {
// nocommit look @ return value
suggester.store(out);
success = true;
} finally {
out.close();
if (!success) {
outFile.delete();
}
}
//System.out.println("buildSuggest: now add suggestName=" + suggestName + " to IndexState.name=" + state.name);
try {
state.addSuggest(suggestName, (JSONObject) JSONValue.parseStrict(jsonOrig));
} catch (ParseException pe) {
// BUG
throw new RuntimeException(pe);
}
JSONObject ret = new JSONObject();
if (suggester instanceof AnalyzingSuggester) {
ret.put("sizeInBytes", ((AnalyzingSuggester) suggester).sizeInBytes());
}
if (suggester instanceof AnalyzingInfixSuggester) {
((AnalyzingInfixSuggester) suggester).commit();
}
ret.put("count", suggester.getCount());
return ret.toString();
}
};
}
}