blob: c9edffc9d363294c073e91737b6f4b25a8cb5c9d [file] [log] [blame]
/*
* This software was produced for the U. S. Government
* under Contract No. W15P7T-11-C-F600, and is
* subject to the Rights in Noncommercial Computer Software
* and Noncommercial Computer Software Documentation
* Clause 252.227-7014 (JUN 1995)
*
* Copyright 2013 The MITRE Corporation. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.tagger;
import javax.xml.stream.XMLStreamException;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import com.google.common.io.CharStreams;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IntsRef;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.QParser;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.search.SyntaxError;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Scans posted text, looking for matching strings in the Solr index.
* The public static final String members are request parameters.
* This handler is also called the "SolrTextTagger".
*
* @since 7.4.0
*/
public class TaggerRequestHandler extends RequestHandlerBase {
/** Request parameter. */
public static final String OVERLAPS = "overlaps";
/** Request parameter. */
public static final String TAGS_LIMIT = "tagsLimit";
/** Request parameter. */
public static final String MATCH_TEXT = "matchText";
/** Request parameter. */
public static final String SKIP_ALT_TOKENS = "skipAltTokens";
/** Request parameter. */
public static final String IGNORE_STOPWORDS = "ignoreStopwords";
/** Request parameter. */
public static final String XML_OFFSET_ADJUST = "xmlOffsetAdjust";
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
public String getDescription() {
return "Processes input text to find matching tokens stored in the index.";
}
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
//--Read params
final String indexedField = req.getParams().get("field");
if (indexedField == null)
throw new RuntimeException("required param 'field'");
final TagClusterReducer tagClusterReducer =
chooseTagClusterReducer(req.getParams().get(OVERLAPS));
final int rows = req.getParams().getInt(CommonParams.ROWS, 10000);
final int tagsLimit = req.getParams().getInt(TAGS_LIMIT, 1000);
final boolean addMatchText = req.getParams().getBool(MATCH_TEXT, false);
final SchemaField idSchemaField = req.getSchema().getUniqueKeyField();
if (idSchemaField == null) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The tagger requires a" +
"uniqueKey in the schema.");//TODO this could be relaxed
}
final boolean skipAltTokens = req.getParams().getBool(SKIP_ALT_TOKENS, false);
final boolean ignoreStopWords = req.getParams().getBool(IGNORE_STOPWORDS,
fieldHasIndexedStopFilter(indexedField, req));
//--Get posted data
Reader inputReader = null;
Iterable<ContentStream> streams = req.getContentStreams();
if (streams != null) {
Iterator<ContentStream> iter = streams.iterator();
if (iter.hasNext()) {
inputReader = iter.next().getReader();
}
if (iter.hasNext()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
getClass().getSimpleName()+" does not support multiple ContentStreams"); //TODO support bulk tagging?
}
}
if (inputReader == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
getClass().getSimpleName()+" requires text to be POSTed to it");
}
// We may or may not need to read the input into a string
final InputStringLazy inputStringFuture = new InputStringLazy(inputReader);
final OffsetCorrector offsetCorrector = getOffsetCorrector(req.getParams(), inputStringFuture);
final String inputString;//only populated if needed
if (addMatchText || inputStringFuture.inputString != null) {
//Read the input fully into a String buffer that we'll need later,
// then replace the input with a reader wrapping the buffer.
inputString = inputStringFuture.call();
inputReader.close();
inputReader = new StringReader(inputString);
} else {
inputString = null;//not used
}
final SolrIndexSearcher searcher = req.getSearcher();
final FixedBitSet matchDocIdsBS = new FixedBitSet(searcher.maxDoc());
@SuppressWarnings({"rawtypes"})
final List tags = new ArrayList(2000);
try {
Analyzer analyzer = req.getSchema().getField(indexedField).getType().getQueryAnalyzer();
try (TokenStream tokenStream = analyzer.tokenStream("", inputReader)) {
Terms terms = searcher.getSlowAtomicReader().terms(indexedField);
if (terms != null) {
Tagger tagger = new Tagger(terms, computeDocCorpus(req), tokenStream, tagClusterReducer,
skipAltTokens, ignoreStopWords) {
@SuppressWarnings("unchecked")
@Override
protected void tagCallback(int startOffset, int endOffset, Object docIdsKey) {
if (tags.size() >= tagsLimit)
return;
if (offsetCorrector != null) {
int[] offsetPair = offsetCorrector.correctPair(startOffset, endOffset);
if (offsetPair == null) {
log.debug("Discarded offsets [{}, {}] because couldn't balance XML.",
startOffset, endOffset);
return;
}
startOffset = offsetPair[0];
endOffset = offsetPair[1];
}
@SuppressWarnings({"rawtypes"})
NamedList tag = new NamedList();
tag.add("startOffset", startOffset);
tag.add("endOffset", endOffset);
if (addMatchText)
tag.add("matchText", inputString.substring(startOffset, endOffset));
//below caches, and also flags matchDocIdsBS
tag.add("ids", lookupSchemaDocIds(docIdsKey));
tags.add(tag);
}
@SuppressWarnings({"rawtypes"})
Map<Object, List> docIdsListCache = new HashMap<>(2000);
ValueSourceAccessor uniqueKeyCache = new ValueSourceAccessor(searcher,
idSchemaField.getType().getValueSource(idSchemaField, null));
@SuppressWarnings({"unchecked", "rawtypes"})
private List lookupSchemaDocIds(Object docIdsKey) {
List schemaDocIds = docIdsListCache.get(docIdsKey);
if (schemaDocIds != null)
return schemaDocIds;
IntsRef docIds = lookupDocIds(docIdsKey);
//translate lucene docIds to schema ids
schemaDocIds = new ArrayList<>(docIds.length);
for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
int docId = docIds.ints[i];
assert i == docIds.offset || docIds.ints[i - 1] < docId : "not sorted?";
matchDocIdsBS.set(docId);//also, flip docid in bitset
try {
schemaDocIds.add(uniqueKeyCache.objectVal(docId));//translates here
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}
assert !schemaDocIds.isEmpty();
docIdsListCache.put(docIds, schemaDocIds);
return schemaDocIds;
}
};
tagger.enableDocIdsCache(2000);//TODO configurable
tagger.process();
}
}
} finally {
inputReader.close();
}
rsp.add("tagsCount",tags.size());
rsp.add("tags", tags);
rsp.setReturnFields(new SolrReturnFields( req ));
//Solr's standard name for matching docs in response
rsp.add("response", getDocList(rows, matchDocIdsBS));
}
private static class InputStringLazy implements Callable<String> {
final Reader inputReader;
String inputString;
InputStringLazy(Reader inputReader) {
this.inputReader = inputReader;
}
@Override
public String call() throws IOException {
if (inputString == null) {
inputString = CharStreams.toString(inputReader);
}
return inputString;
}
}
protected OffsetCorrector getOffsetCorrector(SolrParams params, Callable<String> inputStringProvider) throws Exception {
final boolean xmlOffsetAdjust = params.getBool(XML_OFFSET_ADJUST, false);
if (!xmlOffsetAdjust) {
return null;
}
try {
return new XmlOffsetCorrector(inputStringProvider.call());
} catch (XMLStreamException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Expecting XML but wasn't: " + e, e);
}
}
private DocList getDocList(int rows, FixedBitSet matchDocIdsBS) throws IOException {
//Now we must supply a Solr DocList and add it to the response.
// Typically this is gotten via a SolrIndexSearcher.search(), but in this case we
// know exactly what documents to return, the order doesn't matter nor does
// scoring.
// Ideally an implementation of DocList could be directly implemented off
// of a BitSet, but there are way too many methods to implement for a minor
// payoff.
int matchDocs = matchDocIdsBS.cardinality();
int[] docIds = new int[ Math.min(rows, matchDocs) ];
DocIdSetIterator docIdIter = new BitSetIterator(matchDocIdsBS, 1);
for (int i = 0; i < docIds.length; i++) {
docIds[i] = docIdIter.nextDoc();
}
return new DocSlice(0, docIds.length, docIds, null, matchDocs, 1f, TotalHits.Relation.EQUAL_TO);
}
private TagClusterReducer chooseTagClusterReducer(String overlaps) {
TagClusterReducer tagClusterReducer;
if (overlaps == null || overlaps.equals("NO_SUB")) {
tagClusterReducer = TagClusterReducer.NO_SUB;
} else if (overlaps.equals("ALL")) {
tagClusterReducer = TagClusterReducer.ALL;
} else if (overlaps.equals("LONGEST_DOMINANT_RIGHT")) {
tagClusterReducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"unknown tag overlap mode: "+overlaps);
}
return tagClusterReducer;
}
/**
* The set of documents matching the provided 'fq' (filter query). Don't include deleted docs
* either. If null is returned, then all docs are available.
*/
private Bits computeDocCorpus(SolrQueryRequest req) throws SyntaxError, IOException {
final String[] corpusFilterQueries = req.getParams().getParams("fq");
final SolrIndexSearcher searcher = req.getSearcher();
final Bits docBits;
if (corpusFilterQueries != null && corpusFilterQueries.length > 0) {
List<Query> filterQueries = new ArrayList<Query>(corpusFilterQueries.length);
for (String corpusFilterQuery : corpusFilterQueries) {
QParser qParser = QParser.getParser(corpusFilterQuery, null, req);
try {
filterQueries.add(qParser.parse());
} catch (SyntaxError e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
}
final DocSet docSet = searcher.getDocSet(filterQueries);//hopefully in the cache
//note: before Solr 4.7 we could call docSet.getBits() but no longer.
if (docSet instanceof BitDocSet) {
docBits = ((BitDocSet)docSet).getBits();
} else {
docBits = new Bits() {
@Override
public boolean get(int index) {
return docSet.exists(index);
}
@Override
public int length() {
return searcher.maxDoc();
}
};
}
} else {
docBits = searcher.getSlowAtomicReader().getLiveDocs();
}
return docBits;
}
private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
FieldType fieldType = req.getSchema().getFieldType(field);
Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
if (analyzer instanceof TokenizerChain) {
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
if (tokenFilterFactory instanceof StopFilterFactory)
return true;
}
}
return false;
}
/** See LUCENE-4541 or {@link org.apache.solr.response.transform.ValueSourceAugmenter}. */
static class ValueSourceAccessor {
private final List<LeafReaderContext> readerContexts;
private final ValueSource valueSource;
@SuppressWarnings({"rawtypes"})
private final Map fContext;
private final FunctionValues[] functionValuesPerSeg;
private final int[] functionValuesDocIdPerSeg;
ValueSourceAccessor(IndexSearcher searcher, ValueSource valueSource) {
readerContexts = searcher.getIndexReader().leaves();
this.valueSource = valueSource;
fContext = ValueSource.newContext(searcher);
functionValuesPerSeg = new FunctionValues[readerContexts.size()];
functionValuesDocIdPerSeg = new int[readerContexts.size()];
}
@SuppressWarnings({"unchecked"})
Object objectVal(int topDocId) throws IOException {
// lookup segment level stuff:
int segIdx = ReaderUtil.subIndex(topDocId, readerContexts);
LeafReaderContext rcontext = readerContexts.get(segIdx);
int segDocId = topDocId - rcontext.docBase;
// unfortunately Lucene 7.0 requires forward only traversal (with no reset method).
// So we need to track our last docId (per segment) and re-fetch the FunctionValues. :-(
FunctionValues functionValues = functionValuesPerSeg[segIdx];
if (functionValues == null || segDocId < functionValuesDocIdPerSeg[segIdx]) {
functionValues = functionValuesPerSeg[segIdx] = valueSource.getValues(fContext, rcontext);
}
functionValuesDocIdPerSeg[segIdx] = segDocId;
// get value:
return functionValues.objectVal(segDocId);
}
}
}