blob: 62f09ce21eb7f14c7bd1314c67f3c8376f5f3122 [file] [log] [blame]
/*
* This software was produced for the U. S. Government
* under Contract No. W15P7T-11-C-F600, and is
* subject to the Rights in Noncommercial Computer Software
* and Noncommercial Computer Software Documentation
* Clause 252.227-7014 (JUN 1995)
*
* Copyright 2013 The MITRE Corporation. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.tagger;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Tags maximum string of words in a corpus. This is a callback-style API
* in which you implement {@link #tagCallback(int, int, Object)}.
*
* This class should be independently usable outside Solr.
*/
public abstract class Tagger {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final TokenStream tokenStream;
private final TermToBytesRefAttribute byteRefAtt;
private final PositionIncrementAttribute posIncAtt;
private final OffsetAttribute offsetAtt;
private final TaggingAttribute taggingAtt;
private final TagClusterReducer tagClusterReducer;
private final Terms terms;
private final Bits liveDocs;
private final boolean skipAltTokens;
private final boolean ignoreStopWords;
private Map<BytesRef, IntsRef> docIdsCache;
/** Whether the WARNING about skipped tokens was already logged. */
private boolean loggedSkippedAltTokenWarning = false;
public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
TagClusterReducer tagClusterReducer, boolean skipAltTokens,
boolean ignoreStopWords) throws IOException {
this.terms = terms;
this.liveDocs = liveDocs;
this.tokenStream = tokenStream;
this.skipAltTokens = skipAltTokens;
this.ignoreStopWords = ignoreStopWords;
byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
tokenStream.reset();
this.tagClusterReducer = tagClusterReducer;
}
public void enableDocIdsCache(int initSize) {
if (initSize > 0)
docIdsCache = new HashMap<>(initSize);
}
public void process() throws IOException {
if (terms == null)
return;
//a shared pointer to the head used by this method and each Tag instance.
final TagLL[] head = new TagLL[1];
TermPrefixCursor cursor = null;//re-used
//boolean switch used to log warnings in case tokens where skipped during tagging.
boolean skippedTokens = false;
while (tokenStream.incrementToken()) {
if (log.isTraceEnabled()) {
log.trace("Token: {}, posInc: {}, offset: [{},{}]",
byteRefAtt, posIncAtt.getPositionIncrement(),
offsetAtt.startOffset(), offsetAtt.endOffset());
}
//check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
if (posIncAtt.getPositionIncrement() < 1) {
//(a) Deal with this as a configuration issue and throw an exception
if (!skipAltTokens) {
//TODO throw UnsupportedTokenException when PhraseBuilder is ported
throw new IllegalStateException("Query Analyzer generates alternate "
+ "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
+ "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such "
+ "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS
+ "' might result in wrong tagging results if the index time analyzer "
+ "is not configured accordingly. For detailed information see "
+ "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
} else {
//(b) In case the index time analyser had indexed all variants (users
// need to ensure that) processing of alternate tokens can be skipped
// as anyways all alternatives will be contained in the FST.
skippedTokens = true;
log.trace(" ... ignored token");
continue;
}
}
//-- If PositionIncrement > 1 (stopwords)
if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
log.trace(" - posInc > 1 ... mark cluster as done");
advanceTagsAndProcessClusterIfDone(head, null);
}
final BytesRef term;
//NOTE: we need to lookup tokens if
// * the LookupAtt is true OR
// * there are still advancing tags (to find the longest possible match)
if(taggingAtt.isTaggable() || head[0] != null){
//-- Lookup the term id from the next token
term = byteRefAtt.getBytesRef();
if (term.length == 0) {
throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token");
}
} else { //no current cluster AND lookup == false ...
term = null; //skip this token
}
//-- Process tag
advanceTagsAndProcessClusterIfDone(head, term);
//-- only create new Tags for Tokens we need to lookup
if (taggingAtt.isTaggable() && term != null) {
//determine if the terms index has a term starting with the provided term
// TODO create a pool of these cursors to reuse them more? could be trivial impl
if (cursor == null)// (else the existing cursor will be re-used)
cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
if (cursor.advance(term)) {
TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
cursor = null;//because the new tag now "owns" this instance
//and add it to the end
if (head[0] == null) {
head[0] = newTail;
} else {
for (TagLL t = head[0]; true; t = t.nextTag) {
if (t.nextTag == null) {
t.addAfterLL(newTail);
break;
}
}
}
}
}//if termId >= 0
}//end while(incrementToken())
//-- Finish all tags
advanceTagsAndProcessClusterIfDone(head, null);
assert head[0] == null;
if(!loggedSkippedAltTokenWarning && skippedTokens){
loggedSkippedAltTokenWarning = true; //only log once
log.warn("{}{}{}{}"
, "The Tagger skipped some alternate tokens (tokens with posInc == 0) "
, "while processing text. This may cause problems with some Analyzer "
, "configurations (e.g. query time synonym expansion). For details see "
, "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
}
tokenStream.end();
//tokenStream.close(); caller closes because caller acquired it
}
private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
//-- Advance tags
final int endOffset = term != null ? offsetAtt.endOffset() : -1;
boolean anyAdvance = false;
for (TagLL t = head[0]; t != null; t = t.nextTag) {
anyAdvance |= t.advance(term, endOffset);
}
//-- Process cluster if done
if (!anyAdvance && head[0] != null) {
tagClusterReducer.reduce(head);
for (TagLL t = head[0]; t != null; t = t.nextTag) {
assert t.value != null;
tagCallback(t.startOffset, t.endOffset, t.value);
}
head[0] = null;
}
}
/**
* Invoked by {@link #process()} for each tag found. endOffset is always &gt;= the endOffset
* given in the previous call.
*
* @param startOffset The character offset of the original stream where the tag starts.
* @param endOffset One more than the character offset of the original stream where the tag ends.
* @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}.
*/
protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);
/**
* Returns a sorted array of integer docIds given the corresponding key.
* @param docIdsKey The lookup key.
* @return Not null
*/
protected IntsRef lookupDocIds(Object docIdsKey) {
return (IntsRef) docIdsKey;
}
}