solr/core/src/java/org/apache/solr/handler/tagger/Tagger.java - lucene-solr - Git at Google

 /*
  * This software was produced for the U. S. Government
  * under Contract No. W15P7T-11-C-F600, and is
  * subject to the Rights in Noncommercial Computer Software
  * and Noncommercial Computer Software Documentation
  * Clause 252.227-7014 (JUN 1995)
  *
  * Copyright 2013 The MITRE Corporation. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.solr.handler.tagger;

 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.util.HashMap;
 import java.util.Map;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Tags maximum string of words in a corpus.  This is a callback-style API
  * in which you implement {@link #tagCallback(int, int, Object)}.
  *
  * This class should be independently usable outside Solr.
  */
 public abstract class Tagger {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

   private final TokenStream tokenStream;
   private final TermToBytesRefAttribute byteRefAtt;
   private final PositionIncrementAttribute posIncAtt;
   private final OffsetAttribute offsetAtt;
   private final TaggingAttribute taggingAtt;

   private final TagClusterReducer tagClusterReducer;
   private final Terms terms;
   private final Bits liveDocs;
   private final boolean skipAltTokens;
   private final boolean ignoreStopWords;

   private Map<BytesRef, IntsRef> docIdsCache;

   /** Whether the WARNING about skipped tokens was already logged. */
   private boolean loggedSkippedAltTokenWarning = false;

   public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
                 TagClusterReducer tagClusterReducer, boolean skipAltTokens,
                 boolean ignoreStopWords) throws IOException {
     this.terms = terms;
     this.liveDocs = liveDocs;
     this.tokenStream = tokenStream;
     this.skipAltTokens = skipAltTokens;
     this.ignoreStopWords = ignoreStopWords;
     byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
     posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
     offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
     taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
     tokenStream.reset();

     this.tagClusterReducer = tagClusterReducer;
   }

   public void enableDocIdsCache(int initSize) {
     if (initSize > 0)
       docIdsCache = new HashMap<>(initSize);
   }

   public void process() throws IOException {
     if (terms == null)
       return;

     //a shared pointer to the head used by this method and each Tag instance.
     final TagLL[] head = new TagLL[1];

     TermPrefixCursor cursor = null;//re-used

     //boolean switch used to log warnings in case tokens where skipped during tagging.
     boolean skippedTokens = false;

     while (tokenStream.incrementToken()) {
       if (log.isTraceEnabled()) {
         log.trace("Token: {}, posInc: {},  offset: [{},{}]",
                 byteRefAtt, posIncAtt.getPositionIncrement(),
                 offsetAtt.startOffset(), offsetAtt.endOffset());
       }
       //check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
       if (posIncAtt.getPositionIncrement() < 1) {
         //(a) Deal with this as a configuration issue and throw an exception
         if (!skipAltTokens) {
           //TODO throw UnsupportedTokenException when PhraseBuilder is ported
           throw new IllegalStateException("Query Analyzer generates alternate "
               + "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
               + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such "
               + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS
               + "' might result in wrong tagging results if the index time analyzer "
               + "is not configured accordingly. For detailed information see "
               + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
         } else {
           //(b) In case the index time analyser had indexed all variants (users
           //    need to ensure that) processing of alternate tokens can be skipped
           //    as anyways all alternatives will be contained in the FST.
           skippedTokens = true;
           log.trace("  ... ignored token");
           continue;
         }
       }
       //-- If PositionIncrement > 1 (stopwords)
       if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
         log.trace("   - posInc > 1 ... mark cluster as done");
         advanceTagsAndProcessClusterIfDone(head, null);
       }

       final BytesRef term;
       //NOTE: we need to lookup tokens if
       // * the LookupAtt is true OR
       // * there are still advancing tags (to find the longest possible match)
       if(taggingAtt.isTaggable() || head[0] != null){
         //-- Lookup the term id from the next token
         term = byteRefAtt.getBytesRef();
         if (term.length == 0) {
           throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token");
         }
       } else { //no current cluster AND lookup == false ...
         term = null; //skip this token
       }

       //-- Process tag
       advanceTagsAndProcessClusterIfDone(head, term);

       //-- only create new Tags for Tokens we need to lookup
       if (taggingAtt.isTaggable() && term != null) {

         //determine if the terms index has a term starting with the provided term
         // TODO create a pool of these cursors to reuse them more?  could be trivial impl
         if (cursor == null)// (else the existing cursor will be re-used)
           cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
         if (cursor.advance(term)) {
           TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
           cursor = null;//because the new tag now "owns" this instance
           //and add it to the end
           if (head[0] == null) {
             head[0] = newTail;
           } else {
             for (TagLL t = head[0]; true; t = t.nextTag) {
               if (t.nextTag == null) {
                 t.addAfterLL(newTail);
                 break;
               }
             }
           }
         }
       }//if termId >= 0
     }//end while(incrementToken())

     //-- Finish all tags
     advanceTagsAndProcessClusterIfDone(head, null);
     assert head[0] == null;

     if(!loggedSkippedAltTokenWarning && skippedTokens){
       loggedSkippedAltTokenWarning = true; //only log once
       log.warn("{}{}{}{}"
           , "The Tagger skipped some alternate tokens (tokens with posInc == 0) "
           , "while processing text. This may cause problems with some Analyzer "
           , "configurations (e.g. query time synonym expansion). For details see "
           , "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
     }

     tokenStream.end();
     //tokenStream.close(); caller closes because caller acquired it
   }

   private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
     //-- Advance tags
     final int endOffset = term != null ? offsetAtt.endOffset() : -1;
     boolean anyAdvance = false;
     for (TagLL t = head[0]; t != null; t = t.nextTag) {
       anyAdvance |= t.advance(term, endOffset);
     }

     //-- Process cluster if done
     if (!anyAdvance && head[0] != null) {
       tagClusterReducer.reduce(head);
       for (TagLL t = head[0]; t != null; t = t.nextTag) {
         assert t.value != null;
         tagCallback(t.startOffset, t.endOffset, t.value);
       }
       head[0] = null;
     }
   }

   /**
    * Invoked by {@link #process()} for each tag found.  endOffset is always &gt;= the endOffset
    * given in the previous call.
    *
    * @param startOffset The character offset of the original stream where the tag starts.
    * @param endOffset One more than the character offset of the original stream where the tag ends.
    * @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}.
    */
   protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);

   /**
    * Returns a sorted array of integer docIds given the corresponding key.
    * @param docIdsKey The lookup key.
    * @return Not null
    */
   protected IntsRef lookupDocIds(Object docIdsKey) {
     return (IntsRef) docIdsKey;
   }
 }
	/*
	* This software was produced for the U. S. Government
	* under Contract No. W15P7T-11-C-F600, and is
	* subject to the Rights in Noncommercial Computer Software
	* and Noncommercial Computer Software Documentation
	* Clause 252.227-7014 (JUN 1995)
	*
	* Copyright 2013 The MITRE Corporation. All Rights Reserved.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.solr.handler.tagger;

	import java.io.IOException;
	import java.lang.invoke.MethodHandles;
	import java.util.HashMap;
	import java.util.Map;

	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.util.Bits;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.IntsRef;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Tags maximum string of words in a corpus. This is a callback-style API
	* in which you implement {@link #tagCallback(int, int, Object)}.
	*
	* This class should be independently usable outside Solr.
	*/
	public abstract class Tagger {
	private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

	private final TokenStream tokenStream;
	private final TermToBytesRefAttribute byteRefAtt;
	private final PositionIncrementAttribute posIncAtt;
	private final OffsetAttribute offsetAtt;
	private final TaggingAttribute taggingAtt;

	private final TagClusterReducer tagClusterReducer;
	private final Terms terms;
	private final Bits liveDocs;
	private final boolean skipAltTokens;
	private final boolean ignoreStopWords;

	private Map<BytesRef, IntsRef> docIdsCache;

	/** Whether the WARNING about skipped tokens was already logged. */
	private boolean loggedSkippedAltTokenWarning = false;

	public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream,
	TagClusterReducer tagClusterReducer, boolean skipAltTokens,
	boolean ignoreStopWords) throws IOException {
	this.terms = terms;
	this.liveDocs = liveDocs;
	this.tokenStream = tokenStream;
	this.skipAltTokens = skipAltTokens;
	this.ignoreStopWords = ignoreStopWords;
	byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
	posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
	offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
	taggingAtt = tokenStream.addAttribute(TaggingAttribute.class);
	tokenStream.reset();

	this.tagClusterReducer = tagClusterReducer;
	}

	public void enableDocIdsCache(int initSize) {
	if (initSize > 0)
	docIdsCache = new HashMap<>(initSize);
	}

	public void process() throws IOException {
	if (terms == null)
	return;

	//a shared pointer to the head used by this method and each Tag instance.
	final TagLL[] head = new TagLL[1];

	TermPrefixCursor cursor = null;//re-used

	//boolean switch used to log warnings in case tokens where skipped during tagging.
	boolean skippedTokens = false;

	while (tokenStream.incrementToken()) {
	if (log.isTraceEnabled()) {
	log.trace("Token: {}, posInc: {}, offset: [{},{}]",
	byteRefAtt, posIncAtt.getPositionIncrement(),
	offsetAtt.startOffset(), offsetAtt.endOffset());
	}
	//check for posInc < 1 (alternate Tokens, such as expanded Synonyms)
	if (posIncAtt.getPositionIncrement() < 1) {
	//(a) Deal with this as a configuration issue and throw an exception
	if (!skipAltTokens) {
	//TODO throw UnsupportedTokenException when PhraseBuilder is ported
	throw new IllegalStateException("Query Analyzer generates alternate "
	+ "Tokens (posInc == 0). Please adapt your Analyzer configuration or "
	+ "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such "
	+ "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS
	+ "' might result in wrong tagging results if the index time analyzer "
	+ "is not configured accordingly. For detailed information see "
	+ "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
	} else {
	//(b) In case the index time analyser had indexed all variants (users
	// need to ensure that) processing of alternate tokens can be skipped
	// as anyways all alternatives will be contained in the FST.
	skippedTokens = true;
	log.trace(" ... ignored token");
	continue;
	}
	}
	//-- If PositionIncrement > 1 (stopwords)
	if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) {
	log.trace(" - posInc > 1 ... mark cluster as done");
	advanceTagsAndProcessClusterIfDone(head, null);
	}

	final BytesRef term;
	//NOTE: we need to lookup tokens if
	// * the LookupAtt is true OR
	// * there are still advancing tags (to find the longest possible match)
	if(taggingAtt.isTaggable() \|\| head[0] != null){
	//-- Lookup the term id from the next token
	term = byteRefAtt.getBytesRef();
	if (term.length == 0) {
	throw new IllegalArgumentException("term: " + term.utf8ToString() + " analyzed to a zero-length token");
	}
	} else { //no current cluster AND lookup == false ...
	term = null; //skip this token
	}

	//-- Process tag
	advanceTagsAndProcessClusterIfDone(head, term);

	//-- only create new Tags for Tokens we need to lookup
	if (taggingAtt.isTaggable() && term != null) {

	//determine if the terms index has a term starting with the provided term
	// TODO create a pool of these cursors to reuse them more? could be trivial impl
	if (cursor == null)// (else the existing cursor will be re-used)
	cursor = new TermPrefixCursor(terms.iterator(), liveDocs, docIdsCache);
	if (cursor.advance(term)) {
	TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null);
	cursor = null;//because the new tag now "owns" this instance
	//and add it to the end
	if (head[0] == null) {
	head[0] = newTail;
	} else {
	for (TagLL t = head[0]; true; t = t.nextTag) {
	if (t.nextTag == null) {
	t.addAfterLL(newTail);
	break;
	}
	}
	}
	}
	}//if termId >= 0
	}//end while(incrementToken())

	//-- Finish all tags
	advanceTagsAndProcessClusterIfDone(head, null);
	assert head[0] == null;

	if(!loggedSkippedAltTokenWarning && skippedTokens){
	loggedSkippedAltTokenWarning = true; //only log once
	log.warn("{}{}{}{}"
	, "The Tagger skipped some alternate tokens (tokens with posInc == 0) "
	, "while processing text. This may cause problems with some Analyzer "
	, "configurations (e.g. query time synonym expansion). For details see "
	, "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225");
	}

	tokenStream.end();
	//tokenStream.close(); caller closes because caller acquired it
	}

	private void advanceTagsAndProcessClusterIfDone(TagLL[] head, BytesRef term) throws IOException {
	//-- Advance tags
	final int endOffset = term != null ? offsetAtt.endOffset() : -1;
	boolean anyAdvance = false;
	for (TagLL t = head[0]; t != null; t = t.nextTag) {
	anyAdvance \|= t.advance(term, endOffset);
	}

	//-- Process cluster if done
	if (!anyAdvance && head[0] != null) {
	tagClusterReducer.reduce(head);
	for (TagLL t = head[0]; t != null; t = t.nextTag) {
	assert t.value != null;
	tagCallback(t.startOffset, t.endOffset, t.value);
	}
	head[0] = null;
	}
	}

	/**
	* Invoked by {@link #process()} for each tag found. endOffset is always >= the endOffset
	* given in the previous call.
	*
	* @param startOffset The character offset of the original stream where the tag starts.
	* @param endOffset One more than the character offset of the original stream where the tag ends.
	* @param docIdsKey A reference to the matching docIds that can be resolved via {@link #lookupDocIds(Object)}.
	*/
	protected abstract void tagCallback(int startOffset, int endOffset, Object docIdsKey);

	/**
	* Returns a sorted array of integer docIds given the corresponding key.
	* @param docIdsKey The lookup key.
	* @return Not null
	*/
	protected IntsRef lookupDocIds(Object docIdsKey) {
	return (IntsRef) docIdsKey;
	}
	}