blob: 29d9a3ebc60184da955ed1766e035b151ff2927e [file] [log] [blame]
Index: contrib/CHANGES.txt
===================================================================
--- contrib/CHANGES.txt (revision 799153)
+++ contrib/CHANGES.txt (working copy)
@@ -11,7 +11,12 @@
API Changes
- (None)
+ 1. LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards
+ compatibility with some public classes. If you have implemented custom Fregmenters or Scorers,
+ you will need to adjust them to work with the new TokenStream API. Rather than getting passed a
+ Token at a time, you will be given a TokenStream to init your impl with - store the Attributes
+ you are interested in locally and access them on each call to the method that used to pass a new
+ Token. Look at the included updated impls for examples. (Mark Miller)
Bug fixes
@@ -41,9 +46,6 @@
8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size.
(Todd Teak via Otis Gospodnetic)
-
- 9. LUCENE-1752: Missing highlights when terms were repeated in separate, nested, boolean or
- disjunction queries. (Koji Sekiguchi, Mark Miller)
New features
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (working copy)
@@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,24 +17,31 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
/**
- * Implements the policy for breaking text into multiple fragments for consideration
- * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
- * of detecting end of sentences in the text.
+ * Implements the policy for breaking text into multiple fragments for
+ * consideration by the {@link Highlighter} class. A sophisticated
+ * implementation may do this on the basis of detecting end of sentences in the
+ * text.
*/
-public interface Fragmenter
-{
- /**
- * Initializes the Fragmenter
- * @param originalText
- */
- public void start(String originalText);
+public interface Fragmenter {
- /**
- * Test to see if this token from the stream should be held in a new TextFragment
- * @param nextToken
- */
- public boolean isNewFragment(Token nextToken);
+ /**
+ * Initializes the Fragmenter. You can grab references to the Attributes you are
+ * interested in from tokenStream and then access the values in isNewFragment.
+ *
+ * @param originalText
+ * @param tokenStream
+ */
+ public void start(String originalText, TokenStream tokenStream);
+
+
+ /**
+ * Test to see if this token from the stream should be held in a new
+ * TextFragment. Every time this is called, the TokenStream
+ * passed to start(String, TokenStream) will have been incremented.
+ *
+ */
+ public boolean isNewFragment();
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy)
@@ -22,8 +22,10 @@
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.PriorityQueue;
/**
@@ -214,8 +216,14 @@
{
ArrayList docFrags = new ArrayList();
StringBuffer newText=new StringBuffer();
-
+
+ TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class);
+ tokenStream.addAttribute(PositionIncrementAttribute.class);
+ tokenStream.reset();
+
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
+ fragmentScorer.init(tokenStream);
fragmentScorer.startFragment(currentFrag);
docFrags.add(currentFrag);
@@ -223,28 +231,27 @@
try
{
- final Token reusableToken = new Token();
+
String tokenText;
int startOffset;
int endOffset;
int lastEndOffset = 0;
- textFragmenter.start(text);
+ textFragmenter.start(text, tokenStream);
- TokenGroup tokenGroup=new TokenGroup();
-
- for (Token nextToken = tokenStream.next(reusableToken);
- (nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
- nextToken = tokenStream.next(reusableToken))
+ TokenGroup tokenGroup=new TokenGroup(tokenStream);
+
+ for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
+ next = tokenStream.incrementToken())
{
- if( (nextToken.endOffset()>text.length())
+ if( (offsetAtt.endOffset()>text.length())
||
- (nextToken.startOffset()>text.length())
+ (offsetAtt.startOffset()>text.length())
)
{
- throw new InvalidTokenOffsetsException("Token "+nextToken.toString()
+ throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
+" exceeds length of provided text sized "+text.length());
}
- if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
+ if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
@@ -260,7 +267,7 @@
tokenGroup.clear();
//check if current token marks the start of a new fragment
- if(textFragmenter.isNewFragment(nextToken))
+ if(textFragmenter.isNewFragment())
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
@@ -271,7 +278,7 @@
}
}
- tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken));
+ tokenGroup.addToken(fragmentScorer.getTokenScore());
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
@@ -332,7 +339,7 @@
//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
//fix to PriorityQueue. The correct method to use here is the new "insert" method
// USE ABOVE CODE IF THIS DOES NOT COMPILE!
- fragQueue.insert(currentFrag);
+ fragQueue.insertWithOverflow(currentFrag);
}
//return the most relevant fragments
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java (working copy)
@@ -16,17 +16,18 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
/**
* {@link Fragmenter} implementation which does not fragment the text.
* This is useful for highlighting the entire content of a document or field.
*/
public class NullFragmenter implements Fragmenter {
- public void start(String s) {
+ public void start(String s, TokenStream tokenStream) {
}
- public boolean isNewFragment(Token token) {
+ public boolean isNewFragment() {
return false;
}
+
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy)
@@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -19,134 +20,142 @@
import java.util.HashMap;
import java.util.HashSet;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
/**
- * {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
- * This class uses the {@link QueryTermExtractor} class to process determine the query terms and
- * their boosts to be used.
+ * {@link Scorer} implementation which scores text fragments by the number of
+ * unique query terms found. This class uses the {@link QueryTermExtractor}
+ * class to process determine the query terms and their boosts to be used.
*/
-//TODO: provide option to boost score of fragments near beginning of document
+// TODO: provide option to boost score of fragments near beginning of document
// based on fragment.getFragNum()
-public class QueryScorer implements Scorer
-{
- TextFragment currentTextFragment=null;
- HashSet uniqueTermsInFragment;
- float totalScore=0;
- float maxTermWeight=0;
- private HashMap termsToFind;
-
+public class QueryScorer implements Scorer {
+
+ TextFragment currentTextFragment = null;
+ HashSet uniqueTermsInFragment;
+
+ float totalScore = 0;
+ float maxTermWeight = 0;
+ private HashMap termsToFind;
+
+ private TermAttribute termAtt;
+
+ /**
+ *
+ * @param query a Lucene query (ideally rewritten using query.rewrite before
+ * being passed to this class and the searcher)
+ */
+ public QueryScorer(Query query) {
+ this(QueryTermExtractor.getTerms(query));
+ }
+
+ /**
+ *
+ * @param query a Lucene query (ideally rewritten using query.rewrite before
+ * being passed to this class and the searcher)
+ * @param fieldName the Field name which is used to match Query terms
+ */
+ public QueryScorer(Query query, String fieldName) {
+ this(QueryTermExtractor.getTerms(query, false, fieldName));
+ }
+
+ /**
+ *
+ * @param query a Lucene query (ideally rewritten using query.rewrite before
+ * being passed to this class and the searcher)
+ * @param reader used to compute IDF which can be used to a) score selected
+ * fragments better b) use graded highlights eg set font color
+ * intensity
+ * @param fieldName the field on which Inverse Document Frequency (IDF)
+ * calculations are based
+ */
+ public QueryScorer(Query query, IndexReader reader, String fieldName) {
+ this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName));
+ }
- /**
- *
- * @param query a Lucene query (ideally rewritten using query.rewrite
- * before being passed to this class and the searcher)
- */
- public QueryScorer(Query query)
- {
- this(QueryTermExtractor.getTerms(query));
- }
-
- /**
- *
- * @param query a Lucene query (ideally rewritten using query.rewrite
- * before being passed to this class and the searcher)
- * @param fieldName the Field name which is used to match Query terms
- */
- public QueryScorer(Query query, String fieldName)
- {
- this(QueryTermExtractor.getTerms(query, false,fieldName));
- }
+ public QueryScorer(WeightedTerm[] weightedTerms) {
+ termsToFind = new HashMap();
+ for (int i = 0; i < weightedTerms.length; i++) {
+ WeightedTerm existingTerm = (WeightedTerm) termsToFind
+ .get(weightedTerms[i].term);
+ if ((existingTerm == null)
+ || (existingTerm.weight < weightedTerms[i].weight)) {
+ // if a term is defined more than once, always use the highest scoring
+ // weight
+ termsToFind.put(weightedTerms[i].term, weightedTerms[i]);
+ maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
+ }
+ }
+ }
- /**
- *
- * @param query a Lucene query (ideally rewritten using query.rewrite
- * before being passed to this class and the searcher)
- * @param reader used to compute IDF which can be used to a) score selected fragments better
- * b) use graded highlights eg set font color intensity
- * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
- */
- public QueryScorer(Query query, IndexReader reader, String fieldName)
- {
- this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName));
- }
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
+ */
+ public void init(TokenStream tokenStream) {
+ termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+ }
- public QueryScorer(WeightedTerm []weightedTerms )
- {
- termsToFind = new HashMap();
- for (int i = 0; i < weightedTerms.length; i++)
- {
- WeightedTerm existingTerm=(WeightedTerm) termsToFind.get(weightedTerms[i].term);
- if( (existingTerm==null) ||(existingTerm.weight<weightedTerms[i].weight) )
- {
- //if a term is defined more than once, always use the highest scoring weight
- termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
- maxTermWeight=Math.max(maxTermWeight,weightedTerms[i].getWeight());
- }
- }
- }
-
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache
+ * .lucene.search.highlight.TextFragment)
+ */
+ public void startFragment(TextFragment newFragment) {
+ uniqueTermsInFragment = new HashSet();
+ currentTextFragment = newFragment;
+ totalScore = 0;
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
- */
- public void startFragment(TextFragment newFragment)
- {
- uniqueTermsInFragment = new HashSet();
- currentTextFragment=newFragment;
- totalScore=0;
-
- }
-
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
- */
- public float getTokenScore(Token token)
- {
- String termText=token.term();
-
- WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
- if(queryTerm==null)
- {
- //not a query term - return
- return 0;
- }
- //found a query term - is it unique in this doc?
- if(!uniqueTermsInFragment.contains(termText))
- {
- totalScore+=queryTerm.getWeight();
- uniqueTermsInFragment.add(termText);
- }
- return queryTerm.getWeight();
- }
-
-
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
- */
- public float getFragmentScore()
- {
- return totalScore;
- }
+ }
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
- */
- public void allFragmentsProcessed()
- {
- //this class has no special operations to perform at end of processing
- }
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
+ */
+ public float getTokenScore() {
+ String termText = termAtt.term();
- /**
- *
- * @return The highest weighted term (useful for passing to GradientFormatter to set
- * top end of coloring scale.
- */
- public float getMaxTermWeight()
- {
- return maxTermWeight;
+ WeightedTerm queryTerm = (WeightedTerm) termsToFind.get(termText);
+ if (queryTerm == null) {
+ // not a query term - return
+ return 0;
}
+ // found a query term - is it unique in this doc?
+ if (!uniqueTermsInFragment.contains(termText)) {
+ totalScore += queryTerm.getWeight();
+ uniqueTermsInFragment.add(termText);
+ }
+ return queryTerm.getWeight();
+ }
+
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
+ */
+ public float getFragmentScore() {
+ return totalScore;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
+ */
+ public void allFragmentsProcessed() {
+ // this class has no special operations to perform at end of processing
+ }
+
+ /**
+ *
+ * @return The highest weighted term (useful for passing to GradientFormatter
+ * to set top end of coloring scale.
+ */
+ public float getMaxTermWeight() {
+ return maxTermWeight;
+ }
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (working copy)
@@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,34 +17,45 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
/**
* Adds to the score for a fragment based on its tokens
*/
-public interface Scorer
-{
- /**
- * called when a new fragment is started for consideration
- * @param newFragment
- */
- public void startFragment(TextFragment newFragment);
+public interface Scorer {
- /**
- * Called for each token in the current fragment
- * @param token The token to be scored
- * @return a score which is passed to the Highlighter class to influence the mark-up of the text
- * (this return value is NOT used to score the fragment)
- */
- public float getTokenScore(Token token);
-
+ /**
+ * Called to init the Scorer with a TokenStream. You can grab references to
+ * the attributes you are interested in here and access them from
+ * getTokenScore().
+ *
+ * @param tokenStream
+ */
+ public void init(TokenStream tokenStream);
- /**
- * Called when the highlighter has no more tokens for the current fragment - the scorer returns
- * the weighting it has derived for the most recent fragment, typically based on the tokens
- * passed to getTokenScore().
- *
- */
- public float getFragmentScore();
+ /**
+ * called when a new fragment is started for consideration
+ *
+ * @param newFragment
+ */
+ public void startFragment(TextFragment newFragment);
+
+ /**
+ * Called for each token in the current fragment. The Highlighter will
+ * increment the TokenStream passed to init on every call.
+ *
+ * @return a score which is passed to the Highlighter class to influence the
+ * mark-up of the text (this return value is NOT used to score the
+ * fragment)
+ */
+ public float getTokenScore();
+
+ /**
+ * Called when the highlighter has no more tokens for the current fragment -
+ * the scorer returns the weighting it has derived for the most recent
+ * fragment, typically based on the tokens passed to getTokenScore().
+ *
+ */
+ public float getFragmentScore();
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java (working copy)
@@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,69 +17,64 @@
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
/**
- * {@link Fragmenter} implementation which breaks text up into same-size
+ * {@link Fragmenter} implementation which breaks text up into same-size
* fragments with no concerns over spotting sentence boundaries.
*/
-public class SimpleFragmenter implements Fragmenter
-{
- private static final int DEFAULT_FRAGMENT_SIZE =100;
- private int currentNumFrags;
- private int fragmentSize;
+public class SimpleFragmenter implements Fragmenter {
+ private static final int DEFAULT_FRAGMENT_SIZE = 100;
+ private int currentNumFrags;
+ private int fragmentSize;
+ private OffsetAttribute offsetAtt;
+ public SimpleFragmenter() {
+ this(DEFAULT_FRAGMENT_SIZE);
+ }
- public SimpleFragmenter()
- {
- this(DEFAULT_FRAGMENT_SIZE);
- }
+ /**
+ *
+ * @param fragmentSize size in number of characters of each fragment
+ */
+ public SimpleFragmenter(int fragmentSize) {
+ this.fragmentSize = fragmentSize;
+ }
- /**
- *
- * @param fragmentSize size in number of characters of each fragment
- */
- public SimpleFragmenter(int fragmentSize)
- {
- this.fragmentSize=fragmentSize;
- }
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
+ */
+ public void start(String originalText, TokenStream stream) {
+ offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
+ currentNumFrags = 1;
+ }
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)
- */
- public void start(String originalText)
- {
- currentNumFrags=1;
- }
- /* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)
- */
- public boolean isNewFragment(Token token)
- {
- boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags);
- if(isNewFrag)
- {
- currentNumFrags++;
- }
- return isNewFrag;
- }
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
+ */
+ public boolean isNewFragment() {
+ boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags);
+ if (isNewFrag) {
+ currentNumFrags++;
+ }
+ return isNewFrag;
+ }
- /**
- * @return size in number of characters of each fragment
- */
- public int getFragmentSize()
- {
- return fragmentSize;
- }
+ /**
+ * @return size in number of characters of each fragment
+ */
+ public int getFragmentSize() {
+ return fragmentSize;
+ }
- /**
- * @param size size in characters of each fragment
- */
- public void setFragmentSize(int size)
- {
- fragmentSize = size;
- }
+ /**
+ * @param size size in characters of each fragment
+ */
+ public void setFragmentSize(int size) {
+ fragmentSize = size;
+ }
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (working copy)
@@ -17,10 +17,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-import org.apache.lucene.analysis.Token;
-
import java.util.List;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* {@link Fragmenter} implementation which breaks text up into same-size
@@ -34,6 +37,9 @@
private SpanScorer spanScorer;
private int waitForPos = -1;
private int textSize;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncAtt;
+ private OffsetAttribute offsetAtt;
/**
* @param spanscorer SpanScorer that was used to score hits
@@ -50,12 +56,12 @@
this.fragmentSize = fragmentSize;
this.spanScorer = spanscorer;
}
-
+
/* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token)
+ * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment()
*/
- public boolean isNewFragment(Token token) {
- position += token.getPositionIncrement();
+ public boolean isNewFragment() {
+ position += posIncAtt.getPositionIncrement();
if (waitForPos == position) {
waitForPos = -1;
@@ -63,7 +69,7 @@
return false;
}
- WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term());
+ WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(termAtt.term());
if (wSpanTerm != null) {
List positionSpans = wSpanTerm.getPositionSpans();
@@ -76,8 +82,8 @@
}
}
- boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags)
- && (textSize - token.endOffset()) >= (fragmentSize >>> 1);
+ boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags)
+ && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1);
if (isNewFrag) {
currentNumFrags++;
@@ -86,12 +92,16 @@
return isNewFrag;
}
+
/* (non-Javadoc)
- * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String)
+ * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream)
*/
- public void start(String originalText) {
+ public void start(String originalText, TokenStream tokenStream) {
position = -1;
currentNumFrags = 1;
textSize = originalText.length();
+ termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+ posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class);
+ offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
}
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (working copy)
@@ -7,9 +7,10 @@
import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
-import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.Query;
@@ -26,6 +27,8 @@
private float maxTermWeight;
private int position = -1;
private String defaultField;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncAtt;
private static boolean highlightCnstScrRngQuery;
/**
@@ -176,9 +179,9 @@
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
* int)
*/
- public float getTokenScore(Token token) {
- position += token.getPositionIncrement();
- String termText = token.term();
+ public float getTokenScore() {
+ position += posIncAtt.getPositionIncrement();
+ String termText = termAtt.term();
WeightedSpanTerm weightedSpanTerm;
@@ -203,6 +206,11 @@
return score;
}
+ public void init(TokenStream tokenStream) {
+ termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+ posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class);
+ }
+
/**
* Retrieve the WeightedSpanTerm for the specified token. Useful for passing
* Span information to a Fragmenter.
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (working copy)
@@ -1,4 +1,5 @@
package org.apache.lucene.search.highlight;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -15,118 +16,117 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
- * One, or several overlapping tokens, along with the score(s) and the
- * scope of the original text
+ * One, or several overlapping tokens, along with the score(s) and the scope of
+ * the original text
*/
-public class TokenGroup
-{
-
- private static final int MAX_NUM_TOKENS_PER_GROUP=50;
- Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
- float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP];
- int numTokens=0;
- int startOffset=0;
- int endOffset=0;
- float tot;
+public class TokenGroup {
+ private static final int MAX_NUM_TOKENS_PER_GROUP = 50;
+ Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP];
+ float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP];
+ int numTokens = 0;
+ int startOffset = 0;
+ int endOffset = 0;
+ float tot;
int matchStartOffset, matchEndOffset;
+ private OffsetAttribute offsetAtt;
+ private TermAttribute termAtt;
- void addToken(Token token, float score)
- {
- if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
- {
- if(numTokens==0)
- {
- startOffset=matchStartOffset=token.startOffset();
- endOffset=matchEndOffset=token.endOffset();
- tot += score;
- }
- else
- {
- startOffset=Math.min(startOffset,token.startOffset());
- endOffset=Math.max(endOffset,token.endOffset());
- if (score>0) {
- if (tot==0) {
- matchStartOffset=token.startOffset();
- matchEndOffset=token.endOffset();
+ public TokenGroup(TokenStream tokenStream) {
+ offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class);
+ termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
+ }
+
+ void addToken(float score) {
+ if (numTokens < MAX_NUM_TOKENS_PER_GROUP) {
+ int termStartOffset = offsetAtt.startOffset();
+ int termEndOffset = offsetAtt.endOffset();
+ if (numTokens == 0) {
+ startOffset = matchStartOffset = termStartOffset;
+ endOffset = matchEndOffset = termEndOffset;
+ tot += score;
+ } else {
+ startOffset = Math.min(startOffset, termStartOffset);
+ endOffset = Math.max(endOffset, termEndOffset);
+ if (score > 0) {
+ if (tot == 0) {
+ matchStartOffset = offsetAtt.startOffset();
+ matchEndOffset = offsetAtt.endOffset();
} else {
- matchStartOffset=Math.min(matchStartOffset,token.startOffset());
- matchEndOffset=Math.max(matchEndOffset,token.endOffset());
+ matchStartOffset = Math.min(matchStartOffset, termStartOffset);
+ matchEndOffset = Math.max(matchEndOffset, termEndOffset);
}
- tot+=score;
+ tot += score;
}
}
- tokens[numTokens]= (Token) token.clone();
- scores[numTokens]=score;
- numTokens++;
- }
- }
+ Token token = new Token(termStartOffset, termEndOffset);
+ token.setTermBuffer(termAtt.term());
+ tokens[numTokens] = token;
+ scores[numTokens] = score;
+ numTokens++;
+ }
+ }
- boolean isDistinct(Token token)
- {
- return token.startOffset()>=endOffset;
- }
+ boolean isDistinct() {
+ return offsetAtt.startOffset() >= endOffset;
+ }
+ void clear() {
+ numTokens = 0;
+ tot = 0;
+ }
+
+ /*
+ * @param index a value between 0 and numTokens -1
+ * @return the "n"th token
+ */
+ public Token getToken(int index)
+ {
+ return tokens[index];
+ }
- void clear()
- {
- numTokens=0;
- tot=0;
- }
-
- /**
- *
- * @param index a value between 0 and numTokens -1
- * @return the "n"th token
- */
- public Token getToken(int index)
- {
- return tokens[index];
- }
+ /**
+ *
+ * @param index a value between 0 and numTokens -1
+ * @return the "n"th score
+ */
+ public float getScore(int index) {
+ return scores[index];
+ }
- /**
- *
- * @param index a value between 0 and numTokens -1
- * @return the "n"th score
- */
- public float getScore(int index)
- {
- return scores[index];
- }
+ /**
+ * @return the end position in the original text
+ */
+ public int getEndOffset() {
+ return endOffset;
+ }
- /**
- * @return the end position in the original text
- */
- public int getEndOffset()
- {
- return endOffset;
- }
+ /**
+ * @return the number of tokens in this group
+ */
+ public int getNumTokens() {
+ return numTokens;
+ }
- /**
- * @return the number of tokens in this group
- */
- public int getNumTokens()
- {
- return numTokens;
- }
+ /**
+ * @return the start position in the original text
+ */
+ public int getStartOffset() {
+ return startOffset;
+ }
- /**
- * @return the start position in the original text
- */
- public int getStartOffset()
- {
- return startOffset;
- }
-
- /**
- * @return all tokens' scores summed up
- */
- public float getTotalScore()
- {
- return tot;
- }
+ /**
+ * @return all tokens' scores summed up
+ */
+ public float getTotalScore() {
+ return tot;
+ }
}
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 797692)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy)
@@ -29,6 +29,8 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
@@ -135,32 +137,45 @@
* @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking
* to eek out the last drops of performance, set to true. If in doubt, set to false.
*/
- public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous)
- {
+ public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) {
//an object used to iterate across an array of tokens
- class StoredTokenStream extends TokenStream
- {
- Token tokens[];
- int currentToken=0;
- StoredTokenStream(Token tokens[])
- {
- this.tokens=tokens;
+ class StoredTokenStream extends TokenStream {
+ Token tokens[];
+ int currentToken = 0;
+ TermAttribute termAtt;
+ OffsetAttribute offsetAtt;
+
+ StoredTokenStream(Token tokens[]) {
+ this.tokens = tokens;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ }
+
+ public Token next(final Token reusableToken) {
+ System.out.println("next token");
+ assert reusableToken != null;
+ if (currentToken >= tokens.length) {
+ return null;
}
- public Token next(final Token reusableToken)
- {
- assert reusableToken != null;
- if(currentToken>=tokens.length)
- {
- return null;
- }
- return tokens[currentToken++];
- }
- }
+ return tokens[currentToken++];
+ }
+
+ public boolean incrementToken() throws IOException {
+ System.out.println("inc token");
+ if (currentToken >= tokens.length) {
+ return false;
+ }
+ Token token = tokens[currentToken++];
+ termAtt.setTermBuffer(token.term());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ return true;
+ }
+ }
//code to reconstruct the original sequence of Tokens
String[] terms=tpv.getTerms();
int[] freq=tpv.getTermFrequencies();
int totalTokens=0;
- Token newToken = new Token();
+
for (int t = 0; t < freq.length; t++)
{
totalTokens+=freq[t];
@@ -190,8 +205,9 @@
}
for (int tp = 0; tp < offsets.length; tp++)
{
- newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
- unsortedTokens.add(newToken.clone());
+ Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
+ token.setTermBuffer(terms[t]);
+ unsortedTokens.add(token);
}
}
else
@@ -204,8 +220,8 @@
//tokens stored with positions - can use this to index straight into sorted array
for (int tp = 0; tp < pos.length; tp++)
{
- newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
- tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone();
+ Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
+ tokensInOriginalOrder[pos[tp]] = token;
}
}
}
@@ -218,7 +234,7 @@
{
Token t1=(Token) o1;
Token t2=(Token) o2;
- if(t1.startOffset()>t2.startOffset())
+ if(t1.startOffset()>t2.endOffset())
return 1;
if(t1.startOffset()<t2.startOffset())
return -1;
Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
===================================================================
--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 799153)
+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (working copy)
@@ -42,8 +42,8 @@
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
@@ -98,7 +98,7 @@
private void extract(Query query, Map terms) throws IOException {
if (query instanceof BooleanQuery) {
BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses();
-
+
for (int i = 0; i < queryClauses.length; i++) {
if (!queryClauses[i].isProhibited()) {
extract(queryClauses[i].getQuery(), terms);
@@ -441,7 +441,7 @@
* This class makes sure that if both position sensitive and insensitive
* versions of the same term are added, the position insensitive one wins.
*/
- private class PositionCheckingMap extends HashMap {
+ static private class PositionCheckingMap extends HashMap {
public void putAll(Map m) {
Iterator it = m.keySet().iterator();
Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
===================================================================
--- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 799153)
+++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy)
@@ -38,10 +38,14 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
@@ -53,18 +57,17 @@
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.ConstantScoreRangeQuery;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.MultiSearcher;
-import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermRangeFilter;
-import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.BooleanClause.Occur;
@@ -75,6 +78,7 @@
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
@@ -87,7 +91,7 @@
static final String FIELD_NAME = "contents";
private Query query;
RAMDirectory ramDir;
- public Searcher searcher = null;
+ public IndexSearcher searcher = null;
public Hits hits = null;
int numHighlights = 0;
Analyzer analyzer = new StandardAnalyzer();
@@ -108,11 +112,40 @@
super(arg0);
}
+ public void testHits() throws Exception {
+ Analyzer analyzer = new SimpleAnalyzer();
+ QueryParser qp = new QueryParser(FIELD_NAME, analyzer);
+ query = qp.parse("\"very long\"");
+ searcher = new IndexSearcher(ramDir, false);
+ TopDocs hits = searcher.search(query, 10);
+
+ Highlighter highlighter = new Highlighter(null);
+
+
+ for (int i = 0; i < hits.scoreDocs.length; i++) {
+ Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ String storedField = doc.get(FIELD_NAME);
+
+ TokenStream stream = TokenSources.getAnyTokenStream(searcher
+ .getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ CachingTokenFilter ctf = new CachingTokenFilter(stream);
+ SpanScorer scorer = new SpanScorer(query, FIELD_NAME, ctf);
+ // ctf.reset();
+ Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
+ highlighter.setFragmentScorer(scorer);
+ highlighter.setTextFragmenter(fragmenter);
+
+ String fragment = highlighter.getBestFragment(ctf, storedField);
+
+ System.out.println(fragment);
+ }
+ }
+
public void testHighlightingWithDefaultField() throws Exception {
String s1 = "I call our world Flatland, not because we call it so,";
- QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer());
+ QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer(Version.LUCENE_CURRENT));
// Verify that a query against the default field results in text being
// highlighted
@@ -144,7 +177,7 @@
*/
private static String highlightField(Query query, String fieldName, String text)
throws IOException, InvalidTokenOffsetsException {
- CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream(
+ CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer(Version.LUCENE_CURRENT).tokenStream(
fieldName, new StringReader(text)));
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
@@ -908,10 +941,12 @@
Query query = parser.parse(srchkey);
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s));
+
Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this);
// Get 3 best fragments and seperate with a "..."
tokenStream = analyzer.tokenStream(null, new StringReader(s));
+
String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult
@@ -1075,10 +1110,11 @@
}
public void testUnRewrittenQuery() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
+ final TestHighlightRunner helper = new TestHighlightRunner() {
public void run() throws Exception {
numHighlights = 0;
+ SpanScorer.setHighlightCnstScrRngQuery(false);
// test to show how rewritten query can still be used
searcher = new IndexSearcher(ramDir);
Analyzer analyzer = new StandardAnalyzer();
@@ -1154,13 +1190,17 @@
public void startFragment(TextFragment newFragment) {
}
- public float getTokenScore(Token token) {
+ public float getTokenScore() {
return 0;
}
public float getFragmentScore() {
return 1;
}
+
+ public void init(TokenStream tokenStream) {
+
+ }
});
highlighter.setTextFragmenter(new SimpleFragmenter(2000));
TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent));
@@ -1292,27 +1332,44 @@
return new TokenStream() {
Iterator iter;
List lst;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private OffsetAttribute offsetAtt;
{
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
lst = new ArrayList();
Token t;
t = createToken("hi", 0, 2);
+ t.setPositionIncrement(1);
lst.add(t);
t = createToken("hispeed", 0, 8);
+ t.setPositionIncrement(1);
lst.add(t);
t = createToken("speed", 3, 8);
t.setPositionIncrement(0);
lst.add(t);
t = createToken("10", 8, 10);
+ t.setPositionIncrement(1);
lst.add(t);
t = createToken("foo", 11, 14);
+ t.setPositionIncrement(1);
lst.add(t);
iter = lst.iterator();
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- return iter.hasNext() ? (Token) iter.next() : null;
+ public boolean incrementToken() throws IOException {
+ if(iter.hasNext()) {
+ Token token = (Token) iter.next();
+ termAtt.setTermBuffer(token.term());
+ posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ return true;
+ }
+ return false;
}
+
};
}
@@ -1322,26 +1379,42 @@
return new TokenStream() {
Iterator iter;
List lst;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private OffsetAttribute offsetAtt;
{
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
lst = new ArrayList();
Token t;
t = createToken("hispeed", 0, 8);
+ t.setPositionIncrement(1);
lst.add(t);
t = createToken("hi", 0, 2);
t.setPositionIncrement(0);
lst.add(t);
t = createToken("speed", 3, 8);
+ t.setPositionIncrement(1);
lst.add(t);
t = createToken("10", 8, 10);
+ t.setPositionIncrement(1);
lst.add(t);
t = createToken("foo", 11, 14);
+ t.setPositionIncrement(1);
lst.add(t);
iter = lst.iterator();
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
- return iter.hasNext() ? (Token) iter.next() : null;
+ public boolean incrementToken() throws IOException {
+ if(iter.hasNext()) {
+ Token token = (Token) iter.next();
+ termAtt.setTermBuffer(token.term());
+ posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ return true;
+ }
+ return false;
}
};
}
@@ -1611,7 +1684,11 @@
* java.io.Reader)
*/
public TokenStream tokenStream(String arg0, Reader arg1) {
- return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms);
+ LowerCaseTokenizer stream = new LowerCaseTokenizer(arg1);
+ stream.addAttribute(TermAttribute.class);
+ stream.addAttribute(PositionIncrementAttribute.class);
+ stream.addAttribute(OffsetAttribute.class);
+ return new SynonymTokenizer(stream, synonyms);
}
}
@@ -1622,47 +1699,70 @@
class SynonymTokenizer extends TokenStream {
private TokenStream realStream;
private Token currentRealToken = null;
+ private org.apache.lucene.analysis.Token cRealToken = null;
private Map synonyms;
StringTokenizer st = null;
+ private TermAttribute realTermAtt;
+ private PositionIncrementAttribute realPosIncrAtt;
+ private OffsetAttribute realOffsetAtt;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+ private OffsetAttribute offsetAtt;
public SynonymTokenizer(TokenStream realStream, Map synonyms) {
this.realStream = realStream;
this.synonyms = synonyms;
+ realTermAtt = (TermAttribute) realStream.getAttribute(TermAttribute.class);
+ realPosIncrAtt = (PositionIncrementAttribute) realStream.getAttribute(PositionIncrementAttribute.class);
+ realOffsetAtt = (OffsetAttribute) realStream.getAttribute(OffsetAttribute.class);
+
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
}
- public Token next(final Token reusableToken) throws IOException {
- assert reusableToken != null;
+ public boolean incrementToken() throws IOException {
+
if (currentRealToken == null) {
- Token nextRealToken = realStream.next(reusableToken);
- if (nextRealToken == null) {
- return null;
+ boolean next = realStream.incrementToken();
+ if (!next) {
+ return false;
}
- String expansions = (String) synonyms.get(nextRealToken.term());
+ //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
+ termAtt.setTermBuffer(realTermAtt.term());
+ offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
+ posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement());
+ System.out.println("term:" + realTermAtt.term());
+ String expansions = (String) synonyms.get(realTermAtt.term());
if (expansions == null) {
- return nextRealToken;
+ return true;
}
st = new StringTokenizer(expansions, ",");
if (st.hasMoreTokens()) {
- currentRealToken = (Token) nextRealToken.clone();
+ currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
+ currentRealToken.setTermBuffer(realTermAtt.term());
}
- return currentRealToken;
+
+ return true;
} else {
- reusableToken.reinit(st.nextToken(),
- currentRealToken.startOffset(),
- currentRealToken.endOffset());
- reusableToken.setPositionIncrement(0);
+ String tok = st.nextToken();
+ termAtt.setTermBuffer(tok);
+ offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset());
+ posIncrAtt.setPositionIncrement(0);
if (!st.hasMoreTokens()) {
currentRealToken = null;
st = null;
}
- return reusableToken;
+ return true;
}
+
}
static abstract class TestHighlightRunner {
static final int STANDARD = 0;
static final int SPAN = 1;
int mode = STANDARD;
+ Fragmenter frag = new SimpleFragmenter(20);
public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream,
Formatter formatter) {
@@ -1725,7 +1825,7 @@
if (mode == SPAN) {
((CachingTokenFilter) tokenStream).reset();
}
- highlighter.setTextFragmenter(new SimpleFragmenter(20));
+ highlighter.setTextFragmenter(frag);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
fragmentSeparator);