| Index: contrib/CHANGES.txt |
| =================================================================== |
| --- contrib/CHANGES.txt (revision 799153) |
| +++ contrib/CHANGES.txt (working copy) |
| @@ -11,7 +11,12 @@ |
| |
| API Changes |
| |
| - (None) |
| + 1. LUCENE-1695: Update the Highlighter to use the new TokenStream API. This issue breaks backwards |
| + compatibility with some public classes. If you have implemented custom Fregmenters or Scorers, |
| + you will need to adjust them to work with the new TokenStream API. Rather than getting passed a |
| + Token at a time, you will be given a TokenStream to init your impl with - store the Attributes |
| + you are interested in locally and access them on each call to the method that used to pass a new |
| + Token. Look at the included updated impls for examples. (Mark Miller) |
| |
| Bug fixes |
| |
| @@ -41,9 +46,6 @@ |
| |
| 8. LUCENE-1491: EdgeNGramTokenFilter no longer stops on tokens shorter than minimum n-gram size. |
| (Todd Teak via Otis Gospodnetic) |
| - |
| - 9. LUCENE-1752: Missing highlights when terms were repeated in separate, nested, boolean or |
| - disjunction queries. (Koji Sekiguchi, Mark Miller) |
| |
| New features |
| |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java (working copy) |
| @@ -1,4 +1,5 @@ |
| package org.apache.lucene.search.highlight; |
| + |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -16,24 +17,31 @@ |
| * limitations under the License. |
| */ |
| |
| -import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| |
| /** |
| - * Implements the policy for breaking text into multiple fragments for consideration |
| - * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis |
| - * of detecting end of sentences in the text. |
| + * Implements the policy for breaking text into multiple fragments for |
| + * consideration by the {@link Highlighter} class. A sophisticated |
| + * implementation may do this on the basis of detecting end of sentences in the |
| + * text. |
| */ |
| -public interface Fragmenter |
| -{ |
| - /** |
| - * Initializes the Fragmenter |
| - * @param originalText |
| - */ |
| - public void start(String originalText); |
| +public interface Fragmenter { |
| |
| - /** |
| - * Test to see if this token from the stream should be held in a new TextFragment |
| - * @param nextToken |
| - */ |
| - public boolean isNewFragment(Token nextToken); |
| + /** |
| + * Initializes the Fragmenter. You can grab references to the Attributes you are |
| + * interested in from tokenStream and then access the values in isNewFragment. |
| + * |
| + * @param originalText |
| + * @param tokenStream |
| + */ |
| + public void start(String originalText, TokenStream tokenStream); |
| + |
| + |
| + /** |
| + * Test to see if this token from the stream should be held in a new |
| + * TextFragment. Every time this is called, the TokenStream |
| + * passed to start(String, TokenStream) will have been incremented. |
| + * |
| + */ |
| + public boolean isNewFragment(); |
| } |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy) |
| @@ -22,8 +22,10 @@ |
| import java.util.Iterator; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| -import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.util.PriorityQueue; |
| |
| /** |
| @@ -214,8 +216,14 @@ |
| { |
| ArrayList docFrags = new ArrayList(); |
| StringBuffer newText=new StringBuffer(); |
| - |
| + |
| + TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class); |
| + OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class); |
| + tokenStream.addAttribute(PositionIncrementAttribute.class); |
| + tokenStream.reset(); |
| + |
| TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); |
| + fragmentScorer.init(tokenStream); |
| fragmentScorer.startFragment(currentFrag); |
| docFrags.add(currentFrag); |
| |
| @@ -223,28 +231,27 @@ |
| |
| try |
| { |
| - final Token reusableToken = new Token(); |
| + |
| String tokenText; |
| int startOffset; |
| int endOffset; |
| int lastEndOffset = 0; |
| - textFragmenter.start(text); |
| + textFragmenter.start(text, tokenStream); |
| |
| - TokenGroup tokenGroup=new TokenGroup(); |
| - |
| - for (Token nextToken = tokenStream.next(reusableToken); |
| - (nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze); |
| - nextToken = tokenStream.next(reusableToken)) |
| + TokenGroup tokenGroup=new TokenGroup(tokenStream); |
| + |
| + for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); |
| + next = tokenStream.incrementToken()) |
| { |
| - if( (nextToken.endOffset()>text.length()) |
| + if( (offsetAtt.endOffset()>text.length()) |
| || |
| - (nextToken.startOffset()>text.length()) |
| + (offsetAtt.startOffset()>text.length()) |
| ) |
| { |
| - throw new InvalidTokenOffsetsException("Token "+nextToken.toString() |
| + throw new InvalidTokenOffsetsException("Token "+ termAtt.term() |
| +" exceeds length of provided text sized "+text.length()); |
| } |
| - if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken))) |
| + if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) |
| { |
| //the current token is distinct from previous tokens - |
| // markup the cached token group info |
| @@ -260,7 +267,7 @@ |
| tokenGroup.clear(); |
| |
| //check if current token marks the start of a new fragment |
| - if(textFragmenter.isNewFragment(nextToken)) |
| + if(textFragmenter.isNewFragment()) |
| { |
| currentFrag.setScore(fragmentScorer.getFragmentScore()); |
| //record stats for a new fragment |
| @@ -271,7 +278,7 @@ |
| } |
| } |
| |
| - tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken)); |
| + tokenGroup.addToken(fragmentScorer.getTokenScore()); |
| |
| // if(lastEndOffset>maxDocBytesToAnalyze) |
| // { |
| @@ -332,7 +339,7 @@ |
| //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 |
| //fix to PriorityQueue. The correct method to use here is the new "insert" method |
| // USE ABOVE CODE IF THIS DOES NOT COMPILE! |
| - fragQueue.insert(currentFrag); |
| + fragQueue.insertWithOverflow(currentFrag); |
| } |
| |
| //return the most relevant fragments |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/NullFragmenter.java (working copy) |
| @@ -16,17 +16,18 @@ |
| * limitations under the License. |
| */ |
| |
| -import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| |
| /** |
| * {@link Fragmenter} implementation which does not fragment the text. |
| * This is useful for highlighting the entire content of a document or field. |
| */ |
| public class NullFragmenter implements Fragmenter { |
| - public void start(String s) { |
| + public void start(String s, TokenStream tokenStream) { |
| } |
| |
| - public boolean isNewFragment(Token token) { |
| + public boolean isNewFragment() { |
| return false; |
| } |
| + |
| } |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (working copy) |
| @@ -1,4 +1,5 @@ |
| package org.apache.lucene.search.highlight; |
| + |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -19,134 +20,142 @@ |
| import java.util.HashMap; |
| import java.util.HashSet; |
| |
| -import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.search.Query; |
| |
| /** |
| - * {@link Scorer} implementation which scores text fragments by the number of unique query terms found. |
| - * This class uses the {@link QueryTermExtractor} class to process determine the query terms and |
| - * their boosts to be used. |
| + * {@link Scorer} implementation which scores text fragments by the number of |
| + * unique query terms found. This class uses the {@link QueryTermExtractor} |
| + * class to process determine the query terms and their boosts to be used. |
| */ |
| -//TODO: provide option to boost score of fragments near beginning of document |
| +// TODO: provide option to boost score of fragments near beginning of document |
| // based on fragment.getFragNum() |
| -public class QueryScorer implements Scorer |
| -{ |
| - TextFragment currentTextFragment=null; |
| - HashSet uniqueTermsInFragment; |
| - float totalScore=0; |
| - float maxTermWeight=0; |
| - private HashMap termsToFind; |
| - |
| +public class QueryScorer implements Scorer { |
| + |
| + TextFragment currentTextFragment = null; |
| + HashSet uniqueTermsInFragment; |
| + |
| + float totalScore = 0; |
| + float maxTermWeight = 0; |
| + private HashMap termsToFind; |
| + |
| + private TermAttribute termAtt; |
| + |
| + /** |
| + * |
| + * @param query a Lucene query (ideally rewritten using query.rewrite before |
| + * being passed to this class and the searcher) |
| + */ |
| + public QueryScorer(Query query) { |
| + this(QueryTermExtractor.getTerms(query)); |
| + } |
| + |
| + /** |
| + * |
| + * @param query a Lucene query (ideally rewritten using query.rewrite before |
| + * being passed to this class and the searcher) |
| + * @param fieldName the Field name which is used to match Query terms |
| + */ |
| + public QueryScorer(Query query, String fieldName) { |
| + this(QueryTermExtractor.getTerms(query, false, fieldName)); |
| + } |
| + |
| + /** |
| + * |
| + * @param query a Lucene query (ideally rewritten using query.rewrite before |
| + * being passed to this class and the searcher) |
| + * @param reader used to compute IDF which can be used to a) score selected |
| + * fragments better b) use graded highlights eg set font color |
| + * intensity |
| + * @param fieldName the field on which Inverse Document Frequency (IDF) |
| + * calculations are based |
| + */ |
| + public QueryScorer(Query query, IndexReader reader, String fieldName) { |
| + this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); |
| + } |
| |
| - /** |
| - * |
| - * @param query a Lucene query (ideally rewritten using query.rewrite |
| - * before being passed to this class and the searcher) |
| - */ |
| - public QueryScorer(Query query) |
| - { |
| - this(QueryTermExtractor.getTerms(query)); |
| - } |
| - |
| - /** |
| - * |
| - * @param query a Lucene query (ideally rewritten using query.rewrite |
| - * before being passed to this class and the searcher) |
| - * @param fieldName the Field name which is used to match Query terms |
| - */ |
| - public QueryScorer(Query query, String fieldName) |
| - { |
| - this(QueryTermExtractor.getTerms(query, false,fieldName)); |
| - } |
| + public QueryScorer(WeightedTerm[] weightedTerms) { |
| + termsToFind = new HashMap(); |
| + for (int i = 0; i < weightedTerms.length; i++) { |
| + WeightedTerm existingTerm = (WeightedTerm) termsToFind |
| + .get(weightedTerms[i].term); |
| + if ((existingTerm == null) |
| + || (existingTerm.weight < weightedTerms[i].weight)) { |
| + // if a term is defined more than once, always use the highest scoring |
| + // weight |
| + termsToFind.put(weightedTerms[i].term, weightedTerms[i]); |
| + maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); |
| + } |
| + } |
| + } |
| |
| - /** |
| - * |
| - * @param query a Lucene query (ideally rewritten using query.rewrite |
| - * before being passed to this class and the searcher) |
| - * @param reader used to compute IDF which can be used to a) score selected fragments better |
| - * b) use graded highlights eg set font color intensity |
| - * @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based |
| - */ |
| - public QueryScorer(Query query, IndexReader reader, String fieldName) |
| - { |
| - this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); |
| - } |
| + /* (non-Javadoc) |
| + * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) |
| + */ |
| + public void init(TokenStream tokenStream) { |
| + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); |
| + } |
| |
| - public QueryScorer(WeightedTerm []weightedTerms ) |
| - { |
| - termsToFind = new HashMap(); |
| - for (int i = 0; i < weightedTerms.length; i++) |
| - { |
| - WeightedTerm existingTerm=(WeightedTerm) termsToFind.get(weightedTerms[i].term); |
| - if( (existingTerm==null) ||(existingTerm.weight<weightedTerms[i].weight) ) |
| - { |
| - //if a term is defined more than once, always use the highest scoring weight |
| - termsToFind.put(weightedTerms[i].term,weightedTerms[i]); |
| - maxTermWeight=Math.max(maxTermWeight,weightedTerms[i].getWeight()); |
| - } |
| - } |
| - } |
| - |
| + /* |
| + * (non-Javadoc) |
| + * |
| + * @see |
| + * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache |
| + * .lucene.search.highlight.TextFragment) |
| + */ |
| + public void startFragment(TextFragment newFragment) { |
| + uniqueTermsInFragment = new HashSet(); |
| + currentTextFragment = newFragment; |
| + totalScore = 0; |
| |
| - /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment) |
| - */ |
| - public void startFragment(TextFragment newFragment) |
| - { |
| - uniqueTermsInFragment = new HashSet(); |
| - currentTextFragment=newFragment; |
| - totalScore=0; |
| - |
| - } |
| - |
| - /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token) |
| - */ |
| - public float getTokenScore(Token token) |
| - { |
| - String termText=token.term(); |
| - |
| - WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText); |
| - if(queryTerm==null) |
| - { |
| - //not a query term - return |
| - return 0; |
| - } |
| - //found a query term - is it unique in this doc? |
| - if(!uniqueTermsInFragment.contains(termText)) |
| - { |
| - totalScore+=queryTerm.getWeight(); |
| - uniqueTermsInFragment.add(termText); |
| - } |
| - return queryTerm.getWeight(); |
| - } |
| - |
| - |
| - /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment) |
| - */ |
| - public float getFragmentScore() |
| - { |
| - return totalScore; |
| - } |
| + } |
| |
| |
| - /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed() |
| - */ |
| - public void allFragmentsProcessed() |
| - { |
| - //this class has no special operations to perform at end of processing |
| - } |
| + /* (non-Javadoc) |
| + * @see org.apache.lucene.search.highlight.Scorer#getTokenScore() |
| + */ |
| + public float getTokenScore() { |
| + String termText = termAtt.term(); |
| |
| - /** |
| - * |
| - * @return The highest weighted term (useful for passing to GradientFormatter to set |
| - * top end of coloring scale. |
| - */ |
| - public float getMaxTermWeight() |
| - { |
| - return maxTermWeight; |
| + WeightedTerm queryTerm = (WeightedTerm) termsToFind.get(termText); |
| + if (queryTerm == null) { |
| + // not a query term - return |
| + return 0; |
| } |
| + // found a query term - is it unique in this doc? |
| + if (!uniqueTermsInFragment.contains(termText)) { |
| + totalScore += queryTerm.getWeight(); |
| + uniqueTermsInFragment.add(termText); |
| + } |
| + return queryTerm.getWeight(); |
| + } |
| + |
| + |
| + /* (non-Javadoc) |
| + * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() |
| + */ |
| + public float getFragmentScore() { |
| + return totalScore; |
| + } |
| + |
| + /* |
| + * (non-Javadoc) |
| + * |
| + * @see |
| + * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed() |
| + */ |
| + public void allFragmentsProcessed() { |
| + // this class has no special operations to perform at end of processing |
| + } |
| + |
| + /** |
| + * |
| + * @return The highest weighted term (useful for passing to GradientFormatter |
| + * to set top end of coloring scale. |
| + */ |
| + public float getMaxTermWeight() { |
| + return maxTermWeight; |
| + } |
| } |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java (working copy) |
| @@ -1,4 +1,5 @@ |
| package org.apache.lucene.search.highlight; |
| + |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -16,34 +17,45 @@ |
| * limitations under the License. |
| */ |
| |
| -import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| |
| /** |
| * Adds to the score for a fragment based on its tokens |
| */ |
| -public interface Scorer |
| -{ |
| - /** |
| - * called when a new fragment is started for consideration |
| - * @param newFragment |
| - */ |
| - public void startFragment(TextFragment newFragment); |
| +public interface Scorer { |
| |
| - /** |
| - * Called for each token in the current fragment |
| - * @param token The token to be scored |
| - * @return a score which is passed to the Highlighter class to influence the mark-up of the text |
| - * (this return value is NOT used to score the fragment) |
| - */ |
| - public float getTokenScore(Token token); |
| - |
| + /** |
| + * Called to init the Scorer with a TokenStream. You can grab references to |
| + * the attributes you are interested in here and access them from |
| + * getTokenScore(). |
| + * |
| + * @param tokenStream |
| + */ |
| + public void init(TokenStream tokenStream); |
| |
| - /** |
| - * Called when the highlighter has no more tokens for the current fragment - the scorer returns |
| - * the weighting it has derived for the most recent fragment, typically based on the tokens |
| - * passed to getTokenScore(). |
| - * |
| - */ |
| - public float getFragmentScore(); |
| + /** |
| + * called when a new fragment is started for consideration |
| + * |
| + * @param newFragment |
| + */ |
| + public void startFragment(TextFragment newFragment); |
| + |
| + /** |
| + * Called for each token in the current fragment. The Highlighter will |
| + * increment the TokenStream passed to init on every call. |
| + * |
| + * @return a score which is passed to the Highlighter class to influence the |
| + * mark-up of the text (this return value is NOT used to score the |
| + * fragment) |
| + */ |
| + public float getTokenScore(); |
| + |
| + /** |
| + * Called when the highlighter has no more tokens for the current fragment - |
| + * the scorer returns the weighting it has derived for the most recent |
| + * fragment, typically based on the tokens passed to getTokenScore(). |
| + * |
| + */ |
| + public float getFragmentScore(); |
| |
| } |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java (working copy) |
| @@ -1,4 +1,5 @@ |
| package org.apache.lucene.search.highlight; |
| + |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -16,69 +17,64 @@ |
| * limitations under the License. |
| */ |
| |
| -import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| |
| /** |
| - * {@link Fragmenter} implementation which breaks text up into same-size |
| + * {@link Fragmenter} implementation which breaks text up into same-size |
| * fragments with no concerns over spotting sentence boundaries. |
| */ |
| -public class SimpleFragmenter implements Fragmenter |
| -{ |
| - private static final int DEFAULT_FRAGMENT_SIZE =100; |
| - private int currentNumFrags; |
| - private int fragmentSize; |
| +public class SimpleFragmenter implements Fragmenter { |
| + private static final int DEFAULT_FRAGMENT_SIZE = 100; |
| + private int currentNumFrags; |
| + private int fragmentSize; |
| + private OffsetAttribute offsetAtt; |
| |
| + public SimpleFragmenter() { |
| + this(DEFAULT_FRAGMENT_SIZE); |
| + } |
| |
| - public SimpleFragmenter() |
| - { |
| - this(DEFAULT_FRAGMENT_SIZE); |
| - } |
| + /** |
| + * |
| + * @param fragmentSize size in number of characters of each fragment |
| + */ |
| + public SimpleFragmenter(int fragmentSize) { |
| + this.fragmentSize = fragmentSize; |
| + } |
| |
| |
| - /** |
| - * |
| - * @param fragmentSize size in number of characters of each fragment |
| - */ |
| - public SimpleFragmenter(int fragmentSize) |
| - { |
| - this.fragmentSize=fragmentSize; |
| - } |
| + /* (non-Javadoc) |
| + * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream) |
| + */ |
| + public void start(String originalText, TokenStream stream) { |
| + offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); |
| + currentNumFrags = 1; |
| + } |
| |
| - /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String) |
| - */ |
| - public void start(String originalText) |
| - { |
| - currentNumFrags=1; |
| - } |
| |
| - /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) |
| - */ |
| - public boolean isNewFragment(Token token) |
| - { |
| - boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags); |
| - if(isNewFrag) |
| - { |
| - currentNumFrags++; |
| - } |
| - return isNewFrag; |
| - } |
| + /* (non-Javadoc) |
| + * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment() |
| + */ |
| + public boolean isNewFragment() { |
| + boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags); |
| + if (isNewFrag) { |
| + currentNumFrags++; |
| + } |
| + return isNewFrag; |
| + } |
| |
| - /** |
| - * @return size in number of characters of each fragment |
| - */ |
| - public int getFragmentSize() |
| - { |
| - return fragmentSize; |
| - } |
| + /** |
| + * @return size in number of characters of each fragment |
| + */ |
| + public int getFragmentSize() { |
| + return fragmentSize; |
| + } |
| |
| - /** |
| - * @param size size in characters of each fragment |
| - */ |
| - public void setFragmentSize(int size) |
| - { |
| - fragmentSize = size; |
| - } |
| + /** |
| + * @param size size in characters of each fragment |
| + */ |
| + public void setFragmentSize(int size) { |
| + fragmentSize = size; |
| + } |
| |
| } |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleSpanFragmenter.java (working copy) |
| @@ -17,10 +17,13 @@ |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| -import org.apache.lucene.analysis.Token; |
| - |
| import java.util.List; |
| |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| + |
| |
| /** |
| * {@link Fragmenter} implementation which breaks text up into same-size |
| @@ -34,6 +37,9 @@ |
| private SpanScorer spanScorer; |
| private int waitForPos = -1; |
| private int textSize; |
| + private TermAttribute termAtt; |
| + private PositionIncrementAttribute posIncAtt; |
| + private OffsetAttribute offsetAtt; |
| |
| /** |
| * @param spanscorer SpanScorer that was used to score hits |
| @@ -50,12 +56,12 @@ |
| this.fragmentSize = fragmentSize; |
| this.spanScorer = spanscorer; |
| } |
| - |
| + |
| /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment(org.apache.lucene.analysis.Token) |
| + * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment() |
| */ |
| - public boolean isNewFragment(Token token) { |
| - position += token.getPositionIncrement(); |
| + public boolean isNewFragment() { |
| + position += posIncAtt.getPositionIncrement(); |
| |
| if (waitForPos == position) { |
| waitForPos = -1; |
| @@ -63,7 +69,7 @@ |
| return false; |
| } |
| |
| - WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term()); |
| + WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(termAtt.term()); |
| |
| if (wSpanTerm != null) { |
| List positionSpans = wSpanTerm.getPositionSpans(); |
| @@ -76,8 +82,8 @@ |
| } |
| } |
| |
| - boolean isNewFrag = token.endOffset() >= (fragmentSize * currentNumFrags) |
| - && (textSize - token.endOffset()) >= (fragmentSize >>> 1); |
| + boolean isNewFrag = offsetAtt.endOffset() >= (fragmentSize * currentNumFrags) |
| + && (textSize - offsetAtt.endOffset()) >= (fragmentSize >>> 1); |
| |
| if (isNewFrag) { |
| currentNumFrags++; |
| @@ -86,12 +92,16 @@ |
| return isNewFrag; |
| } |
| |
| + |
| /* (non-Javadoc) |
| - * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String) |
| + * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream) |
| */ |
| - public void start(String originalText) { |
| + public void start(String originalText, TokenStream tokenStream) { |
| position = -1; |
| currentNumFrags = 1; |
| textSize = originalText.length(); |
| + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); |
| + posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class); |
| + offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); |
| } |
| } |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/SpanScorer.java (working copy) |
| @@ -7,9 +7,10 @@ |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.CachingTokenFilter; |
| -import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.index.IndexReader; |
| -import org.apache.lucene.search.ConstantScoreRangeQuery; |
| import org.apache.lucene.search.Query; |
| |
| |
| @@ -26,6 +27,8 @@ |
| private float maxTermWeight; |
| private int position = -1; |
| private String defaultField; |
| + private TermAttribute termAtt; |
| + private PositionIncrementAttribute posIncAtt; |
| private static boolean highlightCnstScrRngQuery; |
| |
| /** |
| @@ -176,9 +179,9 @@ |
| * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, |
| * int) |
| */ |
| - public float getTokenScore(Token token) { |
| - position += token.getPositionIncrement(); |
| - String termText = token.term(); |
| + public float getTokenScore() { |
| + position += posIncAtt.getPositionIncrement(); |
| + String termText = termAtt.term(); |
| |
| WeightedSpanTerm weightedSpanTerm; |
| |
| @@ -203,6 +206,11 @@ |
| return score; |
| } |
| |
| + public void init(TokenStream tokenStream) { |
| + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); |
| + posIncAtt = (PositionIncrementAttribute) tokenStream.getAttribute(PositionIncrementAttribute.class); |
| + } |
| + |
| /** |
| * Retrieve the WeightedSpanTerm for the specified token. Useful for passing |
| * Span information to a Fragmenter. |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (working copy) |
| @@ -1,4 +1,5 @@ |
| package org.apache.lucene.search.highlight; |
| + |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -15,118 +16,117 @@ |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| + |
| import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| |
| /** |
| - * One, or several overlapping tokens, along with the score(s) and the |
| - * scope of the original text |
| + * One, or several overlapping tokens, along with the score(s) and the scope of |
| + * the original text |
| */ |
| -public class TokenGroup |
| -{ |
| - |
| - private static final int MAX_NUM_TOKENS_PER_GROUP=50; |
| - Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; |
| - float [] scores=new float[MAX_NUM_TOKENS_PER_GROUP]; |
| - int numTokens=0; |
| - int startOffset=0; |
| - int endOffset=0; |
| - float tot; |
| +public class TokenGroup { |
| |
| + private static final int MAX_NUM_TOKENS_PER_GROUP = 50; |
| + Token [] tokens=new Token[MAX_NUM_TOKENS_PER_GROUP]; |
| + float[] scores = new float[MAX_NUM_TOKENS_PER_GROUP]; |
| + int numTokens = 0; |
| + int startOffset = 0; |
| + int endOffset = 0; |
| + float tot; |
| int matchStartOffset, matchEndOffset; |
| |
| + private OffsetAttribute offsetAtt; |
| + private TermAttribute termAtt; |
| |
| - void addToken(Token token, float score) |
| - { |
| - if(numTokens < MAX_NUM_TOKENS_PER_GROUP) |
| - { |
| - if(numTokens==0) |
| - { |
| - startOffset=matchStartOffset=token.startOffset(); |
| - endOffset=matchEndOffset=token.endOffset(); |
| - tot += score; |
| - } |
| - else |
| - { |
| - startOffset=Math.min(startOffset,token.startOffset()); |
| - endOffset=Math.max(endOffset,token.endOffset()); |
| - if (score>0) { |
| - if (tot==0) { |
| - matchStartOffset=token.startOffset(); |
| - matchEndOffset=token.endOffset(); |
| + public TokenGroup(TokenStream tokenStream) { |
| + offsetAtt = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); |
| + termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); |
| + } |
| + |
| + void addToken(float score) { |
| + if (numTokens < MAX_NUM_TOKENS_PER_GROUP) { |
| + int termStartOffset = offsetAtt.startOffset(); |
| + int termEndOffset = offsetAtt.endOffset(); |
| + if (numTokens == 0) { |
| + startOffset = matchStartOffset = termStartOffset; |
| + endOffset = matchEndOffset = termEndOffset; |
| + tot += score; |
| + } else { |
| + startOffset = Math.min(startOffset, termStartOffset); |
| + endOffset = Math.max(endOffset, termEndOffset); |
| + if (score > 0) { |
| + if (tot == 0) { |
| + matchStartOffset = offsetAtt.startOffset(); |
| + matchEndOffset = offsetAtt.endOffset(); |
| } else { |
| - matchStartOffset=Math.min(matchStartOffset,token.startOffset()); |
| - matchEndOffset=Math.max(matchEndOffset,token.endOffset()); |
| + matchStartOffset = Math.min(matchStartOffset, termStartOffset); |
| + matchEndOffset = Math.max(matchEndOffset, termEndOffset); |
| } |
| - tot+=score; |
| + tot += score; |
| } |
| } |
| - tokens[numTokens]= (Token) token.clone(); |
| - scores[numTokens]=score; |
| - numTokens++; |
| - } |
| - } |
| + Token token = new Token(termStartOffset, termEndOffset); |
| + token.setTermBuffer(termAtt.term()); |
| + tokens[numTokens] = token; |
| + scores[numTokens] = score; |
| + numTokens++; |
| + } |
| + } |
| |
| - boolean isDistinct(Token token) |
| - { |
| - return token.startOffset()>=endOffset; |
| - } |
| + boolean isDistinct() { |
| + return offsetAtt.startOffset() >= endOffset; |
| + } |
| |
| + void clear() { |
| + numTokens = 0; |
| + tot = 0; |
| + } |
| + |
| + /* |
| + * @param index a value between 0 and numTokens -1 |
| + * @return the "n"th token |
| + */ |
| + public Token getToken(int index) |
| + { |
| + return tokens[index]; |
| + } |
| |
| - void clear() |
| - { |
| - numTokens=0; |
| - tot=0; |
| - } |
| - |
| - /** |
| - * |
| - * @param index a value between 0 and numTokens -1 |
| - * @return the "n"th token |
| - */ |
| - public Token getToken(int index) |
| - { |
| - return tokens[index]; |
| - } |
| + /** |
| + * |
| + * @param index a value between 0 and numTokens -1 |
| + * @return the "n"th score |
| + */ |
| + public float getScore(int index) { |
| + return scores[index]; |
| + } |
| |
| - /** |
| - * |
| - * @param index a value between 0 and numTokens -1 |
| - * @return the "n"th score |
| - */ |
| - public float getScore(int index) |
| - { |
| - return scores[index]; |
| - } |
| + /** |
| + * @return the end position in the original text |
| + */ |
| + public int getEndOffset() { |
| + return endOffset; |
| + } |
| |
| - /** |
| - * @return the end position in the original text |
| - */ |
| - public int getEndOffset() |
| - { |
| - return endOffset; |
| - } |
| + /** |
| + * @return the number of tokens in this group |
| + */ |
| + public int getNumTokens() { |
| + return numTokens; |
| + } |
| |
| - /** |
| - * @return the number of tokens in this group |
| - */ |
| - public int getNumTokens() |
| - { |
| - return numTokens; |
| - } |
| + /** |
| + * @return the start position in the original text |
| + */ |
| + public int getStartOffset() { |
| + return startOffset; |
| + } |
| |
| - /** |
| - * @return the start position in the original text |
| - */ |
| - public int getStartOffset() |
| - { |
| - return startOffset; |
| - } |
| - |
| - /** |
| - * @return all tokens' scores summed up |
| - */ |
| - public float getTotalScore() |
| - { |
| - return tot; |
| - } |
| + /** |
| + * @return all tokens' scores summed up |
| + */ |
| + public float getTotalScore() { |
| + return tot; |
| + } |
| } |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (revision 797692) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (working copy) |
| @@ -29,6 +29,8 @@ |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.TermFreqVector; |
| @@ -135,32 +137,45 @@ |
| * @param tokenPositionsGuaranteedContiguous true if the token position numbers have no overlaps or gaps. If looking |
| * to eek out the last drops of performance, set to true. If in doubt, set to false. |
| */ |
| - public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) |
| - { |
| + public static TokenStream getTokenStream(TermPositionVector tpv, boolean tokenPositionsGuaranteedContiguous) { |
| //an object used to iterate across an array of tokens |
| - class StoredTokenStream extends TokenStream |
| - { |
| - Token tokens[]; |
| - int currentToken=0; |
| - StoredTokenStream(Token tokens[]) |
| - { |
| - this.tokens=tokens; |
| + class StoredTokenStream extends TokenStream { |
| + Token tokens[]; |
| + int currentToken = 0; |
| + TermAttribute termAtt; |
| + OffsetAttribute offsetAtt; |
| + |
| + StoredTokenStream(Token tokens[]) { |
| + this.tokens = tokens; |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| + } |
| + |
| + public Token next(final Token reusableToken) { |
| + System.out.println("next token"); |
| + assert reusableToken != null; |
| + if (currentToken >= tokens.length) { |
| + return null; |
| } |
| - public Token next(final Token reusableToken) |
| - { |
| - assert reusableToken != null; |
| - if(currentToken>=tokens.length) |
| - { |
| - return null; |
| - } |
| - return tokens[currentToken++]; |
| - } |
| - } |
| + return tokens[currentToken++]; |
| + } |
| + |
| + public boolean incrementToken() throws IOException { |
| + System.out.println("inc token"); |
| + if (currentToken >= tokens.length) { |
| + return false; |
| + } |
| + Token token = tokens[currentToken++]; |
| + termAtt.setTermBuffer(token.term()); |
| + offsetAtt.setOffset(token.startOffset(), token.endOffset()); |
| + return true; |
| + } |
| + } |
| //code to reconstruct the original sequence of Tokens |
| String[] terms=tpv.getTerms(); |
| int[] freq=tpv.getTermFrequencies(); |
| int totalTokens=0; |
| - Token newToken = new Token(); |
| + |
| for (int t = 0; t < freq.length; t++) |
| { |
| totalTokens+=freq[t]; |
| @@ -190,8 +205,9 @@ |
| } |
| for (int tp = 0; tp < offsets.length; tp++) |
| { |
| - newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); |
| - unsortedTokens.add(newToken.clone()); |
| + Token token = new Token(offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); |
| + token.setTermBuffer(terms[t]); |
| + unsortedTokens.add(token); |
| } |
| } |
| else |
| @@ -204,8 +220,8 @@ |
| //tokens stored with positions - can use this to index straight into sorted array |
| for (int tp = 0; tp < pos.length; tp++) |
| { |
| - newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); |
| - tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone(); |
| + Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset()); |
| + tokensInOriginalOrder[pos[tp]] = token; |
| } |
| } |
| } |
| @@ -218,7 +234,7 @@ |
| { |
| Token t1=(Token) o1; |
| Token t2=(Token) o2; |
| - if(t1.startOffset()>t2.startOffset()) |
| + if(t1.startOffset()>t2.endOffset()) |
| return 1; |
| if(t1.startOffset()<t2.startOffset()) |
| return -1; |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (revision 799153) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (working copy) |
| @@ -42,8 +42,8 @@ |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.search.PrefixQuery; |
| import org.apache.lucene.search.Query; |
| -import org.apache.lucene.search.TermRangeQuery; |
| import org.apache.lucene.search.TermQuery; |
| +import org.apache.lucene.search.TermRangeQuery; |
| import org.apache.lucene.search.WildcardQuery; |
| import org.apache.lucene.search.spans.SpanNearQuery; |
| import org.apache.lucene.search.spans.SpanOrQuery; |
| @@ -98,7 +98,7 @@ |
| private void extract(Query query, Map terms) throws IOException { |
| if (query instanceof BooleanQuery) { |
| BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses(); |
| - |
| + |
| for (int i = 0; i < queryClauses.length; i++) { |
| if (!queryClauses[i].isProhibited()) { |
| extract(queryClauses[i].getQuery(), terms); |
| @@ -441,7 +441,7 @@ |
| * This class makes sure that if both position sensitive and insensitive |
| * versions of the same term are added, the position insensitive one wins. |
| */ |
| - private class PositionCheckingMap extends HashMap { |
| + static private class PositionCheckingMap extends HashMap { |
| |
| public void putAll(Map m) { |
| Iterator it = m.keySet().iterator(); |
| Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java |
| =================================================================== |
| --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 799153) |
| +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy) |
| @@ -38,10 +38,14 @@ |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.CachingTokenFilter; |
| import org.apache.lucene.analysis.LowerCaseTokenizer; |
| +import org.apache.lucene.analysis.SimpleAnalyzer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| +import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.Field.Index; |
| @@ -53,18 +57,17 @@ |
| import org.apache.lucene.queryParser.ParseException; |
| import org.apache.lucene.queryParser.QueryParser; |
| import org.apache.lucene.search.BooleanQuery; |
| +import org.apache.lucene.search.MultiTermQuery; |
| import org.apache.lucene.search.ConstantScoreRangeQuery; |
| import org.apache.lucene.search.FilteredQuery; |
| import org.apache.lucene.search.Hits; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.MultiPhraseQuery; |
| import org.apache.lucene.search.MultiSearcher; |
| -import org.apache.lucene.search.MultiTermQuery; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.search.Query; |
| -import org.apache.lucene.search.TermRangeFilter; |
| -import org.apache.lucene.search.Searcher; |
| import org.apache.lucene.search.TermQuery; |
| +import org.apache.lucene.search.TermRangeFilter; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.search.WildcardQuery; |
| import org.apache.lucene.search.BooleanClause.Occur; |
| @@ -75,6 +78,7 @@ |
| import org.apache.lucene.search.spans.SpanTermQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.RAMDirectory; |
| +import org.apache.lucene.util.Version; |
| import org.w3c.dom.Element; |
| import org.w3c.dom.NodeList; |
| |
| @@ -87,7 +91,7 @@ |
| static final String FIELD_NAME = "contents"; |
| private Query query; |
| RAMDirectory ramDir; |
| - public Searcher searcher = null; |
| + public IndexSearcher searcher = null; |
| public Hits hits = null; |
| int numHighlights = 0; |
| Analyzer analyzer = new StandardAnalyzer(); |
| @@ -108,11 +112,40 @@ |
| super(arg0); |
| } |
| |
| + public void testHits() throws Exception { |
| + Analyzer analyzer = new SimpleAnalyzer(); |
| + QueryParser qp = new QueryParser(FIELD_NAME, analyzer); |
| + query = qp.parse("\"very long\""); |
| + searcher = new IndexSearcher(ramDir, false); |
| + TopDocs hits = searcher.search(query, 10); |
| + |
| + Highlighter highlighter = new Highlighter(null); |
| + |
| + |
| + for (int i = 0; i < hits.scoreDocs.length; i++) { |
| + Document doc = searcher.doc(hits.scoreDocs[i].doc); |
| + String storedField = doc.get(FIELD_NAME); |
| + |
| + TokenStream stream = TokenSources.getAnyTokenStream(searcher |
| + .getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); |
| + CachingTokenFilter ctf = new CachingTokenFilter(stream); |
| + SpanScorer scorer = new SpanScorer(query, FIELD_NAME, ctf); |
| + // ctf.reset(); |
| + Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); |
| + highlighter.setFragmentScorer(scorer); |
| + highlighter.setTextFragmenter(fragmenter); |
| + |
| + String fragment = highlighter.getBestFragment(ctf, storedField); |
| + |
| + System.out.println(fragment); |
| + } |
| + } |
| + |
| public void testHighlightingWithDefaultField() throws Exception { |
| |
| String s1 = "I call our world Flatland, not because we call it so,"; |
| |
| - QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer()); |
| + QueryParser parser = new QueryParser(FIELD_NAME, new StandardAnalyzer(Version.LUCENE_CURRENT)); |
| |
| // Verify that a query against the default field results in text being |
| // highlighted |
| @@ -144,7 +177,7 @@ |
| */ |
| private static String highlightField(Query query, String fieldName, String text) |
| throws IOException, InvalidTokenOffsetsException { |
| - CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer().tokenStream( |
| + CachingTokenFilter tokenStream = new CachingTokenFilter(new StandardAnalyzer(Version.LUCENE_CURRENT).tokenStream( |
| fieldName, new StringReader(text))); |
| // Assuming "<B>", "</B>" used to highlight |
| SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(); |
| @@ -908,10 +941,12 @@ |
| Query query = parser.parse(srchkey); |
| |
| TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s)); |
| + |
| Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this); |
| |
| // Get 3 best fragments and seperate with a "..." |
| tokenStream = analyzer.tokenStream(null, new StringReader(s)); |
| + |
| String result = highlighter.getBestFragments(tokenStream, s, 3, "..."); |
| String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition"; |
| assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult |
| @@ -1075,10 +1110,11 @@ |
| } |
| |
| public void testUnRewrittenQuery() throws Exception { |
| - TestHighlightRunner helper = new TestHighlightRunner() { |
| + final TestHighlightRunner helper = new TestHighlightRunner() { |
| |
| public void run() throws Exception { |
| numHighlights = 0; |
| + SpanScorer.setHighlightCnstScrRngQuery(false); |
| // test to show how rewritten query can still be used |
| searcher = new IndexSearcher(ramDir); |
| Analyzer analyzer = new StandardAnalyzer(); |
| @@ -1154,13 +1190,17 @@ |
| public void startFragment(TextFragment newFragment) { |
| } |
| |
| - public float getTokenScore(Token token) { |
| + public float getTokenScore() { |
| return 0; |
| } |
| |
| public float getFragmentScore() { |
| return 1; |
| } |
| + |
| + public void init(TokenStream tokenStream) { |
| + |
| + } |
| }); |
| highlighter.setTextFragmenter(new SimpleFragmenter(2000)); |
| TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent)); |
| @@ -1292,27 +1332,44 @@ |
| return new TokenStream() { |
| Iterator iter; |
| List lst; |
| + private TermAttribute termAtt; |
| + private PositionIncrementAttribute posIncrAtt; |
| + private OffsetAttribute offsetAtt; |
| { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| lst = new ArrayList(); |
| Token t; |
| t = createToken("hi", 0, 2); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| t = createToken("hispeed", 0, 8); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| t = createToken("speed", 3, 8); |
| t.setPositionIncrement(0); |
| lst.add(t); |
| t = createToken("10", 8, 10); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| t = createToken("foo", 11, 14); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| iter = lst.iterator(); |
| } |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| - return iter.hasNext() ? (Token) iter.next() : null; |
| + public boolean incrementToken() throws IOException { |
| + if(iter.hasNext()) { |
| + Token token = (Token) iter.next(); |
| + termAtt.setTermBuffer(token.term()); |
| + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); |
| + offsetAtt.setOffset(token.startOffset(), token.endOffset()); |
| + return true; |
| + } |
| + return false; |
| } |
| + |
| }; |
| } |
| |
| @@ -1322,26 +1379,42 @@ |
| return new TokenStream() { |
| Iterator iter; |
| List lst; |
| + private TermAttribute termAtt; |
| + private PositionIncrementAttribute posIncrAtt; |
| + private OffsetAttribute offsetAtt; |
| { |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| lst = new ArrayList(); |
| Token t; |
| t = createToken("hispeed", 0, 8); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| t = createToken("hi", 0, 2); |
| t.setPositionIncrement(0); |
| lst.add(t); |
| t = createToken("speed", 3, 8); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| t = createToken("10", 8, 10); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| t = createToken("foo", 11, 14); |
| + t.setPositionIncrement(1); |
| lst.add(t); |
| iter = lst.iterator(); |
| } |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| - return iter.hasNext() ? (Token) iter.next() : null; |
| + public boolean incrementToken() throws IOException { |
| + if(iter.hasNext()) { |
| + Token token = (Token) iter.next(); |
| + termAtt.setTermBuffer(token.term()); |
| + posIncrAtt.setPositionIncrement(token.getPositionIncrement()); |
| + offsetAtt.setOffset(token.startOffset(), token.endOffset()); |
| + return true; |
| + } |
| + return false; |
| } |
| }; |
| } |
| @@ -1611,7 +1684,11 @@ |
| * java.io.Reader) |
| */ |
| public TokenStream tokenStream(String arg0, Reader arg1) { |
| - return new SynonymTokenizer(new LowerCaseTokenizer(arg1), synonyms); |
| + LowerCaseTokenizer stream = new LowerCaseTokenizer(arg1); |
| + stream.addAttribute(TermAttribute.class); |
| + stream.addAttribute(PositionIncrementAttribute.class); |
| + stream.addAttribute(OffsetAttribute.class); |
| + return new SynonymTokenizer(stream, synonyms); |
| } |
| } |
| |
| @@ -1622,47 +1699,70 @@ |
| class SynonymTokenizer extends TokenStream { |
| private TokenStream realStream; |
| private Token currentRealToken = null; |
| + private org.apache.lucene.analysis.Token cRealToken = null; |
| private Map synonyms; |
| StringTokenizer st = null; |
| + private TermAttribute realTermAtt; |
| + private PositionIncrementAttribute realPosIncrAtt; |
| + private OffsetAttribute realOffsetAtt; |
| + private TermAttribute termAtt; |
| + private PositionIncrementAttribute posIncrAtt; |
| + private OffsetAttribute offsetAtt; |
| |
| public SynonymTokenizer(TokenStream realStream, Map synonyms) { |
| this.realStream = realStream; |
| this.synonyms = synonyms; |
| + realTermAtt = (TermAttribute) realStream.getAttribute(TermAttribute.class); |
| + realPosIncrAtt = (PositionIncrementAttribute) realStream.getAttribute(PositionIncrementAttribute.class); |
| + realOffsetAtt = (OffsetAttribute) realStream.getAttribute(OffsetAttribute.class); |
| + |
| + termAtt = (TermAttribute) addAttribute(TermAttribute.class); |
| + posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); |
| + offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); |
| } |
| |
| - public Token next(final Token reusableToken) throws IOException { |
| - assert reusableToken != null; |
| + public boolean incrementToken() throws IOException { |
| + |
| if (currentRealToken == null) { |
| - Token nextRealToken = realStream.next(reusableToken); |
| - if (nextRealToken == null) { |
| - return null; |
| + boolean next = realStream.incrementToken(); |
| + if (!next) { |
| + return false; |
| } |
| - String expansions = (String) synonyms.get(nextRealToken.term()); |
| + //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset()); |
| + termAtt.setTermBuffer(realTermAtt.term()); |
| + offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset()); |
| + posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement()); |
| + System.out.println("term:" + realTermAtt.term()); |
| + String expansions = (String) synonyms.get(realTermAtt.term()); |
| if (expansions == null) { |
| - return nextRealToken; |
| + return true; |
| } |
| st = new StringTokenizer(expansions, ","); |
| if (st.hasMoreTokens()) { |
| - currentRealToken = (Token) nextRealToken.clone(); |
| + currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset()); |
| + currentRealToken.setTermBuffer(realTermAtt.term()); |
| } |
| - return currentRealToken; |
| + |
| + return true; |
| } else { |
| - reusableToken.reinit(st.nextToken(), |
| - currentRealToken.startOffset(), |
| - currentRealToken.endOffset()); |
| - reusableToken.setPositionIncrement(0); |
| + String tok = st.nextToken(); |
| + termAtt.setTermBuffer(tok); |
| + offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset()); |
| + posIncrAtt.setPositionIncrement(0); |
| if (!st.hasMoreTokens()) { |
| currentRealToken = null; |
| st = null; |
| } |
| - return reusableToken; |
| + return true; |
| } |
| + |
| } |
| |
| static abstract class TestHighlightRunner { |
| static final int STANDARD = 0; |
| static final int SPAN = 1; |
| int mode = STANDARD; |
| + Fragmenter frag = new SimpleFragmenter(20); |
| |
| public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, |
| Formatter formatter) { |
| @@ -1725,7 +1825,7 @@ |
| if (mode == SPAN) { |
| ((CachingTokenFilter) tokenStream).reset(); |
| } |
| - highlighter.setTextFragmenter(new SimpleFragmenter(20)); |
| + highlighter.setTextFragmenter(frag); |
| |
| String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, |
| fragmentSeparator); |
| |