| Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java |
| =================================================================== |
| --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 421357) |
| +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy) |
| @@ -20,19 +20,14 @@ |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| -import java.util.HashMap; |
| -import java.util.Map; |
| -import java.util.StringTokenizer; |
| +import java.util.*; |
| |
| import javax.xml.parsers.DocumentBuilder; |
| import javax.xml.parsers.DocumentBuilderFactory; |
| |
| import junit.framework.TestCase; |
| |
| -import org.apache.lucene.analysis.Analyzer; |
| -import org.apache.lucene.analysis.LowerCaseTokenizer; |
| -import org.apache.lucene.analysis.Token; |
| -import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.*; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| @@ -535,9 +530,132 @@ |
| reader.close(); |
| |
| } |
| - |
| - |
| |
| + protected TokenStream getTS2() { |
| + //String s = "Hi-Speed10 foo"; |
| + return new TokenStream() { |
| + Iterator iter; |
| + List lst; |
| + { |
| + lst = new ArrayList(); |
| + Token t; |
| + t = new Token("hi",0,2); |
| + lst.add(t); |
| + t = new Token("hispeed",0,8); |
| + lst.add(t); |
| + t = new Token("speed",3,8); |
| + t.setPositionIncrement(0); |
| + lst.add(t); |
| + t = new Token("10",8,10); |
| + lst.add(t); |
| + t = new Token("foo",11,14); |
| + lst.add(t); |
| + iter = lst.iterator(); |
| + } |
| + public Token next() throws IOException { |
| + return iter.hasNext() ? (Token)iter.next() : null; |
| + } |
| + }; |
| + } |
| + |
| + // same token-stream as above, but the bigger token comes first this time |
| + protected TokenStream getTS2a() { |
| + //String s = "Hi-Speed10 foo"; |
| + return new TokenStream() { |
| + Iterator iter; |
| + List lst; |
| + { |
| + lst = new ArrayList(); |
| + Token t; |
| + t = new Token("hispeed",0,8); |
| + lst.add(t); |
| + t = new Token("hi",0,2); |
| + t.setPositionIncrement(0); |
| + lst.add(t); |
| + t = new Token("speed",3,8); |
| + lst.add(t); |
| + t = new Token("10",8,10); |
| + lst.add(t); |
| + t = new Token("foo",11,14); |
| + lst.add(t); |
| + iter = lst.iterator(); |
| + } |
| + public Token next() throws IOException { |
| + return iter.hasNext() ? (Token)iter.next() : null; |
| + } |
| + }; |
| + } |
| + |
| + public void testOverlapAnalyzer2() throws Exception |
| + { |
| + |
| + String s = "Hi-Speed10 foo"; |
| + |
| + Query query; Highlighter highlighter; String result; |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); |
| + assertEquals("Hi-Speed10 <B>foo</B>",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); |
| + assertEquals("Hi-Speed<B>10</B> foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); |
| + assertEquals("<B>Hi</B>-Speed10 foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); |
| + assertEquals("Hi-<B>Speed</B>10 foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); |
| + assertEquals("<B>Hi-Speed</B>10 foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2(), s, 3, "..."); |
| + assertEquals("<B>Hi-Speed</B>10 foo",result); |
| + |
| + /////////////////// same tests, just put the bigger overlapping token first |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); |
| + assertEquals("Hi-Speed10 <B>foo</B>",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); |
| + assertEquals("Hi-Speed<B>10</B> foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); |
| + assertEquals("<B>Hi</B>-Speed10 foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); |
| + assertEquals("Hi-<B>Speed</B>10 foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); |
| + assertEquals("<B>Hi-Speed</B>10 foo",result); |
| + |
| + query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed"); |
| + highlighter = new Highlighter(new QueryScorer(query)); |
| + result = highlighter.getBestFragments(getTS2a(), s, 3, "..."); |
| + assertEquals("<B>Hi-Speed</B>10 foo",result); |
| + } |
| + |
| + |
| /* |
| |
| public void testBigramAnalyzer() throws IOException, ParseException |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (revision 421357) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (working copy) |
| @@ -30,34 +30,48 @@ |
| int numTokens=0; |
| int startOffset=0; |
| int endOffset=0; |
| - |
| + float tot; |
| |
| - void addToken(Token token, float score) |
| + int matchStartOffset, matchEndOffset; |
| + |
| + |
| + void addToken(Token token, float score) |
| { |
| if(numTokens < MAX_NUM_TOKENS_PER_GROUP) |
| { |
| if(numTokens==0) |
| { |
| - startOffset=token.startOffset(); |
| - endOffset=token.endOffset(); |
| + startOffset=matchStartOffset=token.startOffset(); |
| + endOffset=matchEndOffset=token.endOffset(); |
| + tot += score; |
| } |
| else |
| { |
| - startOffset=Math.min(startOffset,token.startOffset()); |
| - endOffset=Math.max(endOffset,token.endOffset()); |
| - } |
| + startOffset=Math.min(startOffset,token.startOffset()); |
| + endOffset=Math.max(endOffset,token.endOffset()); |
| + if (score>0) { |
| + if (tot==0) { |
| + matchStartOffset=token.startOffset(); |
| + matchEndOffset=token.endOffset(); |
| + } else { |
| + matchStartOffset=Math.min(matchStartOffset,token.startOffset()); |
| + matchEndOffset=Math.max(matchEndOffset,token.endOffset()); |
| + } |
| + tot+=score; |
| + } |
| + } |
| tokens[numTokens]=token; |
| scores[numTokens]=score; |
| numTokens++; |
| } |
| } |
| - |
| + |
| boolean isDistinct(Token token) |
| { |
| return token.startOffset()>=endOffset; |
| } |
| - |
| - |
| + |
| + |
| void clear() |
| { |
| numTokens=0; |
| Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java |
| =================================================================== |
| --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 421357) |
| +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy) |
| @@ -25,8 +25,8 @@ |
| import org.apache.lucene.util.PriorityQueue; |
| |
| /** |
| - * Class used to markup highlighted terms found in the best sections of a |
| - * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, |
| + * Class used to markup highlighted terms found in the best sections of a |
| + * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, |
| * {@link Encoder} and tokenizers. |
| * @author mark@searcharea.co.uk |
| */ |
| @@ -36,7 +36,7 @@ |
| public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024; |
| private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE; |
| private Formatter formatter; |
| - private Encoder encoder; |
| + private Encoder encoder; |
| private Fragmenter textFragmenter=new SimpleFragmenter(); |
| private Scorer fragmentScorer=null; |
| |
| @@ -44,14 +44,14 @@ |
| { |
| this(new SimpleHTMLFormatter(),fragmentScorer); |
| } |
| - |
| - |
| + |
| + |
| public Highlighter(Formatter formatter, Scorer fragmentScorer) |
| { |
| this(formatter,new DefaultEncoder(),fragmentScorer); |
| } |
| - |
| - |
| + |
| + |
| public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) |
| { |
| this.formatter = formatter; |
| @@ -65,9 +65,9 @@ |
| * {@link #getBestFragment(TokenStream, String)} |
| * |
| * @param analyzer the analyzer that will be used to split <code>text</code> |
| - * into chunks |
| + * into chunks |
| * @param text text to highlight terms in |
| - * @param fieldName Name of field used to influence analyzer's tokenization policy |
| + * @param fieldName Name of field used to influence analyzer's tokenization policy |
| * |
| * @return highlighted text fragment or null if no terms found |
| */ |
| @@ -77,18 +77,18 @@ |
| TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); |
| return getBestFragment(tokenStream, text); |
| } |
| - |
| + |
| /** |
| * Highlights chosen terms in a text, extracting the most relevant section. |
| * The document text is analysed in chunks to record hit statistics |
| * across the document. After accumulating stats, the fragment with the highest score |
| * is returned |
| * |
| - * @param tokenStream a stream of tokens identified in the text parameter, including offset information. |
| - * This is typically produced by an analyzer re-parsing a document's |
| - * text. Some work may be done on retrieving TokenStreams more efficently |
| + * @param tokenStream a stream of tokens identified in the text parameter, including offset information. |
| + * This is typically produced by an analyzer re-parsing a document's |
| + * text. Some work may be done on retrieving TokenStreams more efficently |
| * by adding support for storing original text position data in the Lucene |
| - * index but this support is not currently available (as of Lucene 1.4 rc2). |
| + * index but this support is not currently available (as of Lucene 1.4 rc2). |
| * @param text text to highlight terms in |
| * |
| * @return highlighted text fragment or null if no terms found |
| @@ -110,7 +110,7 @@ |
| * {@link #getBestFragments(TokenStream, String, int)} |
| * |
| * @param analyzer the analyzer that will be used to split <code>text</code> |
| - * into chunks |
| + * into chunks |
| * @param text text to highlight terms in |
| * @param maxNumFragments the maximum number of fragments. |
| * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the |
| @@ -118,7 +118,7 @@ |
| * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) |
| */ |
| public final String[] getBestFragments( |
| - Analyzer analyzer, |
| + Analyzer analyzer, |
| String text, |
| int maxNumFragments) |
| throws IOException |
| @@ -132,7 +132,7 @@ |
| * {@link #getBestFragments(TokenStream, String, int)} |
| * |
| * @param analyzer the analyzer that will be used to split <code>text</code> |
| - * into chunks |
| + * into chunks |
| * @param fieldName the name of the field being highlighted (used by analyzer) |
| * @param text text to highlight terms in |
| * @param maxNumFragments the maximum number of fragments. |
| @@ -140,7 +140,7 @@ |
| * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) |
| */ |
| public final String[] getBestFragments( |
| - Analyzer analyzer, |
| + Analyzer analyzer, |
| String fieldName, |
| String text, |
| int maxNumFragments) |
| @@ -149,12 +149,12 @@ |
| TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text)); |
| return getBestFragments(tokenStream, text, maxNumFragments); |
| } |
| - |
| + |
| /** |
| * Highlights chosen terms in a text, extracting the most relevant sections. |
| * The document text is analysed in chunks to record hit statistics |
| * across the document. After accumulating stats, the fragments with the highest scores |
| - * are returned as an array of strings in order of score (contiguous fragments are merged into |
| + * are returned as an array of strings in order of score (contiguous fragments are merged into |
| * one in their original order to improve readability) |
| * |
| * @param text text to highlight terms in |
| @@ -163,13 +163,13 @@ |
| * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) |
| */ |
| public final String[] getBestFragments( |
| - TokenStream tokenStream, |
| + TokenStream tokenStream, |
| String text, |
| int maxNumFragments) |
| throws IOException |
| { |
| maxNumFragments = Math.max(1, maxNumFragments); //sanity check |
| - |
| + |
| TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments); |
| |
| //Get text |
| @@ -183,12 +183,12 @@ |
| } |
| return (String[]) fragTexts.toArray(new String[0]); |
| } |
| - |
| |
| + |
| /** |
| * Low level api to get the most relevant (formatted) sections of the document. |
| * This method has been made public to allow visibility of score information held in TextFragment objects. |
| - * Thanks to Jason Calabrese for help in redefining the interface. |
| + * Thanks to Jason Calabrese for help in redefining the interface. |
| * @param tokenStream |
| * @param text |
| * @param maxNumFragments |
| @@ -196,7 +196,7 @@ |
| * @throws IOException |
| */ |
| public final TextFragment[] getBestTextFragments( |
| - TokenStream tokenStream, |
| + TokenStream tokenStream, |
| String text, |
| boolean mergeContiguousFragments, |
| int maxNumFragments) |
| @@ -208,7 +208,7 @@ |
| TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); |
| fragmentScorer.startFragment(currentFrag); |
| docFrags.add(currentFrag); |
| - |
| + |
| FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); |
| |
| try |
| @@ -219,27 +219,27 @@ |
| int endOffset; |
| int lastEndOffset = 0; |
| textFragmenter.start(text); |
| - |
| + |
| TokenGroup tokenGroup=new TokenGroup(); |
| |
| while ((token = tokenStream.next()) != null) |
| { |
| if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token))) |
| { |
| - //the current token is distinct from previous tokens - |
| + //the current token is distinct from previous tokens - |
| // markup the cached token group info |
| - startOffset = tokenGroup.startOffset; |
| - endOffset = tokenGroup.endOffset; |
| + startOffset = tokenGroup.matchStartOffset; |
| + endOffset = tokenGroup.matchEndOffset; |
| tokenText = text.substring(startOffset, endOffset); |
| String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); |
| //store any whitespace etc from between this and last group |
| if (startOffset > lastEndOffset) |
| newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); |
| newText.append(markedUpText); |
| - lastEndOffset=endOffset; |
| + lastEndOffset=Math.max(endOffset, lastEndOffset); |
| tokenGroup.clear(); |
| |
| - //check if current token marks the start of a new fragment |
| + //check if current token marks the start of a new fragment |
| if(textFragmenter.isNewFragment(token)) |
| { |
| currentFrag.setScore(fragmentScorer.getFragmentScore()); |
| @@ -250,28 +250,28 @@ |
| docFrags.add(currentFrag); |
| } |
| } |
| - |
| - tokenGroup.addToken(token,fragmentScorer.getTokenScore(token)); |
| - |
| + |
| + tokenGroup.addToken(token,fragmentScorer.getTokenScore(token)); |
| + |
| if(lastEndOffset>maxDocBytesToAnalyze) |
| { |
| break; |
| } |
| } |
| currentFrag.setScore(fragmentScorer.getFragmentScore()); |
| - |
| + |
| if(tokenGroup.numTokens>0) |
| { |
| //flush the accumulated text (same code as in above loop) |
| - startOffset = tokenGroup.startOffset; |
| - endOffset = tokenGroup.endOffset; |
| + startOffset = tokenGroup.matchStartOffset; |
| + endOffset = tokenGroup.matchEndOffset; |
| tokenText = text.substring(startOffset, endOffset); |
| String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); |
| //store any whitespace etc from between this and last group |
| if (startOffset > lastEndOffset) |
| newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); |
| newText.append(markedUpText); |
| - lastEndOffset=endOffset; |
| + lastEndOffset=Math.max(lastEndOffset,endOffset); |
| } |
| |
| // append text after end of last token |
| @@ -286,7 +286,7 @@ |
| currentFrag = (TextFragment) i.next(); |
| |
| //If you are running with a version of Lucene before 11th Sept 03 |
| - // you do not have PriorityQueue.insert() - so uncomment the code below |
| + // you do not have PriorityQueue.insert() - so uncomment the code below |
| /* |
| if (currentFrag.getScore() >= minScore) |
| { |
| @@ -296,8 +296,8 @@ |
| fragQueue.pop(); // remove lowest in hit queue |
| minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore |
| } |
| - |
| - |
| + |
| + |
| } |
| */ |
| //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 |
| @@ -312,7 +312,7 @@ |
| { |
| frag[i] = (TextFragment) fragQueue.pop(); |
| } |
| - |
| + |
| //merge any contiguous fragments to improve readability |
| if(mergeContiguousFragments) |
| { |
| @@ -325,9 +325,9 @@ |
| fragTexts.add(frag[i]); |
| } |
| } |
| - frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]); |
| + frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]); |
| } |
| - |
| + |
| return frag; |
| |
| } |
| @@ -347,7 +347,7 @@ |
| } |
| |
| |
| - /** Improves readability of a score-sorted list of TextFragments by merging any fragments |
| + /** Improves readability of a score-sorted list of TextFragments by merging any fragments |
| * that were contiguous in the original text into one larger fragment with the correct order. |
| * This will leave a "null" in the array entry for the lesser scored fragment. |
| * |