docs/attachments/LUCENE-627/highlight_overlap.diff - lucene-jira-archive - Git at Google

 Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
 ===================================================================
 --- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java	(revision 421357)
 +++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java	(working copy)
 @@ -20,19 +20,14 @@
  import java.io.IOException;
  import java.io.Reader;
  import java.io.StringReader;
 -import java.util.HashMap;
 -import java.util.Map;
 -import java.util.StringTokenizer;
 +import java.util.*;

  import javax.xml.parsers.DocumentBuilder;
  import javax.xml.parsers.DocumentBuilderFactory;

  import junit.framework.TestCase;

 -import org.apache.lucene.analysis.Analyzer;
 -import org.apache.lucene.analysis.LowerCaseTokenizer;
 -import org.apache.lucene.analysis.Token;
 -import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.*;
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
 @@ -535,9 +530,132 @@
  		reader.close();

  	}
 -
 -

 +  protected TokenStream getTS2() {
 +    //String s = "Hi-Speed10 foo";
 +    return new TokenStream() {
 +      Iterator iter;
 +      List lst;
 +      {
 +        lst = new ArrayList();
 +        Token t;
 +        t = new Token("hi",0,2);
 +        lst.add(t);
 +        t = new Token("hispeed",0,8);
 +        lst.add(t);
 +        t = new Token("speed",3,8);
 +        t.setPositionIncrement(0);
 +        lst.add(t);
 +        t = new Token("10",8,10);
 +        lst.add(t);
 +        t = new Token("foo",11,14);
 +        lst.add(t);
 +        iter = lst.iterator();
 +      }
 +      public Token next() throws IOException {
 +        return iter.hasNext() ? (Token)iter.next() : null;
 +      }
 +    };
 +  }
 +
 +  // same token-stream as above, but the bigger token comes first this time
 +  protected TokenStream getTS2a() {
 +    //String s = "Hi-Speed10 foo";
 +    return new TokenStream() {
 +      Iterator iter;
 +      List lst;
 +      {
 +        lst = new ArrayList();
 +        Token t;
 +        t = new Token("hispeed",0,8);
 +        lst.add(t);
 +        t = new Token("hi",0,2);
 +        t.setPositionIncrement(0);
 +        lst.add(t);
 +        t = new Token("speed",3,8);
 +        lst.add(t);
 +        t = new Token("10",8,10);
 +        lst.add(t);
 +        t = new Token("foo",11,14);
 +        lst.add(t);
 +        iter = lst.iterator();
 +      }
 +      public Token next() throws IOException {
 +        return iter.hasNext() ? (Token)iter.next() : null;
 +      }
 +    };
 +  }
 +
 +  public void testOverlapAnalyzer2() throws Exception
 +  {
 +
 +    String s = "Hi-Speed10 foo";
 +
 +    Query query; Highlighter highlighter; String result;
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2(), s, 3, "...");
 +    assertEquals("Hi-Speed10 <B>foo</B>",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2(), s, 3, "...");
 +    assertEquals("Hi-Speed<B>10</B> foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2(), s, 3, "...");
 +    assertEquals("<B>Hi</B>-Speed10 foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2(), s, 3, "...");
 +    assertEquals("Hi-<B>Speed</B>10 foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2(), s, 3, "...");
 +    assertEquals("<B>Hi-Speed</B>10 foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2(), s, 3, "...");
 +    assertEquals("<B>Hi-Speed</B>10 foo",result);
 +
 +    /////////////////// same tests, just put the bigger overlapping token first
 +        query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
 +    assertEquals("Hi-Speed10 <B>foo</B>",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
 +    assertEquals("Hi-Speed<B>10</B> foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
 +    assertEquals("<B>Hi</B>-Speed10 foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
 +    assertEquals("Hi-<B>Speed</B>10 foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
 +    assertEquals("<B>Hi-Speed</B>10 foo",result);
 +
 +    query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed");
 +    highlighter = new Highlighter(new QueryScorer(query));
 +    result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
 +    assertEquals("<B>Hi-Speed</B>10 foo",result);
 +  }
 +
 +
  /*

  	public void testBigramAnalyzer() throws IOException, ParseException
 Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
 ===================================================================
 --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java	(revision 421357)
 +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java	(working copy)
 @@ -30,34 +30,48 @@
  	int numTokens=0;
  	int startOffset=0;
  	int endOffset=0;
 -
 +  float tot;

 -	void addToken(Token token, float score)
 +  int matchStartOffset, matchEndOffset;
 +
 +
 +  void addToken(Token token, float score)
  	{
  	    if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
          {
  			if(numTokens==0)
  			{
 -				startOffset=token.startOffset();
 -				endOffset=token.endOffset();
 +				startOffset=matchStartOffset=token.startOffset();
 +				endOffset=matchEndOffset=token.endOffset();
 +        tot += score;
  			}
  			else
  			{
 -				startOffset=Math.min(startOffset,token.startOffset());
 -				endOffset=Math.max(endOffset,token.endOffset());
 -			}
 +				startOffset=Math.min(startOffset,token.startOffset());
 +				endOffset=Math.max(endOffset,token.endOffset());
 +        if (score>0) {
 +          if (tot==0) {
 +            matchStartOffset=token.startOffset();
 +            matchEndOffset=token.endOffset();
 +          } else {
 +            matchStartOffset=Math.min(matchStartOffset,token.startOffset());
 +            matchEndOffset=Math.max(matchEndOffset,token.endOffset());
 +          }
 +          tot+=score;
 +        }
 +      }
  			tokens[numTokens]=token;
  			scores[numTokens]=score;
  			numTokens++;
          }
  	}
 -
 +
  	boolean isDistinct(Token token)
  	{
  		return token.startOffset()>=endOffset;
  	}
 -
 -
 +
 +
  	void clear()
  	{
  		numTokens=0;
 Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
 ===================================================================
 --- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java	(revision 421357)
 +++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java	(working copy)
 @@ -25,8 +25,8 @@
  import org.apache.lucene.util.PriorityQueue;

  /**
 - * Class used to markup highlighted terms found in the best sections of a
 - * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
 + * Class used to markup highlighted terms found in the best sections of a
 + * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
   * {@link Encoder} and tokenizers.
   * @author mark@searcharea.co.uk
   */
 @@ -36,7 +36,7 @@
  	public static final  int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
  	private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
  	private Formatter formatter;
 -	private Encoder encoder;
 +	private Encoder encoder;
  	private Fragmenter textFragmenter=new SimpleFragmenter();
  	private Scorer fragmentScorer=null;

 @@ -44,14 +44,14 @@
  	{
  		this(new SimpleHTMLFormatter(),fragmentScorer);
  	}
 -
 -
 +
 +
   	public Highlighter(Formatter formatter, Scorer fragmentScorer)
   	{
  		this(formatter,new DefaultEncoder(),fragmentScorer);
  	}
 -
 -
 +
 +
  	public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
  	{
   		this.formatter = formatter;
 @@ -65,9 +65,9 @@
  	 * {@link #getBestFragment(TokenStream, String)}
  	 *
  	 * @param analyzer   the analyzer that will be used to split <code>text</code>
 -	 * into chunks
 +	 * into chunks
  	 * @param text text to highlight terms in
 -	 * @param fieldName Name of field used to influence analyzer's tokenization policy
 +	 * @param fieldName Name of field used to influence analyzer's tokenization policy
  	 *
  	 * @return highlighted text fragment or null if no terms found
  	 */
 @@ -77,18 +77,18 @@
  		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
  		return getBestFragment(tokenStream, text);
  	}
 -
 +
  	/**
  	 * Highlights chosen terms in a text, extracting the most relevant section.
  	 * The document text is analysed in chunks to record hit statistics
  	 * across the document. After accumulating stats, the fragment with the highest score
  	 * is returned
  	 *
 -	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
 -	 * This is typically produced by an analyzer re-parsing a document's
 -	 * text. Some work may be done on retrieving TokenStreams more efficently
 +	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
 +	 * This is typically produced by an analyzer re-parsing a document's
 +	 * text. Some work may be done on retrieving TokenStreams more efficently
  	 * by adding support for storing original text position data in the Lucene
 -	 * index but this support is not currently available (as of Lucene 1.4 rc2).
 +	 * index but this support is not currently available (as of Lucene 1.4 rc2).
  	 * @param text text to highlight terms in
  	 *
  	 * @return highlighted text fragment or null if no terms found
 @@ -110,7 +110,7 @@
  	 * {@link #getBestFragments(TokenStream, String, int)}
  	 *
  	 * @param analyzer   the analyzer that will be used to split <code>text</code>
 -	 * into chunks
 +	 * into chunks
  	 * @param text        	text to highlight terms in
  	 * @param maxNumFragments  the maximum number of fragments.
  	 * @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
 @@ -118,7 +118,7 @@
  	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
  	 */
  	public final String[] getBestFragments(
 -		Analyzer analyzer,
 +		Analyzer analyzer,
  		String text,
  		int maxNumFragments)
  		throws IOException
 @@ -132,7 +132,7 @@
  	 * {@link #getBestFragments(TokenStream, String, int)}
  	 *
  	 * @param analyzer   the analyzer that will be used to split <code>text</code>
 -	 * into chunks
 +	 * into chunks
  	 * @param fieldName     the name of the field being highlighted (used by analyzer)
  	 * @param text        	text to highlight terms in
  	 * @param maxNumFragments  the maximum number of fragments.
 @@ -140,7 +140,7 @@
  	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
  	 */
  	public final String[] getBestFragments(
 -		Analyzer analyzer,
 +		Analyzer analyzer,
  		String fieldName,
  		String text,
  		int maxNumFragments)
 @@ -149,12 +149,12 @@
  		TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
  		return getBestFragments(tokenStream, text, maxNumFragments);
  	}
 -
 +
  	/**
  	 * Highlights chosen terms in a text, extracting the most relevant sections.
  	 * The document text is analysed in chunks to record hit statistics
  	 * across the document. After accumulating stats, the fragments with the highest scores
 -	 * are returned as an array of strings in order of score (contiguous fragments are merged into
 +	 * are returned as an array of strings in order of score (contiguous fragments are merged into
  	 * one in their original order to improve readability)
  	 *
  	 * @param text        	text to highlight terms in
 @@ -163,13 +163,13 @@
  	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
  	 */
  	public final String[] getBestFragments(
 -		TokenStream tokenStream,
 +		TokenStream tokenStream,
  		String text,
  		int maxNumFragments)
  		throws IOException
  	{
  		maxNumFragments = Math.max(1, maxNumFragments); //sanity check
 -
 +
  		TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);

  		//Get text
 @@ -183,12 +183,12 @@
  		}
  		return (String[]) fragTexts.toArray(new String[0]);
  	}
 -

 +
  	/**
  	 * Low level api to get the most relevant (formatted) sections of the document.
  	 * This method has been made public to allow visibility of score information held in TextFragment objects.
 -	 * Thanks to Jason Calabrese for help in redefining the interface.
 +	 * Thanks to Jason Calabrese for help in redefining the interface.
  	 * @param tokenStream
  	 * @param text
  	 * @param maxNumFragments
 @@ -196,7 +196,7 @@
  	 * @throws IOException
  	 */
  	public final TextFragment[] getBestTextFragments(
 -		TokenStream tokenStream,
 +		TokenStream tokenStream,
  		String text,
  		boolean mergeContiguousFragments,
  		int maxNumFragments)
 @@ -208,7 +208,7 @@
  		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());
  		fragmentScorer.startFragment(currentFrag);
  		docFrags.add(currentFrag);
 -
 +
  		FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

  		try
 @@ -219,27 +219,27 @@
  			int endOffset;
  			int lastEndOffset = 0;
  			textFragmenter.start(text);
 -
 +
  			TokenGroup tokenGroup=new TokenGroup();

  			while ((token = tokenStream.next()) != null)
  			{
  				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
  				{
 -					//the current token is distinct from previous tokens -
 +					//the current token is distinct from previous tokens -
  					// markup the cached token group info
 -					startOffset = tokenGroup.startOffset;
 -					endOffset = tokenGroup.endOffset;
 +					startOffset = tokenGroup.matchStartOffset;
 +					endOffset = tokenGroup.matchEndOffset;
  					tokenText = text.substring(startOffset, endOffset);
  					String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
  					//store any whitespace etc from between this and last group
  					if (startOffset > lastEndOffset)
  						newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
  					newText.append(markedUpText);
 -					lastEndOffset=endOffset;
 +					lastEndOffset=Math.max(endOffset, lastEndOffset);
  					tokenGroup.clear();

 -					//check if current token marks the start of a new fragment
 +					//check if current token marks the start of a new fragment
  					if(textFragmenter.isNewFragment(token))
  					{
  						currentFrag.setScore(fragmentScorer.getFragmentScore());
 @@ -250,28 +250,28 @@
  						docFrags.add(currentFrag);
  					}
  				}
 -
 -				tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
 -
 +
 +        tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
 +
  				if(lastEndOffset>maxDocBytesToAnalyze)
  				{
  					break;
  				}
  			}
  			currentFrag.setScore(fragmentScorer.getFragmentScore());
 -
 +
  			if(tokenGroup.numTokens>0)
  			{
  				//flush the accumulated text (same code as in above loop)
 -				startOffset = tokenGroup.startOffset;
 -				endOffset = tokenGroup.endOffset;
 +				startOffset = tokenGroup.matchStartOffset;
 +				endOffset = tokenGroup.matchEndOffset;
  				tokenText = text.substring(startOffset, endOffset);
  				String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
  				//store any whitespace etc from between this and last group
  				if (startOffset > lastEndOffset)
  					newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
  				newText.append(markedUpText);
 -				lastEndOffset=endOffset;
 +				lastEndOffset=Math.max(lastEndOffset,endOffset);
  			}

  			// append text after end of last token
 @@ -286,7 +286,7 @@
  				currentFrag = (TextFragment) i.next();

  				//If you are running with a version of Lucene before 11th Sept 03
 -				// you do not have PriorityQueue.insert() - so uncomment the code below
 +				// you do not have PriorityQueue.insert() - so uncomment the code below
  				/*
  									if (currentFrag.getScore() >= minScore)
  									{
 @@ -296,8 +296,8 @@
  											fragQueue.pop(); // remove lowest in hit queue
  											minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
  										}
 -
 -
 +
 +
  									}
  				*/
  				//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
 @@ -312,7 +312,7 @@
  			{
  				frag[i] = (TextFragment) fragQueue.pop();
  			}
 -
 +
  			//merge any contiguous fragments to improve readability
  			if(mergeContiguousFragments)
  			{
 @@ -325,9 +325,9 @@
  						fragTexts.add(frag[i]);
  					}
  				}
 -				frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
 +				frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
  			}
 -
 +
  			return frag;

  		}
 @@ -347,7 +347,7 @@
  	}


 -	/** Improves readability of a score-sorted list of TextFragments by merging any fragments
 +	/** Improves readability of a score-sorted list of TextFragments by merging any fragments
  	 * that were contiguous in the original text into one larger fragment with the correct order.
  	 * This will leave a "null" in the array entry for the lesser scored fragment.
  	 *
	Index: contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
	===================================================================
	--- contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (revision 421357)
	+++ contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (working copy)
	@@ -20,19 +20,14 @@
	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	-import java.util.HashMap;
	-import java.util.Map;
	-import java.util.StringTokenizer;
	+import java.util.*;

	import javax.xml.parsers.DocumentBuilder;
	import javax.xml.parsers.DocumentBuilderFactory;

	import junit.framework.TestCase;

	-import org.apache.lucene.analysis.Analyzer;
	-import org.apache.lucene.analysis.LowerCaseTokenizer;
	-import org.apache.lucene.analysis.Token;
	-import org.apache.lucene.analysis.TokenStream;
	+import org.apache.lucene.analysis.*;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	@@ -535,9 +530,132 @@
	reader.close();

	}
	-
	-

	+ protected TokenStream getTS2() {
	+ //String s = "Hi-Speed10 foo";
	+ return new TokenStream() {
	+ Iterator iter;
	+ List lst;
	+ {
	+ lst = new ArrayList();
	+ Token t;
	+ t = new Token("hi",0,2);
	+ lst.add(t);
	+ t = new Token("hispeed",0,8);
	+ lst.add(t);
	+ t = new Token("speed",3,8);
	+ t.setPositionIncrement(0);
	+ lst.add(t);
	+ t = new Token("10",8,10);
	+ lst.add(t);
	+ t = new Token("foo",11,14);
	+ lst.add(t);
	+ iter = lst.iterator();
	+ }
	+ public Token next() throws IOException {
	+ return iter.hasNext() ? (Token)iter.next() : null;
	+ }
	+ };
	+ }
	+
	+ // same token-stream as above, but the bigger token comes first this time
	+ protected TokenStream getTS2a() {
	+ //String s = "Hi-Speed10 foo";
	+ return new TokenStream() {
	+ Iterator iter;
	+ List lst;
	+ {
	+ lst = new ArrayList();
	+ Token t;
	+ t = new Token("hispeed",0,8);
	+ lst.add(t);
	+ t = new Token("hi",0,2);
	+ t.setPositionIncrement(0);
	+ lst.add(t);
	+ t = new Token("speed",3,8);
	+ lst.add(t);
	+ t = new Token("10",8,10);
	+ lst.add(t);
	+ t = new Token("foo",11,14);
	+ lst.add(t);
	+ iter = lst.iterator();
	+ }
	+ public Token next() throws IOException {
	+ return iter.hasNext() ? (Token)iter.next() : null;
	+ }
	+ };
	+ }
	+
	+ public void testOverlapAnalyzer2() throws Exception
	+ {
	+
	+ String s = "Hi-Speed10 foo";
	+
	+ Query query; Highlighter highlighter; String result;
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
	+ assertEquals("Hi-Speed10 <B>foo</B>",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
	+ assertEquals("Hi-Speed<B>10</B> foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
	+ assertEquals("<B>Hi</B>-Speed10 foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
	+ assertEquals("Hi-<B>Speed</B>10 foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
	+ assertEquals("<B>Hi-Speed</B>10 foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
	+ assertEquals("<B>Hi-Speed</B>10 foo",result);
	+
	+ /////////////////// same tests, just put the bigger overlapping token first
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("foo");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
	+ assertEquals("Hi-Speed10 <B>foo</B>",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("10");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
	+ assertEquals("Hi-Speed<B>10</B> foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
	+ assertEquals("<B>Hi</B>-Speed10 foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("speed");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
	+ assertEquals("Hi-<B>Speed</B>10 foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hispeed");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
	+ assertEquals("<B>Hi-Speed</B>10 foo",result);
	+
	+ query = new QueryParser("text",new WhitespaceAnalyzer()).parse("hi speed");
	+ highlighter = new Highlighter(new QueryScorer(query));
	+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
	+ assertEquals("<B>Hi-Speed</B>10 foo",result);
	+ }
	+
	+
	/*

	public void testBigramAnalyzer() throws IOException, ParseException
	Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java
	===================================================================
	--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (revision 421357)
	+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenGroup.java (working copy)
	@@ -30,34 +30,48 @@
	int numTokens=0;
	int startOffset=0;
	int endOffset=0;
	-
	+ float tot;

	- void addToken(Token token, float score)
	+ int matchStartOffset, matchEndOffset;
	+
	+
	+ void addToken(Token token, float score)
	{
	if(numTokens < MAX_NUM_TOKENS_PER_GROUP)
	{
	if(numTokens==0)
	{
	- startOffset=token.startOffset();
	- endOffset=token.endOffset();
	+ startOffset=matchStartOffset=token.startOffset();
	+ endOffset=matchEndOffset=token.endOffset();
	+ tot += score;
	}
	else
	{
	- startOffset=Math.min(startOffset,token.startOffset());
	- endOffset=Math.max(endOffset,token.endOffset());
	- }
	+ startOffset=Math.min(startOffset,token.startOffset());
	+ endOffset=Math.max(endOffset,token.endOffset());
	+ if (score>0) {
	+ if (tot==0) {
	+ matchStartOffset=token.startOffset();
	+ matchEndOffset=token.endOffset();
	+ } else {
	+ matchStartOffset=Math.min(matchStartOffset,token.startOffset());
	+ matchEndOffset=Math.max(matchEndOffset,token.endOffset());
	+ }
	+ tot+=score;
	+ }
	+ }
	tokens[numTokens]=token;
	scores[numTokens]=score;
	numTokens++;
	}
	}
	-
	+
	boolean isDistinct(Token token)
	{
	return token.startOffset()>=endOffset;
	}
	-
	-
	+
	+
	void clear()
	{
	numTokens=0;
	Index: contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
	===================================================================
	--- contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (revision 421357)
	+++ contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (working copy)
	@@ -25,8 +25,8 @@
	import org.apache.lucene.util.PriorityQueue;

	/**
	- * Class used to markup highlighted terms found in the best sections of a
	- * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
	+ * Class used to markup highlighted terms found in the best sections of a
	+ * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
	* {@link Encoder} and tokenizers.
	* @author mark@searcharea.co.uk
	*/
	@@ -36,7 +36,7 @@
	public static final int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;
	private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
	private Formatter formatter;
	- private Encoder encoder;
	+ private Encoder encoder;
	private Fragmenter textFragmenter=new SimpleFragmenter();
	private Scorer fragmentScorer=null;

	@@ -44,14 +44,14 @@
	{
	this(new SimpleHTMLFormatter(),fragmentScorer);
	}
	-
	-
	+
	+
	public Highlighter(Formatter formatter, Scorer fragmentScorer)
	{
	this(formatter,new DefaultEncoder(),fragmentScorer);
	}
	-
	-
	+
	+
	public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
	{
	this.formatter = formatter;
	@@ -65,9 +65,9 @@
	* {@link #getBestFragment(TokenStream, String)}
	*
	* @param analyzer the analyzer that will be used to split <code>text</code>
	- * into chunks
	+ * into chunks
	* @param text text to highlight terms in
	- * @param fieldName Name of field used to influence analyzer's tokenization policy
	+ * @param fieldName Name of field used to influence analyzer's tokenization policy
	*
	* @return highlighted text fragment or null if no terms found
	*/
	@@ -77,18 +77,18 @@
	TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
	return getBestFragment(tokenStream, text);
	}
	-
	+
	/**
	* Highlights chosen terms in a text, extracting the most relevant section.
	* The document text is analysed in chunks to record hit statistics
	* across the document. After accumulating stats, the fragment with the highest score
	* is returned
	*
	- * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
	- * This is typically produced by an analyzer re-parsing a document's
	- * text. Some work may be done on retrieving TokenStreams more efficently
	+ * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
	+ * This is typically produced by an analyzer re-parsing a document's
	+ * text. Some work may be done on retrieving TokenStreams more efficently
	* by adding support for storing original text position data in the Lucene
	- * index but this support is not currently available (as of Lucene 1.4 rc2).
	+ * index but this support is not currently available (as of Lucene 1.4 rc2).
	* @param text text to highlight terms in
	*
	* @return highlighted text fragment or null if no terms found
	@@ -110,7 +110,7 @@
	* {@link #getBestFragments(TokenStream, String, int)}
	*
	* @param analyzer the analyzer that will be used to split <code>text</code>
	- * into chunks
	+ * into chunks
	* @param text text to highlight terms in
	* @param maxNumFragments the maximum number of fragments.
	* @deprecated This method incorrectly hardcodes the choice of fieldname. Use the
	@@ -118,7 +118,7 @@
	* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
	*/
	public final String[] getBestFragments(
	- Analyzer analyzer,
	+ Analyzer analyzer,
	String text,
	int maxNumFragments)
	throws IOException
	@@ -132,7 +132,7 @@
	* {@link #getBestFragments(TokenStream, String, int)}
	*
	* @param analyzer the analyzer that will be used to split <code>text</code>
	- * into chunks
	+ * into chunks
	* @param fieldName the name of the field being highlighted (used by analyzer)
	* @param text text to highlight terms in
	* @param maxNumFragments the maximum number of fragments.
	@@ -140,7 +140,7 @@
	* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
	*/
	public final String[] getBestFragments(
	- Analyzer analyzer,
	+ Analyzer analyzer,
	String fieldName,
	String text,
	int maxNumFragments)
	@@ -149,12 +149,12 @@
	TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(text));
	return getBestFragments(tokenStream, text, maxNumFragments);
	}
	-
	+
	/**
	* Highlights chosen terms in a text, extracting the most relevant sections.
	* The document text is analysed in chunks to record hit statistics
	* across the document. After accumulating stats, the fragments with the highest scores
	- * are returned as an array of strings in order of score (contiguous fragments are merged into
	+ * are returned as an array of strings in order of score (contiguous fragments are merged into
	* one in their original order to improve readability)
	*
	* @param text text to highlight terms in
	@@ -163,13 +163,13 @@
	* @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
	*/
	public final String[] getBestFragments(
	- TokenStream tokenStream,
	+ TokenStream tokenStream,
	String text,
	int maxNumFragments)
	throws IOException
	{
	maxNumFragments = Math.max(1, maxNumFragments); //sanity check
	-
	+
	TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);

	//Get text
	@@ -183,12 +183,12 @@
	}
	return (String[]) fragTexts.toArray(new String[0]);
	}
	-

	+
	/**
	* Low level api to get the most relevant (formatted) sections of the document.
	* This method has been made public to allow visibility of score information held in TextFragment objects.
	- * Thanks to Jason Calabrese for help in redefining the interface.
	+ * Thanks to Jason Calabrese for help in redefining the interface.
	* @param tokenStream
	* @param text
	* @param maxNumFragments
	@@ -196,7 +196,7 @@
	* @throws IOException
	*/
	public final TextFragment[] getBestTextFragments(
	- TokenStream tokenStream,
	+ TokenStream tokenStream,
	String text,
	boolean mergeContiguousFragments,
	int maxNumFragments)
	@@ -208,7 +208,7 @@
	TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
	fragmentScorer.startFragment(currentFrag);
	docFrags.add(currentFrag);
	-
	+
	FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

	try
	@@ -219,27 +219,27 @@
	int endOffset;
	int lastEndOffset = 0;
	textFragmenter.start(text);
	-
	+
	TokenGroup tokenGroup=new TokenGroup();

	while ((token = tokenStream.next()) != null)
	{
	if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
	{
	- //the current token is distinct from previous tokens -
	+ //the current token is distinct from previous tokens -
	// markup the cached token group info
	- startOffset = tokenGroup.startOffset;
	- endOffset = tokenGroup.endOffset;
	+ startOffset = tokenGroup.matchStartOffset;
	+ endOffset = tokenGroup.matchEndOffset;
	tokenText = text.substring(startOffset, endOffset);
	String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
	//store any whitespace etc from between this and last group
	if (startOffset > lastEndOffset)
	newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
	newText.append(markedUpText);
	- lastEndOffset=endOffset;
	+ lastEndOffset=Math.max(endOffset, lastEndOffset);
	tokenGroup.clear();

	- //check if current token marks the start of a new fragment
	+ //check if current token marks the start of a new fragment
	if(textFragmenter.isNewFragment(token))
	{
	currentFrag.setScore(fragmentScorer.getFragmentScore());
	@@ -250,28 +250,28 @@
	docFrags.add(currentFrag);
	}
	}
	-
	- tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
	-
	+
	+ tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
	+
	if(lastEndOffset>maxDocBytesToAnalyze)
	{
	break;
	}
	}
	currentFrag.setScore(fragmentScorer.getFragmentScore());
	-
	+
	if(tokenGroup.numTokens>0)
	{
	//flush the accumulated text (same code as in above loop)
	- startOffset = tokenGroup.startOffset;
	- endOffset = tokenGroup.endOffset;
	+ startOffset = tokenGroup.matchStartOffset;
	+ endOffset = tokenGroup.matchEndOffset;
	tokenText = text.substring(startOffset, endOffset);
	String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
	//store any whitespace etc from between this and last group
	if (startOffset > lastEndOffset)
	newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
	newText.append(markedUpText);
	- lastEndOffset=endOffset;
	+ lastEndOffset=Math.max(lastEndOffset,endOffset);
	}

	// append text after end of last token
	@@ -286,7 +286,7 @@
	currentFrag = (TextFragment) i.next();

	//If you are running with a version of Lucene before 11th Sept 03
	- // you do not have PriorityQueue.insert() - so uncomment the code below
	+ // you do not have PriorityQueue.insert() - so uncomment the code below
	/*
	if (currentFrag.getScore() >= minScore)
	{
	@@ -296,8 +296,8 @@
	fragQueue.pop(); // remove lowest in hit queue
	minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
	}
	-
	-
	+
	+
	}
	*/
	//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
	@@ -312,7 +312,7 @@
	{
	frag[i] = (TextFragment) fragQueue.pop();
	}
	-
	+
	//merge any contiguous fragments to improve readability
	if(mergeContiguousFragments)
	{
	@@ -325,9 +325,9 @@
	fragTexts.add(frag[i]);
	}
	}
	- frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
	+ frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
	}
	-
	+
	return frag;

	}
	@@ -347,7 +347,7 @@
	}


	- /** Improves readability of a score-sorted list of TextFragments by merging any fragments
	+ /** Improves readability of a score-sorted list of TextFragments by merging any fragments
	* that were contiguous in the original text into one larger fragment with the correct order.
	* This will leave a "null" in the array entry for the lesser scored fragment.
	*