lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.search.highlight;

 import java.io.IOException;
 import java.util.Arrays;

 import com.carrotsearch.randomizedtesting.annotations.Repeat;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.DisjunctionMaxQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.TestUtil;

 // LUCENE-2874

 /** Tests {@link org.apache.lucene.search.highlight.TokenSources} and
  *  {@link org.apache.lucene.search.highlight.TokenStreamFromTermVector}
  * indirectly from that.
  */
 public class TokenSourcesTest extends BaseTokenStreamTestCase {
   private static final String FIELD = "text";

   private static final class OverlappingTokenStream extends TokenStream {
     private Token[] tokens;

     private int i = -1;

     private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
     private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
     private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);

     @Override
     public boolean incrementToken() {
       this.i++;
       if (this.i >= this.tokens.length) {
         return false;
       }
       clearAttributes();
       termAttribute.setEmpty().append(this.tokens[i]);
       offsetAttribute.setOffset(this.tokens[i].startOffset(),
           this.tokens[i].endOffset());
       positionIncrementAttribute.setPositionIncrement(this.tokens[i]
           .getPositionIncrement());
       return true;
     }

     @Override
     public void reset() {
       this.i = -1;
       this.tokens = new Token[] {
           new Token("the", 0, 3),
           new Token("{fox}", 0, 7),
           new Token("fox", 4, 7),
           new Token("did", 8, 11),
           new Token("not", 12, 15),
           new Token("jump", 16, 20)};
       this.tokens[1].setPositionIncrement(0);
     }
   }

   public void testOverlapWithOffset() throws IOException, InvalidTokenOffsetsException {
     final String TEXT = "the fox did not jump";
     final Directory directory = newDirectory();
     final IndexWriter indexWriter = new IndexWriter(directory,
         newIndexWriterConfig(null));
     try {
       final Document document = new Document();
       FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
       customType.setStoreTermVectors(true);
       // no positions!
       customType.setStoreTermVectorOffsets(true);
       document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
       indexWriter.addDocument(document);
     } finally {
       indexWriter.close();
     }
     final IndexReader indexReader = DirectoryReader.open(directory);
     assertEquals(1, indexReader.numDocs());
     final IndexSearcher indexSearcher = newSearcher(indexReader);
     try {
       final DisjunctionMaxQuery query = new DisjunctionMaxQuery(
           Arrays.asList(
               new SpanTermQuery(new Term(FIELD, "{fox}")),
               new SpanTermQuery(new Term(FIELD, "fox"))),
           1);
         // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
         // new SpanTermQuery(new Term(FIELD, "{fox}")),
         // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);

       TopDocs hits = indexSearcher.search(query, 1);
       assertEquals(1, hits.totalHits.value);
       final Highlighter highlighter = new Highlighter(
           new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
           new QueryScorer(query));
       final TokenStream tokenStream =
           TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
       assertEquals("<B>the fox</B> did not jump",
           highlighter.getBestFragment(tokenStream, TEXT));
     } finally {
       indexReader.close();
       directory.close();
     }
   }

   public void testOverlapWithPositionsAndOffset()
       throws IOException, InvalidTokenOffsetsException {
     final String TEXT = "the fox did not jump";
     final Directory directory = newDirectory();
     final IndexWriter indexWriter = new IndexWriter(directory,
         newIndexWriterConfig(null));
     try {
       final Document document = new Document();
       FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
       customType.setStoreTermVectors(true);
       customType.setStoreTermVectorOffsets(true);
       customType.setStoreTermVectorPositions(true);
       document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
       indexWriter.addDocument(document);
     } finally {
       indexWriter.close();
     }
     final IndexReader indexReader = DirectoryReader.open(directory);
     try {
       assertEquals(1, indexReader.numDocs());
       final IndexSearcher indexSearcher = newSearcher(indexReader);
       final DisjunctionMaxQuery query = new DisjunctionMaxQuery(
           Arrays.asList(
               new SpanTermQuery(new Term(FIELD, "{fox}")),
               new SpanTermQuery(new Term(FIELD, "fox"))),
           1);
       // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
       // new SpanTermQuery(new Term(FIELD, "{fox}")),
       // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);

       TopDocs hits = indexSearcher.search(query, 1);
       assertEquals(1, hits.totalHits.value);
       final Highlighter highlighter = new Highlighter(
           new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
           new QueryScorer(query));
       final TokenStream tokenStream =
           TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
       assertEquals("<B>the fox</B> did not jump",
           highlighter.getBestFragment(tokenStream, TEXT));
     } finally {
       indexReader.close();
       directory.close();
     }
   }

   public void testOverlapWithOffsetExactPhrase()
       throws IOException, InvalidTokenOffsetsException {
     final String TEXT = "the fox did not jump";
     final Directory directory = newDirectory();
     final IndexWriter indexWriter = new IndexWriter(directory,
         newIndexWriterConfig(null));
     try {
       final Document document = new Document();
       FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
       customType.setStoreTermVectors(true);
       // no positions!
       customType.setStoreTermVectorOffsets(true);
       document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
       indexWriter.addDocument(document);
     } finally {
       indexWriter.close();
     }
     final IndexReader indexReader = DirectoryReader.open(directory);
     try {
       assertEquals(1, indexReader.numDocs());
       final IndexSearcher indexSearcher = newSearcher(indexReader);
       // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
       // query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
       // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
       final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
           new SpanTermQuery(new Term(FIELD, "the")),
           new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);

       TopDocs hits = indexSearcher.search(phraseQuery, 1);
       assertEquals(1, hits.totalHits.value);
       final Highlighter highlighter = new Highlighter(
           new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
           new QueryScorer(phraseQuery));
       final TokenStream tokenStream =
           TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
       assertEquals("<B>the fox</B> did not jump",
           highlighter.getBestFragment(tokenStream, TEXT));
     } finally {
       indexReader.close();
       directory.close();
     }
   }

   public void testOverlapWithPositionsAndOffsetExactPhrase()
       throws IOException, InvalidTokenOffsetsException {
     final String TEXT = "the fox did not jump";
     final Directory directory = newDirectory();
     final IndexWriter indexWriter = new IndexWriter(directory,
         newIndexWriterConfig(null));
     try {
       final Document document = new Document();
       FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
       customType.setStoreTermVectors(true);
       customType.setStoreTermVectorPositions(true);
       customType.setStoreTermVectorOffsets(true);
       document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
       indexWriter.addDocument(document);
     } finally {
       indexWriter.close();
     }
     final IndexReader indexReader = DirectoryReader.open(directory);
     try {
       assertEquals(1, indexReader.numDocs());
       final IndexSearcher indexSearcher = newSearcher(indexReader);
       // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
       // query.add(new SpanTermQuery(new Term(FIELD, "the")));
       // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
       final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
           new SpanTermQuery(new Term(FIELD, "the")),
           new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);

       TopDocs hits = indexSearcher.search(phraseQuery, 1);
       assertEquals(1, hits.totalHits.value);
       final Highlighter highlighter = new Highlighter(
           new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
           new QueryScorer(phraseQuery));
       final TokenStream tokenStream =
           TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
       assertEquals("<B>the fox</B> did not jump",
           highlighter.getBestFragment(tokenStream, TEXT));
     } finally {
       indexReader.close();
       directory.close();
     }
   }

   public void testTermVectorWithoutOffsetsDoesntWork()
       throws IOException, InvalidTokenOffsetsException {
     final Directory directory = newDirectory();
     final IndexWriter indexWriter = new IndexWriter(directory,
         newIndexWriterConfig(null));
     try {
       final Document document = new Document();
       FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
       customType.setStoreTermVectors(true);
       customType.setStoreTermVectorOffsets(false);
       customType.setStoreTermVectorPositions(true);
       document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
       indexWriter.addDocument(document);
     } finally {
       indexWriter.close();
     }
     final IndexReader indexReader = DirectoryReader.open(directory);
     try {
       assertEquals(1, indexReader.numDocs());
       final TokenStream tokenStream =
           TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
       assertNull(tokenStream);
     }
     finally {
       indexReader.close();
       directory.close();
     }
   }

   int curOffset;

   /** Just make a token with the text, and set the payload
    *  to the text as well.  Offsets increment "naturally". */
   private Token getToken(String text) {
     Token t = new Token(text, curOffset, curOffset+text.length());
     t.setPayload(new BytesRef(text));
     curOffset++;
     return t;
   }

   // LUCENE-5294
   public void testPayloads() throws Exception {
     Directory dir = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
     FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
     myFieldType.setStoreTermVectors(true);
     myFieldType.setStoreTermVectorOffsets(true);
     myFieldType.setStoreTermVectorPositions(true);
     myFieldType.setStoreTermVectorPayloads(true);

     curOffset = 0;

     Token[] tokens = new Token[] {
       getToken("foxes"),
       getToken("can"),
       getToken("jump"),
       getToken("high")
     };

     Document doc = new Document();
     doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
     writer.addDocument(doc);

     IndexReader reader = writer.getReader();
     writer.close();
     assertEquals(1, reader.numDocs());

     TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

     CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
     PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
     OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
     PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);

     ts.reset();
     for(Token token : tokens) {
       assertTrue(ts.incrementToken());
       assertEquals(token.toString(), termAtt.toString());
       assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
       assertEquals(token.getPayload(), payloadAtt.getPayload());
       assertEquals(token.startOffset(), offsetAtt.startOffset());
       assertEquals(token.endOffset(), offsetAtt.endOffset());
     }

     assertFalse(ts.incrementToken());

     reader.close();
     dir.close();
   }

   @Repeat(iterations = 10)
   //@Seed("947083AB20AB2D4F")
   public void testRandomizedRoundTrip() throws Exception {
     final int distinct = TestUtil.nextInt(random(), 1, 10);

     String[] terms = new String[distinct];
     BytesRef[] termBytes = new BytesRef[distinct];
     for (int i = 0; i < distinct; ++i) {
       terms[i] = TestUtil.randomRealisticUnicodeString(random());
       termBytes[i] = new BytesRef(terms[i]);
     }

     final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
         new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
     //check to see if the token streams might have non-deterministic testable result
     final boolean storeTermVectorPositions = random().nextBoolean();
     final int[] startOffsets = rTokenStream.getStartOffsets();
     final int[] positionsIncrements = rTokenStream.getPositionsIncrements();
     for (int i = 1; i < positionsIncrements.length; i++) {
       if (storeTermVectorPositions && positionsIncrements[i] != 0) {
         continue;
       }
       //TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater
       // than previous token's endOffset?  That would increase the testable possibilities.
       if (startOffsets[i] == startOffsets[i-1]) {
         if (VERBOSE)
           System.out.println("Skipping test because can't easily validate random token-stream is correct.");
         rTokenStream.close();
         return;
       }
     }

     //sanity check itself
     assertTokenStreamContents(rTokenStream,
         rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
         rTokenStream.getPositionsIncrements());

     Directory dir = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
     FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
     myFieldType.setStoreTermVectors(true);
     myFieldType.setStoreTermVectorOffsets(true);
     myFieldType.setStoreTermVectorPositions(storeTermVectorPositions);
     //payloads require positions; it will throw an error otherwise
     myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean());

     Document doc = new Document();
     doc.add(new Field("field", rTokenStream, myFieldType));
     writer.addDocument(doc);

     IndexReader reader = writer.getReader();
     writer.close();
     assertEquals(1, reader.numDocs());

     TokenStream vectorTokenStream =
         TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

     //sometimes check payloads
     PayloadAttribute payloadAttribute = null;
     if (myFieldType.storeTermVectorPayloads() && usually()) {
       payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class);
     }
     assertTokenStreamContents(vectorTokenStream,
         rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
         myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null);
     //test payloads
     if (payloadAttribute != null) {
       vectorTokenStream.reset();
       for (int i = 0; vectorTokenStream.incrementToken(); i++) {
         assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload());
       }
     }

     reader.close();
     dir.close();
     rTokenStream.close();
   }

   public void testMaxStartOffsetConsistency() throws IOException {
     FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
     tvFieldType.setStoreTermVectors(true);
     tvFieldType.setStoreTermVectorOffsets(true);
     tvFieldType.setStoreTermVectorPositions(true);

     Directory dir = newDirectory();

     MockAnalyzer analyzer = new MockAnalyzer(random());
     analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset
     Document doc = new Document();
     final String TEXT = " f gg h";
     doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
     doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));

     IndexReader reader;
     try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
       writer.addDocument(doc);
       reader = writer.getReader();
     }
     try {
       Fields tvFields = reader.getTermVectors(0);
       for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
         TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
         TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);

         //assert have same tokens, none of which has a start offset > maxStartOffset
         final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
         final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
         tvStream.reset();
         anaStream.reset();
         while (tvStream.incrementToken()) {
           assertTrue(anaStream.incrementToken());
           assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
           if (maxStartOffset >= 0)
             assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
         }
         assertTrue(anaStream.incrementToken() == false);
         tvStream.end();
         anaStream.end();
         tvStream.close();
         anaStream.close();
       }

     } finally {
       reader.close();
     }


     dir.close();
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.search.highlight;

	import java.io.IOException;
	import java.util.Arrays;

	import com.carrotsearch.randomizedtesting.annotations.Repeat;

	import org.apache.lucene.analysis.BaseTokenStreamTestCase;
	import org.apache.lucene.analysis.CannedTokenStream;
	import org.apache.lucene.analysis.MockAnalyzer;
	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.FieldType;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.Fields;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.RandomIndexWriter;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.DisjunctionMaxQuery;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.search.spans.SpanNearQuery;
	import org.apache.lucene.search.spans.SpanQuery;
	import org.apache.lucene.search.spans.SpanTermQuery;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.TestUtil;

	// LUCENE-2874

	/** Tests {@link org.apache.lucene.search.highlight.TokenSources} and
	* {@link org.apache.lucene.search.highlight.TokenStreamFromTermVector}
	* indirectly from that.
	*/
	public class TokenSourcesTest extends BaseTokenStreamTestCase {
	private static final String FIELD = "text";

	private static final class OverlappingTokenStream extends TokenStream {
	private Token[] tokens;

	private int i = -1;

	private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
	private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
	private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);

	@Override
	public boolean incrementToken() {
	this.i++;
	if (this.i >= this.tokens.length) {
	return false;
	}
	clearAttributes();
	termAttribute.setEmpty().append(this.tokens[i]);
	offsetAttribute.setOffset(this.tokens[i].startOffset(),
	this.tokens[i].endOffset());
	positionIncrementAttribute.setPositionIncrement(this.tokens[i]
	.getPositionIncrement());
	return true;
	}

	@Override
	public void reset() {
	this.i = -1;
	this.tokens = new Token[] {
	new Token("the", 0, 3),
	new Token("{fox}", 0, 7),
	new Token("fox", 4, 7),
	new Token("did", 8, 11),
	new Token("not", 12, 15),
	new Token("jump", 16, 20)};
	this.tokens[1].setPositionIncrement(0);
	}
	}

	public void testOverlapWithOffset() throws IOException, InvalidTokenOffsetsException {
	final String TEXT = "the fox did not jump";
	final Directory directory = newDirectory();
	final IndexWriter indexWriter = new IndexWriter(directory,
	newIndexWriterConfig(null));
	try {
	final Document document = new Document();
	FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
	customType.setStoreTermVectors(true);
	// no positions!
	customType.setStoreTermVectorOffsets(true);
	document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
	indexWriter.addDocument(document);
	} finally {
	indexWriter.close();
	}
	final IndexReader indexReader = DirectoryReader.open(directory);
	assertEquals(1, indexReader.numDocs());
	final IndexSearcher indexSearcher = newSearcher(indexReader);
	try {
	final DisjunctionMaxQuery query = new DisjunctionMaxQuery(
	Arrays.asList(
	new SpanTermQuery(new Term(FIELD, "{fox}")),
	new SpanTermQuery(new Term(FIELD, "fox"))),
	1);
	// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
	// new SpanTermQuery(new Term(FIELD, "{fox}")),
	// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);

	TopDocs hits = indexSearcher.search(query, 1);
	assertEquals(1, hits.totalHits.value);
	final Highlighter highlighter = new Highlighter(
	new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
	new QueryScorer(query));
	final TokenStream tokenStream =
	TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
	assertEquals("<B>the fox</B> did not jump",
	highlighter.getBestFragment(tokenStream, TEXT));
	} finally {
	indexReader.close();
	directory.close();
	}
	}

	public void testOverlapWithPositionsAndOffset()
	throws IOException, InvalidTokenOffsetsException {
	final String TEXT = "the fox did not jump";
	final Directory directory = newDirectory();
	final IndexWriter indexWriter = new IndexWriter(directory,
	newIndexWriterConfig(null));
	try {
	final Document document = new Document();
	FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
	customType.setStoreTermVectors(true);
	customType.setStoreTermVectorOffsets(true);
	customType.setStoreTermVectorPositions(true);
	document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
	indexWriter.addDocument(document);
	} finally {
	indexWriter.close();
	}
	final IndexReader indexReader = DirectoryReader.open(directory);
	try {
	assertEquals(1, indexReader.numDocs());
	final IndexSearcher indexSearcher = newSearcher(indexReader);
	final DisjunctionMaxQuery query = new DisjunctionMaxQuery(
	Arrays.asList(
	new SpanTermQuery(new Term(FIELD, "{fox}")),
	new SpanTermQuery(new Term(FIELD, "fox"))),
	1);
	// final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
	// new SpanTermQuery(new Term(FIELD, "{fox}")),
	// new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);

	TopDocs hits = indexSearcher.search(query, 1);
	assertEquals(1, hits.totalHits.value);
	final Highlighter highlighter = new Highlighter(
	new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
	new QueryScorer(query));
	final TokenStream tokenStream =
	TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
	assertEquals("<B>the fox</B> did not jump",
	highlighter.getBestFragment(tokenStream, TEXT));
	} finally {
	indexReader.close();
	directory.close();
	}
	}

	public void testOverlapWithOffsetExactPhrase()
	throws IOException, InvalidTokenOffsetsException {
	final String TEXT = "the fox did not jump";
	final Directory directory = newDirectory();
	final IndexWriter indexWriter = new IndexWriter(directory,
	newIndexWriterConfig(null));
	try {
	final Document document = new Document();
	FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
	customType.setStoreTermVectors(true);
	// no positions!
	customType.setStoreTermVectorOffsets(true);
	document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
	indexWriter.addDocument(document);
	} finally {
	indexWriter.close();
	}
	final IndexReader indexReader = DirectoryReader.open(directory);
	try {
	assertEquals(1, indexReader.numDocs());
	final IndexSearcher indexSearcher = newSearcher(indexReader);
	// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
	// query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
	// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
	final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
	new SpanTermQuery(new Term(FIELD, "the")),
	new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);

	TopDocs hits = indexSearcher.search(phraseQuery, 1);
	assertEquals(1, hits.totalHits.value);
	final Highlighter highlighter = new Highlighter(
	new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
	new QueryScorer(phraseQuery));
	final TokenStream tokenStream =
	TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
	assertEquals("<B>the fox</B> did not jump",
	highlighter.getBestFragment(tokenStream, TEXT));
	} finally {
	indexReader.close();
	directory.close();
	}
	}

	public void testOverlapWithPositionsAndOffsetExactPhrase()
	throws IOException, InvalidTokenOffsetsException {
	final String TEXT = "the fox did not jump";
	final Directory directory = newDirectory();
	final IndexWriter indexWriter = new IndexWriter(directory,
	newIndexWriterConfig(null));
	try {
	final Document document = new Document();
	FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
	customType.setStoreTermVectors(true);
	customType.setStoreTermVectorPositions(true);
	customType.setStoreTermVectorOffsets(true);
	document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
	indexWriter.addDocument(document);
	} finally {
	indexWriter.close();
	}
	final IndexReader indexReader = DirectoryReader.open(directory);
	try {
	assertEquals(1, indexReader.numDocs());
	final IndexSearcher indexSearcher = newSearcher(indexReader);
	// final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
	// query.add(new SpanTermQuery(new Term(FIELD, "the")));
	// query.add(new SpanTermQuery(new Term(FIELD, "fox")));
	final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
	new SpanTermQuery(new Term(FIELD, "the")),
	new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);

	TopDocs hits = indexSearcher.search(phraseQuery, 1);
	assertEquals(1, hits.totalHits.value);
	final Highlighter highlighter = new Highlighter(
	new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
	new QueryScorer(phraseQuery));
	final TokenStream tokenStream =
	TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
	assertEquals("<B>the fox</B> did not jump",
	highlighter.getBestFragment(tokenStream, TEXT));
	} finally {
	indexReader.close();
	directory.close();
	}
	}

	public void testTermVectorWithoutOffsetsDoesntWork()
	throws IOException, InvalidTokenOffsetsException {
	final Directory directory = newDirectory();
	final IndexWriter indexWriter = new IndexWriter(directory,
	newIndexWriterConfig(null));
	try {
	final Document document = new Document();
	FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
	customType.setStoreTermVectors(true);
	customType.setStoreTermVectorOffsets(false);
	customType.setStoreTermVectorPositions(true);
	document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
	indexWriter.addDocument(document);
	} finally {
	indexWriter.close();
	}
	final IndexReader indexReader = DirectoryReader.open(directory);
	try {
	assertEquals(1, indexReader.numDocs());
	final TokenStream tokenStream =
	TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
	assertNull(tokenStream);
	}
	finally {
	indexReader.close();
	directory.close();
	}
	}

	int curOffset;

	/** Just make a token with the text, and set the payload
	* to the text as well. Offsets increment "naturally". */
	private Token getToken(String text) {
	Token t = new Token(text, curOffset, curOffset+text.length());
	t.setPayload(new BytesRef(text));
	curOffset++;
	return t;
	}

	// LUCENE-5294
	public void testPayloads() throws Exception {
	Directory dir = newDirectory();
	RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
	FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
	myFieldType.setStoreTermVectors(true);
	myFieldType.setStoreTermVectorOffsets(true);
	myFieldType.setStoreTermVectorPositions(true);
	myFieldType.setStoreTermVectorPayloads(true);

	curOffset = 0;

	Token[] tokens = new Token[] {
	getToken("foxes"),
	getToken("can"),
	getToken("jump"),
	getToken("high")
	};

	Document doc = new Document();
	doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType));
	writer.addDocument(doc);

	IndexReader reader = writer.getReader();
	writer.close();
	assertEquals(1, reader.numDocs());

	TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

	CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
	PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
	OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
	PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);

	ts.reset();
	for(Token token : tokens) {
	assertTrue(ts.incrementToken());
	assertEquals(token.toString(), termAtt.toString());
	assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement());
	assertEquals(token.getPayload(), payloadAtt.getPayload());
	assertEquals(token.startOffset(), offsetAtt.startOffset());
	assertEquals(token.endOffset(), offsetAtt.endOffset());
	}

	assertFalse(ts.incrementToken());

	reader.close();
	dir.close();
	}

	@Repeat(iterations = 10)
	//@Seed("947083AB20AB2D4F")
	public void testRandomizedRoundTrip() throws Exception {
	final int distinct = TestUtil.nextInt(random(), 1, 10);

	String[] terms = new String[distinct];
	BytesRef[] termBytes = new BytesRef[distinct];
	for (int i = 0; i < distinct; ++i) {
	terms[i] = TestUtil.randomRealisticUnicodeString(random());
	termBytes[i] = new BytesRef(terms[i]);
	}

	final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
	new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
	//check to see if the token streams might have non-deterministic testable result
	final boolean storeTermVectorPositions = random().nextBoolean();
	final int[] startOffsets = rTokenStream.getStartOffsets();
	final int[] positionsIncrements = rTokenStream.getPositionsIncrements();
	for (int i = 1; i < positionsIncrements.length; i++) {
	if (storeTermVectorPositions && positionsIncrements[i] != 0) {
	continue;
	}
	//TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater
	// than previous token's endOffset? That would increase the testable possibilities.
	if (startOffsets[i] == startOffsets[i-1]) {
	if (VERBOSE)
	System.out.println("Skipping test because can't easily validate random token-stream is correct.");
	rTokenStream.close();
	return;
	}
	}

	//sanity check itself
	assertTokenStreamContents(rTokenStream,
	rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
	rTokenStream.getPositionsIncrements());

	Directory dir = newDirectory();
	RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
	FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
	myFieldType.setStoreTermVectors(true);
	myFieldType.setStoreTermVectorOffsets(true);
	myFieldType.setStoreTermVectorPositions(storeTermVectorPositions);
	//payloads require positions; it will throw an error otherwise
	myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean());

	Document doc = new Document();
	doc.add(new Field("field", rTokenStream, myFieldType));
	writer.addDocument(doc);

	IndexReader reader = writer.getReader();
	writer.close();
	assertEquals(1, reader.numDocs());

	TokenStream vectorTokenStream =
	TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

	//sometimes check payloads
	PayloadAttribute payloadAttribute = null;
	if (myFieldType.storeTermVectorPayloads() && usually()) {
	payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class);
	}
	assertTokenStreamContents(vectorTokenStream,
	rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
	myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null);
	//test payloads
	if (payloadAttribute != null) {
	vectorTokenStream.reset();
	for (int i = 0; vectorTokenStream.incrementToken(); i++) {
	assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload());
	}
	}

	reader.close();
	dir.close();
	rTokenStream.close();
	}

	public void testMaxStartOffsetConsistency() throws IOException {
	FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
	tvFieldType.setStoreTermVectors(true);
	tvFieldType.setStoreTermVectorOffsets(true);
	tvFieldType.setStoreTermVectorPositions(true);

	Directory dir = newDirectory();

	MockAnalyzer analyzer = new MockAnalyzer(random());
	analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset
	Document doc = new Document();
	final String TEXT = " f gg h";
	doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
	doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));

	IndexReader reader;
	try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
	writer.addDocument(doc);
	reader = writer.getReader();
	}
	try {
	Fields tvFields = reader.getTermVectors(0);
	for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
	TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
	TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);

	//assert have same tokens, none of which has a start offset > maxStartOffset
	final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
	final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
	tvStream.reset();
	anaStream.reset();
	while (tvStream.incrementToken()) {
	assertTrue(anaStream.incrementToken());
	assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
	if (maxStartOffset >= 0)
	assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
	}
	assertTrue(anaStream.incrementToken() == false);
	tvStream.end();
	anaStream.end();
	tvStream.close();
	anaStream.close();
	}

	} finally {
	reader.close();
	}



	dir.close();
	}
	}