| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.highlight; |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| |
| import com.carrotsearch.randomizedtesting.annotations.Repeat; |
| |
| import org.apache.lucene.analysis.BaseTokenStreamTestCase; |
| import org.apache.lucene.analysis.CannedTokenStream; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.Token; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.BaseTermVectorsFormatTestCase; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.DisjunctionMaxQuery; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.search.spans.SpanNearQuery; |
| import org.apache.lucene.search.spans.SpanQuery; |
| import org.apache.lucene.search.spans.SpanTermQuery; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.TestUtil; |
| |
| // LUCENE-2874 |
| |
| /** Tests {@link org.apache.lucene.search.highlight.TokenSources} and |
| * {@link org.apache.lucene.search.highlight.TokenStreamFromTermVector} |
| * indirectly from that. |
| */ |
| public class TokenSourcesTest extends BaseTokenStreamTestCase { |
| private static final String FIELD = "text"; |
| |
| private static final class OverlappingTokenStream extends TokenStream { |
| private Token[] tokens; |
| |
| private int i = -1; |
| |
| private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); |
| private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class); |
| |
| @Override |
| public boolean incrementToken() { |
| this.i++; |
| if (this.i >= this.tokens.length) { |
| return false; |
| } |
| clearAttributes(); |
| termAttribute.setEmpty().append(this.tokens[i]); |
| offsetAttribute.setOffset(this.tokens[i].startOffset(), |
| this.tokens[i].endOffset()); |
| positionIncrementAttribute.setPositionIncrement(this.tokens[i] |
| .getPositionIncrement()); |
| return true; |
| } |
| |
| @Override |
| public void reset() { |
| this.i = -1; |
| this.tokens = new Token[] { |
| new Token("the", 0, 3), |
| new Token("{fox}", 0, 7), |
| new Token("fox", 4, 7), |
| new Token("did", 8, 11), |
| new Token("not", 12, 15), |
| new Token("jump", 16, 20)}; |
| this.tokens[1].setPositionIncrement(0); |
| } |
| } |
| |
| public void testOverlapWithOffset() throws IOException, InvalidTokenOffsetsException { |
| final String TEXT = "the fox did not jump"; |
| final Directory directory = newDirectory(); |
| final IndexWriter indexWriter = new IndexWriter(directory, |
| newIndexWriterConfig(null)); |
| try { |
| final Document document = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setStoreTermVectors(true); |
| // no positions! |
| customType.setStoreTermVectorOffsets(true); |
| document.add(new Field(FIELD, new OverlappingTokenStream(), customType)); |
| indexWriter.addDocument(document); |
| } finally { |
| indexWriter.close(); |
| } |
| final IndexReader indexReader = DirectoryReader.open(directory); |
| assertEquals(1, indexReader.numDocs()); |
| final IndexSearcher indexSearcher = newSearcher(indexReader); |
| try { |
| final DisjunctionMaxQuery query = new DisjunctionMaxQuery( |
| Arrays.asList( |
| new SpanTermQuery(new Term(FIELD, "{fox}")), |
| new SpanTermQuery(new Term(FIELD, "fox"))), |
| 1); |
| // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { |
| // new SpanTermQuery(new Term(FIELD, "{fox}")), |
| // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true); |
| |
| TopDocs hits = indexSearcher.search(query, 1); |
| assertEquals(1, hits.totalHits.value); |
| final Highlighter highlighter = new Highlighter( |
| new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), |
| new QueryScorer(query)); |
| final TokenStream tokenStream = |
| TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1); |
| assertEquals("<B>the fox</B> did not jump", |
| highlighter.getBestFragment(tokenStream, TEXT)); |
| } finally { |
| indexReader.close(); |
| directory.close(); |
| } |
| } |
| |
| public void testOverlapWithPositionsAndOffset() |
| throws IOException, InvalidTokenOffsetsException { |
| final String TEXT = "the fox did not jump"; |
| final Directory directory = newDirectory(); |
| final IndexWriter indexWriter = new IndexWriter(directory, |
| newIndexWriterConfig(null)); |
| try { |
| final Document document = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setStoreTermVectors(true); |
| customType.setStoreTermVectorOffsets(true); |
| customType.setStoreTermVectorPositions(true); |
| document.add(new Field(FIELD, new OverlappingTokenStream(), customType)); |
| indexWriter.addDocument(document); |
| } finally { |
| indexWriter.close(); |
| } |
| final IndexReader indexReader = DirectoryReader.open(directory); |
| try { |
| assertEquals(1, indexReader.numDocs()); |
| final IndexSearcher indexSearcher = newSearcher(indexReader); |
| final DisjunctionMaxQuery query = new DisjunctionMaxQuery( |
| Arrays.asList( |
| new SpanTermQuery(new Term(FIELD, "{fox}")), |
| new SpanTermQuery(new Term(FIELD, "fox"))), |
| 1); |
| // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { |
| // new SpanTermQuery(new Term(FIELD, "{fox}")), |
| // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true); |
| |
| TopDocs hits = indexSearcher.search(query, 1); |
| assertEquals(1, hits.totalHits.value); |
| final Highlighter highlighter = new Highlighter( |
| new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), |
| new QueryScorer(query)); |
| final TokenStream tokenStream = |
| TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1); |
| assertEquals("<B>the fox</B> did not jump", |
| highlighter.getBestFragment(tokenStream, TEXT)); |
| } finally { |
| indexReader.close(); |
| directory.close(); |
| } |
| } |
| |
| public void testOverlapWithOffsetExactPhrase() |
| throws IOException, InvalidTokenOffsetsException { |
| final String TEXT = "the fox did not jump"; |
| final Directory directory = newDirectory(); |
| final IndexWriter indexWriter = new IndexWriter(directory, |
| newIndexWriterConfig(null)); |
| try { |
| final Document document = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setStoreTermVectors(true); |
| // no positions! |
| customType.setStoreTermVectorOffsets(true); |
| document.add(new Field(FIELD, new OverlappingTokenStream(), customType)); |
| indexWriter.addDocument(document); |
| } finally { |
| indexWriter.close(); |
| } |
| final IndexReader indexReader = DirectoryReader.open(directory); |
| try { |
| assertEquals(1, indexReader.numDocs()); |
| final IndexSearcher indexSearcher = newSearcher(indexReader); |
| // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); |
| // query.add(new SpanTermQuery(new Term(FIELD, "{fox}"))); |
| // query.add(new SpanTermQuery(new Term(FIELD, "fox"))); |
| final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { |
| new SpanTermQuery(new Term(FIELD, "the")), |
| new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true); |
| |
| TopDocs hits = indexSearcher.search(phraseQuery, 1); |
| assertEquals(1, hits.totalHits.value); |
| final Highlighter highlighter = new Highlighter( |
| new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), |
| new QueryScorer(phraseQuery)); |
| final TokenStream tokenStream = |
| TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1); |
| assertEquals("<B>the fox</B> did not jump", |
| highlighter.getBestFragment(tokenStream, TEXT)); |
| } finally { |
| indexReader.close(); |
| directory.close(); |
| } |
| } |
| |
| public void testOverlapWithPositionsAndOffsetExactPhrase() |
| throws IOException, InvalidTokenOffsetsException { |
| final String TEXT = "the fox did not jump"; |
| final Directory directory = newDirectory(); |
| final IndexWriter indexWriter = new IndexWriter(directory, |
| newIndexWriterConfig(null)); |
| try { |
| final Document document = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setStoreTermVectors(true); |
| customType.setStoreTermVectorPositions(true); |
| customType.setStoreTermVectorOffsets(true); |
| document.add(new Field(FIELD, new OverlappingTokenStream(), customType)); |
| indexWriter.addDocument(document); |
| } finally { |
| indexWriter.close(); |
| } |
| final IndexReader indexReader = DirectoryReader.open(directory); |
| try { |
| assertEquals(1, indexReader.numDocs()); |
| final IndexSearcher indexSearcher = newSearcher(indexReader); |
| // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1); |
| // query.add(new SpanTermQuery(new Term(FIELD, "the"))); |
| // query.add(new SpanTermQuery(new Term(FIELD, "fox"))); |
| final Query phraseQuery = new SpanNearQuery(new SpanQuery[] { |
| new SpanTermQuery(new Term(FIELD, "the")), |
| new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true); |
| |
| TopDocs hits = indexSearcher.search(phraseQuery, 1); |
| assertEquals(1, hits.totalHits.value); |
| final Highlighter highlighter = new Highlighter( |
| new SimpleHTMLFormatter(), new SimpleHTMLEncoder(), |
| new QueryScorer(phraseQuery)); |
| final TokenStream tokenStream = |
| TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1); |
| assertEquals("<B>the fox</B> did not jump", |
| highlighter.getBestFragment(tokenStream, TEXT)); |
| } finally { |
| indexReader.close(); |
| directory.close(); |
| } |
| } |
| |
| public void testTermVectorWithoutOffsetsDoesntWork() |
| throws IOException, InvalidTokenOffsetsException { |
| final Directory directory = newDirectory(); |
| final IndexWriter indexWriter = new IndexWriter(directory, |
| newIndexWriterConfig(null)); |
| try { |
| final Document document = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setStoreTermVectors(true); |
| customType.setStoreTermVectorOffsets(false); |
| customType.setStoreTermVectorPositions(true); |
| document.add(new Field(FIELD, new OverlappingTokenStream(), customType)); |
| indexWriter.addDocument(document); |
| } finally { |
| indexWriter.close(); |
| } |
| final IndexReader indexReader = DirectoryReader.open(directory); |
| try { |
| assertEquals(1, indexReader.numDocs()); |
| final TokenStream tokenStream = |
| TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1); |
| assertNull(tokenStream); |
| } |
| finally { |
| indexReader.close(); |
| directory.close(); |
| } |
| } |
| |
| int curOffset; |
| |
| /** Just make a token with the text, and set the payload |
| * to the text as well. Offsets increment "naturally". */ |
| private Token getToken(String text) { |
| Token t = new Token(text, curOffset, curOffset+text.length()); |
| t.setPayload(new BytesRef(text)); |
| curOffset++; |
| return t; |
| } |
| |
| // LUCENE-5294 |
| public void testPayloads() throws Exception { |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir); |
| FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED); |
| myFieldType.setStoreTermVectors(true); |
| myFieldType.setStoreTermVectorOffsets(true); |
| myFieldType.setStoreTermVectorPositions(true); |
| myFieldType.setStoreTermVectorPayloads(true); |
| |
| curOffset = 0; |
| |
| Token[] tokens = new Token[] { |
| getToken("foxes"), |
| getToken("can"), |
| getToken("jump"), |
| getToken("high") |
| }; |
| |
| Document doc = new Document(); |
| doc.add(new Field("field", new CannedTokenStream(tokens), myFieldType)); |
| writer.addDocument(doc); |
| |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| assertEquals(1, reader.numDocs()); |
| |
| TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1); |
| |
| CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); |
| PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); |
| OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); |
| PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); |
| |
| ts.reset(); |
| for(Token token : tokens) { |
| assertTrue(ts.incrementToken()); |
| assertEquals(token.toString(), termAtt.toString()); |
| assertEquals(token.getPositionIncrement(), posIncAtt.getPositionIncrement()); |
| assertEquals(token.getPayload(), payloadAtt.getPayload()); |
| assertEquals(token.startOffset(), offsetAtt.startOffset()); |
| assertEquals(token.endOffset(), offsetAtt.endOffset()); |
| } |
| |
| assertFalse(ts.incrementToken()); |
| |
| reader.close(); |
| dir.close(); |
| } |
| |
| @Repeat(iterations = 10) |
| //@Seed("947083AB20AB2D4F") |
| public void testRandomizedRoundTrip() throws Exception { |
| final int distinct = TestUtil.nextInt(random(), 1, 10); |
| |
| String[] terms = new String[distinct]; |
| BytesRef[] termBytes = new BytesRef[distinct]; |
| for (int i = 0; i < distinct; ++i) { |
| terms[i] = TestUtil.randomRealisticUnicodeString(random()); |
| termBytes[i] = new BytesRef(terms[i]); |
| } |
| |
| final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream = |
| new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes); |
| //check to see if the token streams might have non-deterministic testable result |
| final boolean storeTermVectorPositions = random().nextBoolean(); |
| final int[] startOffsets = rTokenStream.getStartOffsets(); |
| final int[] positionsIncrements = rTokenStream.getPositionsIncrements(); |
| for (int i = 1; i < positionsIncrements.length; i++) { |
| if (storeTermVectorPositions && positionsIncrements[i] != 0) { |
| continue; |
| } |
| //TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater |
| // than previous token's endOffset? That would increase the testable possibilities. |
| if (startOffsets[i] == startOffsets[i-1]) { |
| if (VERBOSE) |
| System.out.println("Skipping test because can't easily validate random token-stream is correct."); |
| rTokenStream.close(); |
| return; |
| } |
| } |
| |
| //sanity check itself |
| assertTokenStreamContents(rTokenStream, |
| rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(), |
| rTokenStream.getPositionsIncrements()); |
| |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir); |
| FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED); |
| myFieldType.setStoreTermVectors(true); |
| myFieldType.setStoreTermVectorOffsets(true); |
| myFieldType.setStoreTermVectorPositions(storeTermVectorPositions); |
| //payloads require positions; it will throw an error otherwise |
| myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean()); |
| |
| Document doc = new Document(); |
| doc.add(new Field("field", rTokenStream, myFieldType)); |
| writer.addDocument(doc); |
| |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| assertEquals(1, reader.numDocs()); |
| |
| TokenStream vectorTokenStream = |
| TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1); |
| |
| //sometimes check payloads |
| PayloadAttribute payloadAttribute = null; |
| if (myFieldType.storeTermVectorPayloads() && usually()) { |
| payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class); |
| } |
| assertTokenStreamContents(vectorTokenStream, |
| rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(), |
| myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null); |
| //test payloads |
| if (payloadAttribute != null) { |
| vectorTokenStream.reset(); |
| for (int i = 0; vectorTokenStream.incrementToken(); i++) { |
| assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload()); |
| } |
| } |
| |
| reader.close(); |
| dir.close(); |
| rTokenStream.close(); |
| } |
| |
| public void testMaxStartOffsetConsistency() throws IOException { |
| FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED); |
| tvFieldType.setStoreTermVectors(true); |
| tvFieldType.setStoreTermVectorOffsets(true); |
| tvFieldType.setStoreTermVectorPositions(true); |
| |
| Directory dir = newDirectory(); |
| |
| MockAnalyzer analyzer = new MockAnalyzer(random()); |
| analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset |
| Document doc = new Document(); |
| final String TEXT = " f gg h"; |
| doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType)); |
| doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT))); |
| |
| IndexReader reader; |
| try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) { |
| writer.addDocument(doc); |
| reader = writer.getReader(); |
| } |
| try { |
| Fields tvFields = reader.getTermVectors(0); |
| for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) { |
| TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset); |
| TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset); |
| |
| //assert have same tokens, none of which has a start offset > maxStartOffset |
| final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class); |
| final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class); |
| tvStream.reset(); |
| anaStream.reset(); |
| while (tvStream.incrementToken()) { |
| assertTrue(anaStream.incrementToken()); |
| assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset()); |
| if (maxStartOffset >= 0) |
| assertTrue(tvOffAtt.startOffset() <= maxStartOffset); |
| } |
| assertTrue(anaStream.incrementToken() == false); |
| tvStream.end(); |
| anaStream.end(); |
| tvStream.close(); |
| anaStream.close(); |
| } |
| |
| } finally { |
| reader.close(); |
| } |
| |
| |
| |
| dir.close(); |
| } |
| } |