| package org.apache.lucene.search.postingshighlight; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.text.BreakIterator; |
| import java.util.Map; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.StoredDocument; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.ScoreDoc; |
| import org.apache.lucene.search.Sort; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; |
| |
| @SuppressCodecs({"MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom"}) |
| public class TestPostingsHighlighter extends LuceneTestCase { |
| |
| public void testBasics() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| iw.addDocument(doc); |
| body.setStringValue("Highlighting the first term. Hope it works."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(2, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs); |
| assertEquals(2, snippets.length); |
| assertEquals("Just a test <b>highlighting</b> from postings. ", snippets[0]); |
| assertEquals("<b>Highlighting</b> the first term. ", snippets[1]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| // simple test with one sentence documents. |
| public void testOneSentence() throws Exception { |
| Directory dir = newDirectory(); |
| // use simpleanalyzer for more natural tokenization (else "test." is a token) |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test."); |
| iw.addDocument(doc); |
| body.setStringValue("Test a one sentence document."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "test")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(2, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs); |
| assertEquals(2, snippets.length); |
| assertEquals("This is a <b>test</b>.", snippets[0]); |
| assertEquals("<b>Test</b> a one sentence document.", snippets[1]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| // simple test with multiple values that make a result longer than maxLength. |
| public void testMaxLengthWithMultivalue() throws Exception { |
| Directory dir = newDirectory(); |
| // use simpleanalyzer for more natural tokenization (else "test." is a token) |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Document doc = new Document(); |
| |
| for(int i = 0; i < 3 ; i++) { |
| Field body = new Field("body", "", offsetsType); |
| body.setStringValue("This is a multivalued field"); |
| doc.add(body); |
| } |
| |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(40); |
| Query query = new TermQuery(new Term("body", "field")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs); |
| assertEquals(1, snippets.length); |
| assertTrue("Snippet should have maximum 40 characters plus the pre and post tags", |
| snippets[0].length() == (40 + "<b></b>".length())); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testMultipleFields() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Field title = new Field("title", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| doc.add(title); |
| |
| body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| title.setStringValue("I am hoping for the best."); |
| iw.addDocument(doc); |
| body.setStringValue("Highlighting the first term. Hope it works."); |
| title.setStringValue("But best may not be good enough."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| BooleanQuery query = new BooleanQuery(); |
| query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD); |
| query.add(new TermQuery(new Term("title", "best")), BooleanClause.Occur.SHOULD); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(2, topDocs.totalHits); |
| Map<String,String[]> snippets = highlighter.highlightFields(new String [] { "body", "title" }, query, searcher, topDocs); |
| assertEquals(2, snippets.size()); |
| assertEquals("Just a test <b>highlighting</b> from postings. ", snippets.get("body")[0]); |
| assertEquals("<b>Highlighting</b> the first term. ", snippets.get("body")[1]); |
| assertEquals("I am hoping for the <b>best</b>.", snippets.get("title")[0]); |
| assertEquals("But <b>best</b> may not be good enough.", snippets.get("title")[1]); |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testMultipleTerms() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| iw.addDocument(doc); |
| body.setStringValue("Highlighting the first term. Hope it works."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| BooleanQuery query = new BooleanQuery(); |
| query.add(new TermQuery(new Term("body", "highlighting")), BooleanClause.Occur.SHOULD); |
| query.add(new TermQuery(new Term("body", "just")), BooleanClause.Occur.SHOULD); |
| query.add(new TermQuery(new Term("body", "first")), BooleanClause.Occur.SHOULD); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(2, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs); |
| assertEquals(2, snippets.length); |
| assertEquals("<b>Just</b> a test <b>highlighting</b> from postings. ", snippets[0]); |
| assertEquals("<b>Highlighting</b> the <b>first</b> term. ", snippets[1]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testMultiplePassages() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| iw.addDocument(doc); |
| body.setStringValue("This test is another test. Not a good sentence. Test test test test."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "test")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(2, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(2, snippets.length); |
| assertEquals("This is a <b>test</b>. Just a <b>test</b> highlighting from postings. ", snippets[0]); |
| assertEquals("This <b>test</b> is another <b>test</b>. ... <b>Test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[1]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testUserFailedToIndexOffsets() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType positionsType = new FieldType(TextField.TYPE_STORED); |
| positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); |
| Field body = new Field("body", "", positionsType); |
| Field title = new StringField("title", "", Field.Store.YES); |
| Document doc = new Document(); |
| doc.add(body); |
| doc.add(title); |
| |
| body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| title.setStringValue("test"); |
| iw.addDocument(doc); |
| body.setStringValue("This test is another test. Not a good sentence. Test test test test."); |
| title.setStringValue("test"); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "test")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(2, topDocs.totalHits); |
| try { |
| highlighter.highlight("body", query, searcher, topDocs, 2); |
| fail("did not hit expected exception"); |
| } catch (IllegalArgumentException iae) { |
| // expected |
| } |
| |
| try { |
| highlighter.highlight("title", new TermQuery(new Term("title", "test")), searcher, topDocs, 2); |
| fail("did not hit expected exception"); |
| } catch (IllegalArgumentException iae) { |
| // expected |
| } |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testBuddhism() throws Exception { |
| String text = "This eight-volume set brings together seminal papers in Buddhist studies from a vast " + |
| "range of academic disciplines published over the last forty years. With a new introduction " + |
| "by the editor, this collection is a unique and unrivalled research resource for both " + |
| "student and scholar. Coverage includes: - Buddhist origins; early history of Buddhism in " + |
| "South and Southeast Asia - early Buddhist Schools and Doctrinal History; Theravada Doctrine " + |
| "- the Origins and nature of Mahayana Buddhism; some Mahayana religious topics - Abhidharma " + |
| "and Madhyamaka - Yogacara, the Epistemological tradition, and Tathagatagarbha - Tantric " + |
| "Buddhism (Including China and Japan); Buddhism in Nepal and Tibet - Buddhism in South and " + |
| "Southeast Asia, and - Buddhism in China, East Asia, and Japan."; |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); |
| |
| FieldType positionsType = new FieldType(TextField.TYPE_STORED); |
| positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", text, positionsType); |
| Document document = new Document(); |
| document.add(body); |
| iw.addDocument(document); |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| IndexSearcher searcher = newSearcher(ir); |
| PhraseQuery query = new PhraseQuery(); |
| query.add(new Term("body", "buddhist")); |
| query.add(new Term("body", "origins")); |
| TopDocs topDocs = searcher.search(query, 10); |
| assertEquals(1, topDocs.totalHits); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(1, snippets.length); |
| assertTrue(snippets[0].contains("<b>Buddhist</b> <b>origins</b>")); |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testCuriousGeorge() throws Exception { |
| String text = "It’s the formula for success for preschoolers—Curious George and fire trucks! " + |
| "Curious George and the Firefighters is a story based on H. A. and Margret Rey’s " + |
| "popular primate and painted in the original watercolor and charcoal style. " + |
| "Firefighters are a famously brave lot, but can they withstand a visit from one curious monkey?"; |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); |
| FieldType positionsType = new FieldType(TextField.TYPE_STORED); |
| positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", text, positionsType); |
| Document document = new Document(); |
| document.add(body); |
| iw.addDocument(document); |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| IndexSearcher searcher = newSearcher(ir); |
| PhraseQuery query = new PhraseQuery(); |
| query.add(new Term("body", "curious")); |
| query.add(new Term("body", "george")); |
| TopDocs topDocs = searcher.search(query, 10); |
| assertEquals(1, topDocs.totalHits); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(1, snippets.length); |
| assertFalse(snippets[0].contains("<b>Curious</b>Curious")); |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testCambridgeMA() throws Exception { |
| BufferedReader r = new BufferedReader(new InputStreamReader( |
| this.getClass().getResourceAsStream("CambridgeMA.utf8"), "UTF-8")); |
| String text = r.readLine(); |
| r.close(); |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); |
| FieldType positionsType = new FieldType(TextField.TYPE_STORED); |
| positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", text, positionsType); |
| Document document = new Document(); |
| document.add(body); |
| iw.addDocument(document); |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| IndexSearcher searcher = newSearcher(ir); |
| BooleanQuery query = new BooleanQuery(); |
| query.add(new TermQuery(new Term("body", "porter")), BooleanClause.Occur.SHOULD); |
| query.add(new TermQuery(new Term("body", "square")), BooleanClause.Occur.SHOULD); |
| query.add(new TermQuery(new Term("body", "massachusetts")), BooleanClause.Occur.SHOULD); |
| TopDocs topDocs = searcher.search(query, 10); |
| assertEquals(1, topDocs.totalHits); |
| PostingsHighlighter highlighter = new PostingsHighlighter(Integer.MAX_VALUE-1); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(1, snippets.length); |
| assertTrue(snippets[0].contains("<b>Square</b>")); |
| assertTrue(snippets[0].contains("<b>Porter</b>")); |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testPassageRanking() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "test")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(1, snippets.length); |
| assertEquals("This is a <b>test</b>. ... Feel free to <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testBooleanMustNot() throws Exception { |
| Directory dir = newDirectory(); |
| Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, analyzer); |
| FieldType positionsType = new FieldType(TextField.TYPE_STORED); |
| positionsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "This sentence has both terms. This sentence has only terms.", positionsType); |
| Document document = new Document(); |
| document.add(body); |
| iw.addDocument(document); |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| IndexSearcher searcher = newSearcher(ir); |
| BooleanQuery query = new BooleanQuery(); |
| query.add(new TermQuery(new Term("body", "terms")), BooleanClause.Occur.SHOULD); |
| BooleanQuery query2 = new BooleanQuery(); |
| query.add(query2, BooleanClause.Occur.SHOULD); |
| query2.add(new TermQuery(new Term("body", "both")), BooleanClause.Occur.MUST_NOT); |
| TopDocs topDocs = searcher.search(query, 10); |
| assertEquals(1, topDocs.totalHits); |
| PostingsHighlighter highlighter = new PostingsHighlighter(Integer.MAX_VALUE-1); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(1, snippets.length); |
| assertFalse(snippets[0].contains("<b>both</b>")); |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testHighlightAllText() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(10000) { |
| @Override |
| protected BreakIterator getBreakIterator(String field) { |
| return new WholeBreakIterator(); |
| } |
| }; |
| Query query = new TermQuery(new Term("body", "test")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(1, snippets.length); |
| assertEquals("This is a <b>test</b>. Just highlighting from postings. This is also a much sillier <b>test</b>. Feel free to <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testSpecificDocIDs() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| iw.addDocument(doc); |
| body.setStringValue("Highlighting the first term. Hope it works."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(2, topDocs.totalHits); |
| ScoreDoc[] hits = topDocs.scoreDocs; |
| int[] docIDs = new int[2]; |
| docIDs[0] = hits[0].doc; |
| docIDs[1] = hits[1].doc; |
| String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 1 }).get("body"); |
| assertEquals(2, snippets.length); |
| assertEquals("Just a test <b>highlighting</b> from postings. ", snippets[0]); |
| assertEquals("<b>Highlighting</b> the first term. ", snippets[1]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testCustomFieldValueSource() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| Document doc = new Document(); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_NOT_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| final String text = "This is a test. Just highlighting from postings. This is also a much sillier test. Feel free to test test test test test test test."; |
| Field body = new Field("body", text, offsetsType); |
| doc.add(body); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| |
| PostingsHighlighter highlighter = new PostingsHighlighter(10000) { |
| @Override |
| protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException { |
| assert fields.length == 1; |
| assert docids.length == 1; |
| String[][] contents = new String[1][1]; |
| contents[0][0] = text; |
| return contents; |
| } |
| |
| @Override |
| protected BreakIterator getBreakIterator(String field) { |
| return new WholeBreakIterator(); |
| } |
| }; |
| |
| Query query = new TermQuery(new Term("body", "test")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs, 2); |
| assertEquals(1, snippets.length); |
| assertEquals("This is a <b>test</b>. Just highlighting from postings. This is also a much sillier <b>test</b>. Feel free to <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b> <b>test</b>.", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| /** Make sure highlighter returns first N sentences if |
| * there were no hits. */ |
| public void testEmptyHighlights() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Document doc = new Document(); |
| |
| Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); |
| doc.add(body); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| int[] docIDs = new int[] {0}; |
| String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); |
| assertEquals(1, snippets.length); |
| assertEquals("test this is. another sentence this test has. ", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| /** Make sure highlighter we can customize how emtpy |
| * highlight is returned. */ |
| public void testCustomEmptyHighlights() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Document doc = new Document(); |
| |
| Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); |
| doc.add(body); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter() { |
| @Override |
| public Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) { |
| return new Passage[0]; |
| } |
| }; |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| int[] docIDs = new int[] {0}; |
| String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); |
| assertEquals(1, snippets.length); |
| assertNull(snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| /** Make sure highlighter returns whole text when there |
| * are no hits and BreakIterator is null. */ |
| public void testEmptyHighlightsWhole() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Document doc = new Document(); |
| |
| Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); |
| doc.add(body); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(10000) { |
| @Override |
| protected BreakIterator getBreakIterator(String field) { |
| return new WholeBreakIterator(); |
| } |
| }; |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| int[] docIDs = new int[] {0}; |
| String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); |
| assertEquals(1, snippets.length); |
| assertEquals("test this is. another sentence this test has. far away is that planet.", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| /** Make sure highlighter is OK with entirely missing |
| * field. */ |
| public void testFieldIsMissing() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Document doc = new Document(); |
| |
| Field body = new Field("body", "test this is. another sentence this test has. far away is that planet.", offsetsType); |
| doc.add(body); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("bogus", "highlighting")); |
| int[] docIDs = new int[] {0}; |
| String snippets[] = highlighter.highlightFields(new String[] {"bogus"}, query, searcher, docIDs, new int[] { 2 }).get("bogus"); |
| assertEquals(1, snippets.length); |
| assertNull(snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testFieldIsJustSpace() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| |
| Document doc = new Document(); |
| doc.add(new Field("body", " ", offsetsType)); |
| doc.add(new Field("id", "id", offsetsType)); |
| iw.addDocument(doc); |
| |
| doc = new Document(); |
| doc.add(new Field("body", "something", offsetsType)); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; |
| |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| int[] docIDs = new int[1]; |
| docIDs[0] = docID; |
| String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); |
| assertEquals(1, snippets.length); |
| assertEquals(" ", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testFieldIsEmptyString() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| |
| Document doc = new Document(); |
| doc.add(new Field("body", "", offsetsType)); |
| doc.add(new Field("id", "id", offsetsType)); |
| iw.addDocument(doc); |
| |
| doc = new Document(); |
| doc.add(new Field("body", "something", offsetsType)); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| int docID = searcher.search(new TermQuery(new Term("id", "id")), 1).scoreDocs[0].doc; |
| |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| int[] docIDs = new int[1]; |
| docIDs[0] = docID; |
| String snippets[] = highlighter.highlightFields(new String[] {"body"}, query, searcher, docIDs, new int[] { 2 }).get("body"); |
| assertEquals(1, snippets.length); |
| assertNull(snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testMultipleDocs() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| |
| int numDocs = atLeast(100); |
| for(int i=0;i<numDocs;i++) { |
| Document doc = new Document(); |
| String content = "the answer is " + i; |
| if ((i & 1) == 0) { |
| content += " some more terms"; |
| } |
| doc.add(new Field("body", content, offsetsType)); |
| doc.add(newStringField("id", ""+i, Field.Store.YES)); |
| iw.addDocument(doc); |
| |
| if (random().nextInt(10) == 2) { |
| iw.commit(); |
| } |
| } |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| Query query = new TermQuery(new Term("body", "answer")); |
| TopDocs hits = searcher.search(query, numDocs); |
| assertEquals(numDocs, hits.totalHits); |
| |
| String snippets[] = highlighter.highlight("body", query, searcher, hits); |
| assertEquals(numDocs, snippets.length); |
| for(int hit=0;hit<numDocs;hit++) { |
| StoredDocument doc = searcher.doc(hits.scoreDocs[hit].doc); |
| int id = Integer.parseInt(doc.get("id")); |
| String expected = "the <b>answer</b> is " + id; |
| if ((id & 1) == 0) { |
| expected += " some more terms"; |
| } |
| assertEquals(expected, snippets[hit]); |
| } |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testMultipleSnippetSizes() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Field title = new Field("title", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| doc.add(title); |
| |
| body.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| title.setStringValue("This is a test. Just a test highlighting from postings. Feel free to ignore."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter(); |
| BooleanQuery query = new BooleanQuery(); |
| query.add(new TermQuery(new Term("body", "test")), BooleanClause.Occur.SHOULD); |
| query.add(new TermQuery(new Term("title", "test")), BooleanClause.Occur.SHOULD); |
| Map<String,String[]> snippets = highlighter.highlightFields(new String[] { "title", "body" }, query, searcher, new int[] { 0 }, new int[] { 1, 2 }); |
| String titleHighlight = snippets.get("title")[0]; |
| String bodyHighlight = snippets.get("body")[0]; |
| assertEquals("This is a <b>test</b>. ", titleHighlight); |
| assertEquals("This is a <b>test</b>. Just a <b>test</b> highlighting from postings. ", bodyHighlight); |
| ir.close(); |
| dir.close(); |
| } |
| |
| public void testEncode() throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Field body = new Field("body", "", offsetsType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. Just a test highlighting from <i>postings</i>. Feel free to ignore."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter() { |
| @Override |
| protected PassageFormatter getFormatter(String field) { |
| return new DefaultPassageFormatter("<b>", "</b>", "... ", true); |
| } |
| }; |
| Query query = new TermQuery(new Term("body", "highlighting")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs); |
| assertEquals(1, snippets.length); |
| assertEquals("Just a test <b>highlighting</b> from <i>postings</i>. ", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| /** customizing the gap separator to force a sentence break */ |
| public void testGapSeparator() throws Exception { |
| Directory dir = newDirectory(); |
| // use simpleanalyzer for more natural tokenization (else "test." is a token) |
| IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType offsetsType = new FieldType(TextField.TYPE_STORED); |
| offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| Document doc = new Document(); |
| |
| Field body1 = new Field("body", "", offsetsType); |
| body1.setStringValue("This is a multivalued field"); |
| doc.add(body1); |
| |
| Field body2 = new Field("body", "", offsetsType); |
| body2.setStringValue("This is something different"); |
| doc.add(body2); |
| |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| PostingsHighlighter highlighter = new PostingsHighlighter() { |
| @Override |
| protected char getMultiValuedSeparator(String field) { |
| assert field.equals("body"); |
| return '\u2029'; |
| } |
| }; |
| Query query = new TermQuery(new Term("body", "field")); |
| TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits); |
| String snippets[] = highlighter.highlight("body", query, searcher, topDocs); |
| assertEquals(1, snippets.length); |
| assertEquals("This is a multivalued <b>field</b>\u2029", snippets[0]); |
| |
| ir.close(); |
| dir.close(); |
| } |
| } |