| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.uhighlight; |
| |
| import java.io.IOException; |
| import java.util.EnumSet; |
| import java.util.HashSet; |
| import java.util.Random; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.Sort; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| |
| public class TestUnifiedHighlighterRanking extends LuceneTestCase { |
| |
| Analyzer indexAnalyzer; |
| |
| // note: all offset sources, by default, use term freq, so it shouldn't matter which we choose. |
| final FieldType fieldType = UHTestHelper.randomFieldType(random()); |
| |
| /** |
| * indexes a bunch of gibberish, and then highlights top(n). |
| * asserts that top(n) highlights is a subset of top(n+1) up to some max N |
| */ |
| // TODO: this only tests single-valued fields. we should also index multiple values per field! |
| public void testRanking() throws Exception { |
| // number of documents: we will check each one |
| final int numDocs = atLeast(20); |
| // number of top-N snippets, we will check 1 .. N |
| final int maxTopN = 3; |
| // maximum number of elements to put in a sentence. |
| final int maxSentenceLength = 10; |
| // maximum number of sentences in a document |
| final int maxNumSentences = 20; |
| |
| Directory dir = newDirectory(); |
| indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); |
| Document document = new Document(); |
| Field id = new StringField("id", "", Field.Store.NO); |
| Field body = new Field("body", "", fieldType); |
| document.add(id); |
| document.add(body); |
| |
| for (int i = 0; i < numDocs; i++) { |
| StringBuilder bodyText = new StringBuilder(); |
| int numSentences = TestUtil.nextInt(random(), 1, maxNumSentences); |
| for (int j = 0; j < numSentences; j++) { |
| bodyText.append(newSentence(random(), maxSentenceLength)); |
| } |
| body.setStringValue(bodyText.toString()); |
| id.setStringValue(Integer.toString(i)); |
| iw.addDocument(document); |
| } |
| |
| IndexReader ir = iw.getReader(); |
| IndexSearcher searcher = newSearcher(ir); |
| for (int i = 0; i < numDocs; i++) { |
| checkDocument(searcher, i, maxTopN); |
| } |
| iw.close(); |
| ir.close(); |
| dir.close(); |
| } |
| |
| private void checkDocument(IndexSearcher is, int doc, int maxTopN) throws IOException { |
| for (int ch = 'a'; ch <= 'z'; ch++) { |
| Term term = new Term("body", "" + (char) ch); |
| // check a simple term query |
| checkQuery(is, new TermQuery(term), doc, maxTopN); |
| // check a boolean query |
| Term nextTerm = new Term("body", "" + (char) (ch + 1)); |
| BooleanQuery bq = new BooleanQuery.Builder() |
| .add(new TermQuery(term), BooleanClause.Occur.SHOULD) |
| .add(new TermQuery(nextTerm), BooleanClause.Occur.SHOULD) |
| .build(); |
| checkQuery(is, bq, doc, maxTopN); |
| } |
| } |
| |
| private void checkQuery(IndexSearcher is, Query query, int doc, int maxTopN) throws IOException { |
| for (int n = 1; n < maxTopN; n++) { |
| final FakePassageFormatter f1 = new FakePassageFormatter(); |
| UnifiedHighlighter p1 = new UnifiedHighlighter(is, indexAnalyzer) { |
| @Override |
| protected PassageFormatter getFormatter(String field) { |
| assertEquals("body", field); |
| return f1; |
| } |
| }; |
| p1.setMaxLength(Integer.MAX_VALUE - 1); |
| |
| final FakePassageFormatter f2 = new FakePassageFormatter(); |
| UnifiedHighlighter p2 = new UnifiedHighlighter(is, indexAnalyzer) { |
| @Override |
| protected PassageFormatter getFormatter(String field) { |
| assertEquals("body", field); |
| return f2; |
| } |
| }; |
| p2.setMaxLength(Integer.MAX_VALUE - 1); |
| |
| BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); |
| queryBuilder.add(query, BooleanClause.Occur.MUST); |
| queryBuilder.add(new TermQuery(new Term("id", Integer.toString(doc))), BooleanClause.Occur.MUST); |
| BooleanQuery bq = queryBuilder.build(); |
| TopDocs td = is.search(bq, 1); |
| p1.highlight("body", bq, td, n); |
| p2.highlight("body", bq, td, n + 1); |
| assertTrue(f2.seen.containsAll(f1.seen)); |
| } |
| } |
| |
| /** |
| * returns a new random sentence, up to maxSentenceLength "words" in length. |
| * each word is a single character (a-z). The first one is capitalized. |
| */ |
| private String newSentence(Random r, int maxSentenceLength) { |
| StringBuilder sb = new StringBuilder(); |
| int numElements = TestUtil.nextInt(r, 1, maxSentenceLength); |
| for (int i = 0; i < numElements; i++) { |
| if (sb.length() > 0) { |
| sb.append(' '); |
| sb.append((char) TestUtil.nextInt(r, 'a', 'z')); |
| } else { |
| // capitalize the first word to help breakiterator |
| sb.append((char) TestUtil.nextInt(r, 'A', 'Z')); |
| } |
| } |
| sb.append(". "); // finalize sentence |
| return sb.toString(); |
| } |
| |
| /** |
| * a fake formatter that doesn't actually format passages. |
| * instead it just collects them for asserts! |
| */ |
| static class FakePassageFormatter extends PassageFormatter { |
| HashSet<Pair> seen = new HashSet<>(); |
| |
| @Override |
| public String format(Passage passages[], String content) { |
| for (Passage p : passages) { |
| // verify some basics about the passage |
| assertTrue(p.getScore() >= 0); |
| assertTrue(p.getNumMatches() > 0); |
| assertTrue(p.getStartOffset() >= 0); |
| assertTrue(p.getStartOffset() <= content.length()); |
| assertTrue(p.getEndOffset() >= p.getStartOffset()); |
| assertTrue(p.getEndOffset() <= content.length()); |
| // we use a very simple analyzer. so we can assert the matches are correct |
| int lastMatchStart = -1; |
| for (int i = 0; i < p.getNumMatches(); i++) { |
| BytesRef term = p.getMatchTerms()[i]; |
| int matchStart = p.getMatchStarts()[i]; |
| assertTrue(matchStart >= 0); |
| // must at least start within the passage |
| assertTrue(matchStart < p.getEndOffset()); |
| int matchEnd = p.getMatchEnds()[i]; |
| assertTrue(matchEnd >= 0); |
| // always moving forward |
| assertTrue(matchStart >= lastMatchStart); |
| lastMatchStart = matchStart; |
| // single character terms |
| assertEquals(matchStart + 1, matchEnd); |
| // and the offsets must be correct... |
| assertEquals(1, term.length); |
| assertEquals((char) term.bytes[term.offset], Character.toLowerCase(content.charAt(matchStart))); |
| } |
| // record just the start/end offset for simplicity |
| seen.add(new Pair(p.getStartOffset(), p.getEndOffset())); |
| } |
| return "bogus!!!!!!"; |
| } |
| } |
| |
| static class Pair { |
| final int start; |
| final int end; |
| |
| Pair(int start, int end) { |
| this.start = start; |
| this.end = end; |
| } |
| |
| @Override |
| public int hashCode() { |
| final int prime = 31; |
| int result = 1; |
| result = prime * result + end; |
| result = prime * result + start; |
| return result; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) { |
| return true; |
| } |
| if (obj == null) { |
| return false; |
| } |
| if (getClass() != obj.getClass()) { |
| return false; |
| } |
| Pair other = (Pair) obj; |
| if (end != other.end) { |
| return false; |
| } |
| if (start != other.start) { |
| return false; |
| } |
| return true; |
| } |
| |
| @Override |
| public String toString() { |
| return "Pair [start=" + start + ", end=" + end + "]"; |
| } |
| } |
| |
| /** |
| * sets b=0 to disable passage length normalization |
| */ |
| public void testCustomB() throws Exception { |
| Directory dir = newDirectory(); |
| indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| IndexWriterConfig iwc = newIndexWriterConfig(indexAnalyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| Field body = new Field("body", "", fieldType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " + |
| "you have no idea how painful it was for me to type this long sentence into my IDE."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { |
| @Override |
| protected Set<HighlightFlag> getFlags(String field) { |
| if (random().nextBoolean()) { |
| return EnumSet.of(HighlightFlag.MULTI_TERM_QUERY, HighlightFlag.PHRASES, HighlightFlag.WEIGHT_MATCHES); |
| } else { |
| return super.getFlags(field); |
| } |
| } |
| |
| @Override |
| protected PassageScorer getScorer(String field) { |
| return new PassageScorer(1.2f, 0, 87); |
| } |
| }; |
| Query query = new TermQuery(new Term("body", "test")); |
| TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits.value); |
| String snippets[] = highlighter.highlight("body", query, topDocs, 1); |
| assertEquals(1, snippets.length); |
| assertTrue(snippets[0].startsWith("This <b>test</b> is a better <b>test</b>")); |
| |
| ir.close(); |
| dir.close(); |
| } |
| |
| /** |
| * sets k1=0 for simple coordinate-level match (# of query terms present) |
| */ |
| public void testCustomK1() throws Exception { |
| Directory dir = newDirectory(); |
| indexAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true); |
| IndexWriterConfig iwc = newIndexWriterConfig(indexAnalyzer); |
| iwc.setMergePolicy(newLogMergePolicy()); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| |
| Field body = new Field("body", "", fieldType); |
| Document doc = new Document(); |
| doc.add(body); |
| |
| body.setStringValue("This has only foo foo. " + |
| "On the other hand this sentence contains both foo and bar. " + |
| "This has only bar bar bar bar bar bar bar bar bar bar bar bar."); |
| iw.addDocument(doc); |
| |
| IndexReader ir = iw.getReader(); |
| iw.close(); |
| |
| IndexSearcher searcher = newSearcher(ir); |
| UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer) { |
| @Override |
| protected Set<HighlightFlag> getFlags(String field) { |
| if (random().nextBoolean()) { |
| return EnumSet.of(HighlightFlag.MULTI_TERM_QUERY, HighlightFlag.PHRASES, HighlightFlag.WEIGHT_MATCHES); |
| } else { |
| return super.getFlags(field); |
| } |
| } |
| |
| @Override |
| protected PassageScorer getScorer(String field) { |
| return new PassageScorer(0, 0.75f, 87); |
| } |
| }; |
| BooleanQuery query = new BooleanQuery.Builder() |
| .add(new TermQuery(new Term("body", "foo")), BooleanClause.Occur.SHOULD) |
| .add(new TermQuery(new Term("body", "bar")), BooleanClause.Occur.SHOULD) |
| .build(); |
| TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER); |
| assertEquals(1, topDocs.totalHits.value); |
| String snippets[] = highlighter.highlight("body", query, topDocs, 1); |
| assertEquals(1, snippets.length); |
| assertTrue(snippets[0].startsWith("On the other hand")); |
| |
| ir.close(); |
| dir.close(); |
| } |
| } |