blob: ac0a18964c41231a6c5d253463d7fd0bb782a514 [file] [log] [blame]
#if FEATURE_BREAKITERATOR
using Lucene.Net.Analysis;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Search.Spans;
using Lucene.Net.Store;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
using System.Text;
using IndexOptions = Lucene.Net.Index.IndexOptions;
namespace Lucene.Net.Search.PostingsHighlight
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Some tests that override <see cref="PostingsHighlighter.GetIndexAnalyzer(string)"/> to
/// highlight wilcard, fuzzy, etc queries.
/// </summary>
/// <remarks>
/// LUCENENET specific - These are the original tests from Lucene. They are only here as proof that we
/// can customize the <see cref="ICUPostingsHighlighter"/> to act like the PostingsHighlighter in Lucene,
/// which has slightly different default behavior than that of ICU because Lucene uses
/// the RuleBasedBreakIterator from the JDK, not that of ICU4J.
/// <para/>
/// These tests use a mock <see cref="PostingsHighlighter"/>, which is backed by an ICU
/// <see cref="ICU4N.Text.RuleBasedBreakIterator"/> that is customized a bit to act (sort of)
/// like the one in the JDK. However, this customized implementation is not a logical default for
/// the <see cref="ICUPostingsHighlighter"/>.
/// </remarks>
[SuppressCodecs("MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene3x")]
public class TestMultiTermHighlighting : LuceneTestCase
{
internal class PostingsHighlighterAnalyzerHelper : PostingsHighlighter
{
private readonly Analyzer analyzer;
public PostingsHighlighterAnalyzerHelper(Analyzer analyzer)
{
this.analyzer = analyzer;
}
protected override Analyzer GetIndexAnalyzer(string field)
{
return analyzer;
}
}
[Test]
public void TestWildcards()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
Query query = new WildcardQuery(new Term("body", "te*"));
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery bq = new BooleanQuery();
bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.Add(new WildcardQuery(new Term("bogus", "te*")), Occur.SHOULD);
topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", bq, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestOnePrefix()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
Query query = new PrefixQuery(new Term("body", "te"));
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery bq = new BooleanQuery();
bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.Add(new PrefixQuery(new Term("bogus", "te")), Occur.SHOULD);
topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", bq, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestOneRegexp()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
Query query = new RegexpQuery(new Term("body", "te.*"));
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery bq = new BooleanQuery();
bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.Add(new RegexpQuery(new Term("bogus", "te.*")), Occur.SHOULD);
topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", bq, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestOneFuzzy()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
Query query = new FuzzyQuery(new Term("body", "tets"), 1);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// with prefix
query = new FuzzyQuery(new Term("body", "tets"), 1, 2);
topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// wrong field
BooleanQuery bq = new BooleanQuery();
bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.Add(new FuzzyQuery(new Term("bogus", "tets"), 1), Occur.SHOULD);
topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", bq, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestRanges()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
Query query = TermRangeQuery.NewStringRange("body", "ta", "tf", true, true);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// null start
query = TermRangeQuery.NewStringRange("body", null, "tf", true, true);
topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This <b>is</b> <b>a</b> <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> <b>a</b> <b>one</b> <b>sentence</b> <b>document</b>.", snippets[1]);
// null end
query = TermRangeQuery.NewStringRange("body", "ta", null, true, true);
topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("<b>This</b> is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// exact start inclusive
query = TermRangeQuery.NewStringRange("body", "test", "tf", true, true);
topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// exact end inclusive
query = TermRangeQuery.NewStringRange("body", "ta", "test", true, true);
topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// exact start exclusive
BooleanQuery bq = new BooleanQuery();
bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.Add(TermRangeQuery.NewStringRange("body", "test", "tf", false, true), Occur.SHOULD);
topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", bq, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
// exact end exclusive
bq = new BooleanQuery();
bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.Add(TermRangeQuery.NewStringRange("body", "ta", "test", true, false), Occur.SHOULD);
topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", bq, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
// wrong field
bq = new BooleanQuery();
bq.Add(new MatchAllDocsQuery(), Occur.SHOULD);
bq.Add(TermRangeQuery.NewStringRange("bogus", "ta", "tf", true, true), Occur.SHOULD);
topDocs = searcher.Search(bq, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", bq, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestWildcardInBoolean()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
BooleanQuery query = new BooleanQuery();
query.Add(new WildcardQuery(new Term("body", "te*")), Occur.SHOULD);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
// must not
query = new BooleanQuery();
query.Add(new MatchAllDocsQuery(), Occur.SHOULD);
query.Add(new WildcardQuery(new Term("bogus", "te*")), Occur.MUST_NOT);
topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a test.", snippets[0]);
assertEquals("Test a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestWildcardInDisjunctionMax()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
DisjunctionMaxQuery query = new DisjunctionMaxQuery(0);
query.Add(new WildcardQuery(new Term("body", "te*")));
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestSpanWildcard()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
Query query = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestSpanOr()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
SpanQuery childQuery = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanOrQuery(new SpanQuery[] { childQuery });
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestSpanNear()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
SpanQuery childQuery = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanNearQuery(new SpanQuery[] { childQuery }, 0, true);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestSpanNot()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
SpanQuery include = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
SpanQuery exclude = new SpanTermQuery(new Term("body", "bogus"));
Query query = new SpanNotQuery(include, exclude);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
[Test]
public void TestSpanPositionCheck()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test.");
iw.AddDocument(doc);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
SpanQuery childQuery = new SpanMultiTermQueryWrapper<WildcardQuery>(new WildcardQuery(new Term("body", "te*")));
Query query = new SpanFirstQuery(childQuery, 1000000);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(2, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(2, snippets.Length);
assertEquals("This is a <b>test</b>.", snippets[0]);
assertEquals("<b>Test</b> a one sentence document.", snippets[1]);
ir.Dispose();
dir.Dispose();
}
internal class PostingsHighlighterAnalyzerAndFormatterHelper : PostingsHighlighterAnalyzerHelper
{
private readonly PassageFormatter formatter;
public PostingsHighlighterAnalyzerAndFormatterHelper(Analyzer analyzer, PassageFormatter formatter)
: base(analyzer)
{
this.formatter = formatter;
}
protected override PassageFormatter GetFormatter(string field)
{
return formatter;
}
}
internal class PassageFormatterHelper : PassageFormatter
{
public override object Format(Passage[] passages, string content)
{
// Copied from DefaultPassageFormatter, but
// tweaked to include the matched term:
StringBuilder sb = new StringBuilder();
int pos = 0;
foreach (Passage passage in passages)
{
// don't add ellipsis if its the first one, or if its connected.
if (passage.StartOffset > pos && pos > 0)
{
sb.append("... ");
}
pos = passage.StartOffset;
for (int i = 0; i < passage.NumMatches; i++)
{
int start = passage.matchStarts[i];
int end = passage.matchEnds[i];
// its possible to have overlapping terms
if (start > pos)
{
sb.Append(content, pos, start - pos);
}
if (end > pos)
{
sb.Append("<b>");
int startPos = Math.Max(pos, start);
sb.Append(content, startPos, end - startPos);
sb.Append('(');
sb.Append(passage.MatchTerms[i].Utf8ToString());
sb.Append(')');
sb.Append("</b>");
pos = end;
}
}
// its possible a "term" from the analyzer could span a sentence boundary.
sb.Append(content, pos, Math.Max(pos, passage.EndOffset) - pos);
pos = passage.EndOffset;
}
return sb.toString();
}
}
/** Runs a query with two MTQs and confirms the formatter
* can tell which query matched which hit. */
[Test]
public void TestWhichMTQMatched()
{
Directory dir = NewDirectory();
// use simpleanalyzer for more natural tokenization (else "test." is a token)
Analyzer analyzer = new MockAnalyzer(Random, MockTokenizer.SIMPLE, true);
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("Test a one sentence document.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new PostingsHighlighterAnalyzerHelper(analyzer);
BooleanQuery query = new BooleanQuery();
query.Add(new WildcardQuery(new Term("body", "te*")), Occur.SHOULD);
query.Add(new WildcardQuery(new Term("body", "one")), Occur.SHOULD);
query.Add(new WildcardQuery(new Term("body", "se*")), Occur.SHOULD);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(1, snippets.Length);
// Default formatter just bolds each hit:
assertEquals("<b>Test</b> a <b>one</b> <b>sentence</b> document.", snippets[0]);
// Now use our own formatter, that also stuffs the
// matching term's text into the result:
highlighter = new PostingsHighlighterAnalyzerAndFormatterHelper(analyzer, new PassageFormatterHelper());
assertEquals(1, topDocs.TotalHits);
snippets = highlighter.Highlight("body", query, searcher, topDocs);
assertEquals(1, snippets.Length);
// Default formatter bolds each hit:
assertEquals("<b>Test(body:te*)</b> a <b>one(body:one)</b> <b>sentence(body:se*)</b> document.", snippets[0]);
ir.Dispose();
dir.Dispose();
}
}
}
#endif