blob: 654d2f9178c176628aef1cd511441b4095d2673a [file] [log] [blame]
#if FEATURE_BREAKITERATOR
using J2N;
using Lucene.Net.Analysis;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Store;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Search.PostingsHighlight
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// LUCENENET specific - These are the original tests from Lucene. They are only here as proof that we
/// can customize the <see cref="ICUPostingsHighlighter"/> to act like the PostingsHighlighter in Lucene,
/// which has slightly different default behavior than that of ICU because Lucene uses
/// the RuleBasedBreakIterator from the JDK, not that of ICU4J.
/// <para/>
/// These tests use a mock <see cref="PostingsHighlighter"/>, which is backed by an ICU
/// <see cref="ICU4N.Text.RuleBasedBreakIterator"/> that is customized a bit to act (sort of)
/// like the one in the JDK. However, this customized implementation is not a logical default for
/// the <see cref="ICUPostingsHighlighter"/>.
/// </summary>
[SuppressCodecs("MockFixedIntBlock", "MockVariableIntBlock", "MockSep", "MockRandom", "Lucene3x")]
public class TestPostingsHighlighterRanking : LuceneTestCase
{
/// <summary>
/// indexes a bunch of gibberish, and then highlights top(n).
/// asserts that top(n) highlights is a subset of top(n+1) up to some max N
/// </summary>
// TODO: this only tests single-valued fields. we should also index multiple values per field!
[Test]
[Slow]
public void TestRanking()
{
// number of documents: we will check each one
int numDocs = AtLeast(100);
// number of top-N snippets, we will check 1 .. N
int maxTopN = 5;
// maximum number of elements to put in a sentence.
int maxSentenceLength = 10;
// maximum number of sentences in a document
int maxNumSentences = 20;
Directory dir = NewDirectory();
RandomIndexWriter iw = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir, new MockAnalyzer(Random, MockTokenizer.SIMPLE, true));
Document document = new Document();
Field id = new StringField("id", "", Field.Store.NO);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
document.Add(id);
document.Add(body);
for (int i = 0; i < numDocs; i++)
{
StringBuilder bodyText = new StringBuilder();
int numSentences = TestUtil.NextInt32(Random, 1, maxNumSentences);
for (int j = 0; j < numSentences; j++)
{
bodyText.Append(newSentence(Random, maxSentenceLength));
}
body.SetStringValue(bodyText.ToString());
id.SetStringValue(i.ToString(CultureInfo.InvariantCulture));
iw.AddDocument(document);
}
IndexReader ir = iw.GetReader();
IndexSearcher searcher = NewSearcher(ir);
for (int i = 0; i < numDocs; i++)
{
checkDocument(searcher, i, maxTopN);
}
iw.Dispose();
ir.Dispose();
dir.Dispose();
}
private void checkDocument(IndexSearcher @is, int doc, int maxTopN)
{
for (int ch = 'a'; ch <= 'z'; ch++)
{
Term term = new Term("body", "" + (char)ch);
// check a simple term query
checkQuery(@is, new TermQuery(term), doc, maxTopN);
// check a boolean query
BooleanQuery bq = new BooleanQuery();
bq.Add(new TermQuery(term), Occur.SHOULD);
Term nextTerm = new Term("body", "" + (char)(ch + 1));
bq.Add(new TermQuery(nextTerm), Occur.SHOULD);
checkQuery(@is, bq, doc, maxTopN);
}
}
internal class CheckQueryPostingsHighlighter : PostingsHighlighter
{
internal FakePassageFormatter f = new FakePassageFormatter();
public CheckQueryPostingsHighlighter(int maxLength)
: base(maxLength)
{
}
protected override PassageFormatter GetFormatter(string field)
{
assertEquals("body", field);
return f;
}
}
private void checkQuery(IndexSearcher @is, Query query, int doc, int maxTopN)
{
for (int n = 1; n < maxTopN; n++)
{
CheckQueryPostingsHighlighter p1 = new CheckQueryPostingsHighlighter(int.MaxValue - 1);
CheckQueryPostingsHighlighter p2 = new CheckQueryPostingsHighlighter(int.MaxValue - 1);
BooleanQuery bq = new BooleanQuery(false);
bq.Add(query, Occur.MUST);
bq.Add(new TermQuery(new Term("id", doc.ToString(CultureInfo.InvariantCulture))), Occur.MUST);
TopDocs td = @is.Search(bq, 1);
p1.Highlight("body", bq, @is, td, n);
p2.Highlight("body", bq, @is, td, n + 1);
assertTrue(p2.f.seen.containsAll(p1.f.seen));
}
}
/**
* returns a new random sentence, up to maxSentenceLength "words" in length.
* each word is a single character (a-z). The first one is capitalized.
*/
private String newSentence(Random r, int maxSentenceLength)
{
StringBuilder sb = new StringBuilder();
int numElements = TestUtil.NextInt32(r, 1, maxSentenceLength);
for (int i = 0; i < numElements; i++)
{
if (sb.Length > 0)
{
sb.append(' ');
sb.append((char)TestUtil.NextInt32(r, 'a', 'z'));
}
else
{
// capitalize the first word to help breakiterator
sb.append((char)TestUtil.NextInt32(r, 'A', 'Z'));
}
}
sb.append(". "); // finalize sentence
return sb.toString();
}
/**
* a fake formatter that doesn't actually format passages.
* instead it just collects them for asserts!
*/
internal class FakePassageFormatter : PassageFormatter
{
internal ISet<Pair> seen = new JCG.HashSet<Pair>();
public override object Format(Passage[] passages, String content)
{
foreach (Passage p in passages)
{
// verify some basics about the passage
assertTrue(p.Score >= 0);
assertTrue(p.NumMatches > 0);
assertTrue(p.StartOffset >= 0);
assertTrue(p.StartOffset <= content.Length);
assertTrue(p.EndOffset >= p.StartOffset);
assertTrue(p.EndOffset <= content.Length);
// we use a very simple analyzer. so we can assert the matches are correct
int lastMatchStart = -1;
for (int i = 0; i < p.NumMatches; i++)
{
BytesRef term = p.MatchTerms[i];
int matchStart = p.MatchStarts[i];
assertTrue(matchStart >= 0);
// must at least start within the passage
assertTrue(matchStart < p.EndOffset);
int matchEnd = p.MatchEnds[i];
assertTrue(matchEnd >= 0);
// always moving forward
assertTrue(matchStart >= lastMatchStart);
lastMatchStart = matchStart;
// single character terms
assertEquals(matchStart + 1, matchEnd);
// and the offsets must be correct...
assertEquals(1, term.Length);
assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart], CultureInfo.InvariantCulture));
}
// record just the start/end offset for simplicity
seen.Add(new Pair(p.StartOffset, p.EndOffset));
}
return "bogus!!!!!!";
}
}
internal class Pair
{
internal readonly int start;
internal readonly int end;
internal Pair(int start, int end)
{
this.start = start;
this.end = end;
}
public override int GetHashCode()
{
int prime = 31;
int result = 1;
result = prime * result + end;
result = prime * result + start;
return result;
}
public override bool Equals(object obj)
{
if (this == obj)
{
return true;
}
if (obj == null)
{
return false;
}
if (GetType() != obj.GetType())
{
return false;
}
Pair other = (Pair)obj;
if (end != other.end)
{
return false;
}
if (start != other.start)
{
return false;
}
return true;
}
public override string ToString()
{
return "Pair [start=" + start + ", end=" + end + "]";
}
}
/** sets b=0 to disable passage length normalization */
[Test]
public void TestCustomB()
{
Directory dir = NewDirectory();
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.SIMPLE, true));
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This is a test. This test is a better test but the sentence is excruiatingly long, " +
"you have no idea how painful it was for me to type this long sentence into my IDE.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new CustomBPostingsHighlighter();
Query query = new TermQuery(new Term("body", "test"));
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs, 1);
assertEquals(1, snippets.Length);
assertTrue(snippets[0].StartsWith("This <b>test</b> is a better <b>test</b>", StringComparison.Ordinal));
ir.Dispose();
dir.Dispose();
}
internal class CustomBPostingsHighlighter : PostingsHighlighter
{
protected override PassageScorer GetScorer(string field)
{
return new PassageScorer(1.2f, 0, 87);
}
}
/** sets k1=0 for simple coordinate-level match (# of query terms present) */
[Test]
public void TestCustomK1()
{
Directory dir = NewDirectory();
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.SIMPLE, true));
iwc.SetMergePolicy(NewLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(Random, dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.IndexOptions = (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.Add(body);
body.SetStringValue("This has only foo foo. " +
"On the other hand this sentence contains both foo and bar. " +
"This has only bar bar bar bar bar bar bar bar bar bar bar bar.");
iw.AddDocument(doc);
IndexReader ir = iw.GetReader();
iw.Dispose();
IndexSearcher searcher = NewSearcher(ir);
PostingsHighlighter highlighter = new CustomK1PostingsHighlighter();
BooleanQuery query = new BooleanQuery();
query.Add(new TermQuery(new Term("body", "foo")), Occur.SHOULD);
query.Add(new TermQuery(new Term("body", "bar")), Occur.SHOULD);
TopDocs topDocs = searcher.Search(query, null, 10, Sort.INDEXORDER);
assertEquals(1, topDocs.TotalHits);
String[] snippets = highlighter.Highlight("body", query, searcher, topDocs, 1);
assertEquals(1, snippets.Length);
assertTrue(snippets[0].StartsWith("On the other hand", StringComparison.Ordinal));
ir.Dispose();
dir.Dispose();
}
internal class CustomK1PostingsHighlighter : PostingsHighlighter
{
public CustomK1PostingsHighlighter()
: base(10000)
{ }
protected override PassageScorer GetScorer(string field)
{
return new PassageScorer(0, 0.75f, 87);
}
}
}
}
#endif