blob: 4180f800632e4b70d2baa76ca76d604e36a6bb80 [file] [log] [blame]
using Lucene.Net.Diagnostics;
using Lucene.Net.Documents;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Search;
using Lucene.Net.Util.Automaton;
using NUnit.Framework;
using System;
using System.Collections.Generic;
using Assert = Lucene.Net.TestFramework.Assert;
using Console = Lucene.Net.Util.SystemConsole;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using Automaton = Lucene.Net.Util.Automaton.Automaton;
using BasicAutomata = Lucene.Net.Util.Automaton.BasicAutomata;
using BytesRef = Lucene.Net.Util.BytesRef;
using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton;
using Directory = Lucene.Net.Store.Directory;
using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
using Document = Documents.Document;
using Field = Field;
using Int32Field = Int32Field;
using LineFileDocs = Lucene.Net.Util.LineFileDocs;
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer;
using RegExp = Lucene.Net.Util.Automaton.RegExp;
using TestUtil = Lucene.Net.Util.TestUtil;
[SuppressCodecs("SimpleText", "Memory", "Direct")]
[TestFixture]
public class TestTermsEnum : LuceneTestCase
{
[Test]
public virtual void Test()
{
Random random = new Random(Random.Next());
LineFileDocs docs = new LineFileDocs(random, DefaultCodecSupportsDocValues);
Directory d = NewDirectory();
MockAnalyzer analyzer = new MockAnalyzer(LuceneTestCase.Random);
analyzer.MaxTokenLength = TestUtil.NextInt32(LuceneTestCase.Random, 1, IndexWriter.MAX_TERM_LENGTH);
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
LuceneTestCase.Random, d, analyzer);
int numDocs = AtLeast(10);
for (int docCount = 0; docCount < numDocs; docCount++)
{
w.AddDocument(docs.NextDoc());
}
IndexReader r = w.GetReader();
w.Dispose();
List<BytesRef> terms = new List<BytesRef>();
TermsEnum termsEnum = MultiFields.GetTerms(r, "body").GetEnumerator();
while (termsEnum.MoveNext())
{
terms.Add(BytesRef.DeepCopyOf(termsEnum.Term));
}
if (Verbose)
{
Console.WriteLine("TEST: " + terms.Count + " terms");
}
int upto = -1;
int iters = AtLeast(200);
for (int iter = 0; iter < iters; iter++)
{
bool isEnd;
if (upto != -1 && LuceneTestCase.Random.NextBoolean())
{
// next
if (Verbose)
{
Console.WriteLine("TEST: iter next");
}
isEnd = termsEnum.MoveNext() == false;
upto++;
if (isEnd)
{
if (Verbose)
{
Console.WriteLine(" end");
}
Assert.AreEqual(upto, terms.Count);
upto = -1;
}
else
{
if (Verbose)
{
Console.WriteLine(" got term=" + termsEnum.Term.Utf8ToString() + " expected=" + terms[upto].Utf8ToString());
}
Assert.IsTrue(upto < terms.Count);
Assert.AreEqual(terms[upto], termsEnum.Term);
}
}
else
{
BytesRef target;
string exists;
if (LuceneTestCase.Random.NextBoolean())
{
// likely fake term
if (LuceneTestCase.Random.NextBoolean())
{
target = new BytesRef(TestUtil.RandomSimpleString(LuceneTestCase.Random));
}
else
{
target = new BytesRef(TestUtil.RandomRealisticUnicodeString(LuceneTestCase.Random));
}
exists = "likely not";
}
else
{
// real term
target = terms[LuceneTestCase.Random.Next(terms.Count)];
exists = "yes";
}
upto = terms.BinarySearch(target);
if (LuceneTestCase.Random.NextBoolean())
{
if (Verbose)
{
Console.WriteLine("TEST: iter seekCeil target=" + target.Utf8ToString() + " exists=" + exists);
}
// seekCeil
TermsEnum.SeekStatus status = termsEnum.SeekCeil(target);
if (Verbose)
{
Console.WriteLine(" got " + status);
}
if (upto < 0)
{
upto = -(upto + 1);
if (upto >= terms.Count)
{
Assert.AreEqual(TermsEnum.SeekStatus.END, status);
upto = -1;
}
else
{
Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status);
Assert.AreEqual(terms[upto], termsEnum.Term);
}
}
else
{
Assert.AreEqual(TermsEnum.SeekStatus.FOUND, status);
Assert.AreEqual(terms[upto], termsEnum.Term);
}
}
else
{
if (Verbose)
{
Console.WriteLine("TEST: iter seekExact target=" + target.Utf8ToString() + " exists=" + exists);
}
// seekExact
bool result = termsEnum.SeekExact(target);
if (Verbose)
{
Console.WriteLine(" got " + result);
}
if (upto < 0)
{
Assert.IsFalse(result);
upto = -1;
}
else
{
Assert.IsTrue(result);
Assert.AreEqual(target, termsEnum.Term);
}
}
}
}
r.Dispose();
d.Dispose();
docs.Dispose();
}
private void AddDoc(RandomIndexWriter w, ICollection<string> terms, IDictionary<BytesRef, int?> termToID, int id)
{
Document doc = new Document();
doc.Add(new Int32Field("id", id, Field.Store.NO));
if (Verbose)
{
Console.WriteLine("TEST: addDoc id:" + id + " terms=" + terms);
}
foreach (string s2 in terms)
{
doc.Add(NewStringField("f", s2, Field.Store.NO));
termToID[new BytesRef(s2)] = id;
}
w.AddDocument(doc);
terms.Clear();
}
private bool Accepts(CompiledAutomaton c, BytesRef b)
{
int state = c.RunAutomaton.InitialState;
for (int idx = 0; idx < b.Length; idx++)
{
Assert.IsTrue(state != -1);
state = c.RunAutomaton.Step(state, b.Bytes[b.Offset + idx] & 0xff);
}
return c.RunAutomaton.IsAccept(state);
}
// Tests Terms.intersect
[Test]
public virtual void TestIntersectRandom()
{
Directory dir = NewDirectory();
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir);
int numTerms = AtLeast(300);
//final int numTerms = 50;
ISet<string> terms = new JCG.HashSet<string>();
ICollection<string> pendingTerms = new List<string>();
IDictionary<BytesRef, int?> termToID = new Dictionary<BytesRef, int?>();
int id = 0;
while (terms.Count != numTerms)
{
string s = RandomString;
if (!terms.Contains(s))
{
terms.Add(s);
pendingTerms.Add(s);
if (Random.Next(20) == 7)
{
AddDoc(w, pendingTerms, termToID, id++);
}
}
}
AddDoc(w, pendingTerms, termToID, id++);
BytesRef[] termsArray = new BytesRef[terms.Count];
ISet<BytesRef> termsSet = new JCG.HashSet<BytesRef>();
{
int upto = 0;
foreach (string s in terms)
{
BytesRef b = new BytesRef(s);
termsArray[upto++] = b;
termsSet.Add(b);
}
Array.Sort(termsArray);
}
if (Verbose)
{
Console.WriteLine("\nTEST: indexed terms (unicode order):");
foreach (BytesRef t in termsArray)
{
Console.WriteLine(" " + t.Utf8ToString() + " -> id:" + termToID[t]);
}
}
IndexReader r = w.GetReader();
w.Dispose();
// NOTE: intentional insanity!!
FieldCache.Int32s docIDToID = FieldCache.DEFAULT.GetInt32s(SlowCompositeReaderWrapper.Wrap(r), "id", false);
for (int iter = 0; iter < 10 * RandomMultiplier; iter++)
{
// TODO: can we also test infinite As here...?
// From the random terms, pick some ratio and compile an
// automaton:
ISet<string> acceptTerms = new JCG.HashSet<string>();
JCG.SortedSet<BytesRef> sortedAcceptTerms = new JCG.SortedSet<BytesRef>();
double keepPct = Random.NextDouble();
Automaton a;
if (iter == 0)
{
if (Verbose)
{
Console.WriteLine("\nTEST: empty automaton");
}
a = BasicAutomata.MakeEmpty();
}
else
{
if (Verbose)
{
Console.WriteLine("\nTEST: keepPct=" + keepPct);
}
foreach (string s in terms)
{
string s2;
if (Random.NextDouble() <= keepPct)
{
s2 = s;
}
else
{
s2 = RandomString;
}
acceptTerms.Add(s2);
sortedAcceptTerms.Add(new BytesRef(s2));
}
a = BasicAutomata.MakeStringUnion(sortedAcceptTerms);
}
if (Random.NextBoolean())
{
if (Verbose)
{
Console.WriteLine("TEST: reduce the automaton");
}
a.Reduce();
}
CompiledAutomaton c = new CompiledAutomaton(a, true, false);
BytesRef[] acceptTermsArray = new BytesRef[acceptTerms.Count];
ISet<BytesRef> acceptTermsSet = new JCG.HashSet<BytesRef>();
int upto = 0;
foreach (string s in acceptTerms)
{
BytesRef b = new BytesRef(s);
acceptTermsArray[upto++] = b;
acceptTermsSet.Add(b);
Assert.IsTrue(Accepts(c, b));
}
Array.Sort(acceptTermsArray);
if (Verbose)
{
Console.WriteLine("\nTEST: accept terms (unicode order):");
foreach (BytesRef t in acceptTermsArray)
{
Console.WriteLine(" " + t.Utf8ToString() + (termsSet.Contains(t) ? " (exists)" : ""));
}
Console.WriteLine(a.ToDot());
}
for (int iter2 = 0; iter2 < 100; iter2++)
{
BytesRef startTerm = acceptTermsArray.Length == 0 || Random.NextBoolean() ? null : acceptTermsArray[Random.Next(acceptTermsArray.Length)];
if (Verbose)
{
Console.WriteLine("\nTEST: iter2=" + iter2 + " startTerm=" + (startTerm == null ? "<null>" : startTerm.Utf8ToString()));
if (startTerm != null)
{
int state = c.RunAutomaton.InitialState;
for (int idx = 0; idx < startTerm.Length; idx++)
{
int label = startTerm.Bytes[startTerm.Offset + idx] & 0xff;
Console.WriteLine(" state=" + state + " label=" + label);
state = c.RunAutomaton.Step(state, label);
Assert.IsTrue(state != -1);
}
Console.WriteLine(" state=" + state);
}
}
TermsEnum te = MultiFields.GetTerms(r, "f").Intersect(c, startTerm);
int loc;
if (startTerm == null)
{
loc = 0;
}
else
{
loc = Array.BinarySearch(termsArray, BytesRef.DeepCopyOf(startTerm));
if (loc < 0)
{
loc = -(loc + 1);
}
else
{
// startTerm exists in index
loc++;
}
}
while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc]))
{
loc++;
}
DocsEnum docsEnum = null;
while (loc < termsArray.Length)
{
BytesRef expected = termsArray[loc];
Assert.IsTrue(te.MoveNext());
BytesRef actual = te.Term;
if (Verbose)
{
Console.WriteLine("TEST: next() expected=" + expected.Utf8ToString() + " actual=" + (actual == null ? "null" : actual.Utf8ToString()));
}
Assert.AreEqual(expected, actual);
Assert.AreEqual(1, te.DocFreq);
docsEnum = TestUtil.Docs(Random, te, null, docsEnum, DocsFlags.NONE);
int docID = docsEnum.NextDoc();
Assert.IsTrue(docID != DocIdSetIterator.NO_MORE_DOCS);
Assert.AreEqual(docIDToID.Get(docID), (int)termToID[expected]);
do
{
loc++;
} while (loc < termsArray.Length && !acceptTermsSet.Contains(termsArray[loc]));
}
Assert.IsFalse(te.MoveNext());
}
}
r.Dispose();
dir.Dispose();
}
private readonly string FIELD = "field";
private IndexReader MakeIndex(Directory d, params string[] terms)
{
var iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));
/*
iwc.SetCodec(new StandardCodec(minTermsInBlock, maxTermsInBlock));
*/
using var w = new RandomIndexWriter(Random, d, iwc);
foreach (string term in terms)
{
var doc = new Document();
var f = NewStringField(FIELD, term, Field.Store.NO);
doc.Add(f);
w.AddDocument(doc);
}
return w.GetReader();
}
private int DocFreq(IndexReader r, string term)
{
return r.DocFreq(new Term(FIELD, term));
}
[Test]
public virtual void TestEasy()
{
// No floor arcs:
using var d = NewDirectory();
using var r = MakeIndex(d, "aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa");
// First term in block:
Assert.AreEqual(1, DocFreq(r, "aa0"));
// Scan forward to another term in same block
Assert.AreEqual(1, DocFreq(r, "aa2"));
Assert.AreEqual(1, DocFreq(r, "aa"));
// Reset same block then scan forwards
Assert.AreEqual(1, DocFreq(r, "aa1"));
// Not found, in same block
Assert.AreEqual(0, DocFreq(r, "aa5"));
// Found, in same block
Assert.AreEqual(1, DocFreq(r, "aa2"));
// Not found in index:
Assert.AreEqual(0, DocFreq(r, "b0"));
// Found:
Assert.AreEqual(1, DocFreq(r, "aa2"));
// Found, rewind:
Assert.AreEqual(1, DocFreq(r, "aa0"));
// First term in block:
Assert.AreEqual(1, DocFreq(r, "bb0"));
// Scan forward to another term in same block
Assert.AreEqual(1, DocFreq(r, "bb2"));
// Reset same block then scan forwards
Assert.AreEqual(1, DocFreq(r, "bb1"));
// Not found, in same block
Assert.AreEqual(0, DocFreq(r, "bb5"));
// Found, in same block
Assert.AreEqual(1, DocFreq(r, "bb2"));
// Not found in index:
Assert.AreEqual(0, DocFreq(r, "b0"));
// Found:
Assert.AreEqual(1, DocFreq(r, "bb2"));
// Found, rewind:
Assert.AreEqual(1, DocFreq(r, "bb0"));
}
// tests:
// - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix)
// - term that's entirely in the index
[Test]
public virtual void TestFloorBlocks()
{
var terms = new[] { "aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx" };
using var d = NewDirectory();
using var r = MakeIndex(d, terms);
// First term in first block:
Assert.AreEqual(1, DocFreq(r, "aa0"));
Assert.AreEqual(1, DocFreq(r, "aa4"));
// No block
Assert.AreEqual(0, DocFreq(r, "bb0"));
// Second block
Assert.AreEqual(1, DocFreq(r, "aa4"));
// Backwards to prior floor block:
Assert.AreEqual(1, DocFreq(r, "aa0"));
// Forwards to last floor block:
Assert.AreEqual(1, DocFreq(r, "aa9"));
Assert.AreEqual(0, DocFreq(r, "a"));
Assert.AreEqual(1, DocFreq(r, "aa"));
Assert.AreEqual(0, DocFreq(r, "a"));
Assert.AreEqual(1, DocFreq(r, "aa"));
// Forwards to last floor block:
Assert.AreEqual(1, DocFreq(r, "xx"));
Assert.AreEqual(1, DocFreq(r, "aa1"));
Assert.AreEqual(0, DocFreq(r, "yy"));
Assert.AreEqual(1, DocFreq(r, "xx"));
Assert.AreEqual(1, DocFreq(r, "aa9"));
Assert.AreEqual(1, DocFreq(r, "xx"));
Assert.AreEqual(1, DocFreq(r, "aa4"));
TermsEnum te = MultiFields.GetTerms(r, FIELD).GetEnumerator();
while (te.MoveNext())
{
//System.out.println("TEST: next term=" + te.Term().Utf8ToString());
}
Assert.IsTrue(SeekExact(te, "aa1"));
Assert.AreEqual("aa2", Next(te));
Assert.IsTrue(SeekExact(te, "aa8"));
Assert.AreEqual("aa9", Next(te));
Assert.AreEqual("xx", Next(te));
TestRandomSeeks(r, terms);
}
[Test]
public virtual void TestZeroTerms()
{
var d = NewDirectory();
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, d);
Document doc = new Document();
doc.Add(NewTextField("field", "one two three", Field.Store.NO));
doc = new Document();
doc.Add(NewTextField("field2", "one two three", Field.Store.NO));
w.AddDocument(doc);
w.Commit();
w.DeleteDocuments(new Term("field", "one"));
w.ForceMerge(1);
IndexReader r = w.GetReader();
w.Dispose();
Assert.AreEqual(1, r.NumDocs);
Assert.AreEqual(1, r.MaxDoc);
Terms terms = MultiFields.GetTerms(r, "field");
if (terms != null)
{
Assert.IsFalse(terms.GetEnumerator().MoveNext());
}
r.Dispose();
d.Dispose();
}
private string RandomString =>
//return TestUtil.RandomSimpleString(Random);
TestUtil.RandomRealisticUnicodeString(Random);
[Test]
public virtual void TestRandomTerms()
{
var terms = new string[TestUtil.NextInt32(Random, 1, AtLeast(1000))];
var seen = new JCG.HashSet<string>();
var allowEmptyString = Random.NextBoolean();
if (Random.Next(10) == 7 && terms.Length > 2)
{
// Sometimes add a bunch of terms sharing a longish common prefix:
int numTermsSamePrefix = Random.Next(terms.Length / 2);
if (numTermsSamePrefix > 0)
{
string prefix;
while (true)
{
prefix = RandomString;
if (prefix.Length < 5)
{
continue;
}
else
{
break;
}
}
while (seen.Count < numTermsSamePrefix)
{
string t = prefix + RandomString;
if (!seen.Contains(t))
{
terms[seen.Count] = t;
seen.Add(t);
}
}
}
}
while (seen.Count < terms.Length)
{
string t = RandomString;
if (!seen.Contains(t) && (allowEmptyString || t.Length != 0))
{
terms[seen.Count] = t;
seen.Add(t);
}
}
using var d = NewDirectory();
using var r = MakeIndex(d, terms);
TestRandomSeeks(r, terms);
}
// sugar
private bool SeekExact(TermsEnum te, string term)
{
return te.SeekExact(new BytesRef(term));
}
// sugar
private string Next(TermsEnum te)
{
if (!te.MoveNext())
{
return null;
}
else
{
return te.Term.Utf8ToString();
}
}
private BytesRef GetNonExistTerm(BytesRef[] terms)
{
BytesRef t = null;
while (true)
{
string ts = RandomString;
t = new BytesRef(ts);
if (Array.BinarySearch(terms, t) < 0)
{
return t;
}
}
}
private class TermAndState
{
public BytesRef Term { get; }
public TermState State { get; }
public TermAndState(BytesRef term, TermState state)
{
this.Term = term;
this.State = state;
}
}
private void TestRandomSeeks(IndexReader r, params string[] validTermStrings)
{
BytesRef[] validTerms = new BytesRef[validTermStrings.Length];
for (int termIDX = 0; termIDX < validTermStrings.Length; termIDX++)
{
validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]);
}
Array.Sort(validTerms);
if (Verbose)
{
Console.WriteLine("TEST: " + validTerms.Length + " terms:");
foreach (BytesRef t in validTerms)
{
Console.WriteLine(" " + t.Utf8ToString() + " " + t);
}
}
TermsEnum te = MultiFields.GetTerms(r, FIELD).GetEnumerator();
int END_LOC = -validTerms.Length - 1;
IList<TermAndState> termStates = new List<TermAndState>();
for (int iter = 0; iter < 100 * RandomMultiplier; iter++)
{
BytesRef t;
int loc;
TermState termState;
if (Random.Next(6) == 4)
{
// pick term that doens't exist:
t = GetNonExistTerm(validTerms);
termState = null;
if (Verbose)
{
Console.WriteLine("\nTEST: invalid term=" + t.Utf8ToString());
}
loc = Array.BinarySearch(validTerms, t);
}
else if (termStates.Count != 0 && Random.Next(4) == 1)
{
TermAndState ts = termStates[Random.Next(termStates.Count)];
t = ts.Term;
loc = Array.BinarySearch(validTerms, t);
Assert.IsTrue(loc >= 0);
termState = ts.State;
if (Verbose)
{
Console.WriteLine("\nTEST: valid termState term=" + t.Utf8ToString());
}
}
else
{
// pick valid term
loc = Random.Next(validTerms.Length);
t = BytesRef.DeepCopyOf(validTerms[loc]);
termState = null;
if (Verbose)
{
Console.WriteLine("\nTEST: valid term=" + t.Utf8ToString());
}
}
// seekCeil or seekExact:
bool doSeekExact = Random.NextBoolean();
if (termState != null)
{
if (Verbose)
{
Console.WriteLine(" seekExact termState");
}
te.SeekExact(t, termState);
}
else if (doSeekExact)
{
if (Verbose)
{
Console.WriteLine(" seekExact");
}
Assert.AreEqual(loc >= 0, te.SeekExact(t));
}
else
{
if (Verbose)
{
Console.WriteLine(" seekCeil");
}
TermsEnum.SeekStatus result = te.SeekCeil(t);
if (Verbose)
{
Console.WriteLine(" got " + result);
}
if (loc >= 0)
{
Assert.AreEqual(TermsEnum.SeekStatus.FOUND, result);
}
else if (loc == END_LOC)
{
Assert.AreEqual(TermsEnum.SeekStatus.END, result);
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(loc >= -validTerms.Length);
Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, result);
}
}
if (loc >= 0)
{
Assert.AreEqual(t, te.Term);
}
else if (doSeekExact)
{
// TermsEnum is unpositioned if seekExact returns false
continue;
}
else if (loc == END_LOC)
{
continue;
}
else
{
loc = -loc - 1;
Assert.AreEqual(validTerms[loc], te.Term);
}
// Do a bunch of next's after the seek
int numNext = Random.Next(validTerms.Length);
for (int nextCount = 0; nextCount < numNext; nextCount++)
{
if (Verbose)
{
Console.WriteLine("\nTEST: next loc=" + loc + " of " + validTerms.Length);
}
bool moved = te.MoveNext();
//BytesRef t2 = te.Term;
loc++;
if (loc == validTerms.Length)
{
//Assert.IsNull(t2); // LUCENENET specific - accessing the Term after MoveNext() returns false results in an assertion failure
Assert.IsFalse(moved);
break;
}
else
{
Assert.AreEqual(validTerms[loc], te.Term);
if (Random.Next(40) == 17 && termStates.Count < 100)
{
termStates.Add(new TermAndState(validTerms[loc], te.GetTermState()));
}
}
}
}
}
[Test]
public virtual void TestIntersectBasic()
{
Directory dir = NewDirectory();
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));
iwc.SetMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc);
Document doc = new Document();
doc.Add(NewTextField("field", "aaa", Field.Store.NO));
w.AddDocument(doc);
doc = new Document();
doc.Add(NewStringField("field", "bbb", Field.Store.NO));
w.AddDocument(doc);
doc = new Document();
doc.Add(NewTextField("field", "ccc", Field.Store.NO));
w.AddDocument(doc);
w.ForceMerge(1);
DirectoryReader r = w.GetReader();
w.Dispose();
AtomicReader sub = GetOnlySegmentReader(r);
Terms terms = sub.Fields.GetTerms("field");
Automaton automaton = (new RegExp(".*", RegExpSyntax.NONE)).ToAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.Intersect(ca, null);
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("aaa", te.Term.Utf8ToString());
Assert.AreEqual(0, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("bbb", te.Term.Utf8ToString());
Assert.AreEqual(1, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("ccc", te.Term.Utf8ToString());
Assert.AreEqual(2, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsFalse(te.MoveNext());
te = terms.Intersect(ca, new BytesRef("abc"));
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("bbb", te.Term.Utf8ToString());
Assert.AreEqual(1, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("ccc", te.Term.Utf8ToString());
Assert.AreEqual(2, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsFalse(te.MoveNext());
te = terms.Intersect(ca, new BytesRef("aaa"));
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("bbb", te.Term.Utf8ToString());
Assert.AreEqual(1, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("ccc", te.Term.Utf8ToString());
Assert.AreEqual(2, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsFalse(te.MoveNext());
r.Dispose();
dir.Dispose();
}
[Test]
public virtual void TestIntersectStartTerm()
{
Directory dir = NewDirectory();
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));
iwc.SetMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc);
Document doc = new Document();
doc.Add(NewStringField("field", "abc", Field.Store.NO));
w.AddDocument(doc);
doc = new Document();
doc.Add(NewStringField("field", "abd", Field.Store.NO));
w.AddDocument(doc);
doc = new Document();
doc.Add(NewStringField("field", "acd", Field.Store.NO));
w.AddDocument(doc);
doc = new Document();
doc.Add(NewStringField("field", "bcd", Field.Store.NO));
w.AddDocument(doc);
w.ForceMerge(1);
DirectoryReader r = w.GetReader();
w.Dispose();
AtomicReader sub = GetOnlySegmentReader(r);
Terms terms = sub.Fields.GetTerms("field");
Automaton automaton = (new RegExp(".*d", RegExpSyntax.NONE)).ToAutomaton();
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te;
// should seek to startTerm
te = terms.Intersect(ca, new BytesRef("aad"));
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("abd", te.Term.Utf8ToString());
Assert.AreEqual(1, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("acd", te.Term.Utf8ToString());
Assert.AreEqual(2, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("bcd", te.Term.Utf8ToString());
Assert.AreEqual(3, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsFalse(te.MoveNext());
// should fail to find ceil label on second arc, rewind
te = terms.Intersect(ca, new BytesRef("add"));
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("bcd", te.Term.Utf8ToString());
Assert.AreEqual(3, te.Docs(null, null, DocsFlags.NONE).NextDoc());
Assert.IsFalse(te.MoveNext());
// should reach end
te = terms.Intersect(ca, new BytesRef("bcd"));
Assert.IsFalse(te.MoveNext());
te = terms.Intersect(ca, new BytesRef("ddd"));
Assert.IsFalse(te.MoveNext());
r.Dispose();
dir.Dispose();
}
[Test]
public virtual void TestIntersectEmptyString()
{
Directory dir = NewDirectory();
IndexWriterConfig iwc = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random));
iwc.SetMergePolicy(new LogDocMergePolicy());
RandomIndexWriter w = new RandomIndexWriter(Random, dir, iwc);
Document doc = new Document();
doc.Add(NewStringField("field", "", Field.Store.NO));
doc.Add(NewStringField("field", "abc", Field.Store.NO));
w.AddDocument(doc);
doc = new Document();
// add empty string to both documents, so that singletonDocID == -1.
// For a FST-based term dict, we'll expect to see the first arc is
// flaged with HAS_FINAL_OUTPUT
doc.Add(NewStringField("field", "abc", Field.Store.NO));
doc.Add(NewStringField("field", "", Field.Store.NO));
w.AddDocument(doc);
w.ForceMerge(1);
DirectoryReader r = w.GetReader();
w.Dispose();
AtomicReader sub = GetOnlySegmentReader(r);
Terms terms = sub.Fields.GetTerms("field");
Automaton automaton = (new RegExp(".*", RegExpSyntax.NONE)).ToAutomaton(); // accept ALL
CompiledAutomaton ca = new CompiledAutomaton(automaton, false, false);
TermsEnum te = terms.Intersect(ca, null);
DocsEnum de;
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("", te.Term.Utf8ToString());
de = te.Docs(null, null, DocsFlags.NONE);
Assert.AreEqual(0, de.NextDoc());
Assert.AreEqual(1, de.NextDoc());
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("abc", te.Term.Utf8ToString());
de = te.Docs(null, null, DocsFlags.NONE);
Assert.AreEqual(0, de.NextDoc());
Assert.AreEqual(1, de.NextDoc());
Assert.IsFalse(te.MoveNext());
// pass empty string
te = terms.Intersect(ca, new BytesRef(""));
Assert.IsTrue(te.MoveNext());
Assert.AreEqual("abc", te.Term.Utf8ToString());
de = te.Docs(null, null, DocsFlags.NONE);
Assert.AreEqual(0, de.NextDoc());
Assert.AreEqual(1, de.NextDoc());
Assert.IsFalse(te.MoveNext());
r.Dispose();
dir.Dispose();
}
}
}