blob: 08a4e9e6958816dd377ab354bf1f4db1417b90a7 [file] [log] [blame]
using Lucene.Net.Documents;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Util.Automaton;
using NUnit.Framework;
using System;
using System.Collections.Generic;
using Console = Lucene.Net.Util.SystemConsole;
namespace Lucene.Net.Search
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using AttributeSource = Lucene.Net.Util.AttributeSource;
using Automaton = Lucene.Net.Util.Automaton.Automaton;
using AutomatonTestUtil = Lucene.Net.Util.Automaton.AutomatonTestUtil;
using BytesRef = Lucene.Net.Util.BytesRef;
using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton;
using CharsRef = Lucene.Net.Util.CharsRef;
using Codec = Lucene.Net.Codecs.Codec;
using Directory = Lucene.Net.Store.Directory;
using Document = Documents.Document;
using Field = Field;
using FilteredTermsEnum = Lucene.Net.Index.FilteredTermsEnum;
using IndexReader = Lucene.Net.Index.IndexReader;
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer;
using MockTokenizer = Lucene.Net.Analysis.MockTokenizer;
using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter;
using RegExp = Lucene.Net.Util.Automaton.RegExp;
using Term = Lucene.Net.Index.Term;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
using TestUtil = Lucene.Net.Util.TestUtil;
using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
/// <summary>
/// Create an index with random unicode terms
/// Generates random regexps, and validates against a simple impl.
/// </summary>
[TestFixture]
public class TestRegexpRandom2 : LuceneTestCase
{
protected internal IndexSearcher searcher1;
protected internal IndexSearcher searcher2;
private IndexReader reader;
private Directory dir;
protected internal string fieldName;
[SetUp]
public override void SetUp()
{
base.SetUp();
dir = NewDirectory();
fieldName = Random.NextBoolean() ? "field" : ""; // sometimes use an empty string as field name
RandomIndexWriter writer = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.KEYWORD, false)).SetMaxBufferedDocs(TestUtil.NextInt32(Random, 50, 1000)));
Document doc = new Document();
Field field = NewStringField(fieldName, "", Field.Store.NO);
doc.Add(field);
List<string> terms = new List<string>();
int num = AtLeast(200);
for (int i = 0; i < num; i++)
{
string s = TestUtil.RandomUnicodeString(Random);
field.SetStringValue(s);
terms.Add(s);
writer.AddDocument(doc);
}
if (Verbose)
{
// utf16 order
terms.Sort();
Console.WriteLine("UTF16 order:");
foreach (string s in terms)
{
Console.WriteLine(" " + UnicodeUtil.ToHexString(s));
}
}
reader = writer.GetReader();
searcher1 = NewSearcher(reader);
searcher2 = NewSearcher(reader);
writer.Dispose();
}
[TearDown]
public override void TearDown()
{
reader.Dispose();
dir.Dispose();
base.TearDown();
}
/// <summary>
/// a stupid regexp query that just blasts thru the terms </summary>
private class DumbRegexpQuery : MultiTermQuery
{
private readonly Automaton automaton;
internal DumbRegexpQuery(Term term, RegExpSyntax flags)
: base(term.Field)
{
RegExp re = new RegExp(term.Text(), flags);
automaton = re.ToAutomaton();
}
protected override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts)
{
return new SimpleAutomatonTermsEnum(this, terms.GetEnumerator());
}
private sealed class SimpleAutomatonTermsEnum : FilteredTermsEnum
{
private readonly TestRegexpRandom2.DumbRegexpQuery outerInstance;
private CharacterRunAutomaton runAutomaton;
private readonly CharsRef utf16 = new CharsRef(10);
internal SimpleAutomatonTermsEnum(TestRegexpRandom2.DumbRegexpQuery outerInstance, TermsEnum tenum)
: base(tenum)
{
this.outerInstance = outerInstance;
runAutomaton = new CharacterRunAutomaton(outerInstance.automaton);
SetInitialSeekTerm(new BytesRef(""));
}
protected override AcceptStatus Accept(BytesRef term)
{
UnicodeUtil.UTF8toUTF16(term.Bytes, term.Offset, term.Length, utf16);
return runAutomaton.Run(utf16.Chars, 0, utf16.Length) ? AcceptStatus.YES : AcceptStatus.NO;
}
}
public override string ToString(string field)
{
return field.ToString() + automaton.ToString();
}
}
/// <summary>
/// test a bunch of random regular expressions </summary>
[Test]
public virtual void TestRegexps()
{
// we generate aweful regexps: good for testing.
// but for preflex codec, the test can be very slow, so use less iterations.
int num = Codec.Default.Name.Equals("Lucene3x", StringComparison.Ordinal) ? 100 * RandomMultiplier : AtLeast(1000);
for (int i = 0; i < num; i++)
{
string reg = AutomatonTestUtil.RandomRegexp(Random);
if (Verbose)
{
Console.WriteLine("TEST: regexp=" + reg);
}
AssertSame(reg);
}
}
/// <summary>
/// check that the # of hits is the same as from a very
/// simple regexpquery implementation.
/// </summary>
protected internal virtual void AssertSame(string regexp)
{
RegexpQuery smart = new RegexpQuery(new Term(fieldName, regexp), RegExpSyntax.NONE);
DumbRegexpQuery dumb = new DumbRegexpQuery(new Term(fieldName, regexp), RegExpSyntax.NONE);
TopDocs smartDocs = searcher1.Search(smart, 25);
TopDocs dumbDocs = searcher2.Search(dumb, 25);
CheckHits.CheckEqual(smart, smartDocs.ScoreDocs, dumbDocs.ScoreDocs);
}
}
}