blob: b18a5835a03c0a55f774a4d894b2ef2273d4505d [file] [log] [blame]
using J2N;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Attributes;
using Lucene.Net.Support;
using Lucene.Net.Util;
using NUnit.Framework;
using System.IO;
namespace Lucene.Net.Analysis.NGram
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Tests <seealso cref="NGramTokenizer"/> for correctness.
/// </summary>
public class NGramTokenizerTest : BaseTokenStreamTestCase
{
private StringReader input;
public override void SetUp()
{
base.SetUp();
input = new StringReader("abcde");
}
[Test]
public virtual void TestInvalidInput()
{
bool gotException = false;
try
{
new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
}
catch (System.ArgumentException)
{
gotException = true;
}
assertTrue(gotException);
}
[Test]
public virtual void TestInvalidInput2()
{
bool gotException = false;
try
{
new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1);
}
catch (System.ArgumentException)
{
gotException = true;
}
assertTrue(gotException);
}
[Test]
public virtual void TestUnigrams()
{
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
}
[Test]
public virtual void TestBigrams()
{
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2);
AssertTokenStreamContents(tokenizer, new string[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5); // abcde
}
[Test]
public virtual void TestNgrams()
{
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4 }, new int[] { 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 5 }, null, null, null, 5, false); // abcde
}
[Test]
public virtual void TestOversizedNgrams()
{
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7);
AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0], 5); // abcde
}
[Test]
public virtual void TestReset()
{
NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
tokenizer.SetReader(new StringReader("abcde"));
AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
}
/// <summary>
/// blast some random strings through the analyzer </summary>
[Test, LongRunningTest]
public virtual void TestRandomStrings()
{
for (int i = 0; i < 10; i++)
{
int min = TestUtil.NextInt32(Random, 2, 10);
int max = TestUtil.NextInt32(Random, min, 20);
Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, min, max);
CheckRandomData(Random, a, 200 * RANDOM_MULTIPLIER, 20);
CheckRandomData(Random, a, 10 * RANDOM_MULTIPLIER, 1027);
}
}
private class AnalyzerAnonymousInnerClassHelper : Analyzer
{
private readonly NGramTokenizerTest outerInstance;
private int min;
private int max;
public AnalyzerAnonymousInnerClassHelper(NGramTokenizerTest outerInstance, int min, int max)
{
this.outerInstance = outerInstance;
this.min = min;
this.max = max;
}
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
return new TokenStreamComponents(tokenizer, tokenizer);
}
}
private static void TestNGrams(int minGram, int maxGram, int length, string nonTokenChars)
{
//string s = RandomStrings.randomAsciiOfLength(Random(), length);
string s = TestUtil.RandomAnalysisString(Random, length, true);
TestNGrams(minGram, maxGram, s, nonTokenChars);
}
private static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars)
{
TestNGrams(minGram, maxGram, s, nonTokenChars, false);
}
internal static int[] toCodePoints(string s)
{
int[] codePoints = new int[Character.CodePointCount(s, 0, s.Length)];
for (int i = 0, j = 0; i < s.Length; ++j)
{
codePoints[j] = Character.CodePointAt(s, i);
i += Character.CharCount(codePoints[j]);
}
return codePoints;
}
internal static bool isTokenChar(string nonTokenChars, int codePoint)
{
for (int i = 0; i < nonTokenChars.Length;)
{
int cp = char.ConvertToUtf32(nonTokenChars, i);
if (cp == codePoint)
{
return false;
}
i += Character.CharCount(cp);
}
return true;
}
internal static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars, bool edgesOnly)
{
// convert the string to code points
int[] codePoints = toCodePoints(s);
int[] offsets = new int[codePoints.Length + 1];
for (int i = 0; i < codePoints.Length; ++i)
{
offsets[i + 1] = offsets[i] + Character.CharCount(codePoints[i]);
}
TokenStream grams = new NGramTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly, nonTokenChars);
ICharTermAttribute termAtt = grams.AddAttribute<ICharTermAttribute>();
IPositionIncrementAttribute posIncAtt = grams.AddAttribute<IPositionIncrementAttribute>();
IPositionLengthAttribute posLenAtt = grams.AddAttribute<IPositionLengthAttribute>();
IOffsetAttribute offsetAtt = grams.AddAttribute<IOffsetAttribute>();
grams.Reset();
for (int start = 0; start < codePoints.Length; ++start)
{
for (int end = start + minGram; end <= start + maxGram && end <= codePoints.Length; ++end)
{
if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1]))
{
// not on an edge
goto nextGramContinue;
}
for (int j = start; j < end; ++j)
{
if (!isTokenChar(nonTokenChars, codePoints[j]))
{
goto nextGramContinue;
}
}
assertTrue(grams.IncrementToken());
assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString()));
assertEquals(1, posIncAtt.PositionIncrement);
assertEquals(1, posLenAtt.PositionLength);
assertEquals(offsets[start], offsetAtt.StartOffset);
assertEquals(offsets[end], offsetAtt.EndOffset);
nextGramContinue:;
}
//nextGramBreak:;
}
assertFalse(grams.IncrementToken());
grams.End();
assertEquals(s.Length, offsetAtt.StartOffset);
assertEquals(s.Length, offsetAtt.EndOffset);
}
private class NGramTokenizerAnonymousInnerClassHelper : NGramTokenizer
{
private string nonTokenChars;
public NGramTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, StringReader java, int minGram, int maxGram, bool edgesOnly, string nonTokenChars)
: base(TEST_VERSION_CURRENT, java, minGram, maxGram, edgesOnly)
{
this.nonTokenChars = nonTokenChars;
}
protected override bool IsTokenChar(int chr)
{
return nonTokenChars.IndexOf((char)chr) < 0;
}
}
[Test]
public virtual void TestLargeInput()
{
// test sliding
int minGram = TestUtil.NextInt32(Random, 1, 100);
int maxGram = TestUtil.NextInt32(Random, minGram, 100);
TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 3 * 1024, 4 * 1024), "");
}
[Test]
public virtual void TestLargeMaxGram()
{
// test sliding with maxGram > 1024
int minGram = TestUtil.NextInt32(Random, 1290, 1300);
int maxGram = TestUtil.NextInt32(Random, minGram, 1300);
TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 3 * 1024, 4 * 1024), "");
}
[Test]
public virtual void TestPreTokenization()
{
int minGram = TestUtil.NextInt32(Random, 1, 100);
int maxGram = TestUtil.NextInt32(Random, minGram, 100);
TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 0, 4 * 1024), "a");
}
[Test]
public virtual void TestHeavyPreTokenization()
{
int minGram = TestUtil.NextInt32(Random, 1, 100);
int maxGram = TestUtil.NextInt32(Random, minGram, 100);
TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 0, 4 * 1024), "abcdef");
}
[Test]
public virtual void TestFewTokenChars()
{
char[] chrs = new char[TestUtil.NextInt32(Random, 4000, 5000)];
Arrays.Fill(chrs, ' ');
for (int i = 0; i < chrs.Length; ++i)
{
if (Random.NextDouble() < 0.1)
{
chrs[i] = 'a';
}
}
int minGram = TestUtil.NextInt32(Random, 1, 2);
int maxGram = TestUtil.NextInt32(Random, minGram, 2);
TestNGrams(minGram, maxGram, new string(chrs), " ");
}
[Test, LongRunningTest]
public virtual void TestFullUTF8Range()
{
int minGram = TestUtil.NextInt32(Random, 1, 100);
int maxGram = TestUtil.NextInt32(Random, minGram, 100);
string s = TestUtil.RandomUnicodeString(Random, 4 * 1024);
TestNGrams(minGram, maxGram, s, "");
TestNGrams(minGram, maxGram, s, "abcdef");
}
}
}