src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs - lucenenet - Git at Google

 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Attributes;
 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using NUnit.Framework;
 using System.IO;

 namespace Lucene.Net.Analysis.NGram
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Tests <seealso cref="NGramTokenizer"/> for correctness.
     /// </summary>
     public class NGramTokenizerTest : BaseTokenStreamTestCase
     {
         private StringReader input;

         public override void SetUp()
         {
             base.SetUp();
             input = new StringReader("abcde");
         }

         [Test]
         public virtual void TestInvalidInput()
         {
             bool gotException = false;
             try
             {
                 new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
             }
             catch (System.ArgumentException)
             {
                 gotException = true;
             }
             assertTrue(gotException);
         }

         [Test]
         public virtual void TestInvalidInput2()
         {
             bool gotException = false;
             try
             {
                 new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1);
             }
             catch (System.ArgumentException)
             {
                 gotException = true;
             }
             assertTrue(gotException);
         }

         [Test]
         public virtual void TestUnigrams()
         {
             NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
             AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
         }

         [Test]
         public virtual void TestBigrams()
         {
             NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2);
             AssertTokenStreamContents(tokenizer, new string[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5); // abcde
         }

         [Test]
         public virtual void TestNgrams()
         {
             NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
             AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4 }, new int[] { 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 5 }, null, null, null, 5, false); // abcde
         }

         [Test]
         public virtual void TestOversizedNgrams()
         {
             NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7);
             AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0], 5); // abcde
         }

         [Test]
         public virtual void TestReset()
         {
             NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
             AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
             tokenizer.SetReader(new StringReader("abcde"));
             AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
         }

         /// <summary>
         /// blast some random strings through the analyzer </summary>
         [Test, LongRunningTest]
         public virtual void TestRandomStrings()
         {
             for (int i = 0; i < 10; i++)
             {
                 int min = TestUtil.NextInt32(Random, 2, 10);
                 int max = TestUtil.NextInt32(Random, min, 20);
                 Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, min, max);
                 CheckRandomData(Random, a, 200 * RANDOM_MULTIPLIER, 20);
                 CheckRandomData(Random, a, 10 * RANDOM_MULTIPLIER, 1027);
             }
         }

         private class AnalyzerAnonymousInnerClassHelper : Analyzer
         {
             private readonly NGramTokenizerTest outerInstance;

             private int min;
             private int max;

             public AnalyzerAnonymousInnerClassHelper(NGramTokenizerTest outerInstance, int min, int max)
             {
                 this.outerInstance = outerInstance;
                 this.min = min;
                 this.max = max;
             }

             protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
             {
                 Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
                 return new TokenStreamComponents(tokenizer, tokenizer);
             }
         }

         private static void TestNGrams(int minGram, int maxGram, int length, string nonTokenChars)
         {
             //string s = RandomStrings.randomAsciiOfLength(Random(), length);
             string s = TestUtil.RandomAnalysisString(Random, length, true);
             TestNGrams(minGram, maxGram, s, nonTokenChars);
         }

         private static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars)
         {
             TestNGrams(minGram, maxGram, s, nonTokenChars, false);
         }

         internal static int[] toCodePoints(string s)
         {
             int[] codePoints = new int[Character.CodePointCount(s, 0, s.Length)];
             for (int i = 0, j = 0; i < s.Length; ++j)
             {
                 codePoints[j] = Character.CodePointAt(s, i);
                 i += Character.CharCount(codePoints[j]);
             }
             return codePoints;
         }

         internal static bool isTokenChar(string nonTokenChars, int codePoint)
         {
             for (int i = 0; i < nonTokenChars.Length;)
             {
                 int cp = char.ConvertToUtf32(nonTokenChars, i);
                 if (cp == codePoint)
                 {
                     return false;
                 }
                 i += Character.CharCount(cp);
             }
             return true;
         }

         internal static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars, bool edgesOnly)
         {
             // convert the string to code points
             int[] codePoints = toCodePoints(s);
             int[] offsets = new int[codePoints.Length + 1];
             for (int i = 0; i < codePoints.Length; ++i)
             {
                 offsets[i + 1] = offsets[i] + Character.CharCount(codePoints[i]);
             }
             TokenStream grams = new NGramTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly, nonTokenChars);
             ICharTermAttribute termAtt = grams.AddAttribute<ICharTermAttribute>();
             IPositionIncrementAttribute posIncAtt = grams.AddAttribute<IPositionIncrementAttribute>();
             IPositionLengthAttribute posLenAtt = grams.AddAttribute<IPositionLengthAttribute>();
             IOffsetAttribute offsetAtt = grams.AddAttribute<IOffsetAttribute>();
             grams.Reset();
             for (int start = 0; start < codePoints.Length; ++start)
             {
                 for (int end = start + minGram; end <= start + maxGram && end <= codePoints.Length; ++end)
                 {
                     if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1]))
                     {
                         // not on an edge
                         goto nextGramContinue;
                     }
                     for (int j = start; j < end; ++j)
                     {
                         if (!isTokenChar(nonTokenChars, codePoints[j]))
                         {
                             goto nextGramContinue;
                         }
                     }
                     assertTrue(grams.IncrementToken());
                     assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString()));
                     assertEquals(1, posIncAtt.PositionIncrement);
                     assertEquals(1, posLenAtt.PositionLength);
                     assertEquals(offsets[start], offsetAtt.StartOffset);
                     assertEquals(offsets[end], offsetAtt.EndOffset);
                     nextGramContinue:;
                 }
                 //nextGramBreak:;
             }
             assertFalse(grams.IncrementToken());
             grams.End();
             assertEquals(s.Length, offsetAtt.StartOffset);
             assertEquals(s.Length, offsetAtt.EndOffset);
         }

         private class NGramTokenizerAnonymousInnerClassHelper : NGramTokenizer
         {
             private string nonTokenChars;

             public NGramTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, StringReader java, int minGram, int maxGram, bool edgesOnly, string nonTokenChars)
                   : base(TEST_VERSION_CURRENT, java, minGram, maxGram, edgesOnly)
             {
                 this.nonTokenChars = nonTokenChars;
             }

             protected override bool IsTokenChar(int chr)
             {
                 return nonTokenChars.IndexOf((char)chr) < 0;
             }
         }

         [Test]
         public virtual void TestLargeInput()
         {
             // test sliding
             int minGram = TestUtil.NextInt32(Random, 1, 100);
             int maxGram = TestUtil.NextInt32(Random, minGram, 100);
             TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 3 * 1024, 4 * 1024), "");
         }

         [Test]
         public virtual void TestLargeMaxGram()
         {
             // test sliding with maxGram > 1024
             int minGram = TestUtil.NextInt32(Random, 1290, 1300);
             int maxGram = TestUtil.NextInt32(Random, minGram, 1300);
             TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 3 * 1024, 4 * 1024), "");
         }

         [Test]
         public virtual void TestPreTokenization()
         {
             int minGram = TestUtil.NextInt32(Random, 1, 100);
             int maxGram = TestUtil.NextInt32(Random, minGram, 100);
             TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 0, 4 * 1024), "a");
         }

         [Test]
         public virtual void TestHeavyPreTokenization()
         {
             int minGram = TestUtil.NextInt32(Random, 1, 100);
             int maxGram = TestUtil.NextInt32(Random, minGram, 100);
             TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 0, 4 * 1024), "abcdef");
         }

         [Test]
         public virtual void TestFewTokenChars()
         {
             char[] chrs = new char[TestUtil.NextInt32(Random, 4000, 5000)];
             Arrays.Fill(chrs, ' ');
             for (int i = 0; i < chrs.Length; ++i)
             {
                 if (Random.NextDouble() < 0.1)
                 {
                     chrs[i] = 'a';
                 }
             }
             int minGram = TestUtil.NextInt32(Random, 1, 2);
             int maxGram = TestUtil.NextInt32(Random, minGram, 2);
             TestNGrams(minGram, maxGram, new string(chrs), " ");
         }

         [Test, LongRunningTest]
         public virtual void TestFullUTF8Range()
         {
             int minGram = TestUtil.NextInt32(Random, 1, 100);
             int maxGram = TestUtil.NextInt32(Random, minGram, 100);
             string s = TestUtil.RandomUnicodeString(Random, 4 * 1024);
             TestNGrams(minGram, maxGram, s, "");
             TestNGrams(minGram, maxGram, s, "abcdef");
         }
     }
 }
	using J2N;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Attributes;
	using Lucene.Net.Support;
	using Lucene.Net.Util;
	using NUnit.Framework;
	using System.IO;

	namespace Lucene.Net.Analysis.NGram
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Tests <seealso cref="NGramTokenizer"/> for correctness.
	/// </summary>
	public class NGramTokenizerTest : BaseTokenStreamTestCase
	{
	private StringReader input;

	public override void SetUp()
	{
	base.SetUp();
	input = new StringReader("abcde");
	}

	[Test]
	public virtual void TestInvalidInput()
	{
	bool gotException = false;
	try
	{
	new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 1);
	}
	catch (System.ArgumentException)
	{
	gotException = true;
	}
	assertTrue(gotException);
	}

	[Test]
	public virtual void TestInvalidInput2()
	{
	bool gotException = false;
	try
	{
	new NGramTokenizer(TEST_VERSION_CURRENT, input, 0, 1);
	}
	catch (System.ArgumentException)
	{
	gotException = true;
	}
	assertTrue(gotException);
	}

	[Test]
	public virtual void TestUnigrams()
	{
	NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
	AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
	}

	[Test]
	public virtual void TestBigrams()
	{
	NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 2, 2);
	AssertTokenStreamContents(tokenizer, new string[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5); // abcde
	}

	[Test]
	public virtual void TestNgrams()
	{
	NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 3);
	AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "b", "bc", "bcd", "c", "cd", "cde", "d", "de", "e" }, new int[] { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4 }, new int[] { 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 5 }, null, null, null, 5, false); // abcde
	}

	[Test]
	public virtual void TestOversizedNgrams()
	{
	NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 6, 7);
	AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0], 5); // abcde
	}

	[Test]
	public virtual void TestReset()
	{
	NGramTokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, input, 1, 1);
	AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
	tokenizer.SetReader(new StringReader("abcde"));
	AssertTokenStreamContents(tokenizer, new string[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5); // abcde
	}

	/// <summary>
	/// blast some random strings through the analyzer </summary>
	[Test, LongRunningTest]
	public virtual void TestRandomStrings()
	{
	for (int i = 0; i < 10; i++)
	{
	int min = TestUtil.NextInt32(Random, 2, 10);
	int max = TestUtil.NextInt32(Random, min, 20);
	Analyzer a = new AnalyzerAnonymousInnerClassHelper(this, min, max);
	CheckRandomData(Random, a, 200 * RANDOM_MULTIPLIER, 20);
	CheckRandomData(Random, a, 10 * RANDOM_MULTIPLIER, 1027);
	}
	}

	private class AnalyzerAnonymousInnerClassHelper : Analyzer
	{
	private readonly NGramTokenizerTest outerInstance;

	private int min;
	private int max;

	public AnalyzerAnonymousInnerClassHelper(NGramTokenizerTest outerInstance, int min, int max)
	{
	this.outerInstance = outerInstance;
	this.min = min;
	this.max = max;
	}

	protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	{
	Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
	return new TokenStreamComponents(tokenizer, tokenizer);
	}
	}

	private static void TestNGrams(int minGram, int maxGram, int length, string nonTokenChars)
	{
	//string s = RandomStrings.randomAsciiOfLength(Random(), length);
	string s = TestUtil.RandomAnalysisString(Random, length, true);
	TestNGrams(minGram, maxGram, s, nonTokenChars);
	}

	private static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars)
	{
	TestNGrams(minGram, maxGram, s, nonTokenChars, false);
	}

	internal static int[] toCodePoints(string s)
	{
	int[] codePoints = new int[Character.CodePointCount(s, 0, s.Length)];
	for (int i = 0, j = 0; i < s.Length; ++j)
	{
	codePoints[j] = Character.CodePointAt(s, i);
	i += Character.CharCount(codePoints[j]);
	}
	return codePoints;
	}

	internal static bool isTokenChar(string nonTokenChars, int codePoint)
	{
	for (int i = 0; i < nonTokenChars.Length;)
	{
	int cp = char.ConvertToUtf32(nonTokenChars, i);
	if (cp == codePoint)
	{
	return false;
	}
	i += Character.CharCount(cp);
	}
	return true;
	}

	internal static void TestNGrams(int minGram, int maxGram, string s, string nonTokenChars, bool edgesOnly)
	{
	// convert the string to code points
	int[] codePoints = toCodePoints(s);
	int[] offsets = new int[codePoints.Length + 1];
	for (int i = 0; i < codePoints.Length; ++i)
	{
	offsets[i + 1] = offsets[i] + Character.CharCount(codePoints[i]);
	}
	TokenStream grams = new NGramTokenizerAnonymousInnerClassHelper(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly, nonTokenChars);
	ICharTermAttribute termAtt = grams.AddAttribute<ICharTermAttribute>();
	IPositionIncrementAttribute posIncAtt = grams.AddAttribute<IPositionIncrementAttribute>();
	IPositionLengthAttribute posLenAtt = grams.AddAttribute<IPositionLengthAttribute>();
	IOffsetAttribute offsetAtt = grams.AddAttribute<IOffsetAttribute>();
	grams.Reset();
	for (int start = 0; start < codePoints.Length; ++start)
	{
	for (int end = start + minGram; end <= start + maxGram && end <= codePoints.Length; ++end)
	{
	if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1]))
	{
	// not on an edge
	goto nextGramContinue;
	}
	for (int j = start; j < end; ++j)
	{
	if (!isTokenChar(nonTokenChars, codePoints[j]))
	{
	goto nextGramContinue;
	}
	}
	assertTrue(grams.IncrementToken());
	assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString()));
	assertEquals(1, posIncAtt.PositionIncrement);
	assertEquals(1, posLenAtt.PositionLength);
	assertEquals(offsets[start], offsetAtt.StartOffset);
	assertEquals(offsets[end], offsetAtt.EndOffset);
	nextGramContinue:;
	}
	//nextGramBreak:;
	}
	assertFalse(grams.IncrementToken());
	grams.End();
	assertEquals(s.Length, offsetAtt.StartOffset);
	assertEquals(s.Length, offsetAtt.EndOffset);
	}

	private class NGramTokenizerAnonymousInnerClassHelper : NGramTokenizer
	{
	private string nonTokenChars;

	public NGramTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, StringReader java, int minGram, int maxGram, bool edgesOnly, string nonTokenChars)
	: base(TEST_VERSION_CURRENT, java, minGram, maxGram, edgesOnly)
	{
	this.nonTokenChars = nonTokenChars;
	}

	protected override bool IsTokenChar(int chr)
	{
	return nonTokenChars.IndexOf((char)chr) < 0;
	}
	}

	[Test]
	public virtual void TestLargeInput()
	{
	// test sliding
	int minGram = TestUtil.NextInt32(Random, 1, 100);
	int maxGram = TestUtil.NextInt32(Random, minGram, 100);
	TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 3 * 1024, 4 * 1024), "");
	}

	[Test]
	public virtual void TestLargeMaxGram()
	{
	// test sliding with maxGram > 1024
	int minGram = TestUtil.NextInt32(Random, 1290, 1300);
	int maxGram = TestUtil.NextInt32(Random, minGram, 1300);
	TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 3 * 1024, 4 * 1024), "");
	}

	[Test]
	public virtual void TestPreTokenization()
	{
	int minGram = TestUtil.NextInt32(Random, 1, 100);
	int maxGram = TestUtil.NextInt32(Random, minGram, 100);
	TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 0, 4 * 1024), "a");
	}

	[Test]
	public virtual void TestHeavyPreTokenization()
	{
	int minGram = TestUtil.NextInt32(Random, 1, 100);
	int maxGram = TestUtil.NextInt32(Random, minGram, 100);
	TestNGrams(minGram, maxGram, TestUtil.NextInt32(Random, 0, 4 * 1024), "abcdef");
	}

	[Test]
	public virtual void TestFewTokenChars()
	{
	char[] chrs = new char[TestUtil.NextInt32(Random, 4000, 5000)];
	Arrays.Fill(chrs, ' ');
	for (int i = 0; i < chrs.Length; ++i)
	{
	if (Random.NextDouble() < 0.1)
	{
	chrs[i] = 'a';
	}
	}
	int minGram = TestUtil.NextInt32(Random, 1, 2);
	int maxGram = TestUtil.NextInt32(Random, minGram, 2);
	TestNGrams(minGram, maxGram, new string(chrs), " ");
	}

	[Test, LongRunningTest]
	public virtual void TestFullUTF8Range()
	{
	int minGram = TestUtil.NextInt32(Random, 1, 100);
	int maxGram = TestUtil.NextInt32(Random, minGram, 100);
	string s = TestUtil.RandomUnicodeString(Random, 4 * 1024);
	TestNGrams(minGram, maxGram, s, "");
	TestNGrams(minGram, maxGram, s, "abcdef");
	}
	}
	}