src/Lucene.Net.Tests.Suggest/Suggest/Analyzing/TestFreeTextSuggester.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis;
 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Documents;
 using Lucene.Net.Support;
 using Lucene.Net.Util;
 using NUnit.Framework;
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Globalization;
 using System.IO;
 using System.Text;
 using Console = Lucene.Net.Support.SystemConsole;

 namespace Lucene.Net.Search.Suggest.Analyzing
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     [SuppressCodecs("Lucene3x")]
     public class TestFreeTextSuggester : LuceneTestCase
     {
         [Test]
         public void TestBasic()
         {
             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("foo bar baz blah", 50),
                 new Input("boo foo bar foo bee", 20)
             );

             Analyzer a = new MockAnalyzer(Random);
             FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
             sug.Build(new InputArrayIterator(keys));
             assertEquals(2, sug.Count);

             for (int i = 0; i < 2; i++)
             {

                 // Uses bigram model and unigram backoff:
                 assertEquals("foo bar/0.67 foo bee/0.33 baz/0.04 blah/0.04 boo/0.04",
                              ToString(sug.DoLookup("foo b", 10)));

                 // Uses only bigram model:
                 assertEquals("foo bar/0.67 foo bee/0.33",
                              ToString(sug.DoLookup("foo ", 10)));

                 // Uses only unigram model:
                 assertEquals("foo/0.33",
                              ToString(sug.DoLookup("foo", 10)));

                 // Uses only unigram model:
                 assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11",
                              ToString(sug.DoLookup("b", 10)));

                 // Try again after save/load:
                 DirectoryInfo tmpDir = CreateTempDir("FreeTextSuggesterTest");
                 //tmpDir.Create();

                 FileInfo path = new FileInfo(Path.Combine(tmpDir.FullName, "suggester"));

                 using (Stream os = new FileStream(path.FullName, FileMode.Create, FileAccess.Write))
                     sug.Store(os);

                 using (Stream @is = new FileStream(path.FullName, FileMode.Open, FileAccess.Read))
                 {
                     sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
                     sug.Load(@is);
                 }
                 assertEquals(2, sug.Count);
             }
         }

         [Test]
         public void TestIllegalByteDuringBuild()
         {
             // Default separator is INFORMATION SEPARATOR TWO
             // (0x1e), so no input token is allowed to contain it
             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("foo\u001ebar baz", 50)
             );
             FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));
             try
             {
                 sug.Build(new InputArrayIterator(keys));
                 fail("did not hit expected exception");
             }
             catch (ArgumentException /*iae*/)
             {
                 // expected
             }
         }

         [Test]
         public void TestIllegalByteDuringQuery()
         {
             // Default separator is INFORMATION SEPARATOR TWO
             // (0x1e), so no input token is allowed to contain it
             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("foo bar baz", 50)
             );
             FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));
             sug.Build(new InputArrayIterator(keys));

             try
             {
                 sug.DoLookup("foo\u001eb", 10);
                 fail("did not hit expected exception");
             }
             catch (ArgumentException /*iae*/)
             {
                 // expected
             }
         }

         internal class TestWikiInputIterator : IInputIterator
         {
             private readonly LineFileDocs lfd;
             private readonly TestFreeTextSuggester outerInstance;
             private int count;

             public TestWikiInputIterator(TestFreeTextSuggester outerInstance, LineFileDocs lfd)
             {
                 this.outerInstance = outerInstance;
                 this.lfd = lfd;
             }

             public long Weight
             {
                 get
                 {
                     return 1;
                 }
             }

             public IComparer<BytesRef> Comparer
             {
                 get
                 {
                     return null;
                 }
             }

             public BytesRef Next()
             {
                 Document doc;
                 try
                 {
                     doc = lfd.NextDoc();
                 }
                 catch (IOException ioe)
                 {
                     throw new Exception(ioe.ToString(), ioe);
                 }
                 if (doc == null)
                 {
                     return null;
                 }
                 if (count++ == 10000)
                 {
                     return null;
                 }
                 return new BytesRef(doc.Get("body"));
             }

             public BytesRef Payload
             {
                 get
                 {
                     return null;
                 }
             }

             public bool HasPayloads
             {
                 get
                 {
                     return false;
                 }
             }

             public IEnumerable<BytesRef> Contexts
             {
                 get
                 {
                     return null;
                 }
             }

             public bool HasContexts
             {
                 get
                 {
                     return false;
                 }
             }
         }

         [Ignore("Ignored in Lucene")]
         public void TestWiki()
         {
             LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false);
             // Skip header:
             lfd.NextDoc();
             FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));
             sug.Build(new TestWikiInputIterator(this, lfd));
             if (VERBOSE)
             {
                 Console.WriteLine(sug.GetSizeInBytes() + " bytes");

                 IList<Lookup.LookupResult> results = sug.DoLookup("general r", 10);
                 Console.WriteLine("results:");
                 foreach (Lookup.LookupResult result in results)
                 {
                     Console.WriteLine("  " + result);
                 }
             }
         }

         // Make sure you can suggest based only on unigram model:
         [Test]
         public void TestUnigrams()
         {
             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("foo bar baz blah boo foo bar foo bee", 50)
             );

             Analyzer a = new MockAnalyzer(Random);
             FreeTextSuggester sug = new FreeTextSuggester(a, a, 1, (byte)0x20);
             sug.Build(new InputArrayIterator(keys));
             // Sorts first by count, descending, second by term, ascending
             assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11",
                          ToString(sug.DoLookup("b", 10)));
         }

         // Make sure the last token is not duplicated
         [Test]
         public void TestNoDupsAcrossGrams()
         {
             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("foo bar bar bar bar", 50)
             );
             Analyzer a = new MockAnalyzer(Random);
             FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
             sug.Build(new InputArrayIterator(keys));
             assertEquals("foo bar/1.00",
                          ToString(sug.DoLookup("foo b", 10)));
         }

         // Lookup of just empty string produces unicode only matches:
         [Test]
         public void TestEmptyString()
         {
             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("foo bar bar bar bar", 50)
             );
             Analyzer a = new MockAnalyzer(Random);
             FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
             sug.Build(new InputArrayIterator(keys));
             try
             {
                 sug.DoLookup("", 10);
                 fail("did not hit exception");
             }
             catch (ArgumentException /*iae*/)
             {
                 // expected
             }
         }

         internal class TestEndingHoleAnalyzer : Analyzer
         {
             protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
             {
                 Tokenizer tokenizer = new MockTokenizer(reader);
                 CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "of");
                 return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
             }
         }

         // With one ending hole, ShingleFilter produces "of _" and
         // we should properly predict from that:
         [Test]
         public void TestEndingHole()
         {
             // Just deletes "of"
             Analyzer a = new TestEndingHoleAnalyzer();

             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("wizard of oz", 50)
             );
             FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20);
             sug.Build(new InputArrayIterator(keys));
             assertEquals("wizard _ oz/1.00",
                          ToString(sug.DoLookup("wizard of", 10)));

             // Falls back to unigram model, with backoff 0.4 times
             // prop 0.5:
             assertEquals("oz/0.20",
                          ToString(sug.DoLookup("wizard o", 10)));
         }


         // If the number of ending holes exceeds the ngrams window
         // then there are no predictions, because ShingleFilter
         // does not produce e.g. a hole only "_ _" token:
         [Test]
         public void TestTwoEndingHoles()
         {
             // Just deletes "of"
             Analyzer a = new TestEndingHoleAnalyzer();

             IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
                 new Input("wizard of of oz", 50)
             );
             FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20);
             sug.Build(new InputArrayIterator(keys));
             assertEquals("",
                          ToString(sug.DoLookup("wizard of of", 10)));
         }

         internal class ByScoreThenKeyComparer : IComparer<Lookup.LookupResult>
         {
             public int Compare(Lookup.LookupResult a, Lookup.LookupResult b)
             {
                 if (a.Value > b.Value)
                 {
                     return -1;
                 }
                 else if (a.Value < b.Value)
                 {
                     return 1;
                 }
                 else
                 {
                     // Tie break by UTF16 sort order:
                     return ((string)a.Key).CompareToOrdinal((string)b.Key);
                 }
             }
         }

         private static IComparer<Lookup.LookupResult> byScoreThenKey = new ByScoreThenKeyComparer();

         internal class TestRandomInputIterator : IInputIterator
         {
             internal int upto;
             private readonly TestFreeTextSuggester outerInstance;
             private readonly string[][] docs;

             public TestRandomInputIterator(TestFreeTextSuggester outerInstance, string[][] docs)
             {
                 this.outerInstance = outerInstance;
                 this.docs = docs;
             }

             public IComparer<BytesRef> Comparer
             {
                 get
                 {
                     return null;
                 }
             }

             public BytesRef Next()
             {
                 if (upto == docs.Length)
                 {
                     return null;
                 }
                 else
                 {
                     StringBuilder b = new StringBuilder();
                     foreach (string token in docs[upto])
                     {
                         b.append(' ');
                         b.append(token);
                     }
                     upto++;
                     return new BytesRef(b.toString());
                 }
             }

             public long Weight
             {
                 get
                 {
                     return Random.Next();
                 }
             }

             public BytesRef Payload
             {
                 get
                 {
                     return null;
                 }
             }

             public bool HasPayloads
             {
                 get
                 {
                     return false;
                 }
             }


             public IEnumerable<BytesRef> Contexts
             {
                 get
                 {
                     return null;
                 }
             }

             public bool HasContexts
             {
                 get
                 {
                     return false;
                 }
             }
         }

         [Test]
         public void TestRandom()
         {
             string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)];
             ISet<string> seen = new HashSet<string>();
             while (seen.size() < terms.Length)
             {
                 string token = TestUtil.RandomSimpleString(Random, 1, 5);
                 if (!seen.contains(token))
                 {
                     terms[seen.size()] = token;
                     seen.add(token);
                 }
             }

             Analyzer a = new MockAnalyzer(Random);

             int numDocs = AtLeast(10);
             long totTokens = 0;
             string[][] docs = new string[numDocs][];
             for (int i = 0; i < numDocs; i++)
             {
                 docs[i] = new string[AtLeast(100)];
                 if (VERBOSE)
                 {
                     Console.Write("  doc " + i + ":");
                 }
                 for (int j = 0; j < docs[i].Length; j++)
                 {
                     docs[i][j] = GetZipfToken(terms);
                     if (VERBOSE)
                     {
                         Console.Write(" " + docs[i][j]);
                     }
                 }
                 if (VERBOSE)
                 {
                     Console.WriteLine();
                 }
                 totTokens += docs[i].Length;
             }

             int grams = TestUtil.NextInt32(Random, 1, 4);

             if (VERBOSE)
             {
                 Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams");
             }

             // Build suggester model:
             FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20);
             sug.Build(new TestRandomInputIterator(this, docs));

             // Build inefficient but hopefully correct model:
             List<IDictionary<string, int?>> gramCounts = new List<IDictionary<string, int?>>(grams);
             for (int gram = 0; gram < grams; gram++)
             {
                 if (VERBOSE)
                 {
                     Console.WriteLine("TEST: build model for gram=" + gram);
                 }
                 IDictionary<string, int?> model = new HashMap<string, int?>();
                 gramCounts.Add(model);
                 foreach (string[] doc in docs)
                 {
                     for (int i = 0; i < doc.Length - gram; i++)
                     {
                         StringBuilder b = new StringBuilder();
                         for (int j = i; j <= i + gram; j++)
                         {
                             if (j > i)
                             {
                                 b.append(' ');
                             }
                             b.append(doc[j]);
                         }
                         string token = b.toString();
                         if (!model.TryGetValue(token, out int? curCount) || curCount == null)
                         {
                             model.Put(token, 1);
                         }
                         else
                         {
                             model.Put(token, 1 + curCount);
                         }
                         if (VERBOSE)
                         {
                             Console.WriteLine("  add '" + token + "' -> count=" + (model.TryGetValue(token, out int? count) ? (count.HasValue ? count.ToString() : "null") : ""));
                         }
                     }
                 }
             }

             int lookups = AtLeast(100);
             for (int iter = 0; iter < lookups; iter++)
             {
                 string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)];
                 for (int i = 0; i < tokens.Length; i++)
                 {
                     tokens[i] = GetZipfToken(terms);
                 }

                 // Maybe trim last token; be sure not to create the
                 // empty string:
                 int trimStart;
                 if (tokens.Length == 1)
                 {
                     trimStart = 1;
                 }
                 else
                 {
                     trimStart = 0;
                 }
                 int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length);
                 tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0);

                 int num = TestUtil.NextInt32(Random, 1, 100);
                 StringBuilder b = new StringBuilder();
                 foreach (string token in tokens)
                 {
                     b.append(' ');
                     b.append(token);
                 }
                 string query = b.toString();
                 query = query.Substring(1);

                 if (VERBOSE)
                 {
                     Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
                 }

                 // Expected:
                 List<Lookup.LookupResult> expected = new List<Lookup.LookupResult>();
                 double backoff = 1.0;
                 seen = new HashSet<string>();

                 if (VERBOSE)
                 {
                     Console.WriteLine("  compute expected");
                 }
                 for (int i = grams - 1; i >= 0; i--)
                 {
                     if (VERBOSE)
                     {
                         Console.WriteLine("    grams=" + i);
                     }

                     if (tokens.Length < i + 1)
                     {
                         // Don't have enough tokens to use this model
                         if (VERBOSE)
                         {
                             Console.WriteLine("      skip");
                         }
                         continue;
                     }

                     if (i == 0 && tokens[tokens.Length - 1].Length == 0)
                     {
                         // Never suggest unigrams from empty string:
                         if (VERBOSE)
                         {
                             Console.WriteLine("      skip unigram priors only");
                         }
                         continue;
                     }

                     // Build up "context" ngram:
                     b = new StringBuilder();
                     for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++)
                     {
                         b.append(' ');
                         b.append(tokens[j]);
                     }
                     string context = b.toString();
                     if (context.Length > 0)
                     {
                         context = context.Substring(1);
                     }
                     if (VERBOSE)
                     {
                         Console.WriteLine("      context='" + context + "'");
                     }
                     long contextCount;
                     if (context.Length == 0)
                     {
                         contextCount = totTokens;
                     }
                     else
                     {
                         //int? count = gramCounts.get(i - 1).get(context);
                         var gramCount = gramCounts[i - 1];
                         if (!gramCount.TryGetValue(context, out int? count) || count == null)
                         {
                             // We never saw this context:
                             backoff *= FreeTextSuggester.ALPHA;
                             if (VERBOSE)
                             {
                                 Console.WriteLine("      skip: never saw context");
                             }
                             continue;
                         }
                         contextCount = count.GetValueOrDefault();
                     }
                     if (VERBOSE)
                     {
                         Console.WriteLine("      contextCount=" + contextCount);
                     }
                     IDictionary<string, int?> model = gramCounts[i];

                     // First pass, gather all predictions for this model:
                     if (VERBOSE)
                     {
                         Console.WriteLine("      find terms w/ prefix=" + tokens[tokens.Length - 1]);
                     }
                     List<Lookup.LookupResult> tmp = new List<Lookup.LookupResult>();
                     foreach (string term in terms)
                     {
                         if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal))
                         {
                             if (VERBOSE)
                             {
                                 Console.WriteLine("        term=" + term);
                             }
                             if (seen.contains(term))
                             {
                                 if (VERBOSE)
                                 {
                                     Console.WriteLine("          skip seen");
                                 }
                                 continue;
                             }
                             string ngram = (context + " " + term).Trim();
                             //Integer count = model.get(ngram);
                             if (model.TryGetValue(ngram, out int? count) && count != null)
                             {
                                 // LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
                                 // return numbers that are greater than long.MaxValue, which results in a negative long number.
                                 // This is also the way it is being done in the FreeTextSuggester to work around the issue.
                                 Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount)));
                                 tmp.Add(lr);
                                 if (VERBOSE)
                                 {
                                     Console.WriteLine("      add tmp key='" + lr.Key + "' score=" + lr.Value);
                                 }
                             }
                         }
                     }

                     // Second pass, trim to only top N, and fold those
                     // into overall suggestions:
                     tmp.Sort(byScoreThenKey);
                     if (tmp.size() > num)
                     {
                         //tmp.subList(num, tmp.size()).clear();
                         tmp.RemoveRange(num, tmp.size() - num);
                     }
                     foreach (Lookup.LookupResult result in tmp)
                     {
                         string key = result.Key.toString();
                         int idx = key.LastIndexOf(' ');
                         string lastToken;
                         if (idx != -1)
                         {
                             lastToken = key.Substring(idx + 1);
                         }
                         else
                         {
                             lastToken = key;
                         }
                         if (!seen.contains(lastToken))
                         {
                             seen.add(lastToken);
                             expected.Add(result);
                             if (VERBOSE)
                             {
                                 Console.WriteLine("      keep key='" + result.Key + "' score=" + result.Value);
                             }
                         }
                     }

                     backoff *= FreeTextSuggester.ALPHA;
                 }

                 expected.Sort(byScoreThenKey);

                 if (expected.size() > num)
                 {
                     expected.RemoveRange(num, expected.size() - num);
                 }

                 // Actual:
                 IList<Lookup.LookupResult> actual = sug.DoLookup(query, num);

                 if (VERBOSE)
                 {
                     Console.WriteLine("  expected: " + expected);
                     Console.WriteLine("    actual: " + actual);
                 }

                 assertEquals(expected.ToString(), actual.ToString());
             }
         }

         private static string GetZipfToken(string[] tokens)
         {
             // Zipf-like distribution:
             for (int k = 0; k < tokens.Length; k++)
             {
                 if (Random.nextBoolean() || k == tokens.Length - 1)
                 {
                     return tokens[k];
                 }
             }
             Debug.Assert(false);
             return null;
         }

         private static string ToString(IEnumerable<Lookup.LookupResult> results)
         {
             StringBuilder b = new StringBuilder();
             foreach (Lookup.LookupResult result in results)
             {
                 b.Append(' ');
                 b.Append(result.Key);
                 b.Append('/');
                 b.AppendFormat(CultureInfo.InvariantCulture, "{0:0.00}", ((double)result.Value) / long.MaxValue);
             }
             return b.toString().Trim();
         }
     }
 }
	using Lucene.Net.Analysis;
	using Lucene.Net.Analysis.Core;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Documents;
	using Lucene.Net.Support;
	using Lucene.Net.Util;
	using NUnit.Framework;
	using System;
	using System.Collections.Generic;
	using System.Diagnostics;
	using System.Globalization;
	using System.IO;
	using System.Text;
	using Console = Lucene.Net.Support.SystemConsole;

	namespace Lucene.Net.Search.Suggest.Analyzing
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	[SuppressCodecs("Lucene3x")]
	public class TestFreeTextSuggester : LuceneTestCase
	{
	[Test]
	public void TestBasic()
	{
	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("foo bar baz blah", 50),
	new Input("boo foo bar foo bee", 20)
	);

	Analyzer a = new MockAnalyzer(Random);
	FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
	sug.Build(new InputArrayIterator(keys));
	assertEquals(2, sug.Count);

	for (int i = 0; i < 2; i++)
	{

	// Uses bigram model and unigram backoff:
	assertEquals("foo bar/0.67 foo bee/0.33 baz/0.04 blah/0.04 boo/0.04",
	ToString(sug.DoLookup("foo b", 10)));

	// Uses only bigram model:
	assertEquals("foo bar/0.67 foo bee/0.33",
	ToString(sug.DoLookup("foo ", 10)));

	// Uses only unigram model:
	assertEquals("foo/0.33",
	ToString(sug.DoLookup("foo", 10)));

	// Uses only unigram model:
	assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11",
	ToString(sug.DoLookup("b", 10)));

	// Try again after save/load:
	DirectoryInfo tmpDir = CreateTempDir("FreeTextSuggesterTest");
	//tmpDir.Create();

	FileInfo path = new FileInfo(Path.Combine(tmpDir.FullName, "suggester"));

	using (Stream os = new FileStream(path.FullName, FileMode.Create, FileAccess.Write))
	sug.Store(os);

	using (Stream @is = new FileStream(path.FullName, FileMode.Open, FileAccess.Read))
	{
	sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
	sug.Load(@is);
	}
	assertEquals(2, sug.Count);
	}
	}

	[Test]
	public void TestIllegalByteDuringBuild()
	{
	// Default separator is INFORMATION SEPARATOR TWO
	// (0x1e), so no input token is allowed to contain it
	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("foo\u001ebar baz", 50)
	);
	FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));
	try
	{
	sug.Build(new InputArrayIterator(keys));
	fail("did not hit expected exception");
	}
	catch (ArgumentException /iae/)
	{
	// expected
	}
	}

	[Test]
	public void TestIllegalByteDuringQuery()
	{
	// Default separator is INFORMATION SEPARATOR TWO
	// (0x1e), so no input token is allowed to contain it
	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("foo bar baz", 50)
	);
	FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));
	sug.Build(new InputArrayIterator(keys));

	try
	{
	sug.DoLookup("foo\u001eb", 10);
	fail("did not hit expected exception");
	}
	catch (ArgumentException /iae/)
	{
	// expected
	}
	}

	internal class TestWikiInputIterator : IInputIterator
	{
	private readonly LineFileDocs lfd;
	private readonly TestFreeTextSuggester outerInstance;
	private int count;

	public TestWikiInputIterator(TestFreeTextSuggester outerInstance, LineFileDocs lfd)
	{
	this.outerInstance = outerInstance;
	this.lfd = lfd;
	}

	public long Weight
	{
	get
	{
	return 1;
	}
	}

	public IComparer<BytesRef> Comparer
	{
	get
	{
	return null;
	}
	}

	public BytesRef Next()
	{
	Document doc;
	try
	{
	doc = lfd.NextDoc();
	}
	catch (IOException ioe)
	{
	throw new Exception(ioe.ToString(), ioe);
	}
	if (doc == null)
	{
	return null;
	}
	if (count++ == 10000)
	{
	return null;
	}
	return new BytesRef(doc.Get("body"));
	}

	public BytesRef Payload
	{
	get
	{
	return null;
	}
	}

	public bool HasPayloads
	{
	get
	{
	return false;
	}
	}

	public IEnumerable<BytesRef> Contexts
	{
	get
	{
	return null;
	}
	}

	public bool HasContexts
	{
	get
	{
	return false;
	}
	}
	}

	[Ignore("Ignored in Lucene")]
	public void TestWiki()
	{
	LineFileDocs lfd = new LineFileDocs(null, "/lucenedata/enwiki/enwiki-20120502-lines-1k.txt", false);
	// Skip header:
	lfd.NextDoc();
	FreeTextSuggester sug = new FreeTextSuggester(new MockAnalyzer(Random));
	sug.Build(new TestWikiInputIterator(this, lfd));
	if (VERBOSE)
	{
	Console.WriteLine(sug.GetSizeInBytes() + " bytes");

	IList<Lookup.LookupResult> results = sug.DoLookup("general r", 10);
	Console.WriteLine("results:");
	foreach (Lookup.LookupResult result in results)
	{
	Console.WriteLine(" " + result);
	}
	}
	}

	// Make sure you can suggest based only on unigram model:
	[Test]
	public void TestUnigrams()
	{
	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("foo bar baz blah boo foo bar foo bee", 50)
	);

	Analyzer a = new MockAnalyzer(Random);
	FreeTextSuggester sug = new FreeTextSuggester(a, a, 1, (byte)0x20);
	sug.Build(new InputArrayIterator(keys));
	// Sorts first by count, descending, second by term, ascending
	assertEquals("bar/0.22 baz/0.11 bee/0.11 blah/0.11 boo/0.11",
	ToString(sug.DoLookup("b", 10)));
	}

	// Make sure the last token is not duplicated
	[Test]
	public void TestNoDupsAcrossGrams()
	{
	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("foo bar bar bar bar", 50)
	);
	Analyzer a = new MockAnalyzer(Random);
	FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
	sug.Build(new InputArrayIterator(keys));
	assertEquals("foo bar/1.00",
	ToString(sug.DoLookup("foo b", 10)));
	}

	// Lookup of just empty string produces unicode only matches:
	[Test]
	public void TestEmptyString()
	{
	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("foo bar bar bar bar", 50)
	);
	Analyzer a = new MockAnalyzer(Random);
	FreeTextSuggester sug = new FreeTextSuggester(a, a, 2, (byte)0x20);
	sug.Build(new InputArrayIterator(keys));
	try
	{
	sug.DoLookup("", 10);
	fail("did not hit exception");
	}
	catch (ArgumentException /iae/)
	{
	// expected
	}
	}

	internal class TestEndingHoleAnalyzer : Analyzer
	{
	protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
	{
	Tokenizer tokenizer = new MockTokenizer(reader);
	CharArraySet stopSet = StopFilter.MakeStopSet(TEST_VERSION_CURRENT, "of");
	return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
	}
	}

	// With one ending hole, ShingleFilter produces "of _" and
	// we should properly predict from that:
	[Test]
	public void TestEndingHole()
	{
	// Just deletes "of"
	Analyzer a = new TestEndingHoleAnalyzer();

	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("wizard of oz", 50)
	);
	FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20);
	sug.Build(new InputArrayIterator(keys));
	assertEquals("wizard _ oz/1.00",
	ToString(sug.DoLookup("wizard of", 10)));

	// Falls back to unigram model, with backoff 0.4 times
	// prop 0.5:
	assertEquals("oz/0.20",
	ToString(sug.DoLookup("wizard o", 10)));
	}



	// If the number of ending holes exceeds the ngrams window
	// then there are no predictions, because ShingleFilter
	// does not produce e.g. a hole only "_ _" token:
	[Test]
	public void TestTwoEndingHoles()
	{
	// Just deletes "of"
	Analyzer a = new TestEndingHoleAnalyzer();

	IEnumerable<Input> keys = AnalyzingSuggesterTest.Shuffle(
	new Input("wizard of of oz", 50)
	);
	FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte)0x20);
	sug.Build(new InputArrayIterator(keys));
	assertEquals("",
	ToString(sug.DoLookup("wizard of of", 10)));
	}

	internal class ByScoreThenKeyComparer : IComparer<Lookup.LookupResult>
	{
	public int Compare(Lookup.LookupResult a, Lookup.LookupResult b)
	{
	if (a.Value > b.Value)
	{
	return -1;
	}
	else if (a.Value < b.Value)
	{
	return 1;
	}
	else
	{
	// Tie break by UTF16 sort order:
	return ((string)a.Key).CompareToOrdinal((string)b.Key);
	}
	}
	}

	private static IComparer<Lookup.LookupResult> byScoreThenKey = new ByScoreThenKeyComparer();

	internal class TestRandomInputIterator : IInputIterator
	{
	internal int upto;
	private readonly TestFreeTextSuggester outerInstance;
	private readonly string[][] docs;

	public TestRandomInputIterator(TestFreeTextSuggester outerInstance, string[][] docs)
	{
	this.outerInstance = outerInstance;
	this.docs = docs;
	}

	public IComparer<BytesRef> Comparer
	{
	get
	{
	return null;
	}
	}

	public BytesRef Next()
	{
	if (upto == docs.Length)
	{
	return null;
	}
	else
	{
	StringBuilder b = new StringBuilder();
	foreach (string token in docs[upto])
	{
	b.append(' ');
	b.append(token);
	}
	upto++;
	return new BytesRef(b.toString());
	}
	}

	public long Weight
	{
	get
	{
	return Random.Next();
	}
	}

	public BytesRef Payload
	{
	get
	{
	return null;
	}
	}

	public bool HasPayloads
	{
	get
	{
	return false;
	}
	}


	public IEnumerable<BytesRef> Contexts
	{
	get
	{
	return null;
	}
	}

	public bool HasContexts
	{
	get
	{
	return false;
	}
	}
	}

	[Test]
	public void TestRandom()
	{
	string[] terms = new string[TestUtil.NextInt32(Random, 2, 10)];
	ISet<string> seen = new HashSet<string>();
	while (seen.size() < terms.Length)
	{
	string token = TestUtil.RandomSimpleString(Random, 1, 5);
	if (!seen.contains(token))
	{
	terms[seen.size()] = token;
	seen.add(token);
	}
	}

	Analyzer a = new MockAnalyzer(Random);

	int numDocs = AtLeast(10);
	long totTokens = 0;
	string[][] docs = new string[numDocs][];
	for (int i = 0; i < numDocs; i++)
	{
	docs[i] = new string[AtLeast(100)];
	if (VERBOSE)
	{
	Console.Write(" doc " + i + ":");
	}
	for (int j = 0; j < docs[i].Length; j++)
	{
	docs[i][j] = GetZipfToken(terms);
	if (VERBOSE)
	{
	Console.Write(" " + docs[i][j]);
	}
	}
	if (VERBOSE)
	{
	Console.WriteLine();
	}
	totTokens += docs[i].Length;
	}

	int grams = TestUtil.NextInt32(Random, 1, 4);

	if (VERBOSE)
	{
	Console.WriteLine("TEST: " + terms.Length + " terms; " + numDocs + " docs; " + grams + " grams");
	}

	// Build suggester model:
	FreeTextSuggester sug = new FreeTextSuggester(a, a, grams, (byte)0x20);
	sug.Build(new TestRandomInputIterator(this, docs));

	// Build inefficient but hopefully correct model:
	List<IDictionary<string, int?>> gramCounts = new List<IDictionary<string, int?>>(grams);
	for (int gram = 0; gram < grams; gram++)
	{
	if (VERBOSE)
	{
	Console.WriteLine("TEST: build model for gram=" + gram);
	}
	IDictionary<string, int?> model = new HashMap<string, int?>();
	gramCounts.Add(model);
	foreach (string[] doc in docs)
	{
	for (int i = 0; i < doc.Length - gram; i++)
	{
	StringBuilder b = new StringBuilder();
	for (int j = i; j <= i + gram; j++)
	{
	if (j > i)
	{
	b.append(' ');
	}
	b.append(doc[j]);
	}
	string token = b.toString();
	if (!model.TryGetValue(token, out int? curCount) \|\| curCount == null)
	{
	model.Put(token, 1);
	}
	else
	{
	model.Put(token, 1 + curCount);
	}
	if (VERBOSE)
	{
	Console.WriteLine(" add '" + token + "' -> count=" + (model.TryGetValue(token, out int? count) ? (count.HasValue ? count.ToString() : "null") : ""));
	}
	}
	}
	}

	int lookups = AtLeast(100);
	for (int iter = 0; iter < lookups; iter++)
	{
	string[] tokens = new string[TestUtil.NextInt32(Random, 1, 5)];
	for (int i = 0; i < tokens.Length; i++)
	{
	tokens[i] = GetZipfToken(terms);
	}

	// Maybe trim last token; be sure not to create the
	// empty string:
	int trimStart;
	if (tokens.Length == 1)
	{
	trimStart = 1;
	}
	else
	{
	trimStart = 0;
	}
	int trimAt = TestUtil.NextInt32(Random, trimStart, tokens[tokens.Length - 1].Length);
	tokens[tokens.Length - 1] = tokens[tokens.Length - 1].Substring(0, trimAt - 0);

	int num = TestUtil.NextInt32(Random, 1, 100);
	StringBuilder b = new StringBuilder();
	foreach (string token in tokens)
	{
	b.append(' ');
	b.append(token);
	}
	string query = b.toString();
	query = query.Substring(1);

	if (VERBOSE)
	{
	Console.WriteLine("\nTEST: iter=" + iter + " query='" + query + "' num=" + num);
	}

	// Expected:
	List<Lookup.LookupResult> expected = new List<Lookup.LookupResult>();
	double backoff = 1.0;
	seen = new HashSet<string>();

	if (VERBOSE)
	{
	Console.WriteLine(" compute expected");
	}
	for (int i = grams - 1; i >= 0; i--)
	{
	if (VERBOSE)
	{
	Console.WriteLine(" grams=" + i);
	}

	if (tokens.Length < i + 1)
	{
	// Don't have enough tokens to use this model
	if (VERBOSE)
	{
	Console.WriteLine(" skip");
	}
	continue;
	}

	if (i == 0 && tokens[tokens.Length - 1].Length == 0)
	{
	// Never suggest unigrams from empty string:
	if (VERBOSE)
	{
	Console.WriteLine(" skip unigram priors only");
	}
	continue;
	}

	// Build up "context" ngram:
	b = new StringBuilder();
	for (int j = tokens.Length - i - 1; j < tokens.Length - 1; j++)
	{
	b.append(' ');
	b.append(tokens[j]);
	}
	string context = b.toString();
	if (context.Length > 0)
	{
	context = context.Substring(1);
	}
	if (VERBOSE)
	{
	Console.WriteLine(" context='" + context + "'");
	}
	long contextCount;
	if (context.Length == 0)
	{
	contextCount = totTokens;
	}
	else
	{
	//int? count = gramCounts.get(i - 1).get(context);
	var gramCount = gramCounts[i - 1];
	if (!gramCount.TryGetValue(context, out int? count) \|\| count == null)
	{
	// We never saw this context:
	backoff *= FreeTextSuggester.ALPHA;
	if (VERBOSE)
	{
	Console.WriteLine(" skip: never saw context");
	}
	continue;
	}
	contextCount = count.GetValueOrDefault();
	}
	if (VERBOSE)
	{
	Console.WriteLine(" contextCount=" + contextCount);
	}
	IDictionary<string, int?> model = gramCounts[i];

	// First pass, gather all predictions for this model:
	if (VERBOSE)
	{
	Console.WriteLine(" find terms w/ prefix=" + tokens[tokens.Length - 1]);
	}
	List<Lookup.LookupResult> tmp = new List<Lookup.LookupResult>();
	foreach (string term in terms)
	{
	if (term.StartsWith(tokens[tokens.Length - 1], StringComparison.Ordinal))
	{
	if (VERBOSE)
	{
	Console.WriteLine(" term=" + term);
	}
	if (seen.contains(term))
	{
	if (VERBOSE)
	{
	Console.WriteLine(" skip seen");
	}
	continue;
	}
	string ngram = (context + " " + term).Trim();
	//Integer count = model.get(ngram);
	if (model.TryGetValue(ngram, out int? count) && count != null)
	{
	// LUCENENET NOTE: We need to calculate this as decimal because when using double it can sometimes
	// return numbers that are greater than long.MaxValue, which results in a negative long number.
	// This is also the way it is being done in the FreeTextSuggester to work around the issue.
	Lookup.LookupResult lr = new Lookup.LookupResult(ngram, (long)(long.MaxValue * ((decimal)backoff * (decimal)count / contextCount)));
	tmp.Add(lr);
	if (VERBOSE)
	{
	Console.WriteLine(" add tmp key='" + lr.Key + "' score=" + lr.Value);
	}
	}
	}
	}

	// Second pass, trim to only top N, and fold those
	// into overall suggestions:
	tmp.Sort(byScoreThenKey);
	if (tmp.size() > num)
	{
	//tmp.subList(num, tmp.size()).clear();
	tmp.RemoveRange(num, tmp.size() - num);
	}
	foreach (Lookup.LookupResult result in tmp)
	{
	string key = result.Key.toString();
	int idx = key.LastIndexOf(' ');
	string lastToken;
	if (idx != -1)
	{
	lastToken = key.Substring(idx + 1);
	}
	else
	{
	lastToken = key;
	}
	if (!seen.contains(lastToken))
	{
	seen.add(lastToken);
	expected.Add(result);
	if (VERBOSE)
	{
	Console.WriteLine(" keep key='" + result.Key + "' score=" + result.Value);
	}
	}
	}

	backoff *= FreeTextSuggester.ALPHA;
	}

	expected.Sort(byScoreThenKey);

	if (expected.size() > num)
	{
	expected.RemoveRange(num, expected.size() - num);
	}

	// Actual:
	IList<Lookup.LookupResult> actual = sug.DoLookup(query, num);

	if (VERBOSE)
	{
	Console.WriteLine(" expected: " + expected);
	Console.WriteLine(" actual: " + actual);
	}

	assertEquals(expected.ToString(), actual.ToString());
	}
	}

	private static string GetZipfToken(string[] tokens)
	{
	// Zipf-like distribution:
	for (int k = 0; k < tokens.Length; k++)
	{
	if (Random.nextBoolean() \|\| k == tokens.Length - 1)
	{
	return tokens[k];
	}
	}
	Debug.Assert(false);
	return null;
	}

	private static string ToString(IEnumerable<Lookup.LookupResult> results)
	{
	StringBuilder b = new StringBuilder();
	foreach (Lookup.LookupResult result in results)
	{
	b.Append(' ');
	b.Append(result.Key);
	b.Append('/');
	b.AppendFormat(CultureInfo.InvariantCulture, "{0:0.00}", ((double)result.Value) / long.MaxValue);
	}
	return b.toString().Trim();
	}
	}
	}