| using J2N.Collections.Generic.Extensions; |
| using J2N.Threading.Atomic; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Index.Extensions; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util.Automaton; |
| using NUnit.Framework; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| using System.Linq; |
| using System.Text; |
| using Assert = Lucene.Net.TestFramework.Assert; |
| using Console = Lucene.Net.Util.SystemConsole; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Util.Fst |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using Automaton = Lucene.Net.Util.Automaton.Automaton; |
| using BytesReader = Lucene.Net.Util.Fst.FST.BytesReader; |
| using CompiledAutomaton = Lucene.Net.Util.Automaton.CompiledAutomaton; |
| using Directory = Lucene.Net.Store.Directory; |
| using DirectoryReader = Lucene.Net.Index.DirectoryReader; |
| using Document = Lucene.Net.Documents.Document; |
| using Field = Lucene.Net.Documents.Field; |
| using FSDirectory = Lucene.Net.Store.FSDirectory; |
| using IndexInput = Lucene.Net.Store.IndexInput; |
| using IndexOutput = Lucene.Net.Store.IndexOutput; |
| using IndexReader = Lucene.Net.Index.IndexReader; |
| using IndexSearcher = Lucene.Net.Search.IndexSearcher; |
| using IndexWriter = Lucene.Net.Index.IndexWriter; |
| using IndexWriterConfig = Lucene.Net.Index.IndexWriterConfig; |
| using IOContext = Lucene.Net.Store.IOContext; |
| using MockAnalyzer = Lucene.Net.Analysis.MockAnalyzer; |
| using MockDirectoryWrapper = Lucene.Net.Store.MockDirectoryWrapper; |
| using MultiFields = Lucene.Net.Index.MultiFields; |
| using OpenMode = Lucene.Net.Index.OpenMode; |
| using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s; |
| using Pair = Lucene.Net.Util.Fst.PairOutputs<long?, long?>.Pair; |
| using RandomIndexWriter = Lucene.Net.Index.RandomIndexWriter; |
| using RegExp = Lucene.Net.Util.Automaton.RegExp; |
| using Term = Lucene.Net.Index.Term; |
| using TermQuery = Lucene.Net.Search.TermQuery; |
| using Terms = Lucene.Net.Index.Terms; |
| using TermsEnum = Lucene.Net.Index.TermsEnum; |
| |
| [SuppressCodecs("SimpleText", "Memory", "Direct")] |
| [Slow] |
| [TestFixture] |
| public class TestFSTs : LuceneTestCase |
| { |
| |
| private MockDirectoryWrapper dir; |
| |
| [SetUp] |
| public override void SetUp() |
| { |
| base.SetUp(); |
| dir = NewMockDirectory(); |
| dir.PreventDoubleWrite = false; |
| } |
| |
| [TearDown] |
| public override void TearDown() |
| { |
| // can be null if we force simpletext (funky, some kind of bug in test runner maybe) |
| if (dir != null) |
| { |
| dir.Dispose(); |
| } |
| base.TearDown(); |
| } |
| |
| [Test] |
| public virtual void TestBasicFSA() |
| { |
| string[] strings = new string[] { "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat" }; |
| string[] strings2 = new string[] { "station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation" }; |
| Int32sRef[] terms = new Int32sRef[strings.Length]; |
| Int32sRef[] terms2 = new Int32sRef[strings2.Length]; |
| for (int inputMode = 0; inputMode < 2; inputMode++) |
| { |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: inputMode=" + InputModeToString(inputMode)); |
| } |
| |
| for (int idx = 0; idx < strings.Length; idx++) |
| { |
| terms[idx] = FSTTester<object>.ToInt32sRef(strings[idx], inputMode); |
| } |
| for (int idx = 0; idx < strings2.Length; idx++) |
| { |
| terms2[idx] = FSTTester<object>.ToInt32sRef(strings2[idx], inputMode); |
| } |
| Array.Sort(terms2); |
| |
| DoTest(inputMode, terms); |
| |
| // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms): |
| |
| // FSA |
| { |
| Outputs<object> outputs = NoOutputs.Singleton; |
| object NO_OUTPUT = outputs.NoOutput; |
| List<InputOutput<object>> pairs = new List<InputOutput<object>>(terms2.Length); |
| foreach (Int32sRef term in terms2) |
| { |
| pairs.Add(new InputOutput<object>(term, NO_OUTPUT)); |
| } |
| FST<object> fst = (new FSTTester<object>(Random, dir, inputMode, pairs, outputs, false)).DoTest(0, 0, false); |
| Assert.IsNotNull(fst); |
| Assert.AreEqual(22, fst.NodeCount); |
| Assert.AreEqual(27, fst.ArcCount); |
| } |
| |
| // FST ord pos int |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| List<InputOutput<long?>> pairs = new List<InputOutput<long?>>(terms2.Length); |
| for (int idx = 0; idx < terms2.Length; idx++) |
| { |
| pairs.Add(new InputOutput<long?>(terms2[idx], (long?)idx)); |
| } |
| FST<long?> fst = (new FSTTester<long?>(Random, dir, inputMode, pairs, outputs, true)).DoTest(0, 0, false); |
| Assert.IsNotNull(fst); |
| Assert.AreEqual(22, fst.NodeCount); |
| Assert.AreEqual(27, fst.ArcCount); |
| } |
| |
| // FST byte sequence ord |
| { |
| ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; |
| BytesRef NO_OUTPUT = outputs.NoOutput; |
| List<InputOutput<BytesRef>> pairs = new List<InputOutput<BytesRef>>(terms2.Length); |
| for (int idx = 0; idx < terms2.Length; idx++) |
| { |
| BytesRef output = Random.Next(30) == 17 ? NO_OUTPUT : new BytesRef(Convert.ToString(idx)); |
| pairs.Add(new InputOutput<BytesRef>(terms2[idx], output)); |
| } |
| FST<BytesRef> fst = (new FSTTester<BytesRef>(Random, dir, inputMode, pairs, outputs, false)).DoTest(0, 0, false); |
| Assert.IsNotNull(fst); |
| Assert.AreEqual(24, fst.NodeCount); |
| Assert.AreEqual(30, fst.ArcCount); |
| } |
| } |
| } |
| |
| // given set of terms, test the different outputs for them |
| private void DoTest(int inputMode, Int32sRef[] terms) |
| { |
| Array.Sort(terms); |
| |
| // NoOutputs (simple FSA) |
| { |
| Outputs<object> outputs = NoOutputs.Singleton; |
| object NO_OUTPUT = outputs.NoOutput; |
| List<InputOutput<object>> pairs = new List<InputOutput<object>>(terms.Length); |
| foreach (Int32sRef term in terms) |
| { |
| pairs.Add(new InputOutput<object>(term, NO_OUTPUT)); |
| } |
| (new FSTTester<object>(Random, dir, inputMode, pairs, outputs, false)).DoTest(true); |
| } |
| |
| // PositiveIntOutput (ord) |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| List<InputOutput<long?>> pairs = new List<InputOutput<long?>>(terms.Length); |
| for (int idx = 0; idx < terms.Length; idx++) |
| { |
| pairs.Add(new InputOutput<long?>(terms[idx], (long?)idx)); |
| } |
| (new FSTTester<long?>(Random, dir, inputMode, pairs, outputs, true)).DoTest(true); |
| } |
| |
| // PositiveIntOutput (random monotonically increasing positive number) |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| List<InputOutput<long?>> pairs = new List<InputOutput<long?>>(terms.Length); |
| long lastOutput = 0; |
| for (int idx = 0; idx < terms.Length; idx++) |
| { |
| long value = lastOutput + TestUtil.NextInt32(Random, 1, 1000); |
| lastOutput = value; |
| pairs.Add(new InputOutput<long?>(terms[idx], value)); |
| } |
| (new FSTTester<long?>(Random, dir, inputMode, pairs, outputs, true)).DoTest(true); |
| } |
| |
| // PositiveIntOutput (random positive number) |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| List<InputOutput<long?>> pairs = new List<InputOutput<long?>>(terms.Length); |
| for (int idx = 0; idx < terms.Length; idx++) |
| { |
| pairs.Add(new InputOutput<long?>(terms[idx], TestUtil.NextInt64(Random, 0, long.MaxValue))); |
| } |
| (new FSTTester<long?>(Random, dir, inputMode, pairs, outputs, false)).DoTest(true); |
| } |
| |
| // Pair<ord, (random monotonically increasing positive number> |
| { |
| PositiveInt32Outputs o1 = PositiveInt32Outputs.Singleton; |
| PositiveInt32Outputs o2 = PositiveInt32Outputs.Singleton; |
| PairOutputs<long?, long?> outputs = new PairOutputs<long?, long?>(o1, o2); |
| List<InputOutput<Pair>> pairs = new List<InputOutput<Pair>>(terms.Length); |
| long lastOutput = 0; |
| for (int idx = 0; idx < terms.Length; idx++) |
| { |
| long value = lastOutput + TestUtil.NextInt32(Random, 1, 1000); |
| lastOutput = value; |
| pairs.Add(new InputOutput<Pair>(terms[idx], outputs.NewPair((long?)idx, value))); |
| } |
| (new FSTTester<Pair>(Random, dir, inputMode, pairs, outputs, false)).DoTest(true); |
| } |
| |
| // Sequence-of-bytes |
| { |
| ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; |
| BytesRef NO_OUTPUT = outputs.NoOutput; |
| List<InputOutput<BytesRef>> pairs = new List<InputOutput<BytesRef>>(terms.Length); |
| for (int idx = 0; idx < terms.Length; idx++) |
| { |
| BytesRef output = Random.Next(30) == 17 ? NO_OUTPUT : new BytesRef(Convert.ToString(idx)); |
| pairs.Add(new InputOutput<BytesRef>(terms[idx], output)); |
| } |
| (new FSTTester<BytesRef>(Random, dir, inputMode, pairs, outputs, false)).DoTest(true); |
| } |
| |
| // Sequence-of-ints |
| { |
| Int32SequenceOutputs outputs = Int32SequenceOutputs.Singleton; |
| List<InputOutput<Int32sRef>> pairs = new List<InputOutput<Int32sRef>>(terms.Length); |
| for (int idx = 0; idx < terms.Length; idx++) |
| { |
| string s = Convert.ToString(idx); |
| Int32sRef output = new Int32sRef(s.Length) |
| { |
| Length = s.Length |
| }; |
| for (int idx2 = 0; idx2 < output.Length; idx2++) |
| { |
| output.Int32s[idx2] = s[idx2]; |
| } |
| pairs.Add(new InputOutput<Int32sRef>(terms[idx], output)); |
| } |
| (new FSTTester<Int32sRef>(Random, dir, inputMode, pairs, outputs, false)).DoTest(true); |
| } |
| |
| } |
| |
| |
| [Test] |
| public virtual void TestRandomWords() |
| { |
| TestRandomWords(1000, AtLeast(2)); |
| //testRandomWords(100, 1); |
| } |
| |
| internal virtual string InputModeToString(int mode) |
| { |
| if (mode == 0) |
| { |
| return "utf8"; |
| } |
| else |
| { |
| return "utf32"; |
| } |
| } |
| |
| private void TestRandomWords(int maxNumWords, int numIter) |
| { |
| Random random = new Random(Random.Next()); |
| for (int iter = 0; iter < numIter; iter++) |
| { |
| if (Verbose) |
| { |
| Console.WriteLine("\nTEST: iter " + iter); |
| } |
| for (int inputMode = 0; inputMode < 2; inputMode++) |
| { |
| int numWords = random.Next(maxNumWords + 1); |
| ISet<Int32sRef> termsSet = new JCG.HashSet<Int32sRef>(); |
| //Int32sRef[] terms = new Int32sRef[numWords]; // LUCENENET: Not Used |
| while (termsSet.Count < numWords) |
| { |
| string term = FSTTester<object>.GetRandomString(random); |
| termsSet.Add(FSTTester<object>.ToInt32sRef(term, inputMode)); |
| } |
| DoTest(inputMode, termsSet.ToArray(/*new IntsRef[termsSet.Count]*/)); |
| } |
| } |
| } |
| |
| [Test] |
| [Nightly] |
| public virtual void TestBigSet() |
| { |
| TestRandomWords(TestUtil.NextInt32(Random, 50000, 60000), 1); |
| } |
| |
| // Build FST for all unique terms in the test line docs |
| // file, up until a time limit |
| [Test] |
| public virtual void TestRealTerms() |
| { |
| |
| LineFileDocs docs = new LineFileDocs(Random, DefaultCodecSupportsDocValues); |
| int RUN_TIME_MSEC = AtLeast(500); |
| MockAnalyzer analyzer = new MockAnalyzer(Random); |
| analyzer.MaxTokenLength = TestUtil.NextInt32(Random, 1, IndexWriter.MAX_TERM_LENGTH); |
| |
| IndexWriterConfig conf = NewIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).SetMaxBufferedDocs(-1).SetRAMBufferSizeMB(64); |
| DirectoryInfo tempDir = CreateTempDir("fstlines"); |
| Directory dir = NewFSDirectory(tempDir); |
| IndexWriter writer = new IndexWriter(dir, conf); |
| long stopTime = Environment.TickCount + RUN_TIME_MSEC; |
| Document doc; |
| int docCount = 0; |
| while ((doc = docs.NextDoc()) != null && Environment.TickCount < stopTime) |
| { |
| writer.AddDocument(doc); |
| docCount++; |
| } |
| IndexReader r = DirectoryReader.Open(writer, true); |
| writer.Dispose(); |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| |
| bool doRewrite = Random.NextBoolean(); |
| |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doRewrite, PackedInt32s.DEFAULT, true, 15); |
| |
| bool storeOrd = Random.NextBoolean(); |
| if (Verbose) |
| { |
| if (storeOrd) |
| { |
| Console.WriteLine("FST stores ord"); |
| } |
| else |
| { |
| Console.WriteLine("FST stores docFreq"); |
| } |
| } |
| Terms terms = MultiFields.GetTerms(r, "body"); |
| if (terms != null) |
| { |
| Int32sRef scratchIntsRef = new Int32sRef(); |
| TermsEnum termsEnum = terms.GetEnumerator(); |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: got termsEnum=" + termsEnum); |
| } |
| BytesRef term; |
| int ord = 0; |
| |
| Automaton automaton = (new RegExp(".*", RegExpSyntax.NONE)).ToAutomaton(); |
| TermsEnum termsEnum2 = terms.Intersect(new CompiledAutomaton(automaton, false, false), null); |
| |
| while (termsEnum.MoveNext()) |
| { |
| term = termsEnum.Term; |
| Assert.IsTrue(termsEnum2.MoveNext()); |
| BytesRef term2 = termsEnum2.Term; |
| Assert.AreEqual(term, term2); |
| Assert.AreEqual(termsEnum.DocFreq, termsEnum2.DocFreq); |
| Assert.AreEqual(termsEnum.TotalTermFreq, termsEnum2.TotalTermFreq); |
| |
| if (ord == 0) |
| { |
| try |
| { |
| var _ = termsEnum.Ord; |
| } |
| #pragma warning disable 168, IDE0059 |
| catch (NotSupportedException uoe) |
| #pragma warning restore 168, IDE0059 |
| { |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: codec doesn't support ord; FST stores docFreq"); |
| } |
| storeOrd = false; |
| } |
| } |
| int output; |
| if (storeOrd) |
| { |
| output = ord; |
| } |
| else |
| { |
| output = termsEnum.DocFreq; |
| } |
| builder.Add(Util.ToInt32sRef(term, scratchIntsRef), (long)output); |
| ord++; |
| if (Verbose && ord % 100000 == 0 && LuceneTestCase.TestNightly) |
| { |
| Console.WriteLine(ord + " terms..."); |
| } |
| } |
| FST<long?> fst = builder.Finish(); |
| if (Verbose) |
| { |
| Console.WriteLine("FST: " + docCount + " docs; " + ord + " terms; " + fst.NodeCount + " nodes; " + fst.ArcCount + " arcs;" + " " + fst.GetSizeInBytes() + " bytes"); |
| } |
| |
| if (ord > 0) |
| { |
| Random random = new Random(Random.Next()); |
| // Now confirm BytesRefFSTEnum and TermsEnum act the |
| // same: |
| BytesRefFSTEnum<long?> fstEnum = new BytesRefFSTEnum<long?>(fst); |
| int num = AtLeast(1000); |
| for (int iter = 0; iter < num; iter++) |
| { |
| BytesRef randomTerm = new BytesRef(FSTTester<object>.GetRandomString(random)); |
| |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: seek non-exist " + randomTerm.Utf8ToString() + " " + randomTerm); |
| } |
| |
| TermsEnum.SeekStatus seekResult = termsEnum.SeekCeil(randomTerm); |
| BytesRefFSTEnum.InputOutput<long?> fstSeekResult = fstEnum.SeekCeil(randomTerm); |
| |
| if (seekResult == TermsEnum.SeekStatus.END) |
| { |
| Assert.IsNull(fstSeekResult, "got " + (fstSeekResult == null ? "null" : fstSeekResult.Input.Utf8ToString()) + " but expected null"); |
| } |
| else |
| { |
| AssertSame(termsEnum, fstEnum, storeOrd); |
| for (int nextIter = 0; nextIter < 10; nextIter++) |
| { |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: next"); |
| if (storeOrd) |
| { |
| Console.WriteLine(" ord=" + termsEnum.Ord); |
| } |
| } |
| if (termsEnum.MoveNext()) |
| { |
| if (Verbose) |
| { |
| Console.WriteLine(" term=" + termsEnum.Term.Utf8ToString()); |
| } |
| Assert.IsTrue(fstEnum.MoveNext()); |
| AssertSame(termsEnum, fstEnum, storeOrd); |
| } |
| else |
| { |
| if (Verbose) |
| { |
| Console.WriteLine(" end!"); |
| } |
| BytesRefFSTEnum.InputOutput<long?> nextResult; |
| if (fstEnum.MoveNext()) |
| { |
| nextResult = fstEnum.Current; |
| Console.WriteLine("expected null but got: input=" + nextResult.Input.Utf8ToString() + " output=" + outputs.OutputToString(nextResult.Output)); |
| Assert.Fail(); |
| } |
| break; |
| } |
| } |
| } |
| } |
| |
| } |
| } |
| |
| r.Dispose(); |
| dir.Dispose(); |
| } |
| |
| private void AssertSame(TermsEnum termsEnum, BytesRefFSTEnum<long?> fstEnum, bool storeOrd) // LUCENENET specific - changed to long? so we don't need a cast |
| { |
| if (termsEnum.Term == null) |
| { |
| Assert.IsNull(fstEnum.Current); |
| } |
| else |
| { |
| Assert.IsNotNull(fstEnum.Current); |
| Assert.AreEqual(termsEnum.Term, fstEnum.Current.Input, termsEnum.Term.Utf8ToString() + " != " + fstEnum.Current.Input.Utf8ToString()); |
| if (storeOrd) |
| { |
| // fst stored the ord |
| Assert.AreEqual(termsEnum.Ord, fstEnum.Current.Output, "term=" + termsEnum.Term.Utf8ToString() + " " + termsEnum.Term); |
| } |
| else |
| { |
| // fst stored the docFreq |
| Assert.AreEqual(termsEnum.DocFreq, fstEnum.Current.Output, "term=" + termsEnum.Term.Utf8ToString() + " " + termsEnum.Term); |
| } |
| } |
| } |
| |
| private abstract class VisitTerms<T> |
| { |
| private readonly string dirOut; |
| private readonly string wordsFileIn; |
| private readonly int inputMode; |
| private readonly Outputs<T> outputs; |
| private readonly Builder<T> builder; |
| //private readonly bool doPack; // LUCENENET: Not used |
| |
| public VisitTerms(string dirOut, string wordsFileIn, int inputMode, int prune, Outputs<T> outputs, bool doPack, bool noArcArrays) |
| { |
| this.dirOut = dirOut; |
| this.wordsFileIn = wordsFileIn; |
| this.inputMode = inputMode; |
| this.outputs = outputs; |
| //this.doPack = doPack; // LUCENENET: Not used |
| |
| builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, int.MaxValue, outputs, null, doPack, PackedInt32s.DEFAULT, !noArcArrays, 15); |
| } |
| |
| protected internal abstract T GetOutput(Int32sRef input, int ord); |
| |
| public virtual void Run(int limit, bool verify, bool verifyByOutput) |
| { |
| TextReader @is = new StreamReader(new FileStream(wordsFileIn, FileMode.Open), Encoding.UTF8); |
| try |
| { |
| Int32sRef intsRef = new Int32sRef(10); |
| long tStart = Environment.TickCount; |
| int ord = 0; |
| while (true) |
| { |
| string w = @is.ReadLine(); |
| if (w == null) |
| { |
| break; |
| } |
| FSTTester<object>.ToInt32sRef(w, inputMode, intsRef); |
| builder.Add(intsRef, GetOutput(intsRef, ord)); |
| |
| ord++; |
| if (ord % 500000 == 0) |
| { |
| Console.WriteLine(string.Format(CultureInfo.InvariantCulture, "{0:000000.000}s: {1:000000000}...", ((Environment.TickCount - tStart) / 1000.0), ord)); |
| } |
| if (ord >= limit) |
| { |
| break; |
| } |
| } |
| |
| long tMid = Environment.TickCount; |
| Console.WriteLine(((tMid - tStart) / 1000.0) + " sec to add all terms"); |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(builder.TermCount == ord); |
| FST<T> fst = builder.Finish(); |
| long tEnd = Environment.TickCount; |
| Console.WriteLine(((tEnd - tMid) / 1000.0) + " sec to finish/pack"); |
| if (fst == null) |
| { |
| Console.WriteLine("FST was fully pruned!"); |
| Environment.Exit(0); |
| } |
| |
| if (dirOut == null) |
| { |
| return; |
| } |
| |
| Console.WriteLine(ord + " terms; " + fst.NodeCount + " nodes; " + fst.ArcCount + " arcs; " + fst.ArcWithOutputCount + " arcs w/ output; tot size " + fst.GetSizeInBytes()); |
| if (fst.NodeCount < 100) |
| { |
| TextWriter w = new StreamWriter(new FileStream("out.dot", FileMode.Create), Encoding.UTF8); |
| Util.ToDot(fst, w, false, false); |
| w.Dispose(); |
| Console.WriteLine("Wrote FST to out.dot"); |
| } |
| |
| Directory dir = FSDirectory.Open(new DirectoryInfo(dirOut)); |
| IndexOutput @out = dir.CreateOutput("fst.bin", IOContext.DEFAULT); |
| fst.Save(@out); |
| @out.Dispose(); |
| Console.WriteLine("Saved FST to fst.bin."); |
| |
| if (!verify) |
| { |
| return; |
| } |
| |
| /* |
| IndexInput in = dir.OpenInput("fst.bin", IOContext.DEFAULT); |
| fst = new FST<T>(in, outputs); |
| in.Dispose(); |
| */ |
| |
| Console.WriteLine("\nNow verify..."); |
| |
| while (true) |
| { |
| for (int iter = 0; iter < 2; iter++) |
| { |
| @is.Dispose(); |
| @is = new StreamReader(new FileStream(wordsFileIn, FileMode.Open), Encoding.UTF8); |
| |
| ord = 0; |
| tStart = Environment.TickCount; |
| while (true) |
| { |
| string w = @is.ReadLine(); |
| if (w == null) |
| { |
| break; |
| } |
| FSTTester<object>.ToInt32sRef(w, inputMode, intsRef); |
| if (iter == 0) |
| { |
| T expected = GetOutput(intsRef, ord); |
| T actual = Util.Get(fst, intsRef); |
| if (actual == null) |
| { |
| throw new Exception("unexpected null output on input=" + w); |
| } |
| if (!actual.Equals(expected)) |
| { |
| throw new Exception("wrong output (got " + outputs.OutputToString(actual) + " but expected " + outputs.OutputToString(expected) + ") on input=" + w); |
| } |
| } |
| else |
| { |
| // Get by output |
| long? output = GetOutput(intsRef, ord) as long?; |
| Int32sRef actual = Util.GetByOutput(fst as FST<long?>, output.GetValueOrDefault()); |
| if (actual == null) |
| { |
| throw new Exception("unexpected null input from output=" + output); |
| } |
| if (!actual.Equals(intsRef)) |
| { |
| throw new Exception("wrong input (got " + actual + " but expected " + intsRef + " from output=" + output); |
| } |
| } |
| |
| ord++; |
| if (ord % 500000 == 0) |
| { |
| Console.WriteLine(((Environment.TickCount - tStart) / 1000.0) + "s: " + ord + "..."); |
| } |
| if (ord >= limit) |
| { |
| break; |
| } |
| } |
| |
| double totSec = ((Environment.TickCount - tStart) / 1000.0); |
| Console.WriteLine("Verify " + (iter == 1 ? "(by output) " : "") + "took " + totSec + " sec + (" + (int)((totSec * 1000000000 / ord)) + " nsec per lookup)"); |
| |
| if (!verifyByOutput) |
| { |
| break; |
| } |
| } |
| |
| // NOTE: comment out to profile lookup... |
| break; |
| } |
| |
| } |
| finally |
| { |
| @is.Dispose(); |
| } |
| } |
| } |
| |
| // TODO: try experiment: reverse terms before |
| // compressing -- how much smaller? |
| |
| // TODO: can FST be used to index all internal substrings, |
| // mapping to term? |
| |
| // java -cp ../build/codecs/classes/java:../test-framework/lib/randomizedtesting-runner-*.jar:../build/core/classes/test:../build/core/classes/test-framework:../build/core/classes/java:../build/test-framework/classes/java:../test-framework/lib/junit-4.10.jar Lucene.Net.Util.Fst.TestFSTs /xold/tmp/allTerms3.txt out |
| /*public static void Main(string[] args) |
| { |
| int prune = 0; |
| int limit = int.MaxValue; |
| int inputMode = 0; // utf8 |
| bool storeOrds = false; |
| bool storeDocFreqs = false; |
| bool verify = true; |
| bool doPack = false; |
| bool noArcArrays = false; |
| string wordsFileIn = null; |
| string dirOut = null; |
| |
| int idx = 0; |
| while (idx < args.Length) |
| { |
| if (args[idx].Equals("-prune", StringComparison.Ordinal)) |
| { |
| prune = Convert.ToInt32(args[1 + idx]); |
| idx++; |
| } |
| else if (args[idx].Equals("-limit", StringComparison.Ordinal)) |
| { |
| limit = Convert.ToInt32(args[1 + idx]); |
| idx++; |
| } |
| else if (args[idx].Equals("-utf8", StringComparison.Ordinal)) |
| { |
| inputMode = 0; |
| } |
| else if (args[idx].Equals("-utf32", StringComparison.Ordinal)) |
| { |
| inputMode = 1; |
| } |
| else if (args[idx].Equals("-docFreq", StringComparison.Ordinal)) |
| { |
| storeDocFreqs = true; |
| } |
| else if (args[idx].Equals("-noArcArrays", StringComparison.Ordinal)) |
| { |
| noArcArrays = true; |
| } |
| else if (args[idx].Equals("-ords", StringComparison.Ordinal)) |
| { |
| storeOrds = true; |
| } |
| else if (args[idx].Equals("-noverify", StringComparison.Ordinal)) |
| { |
| verify = false; |
| } |
| else if (args[idx].Equals("-pack", StringComparison.Ordinal)) |
| { |
| doPack = true; |
| } |
| else if (args[idx].StartsWith("-", StringComparison.Ordinal)) |
| { |
| Console.Error.WriteLine("Unrecognized option: " + args[idx]); |
| Environment.Exit(-1); |
| } |
| else |
| { |
| if (wordsFileIn == null) |
| { |
| wordsFileIn = args[idx]; |
| } |
| else if (dirOut == null) |
| { |
| dirOut = args[idx]; |
| } |
| else |
| { |
| Console.Error.WriteLine("Too many arguments, expected: input [output]"); |
| Environment.Exit(-1); |
| } |
| } |
| idx++; |
| } |
| |
| if (wordsFileIn == null) |
| { |
| Console.Error.WriteLine("No input file."); |
| Environment.Exit(-1); |
| } |
| |
| // ord benefits from share, docFreqs don't: |
| |
| if (storeOrds && storeDocFreqs) |
| { |
| // Store both ord & docFreq: |
| PositiveIntOutputs o1 = PositiveIntOutputs.Singleton; |
| PositiveIntOutputs o2 = PositiveIntOutputs.Singleton; |
| PairOutputs<long, long> outputs = new PairOutputs<long, long>(o1, o2); |
| new VisitTermsAnonymousClass(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays).Run(limit, verify, false); |
| } |
| else if (storeOrds) |
| { |
| // Store only ords |
| PositiveIntOutputs outputs = PositiveIntOutputs.Singleton; |
| new VisitTermsAnonymousClass2(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays).Run(limit, verify, true); |
| } |
| else if (storeDocFreqs) |
| { |
| // Store only docFreq |
| PositiveIntOutputs outputs = PositiveIntOutputs.Singleton; |
| new VisitTermsAnonymousClass3(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays).Run(limit, verify, false); |
| } |
| else |
| { |
| // Store nothing |
| NoOutputs outputs = NoOutputs.Singleton; |
| object NO_OUTPUT = outputs.NoOutput; |
| new VisitTermsAnonymousClass4(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays, NO_OUTPUT).Run(limit, verify, false); |
| } |
| }*/ |
| |
| private class VisitTermsAnonymousClass : VisitTerms<Pair> |
| { |
| private readonly PairOutputs<long?, long?> outputs; |
| |
| public VisitTermsAnonymousClass(string dirOut, string wordsFileIn, int inputMode, int prune, PairOutputs<long?, long?> outputs, bool doPack, bool noArcArrays) |
| : base(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) |
| { |
| this.outputs = outputs; |
| } |
| |
| internal Random rand; |
| protected internal override Pair GetOutput(Int32sRef input, int ord) |
| { |
| if (ord == 0) |
| { |
| rand = new Random(17); |
| } |
| return outputs.NewPair(ord, TestUtil.NextInt32(rand, 1, 5000)); |
| } |
| } |
| |
| private class VisitTermsAnonymousClass2 : VisitTerms<long?> |
| { |
| public VisitTermsAnonymousClass2(string dirOut, string wordsFileIn, int inputMode, int prune, PositiveInt32Outputs outputs, bool doPack, bool noArcArrays) |
| : base(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) |
| { |
| } |
| |
| protected internal override long? GetOutput(Int32sRef input, int ord) |
| { |
| return ord; |
| } |
| } |
| |
| private class VisitTermsAnonymousClass3 : VisitTerms<long?> |
| { |
| public VisitTermsAnonymousClass3(string dirOut, string wordsFileIn, int inputMode, int prune, PositiveInt32Outputs outputs, bool doPack, bool noArcArrays) |
| : base(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) |
| { |
| } |
| |
| internal Random rand; |
| protected internal override long? GetOutput(Int32sRef input, int ord) |
| { |
| if (ord == 0) |
| { |
| rand = new Random(17); |
| } |
| return (long)TestUtil.NextInt32(rand, 1, 5000); |
| } |
| } |
| |
| private class VisitTermsAnonymousClass4 : VisitTerms<object> |
| { |
| private readonly object NO_OUTPUT; |
| |
| public VisitTermsAnonymousClass4(string dirOut, string wordsFileIn, int inputMode, int prune, NoOutputs outputs, bool doPack, bool noArcArrays, object NO_OUTPUT) |
| : base(dirOut, wordsFileIn, inputMode, prune, outputs, doPack, noArcArrays) |
| { |
| this.NO_OUTPUT = NO_OUTPUT; |
| } |
| |
| protected internal override object GetOutput(Int32sRef input, int ord) |
| { |
| return NO_OUTPUT; |
| } |
| } |
| |
| [Test] |
| public virtual void TestSingleString() |
| { |
| Outputs<object> outputs = NoOutputs.Singleton; |
| Builder<object> b = new Builder<object>(FST.INPUT_TYPE.BYTE1, outputs); |
| b.Add(Util.ToInt32sRef(new BytesRef("foobar"), new Int32sRef()), outputs.NoOutput); |
| BytesRefFSTEnum<object> fstEnum = new BytesRefFSTEnum<object>(b.Finish()); |
| Assert.IsNull(fstEnum.SeekFloor(new BytesRef("foo"))); |
| Assert.IsNull(fstEnum.SeekCeil(new BytesRef("foobaz"))); |
| } |
| |
| |
| [Test] |
| public virtual void TestDuplicateFSAString() |
| { |
| string str = "foobar"; |
| Outputs<object> outputs = NoOutputs.Singleton; |
| Builder<object> b = new Builder<object>(FST.INPUT_TYPE.BYTE1, outputs); |
| Int32sRef ints = new Int32sRef(); |
| for (int i = 0; i < 10; i++) |
| { |
| b.Add(Util.ToInt32sRef(new BytesRef(str), ints), outputs.NoOutput); |
| } |
| FST<object> fst = b.Finish(); |
| |
| // count the input paths |
| int count = 0; |
| BytesRefFSTEnum<object> fstEnum = new BytesRefFSTEnum<object>(fst); |
| while (fstEnum.MoveNext()) |
| { |
| count++; |
| } |
| Assert.AreEqual(1, count); |
| |
| Assert.IsNotNull(Util.Get(fst, new BytesRef(str))); |
| Assert.IsNull(Util.Get(fst, new BytesRef("foobaz"))); |
| } |
| |
| /* |
| public void testTrivial() throws Exception { |
| |
| // Get outputs -- passing true means FST will share |
| // (delta code) the outputs. this should result in |
| // smaller FST if the outputs grow monotonically. But |
| // if numbers are "random", false should give smaller |
| // final size: |
| final NoOutputs outputs = NoOutputs.getSingleton(); |
| |
| String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"}; |
| |
| final Builder<Object> builder = new Builder<Object>(FST.INPUT_TYPE.BYTE1, |
| 0, 0, |
| true, |
| true, |
| Integer.MAX_VALUE, |
| outputs, |
| null, |
| true); |
| Arrays.sort(strings); |
| final IntsRef scratch = new IntsRef(); |
| for(String s : strings) { |
| builder.Add(Util.ToIntsRef(new BytesRef(s), scratch), outputs.getNoOutput()); |
| } |
| final FST<Object> fst = builder.Finish(); |
| System.out.println("DOT before rewrite"); |
| Writer w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/before.dot")); |
| Util.toDot(fst, w, false, false); |
| w.Dispose(); |
| |
| final FST<Object> rewrite = new FST<Object>(fst, 1, 100); |
| |
| System.out.println("DOT after rewrite"); |
| w = new OutputStreamWriter(new FileOutputStream("/mnt/scratch/after.dot")); |
| Util.toDot(rewrite, w, false, false); |
| w.Dispose(); |
| } |
| */ |
| |
| [Test] |
| public virtual void TestSimple() |
| { |
| |
| // Get outputs -- passing true means FST will share |
| // (delta code) the outputs. this should result in |
| // smaller FST if the outputs grow monotonically. But |
| // if numbers are "random", false should give smaller |
| // final size: |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| |
| // Build an FST mapping BytesRef -> Long |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| BytesRef a = new BytesRef("a"); |
| BytesRef b = new BytesRef("b"); |
| BytesRef c = new BytesRef("c"); |
| |
| builder.Add(Util.ToInt32sRef(a, new Int32sRef()), 17L); |
| builder.Add(Util.ToInt32sRef(b, new Int32sRef()), 42L); |
| builder.Add(Util.ToInt32sRef(c, new Int32sRef()), 13824324872317238L); |
| |
| FST<long?> fst = builder.Finish(); |
| |
| Assert.AreEqual(13824324872317238L, Util.Get(fst, c)); |
| Assert.AreEqual(42, Util.Get(fst, b)); |
| Assert.AreEqual(17, Util.Get(fst, a)); |
| |
| BytesRefFSTEnum<long?> fstEnum = new BytesRefFSTEnum<long?>(fst); |
| BytesRefFSTEnum.InputOutput<long?> seekResult; |
| seekResult = fstEnum.SeekFloor(a); |
| Assert.IsNotNull(seekResult); |
| Assert.AreEqual(17, seekResult.Output); |
| |
| // goes to a |
| seekResult = fstEnum.SeekFloor(new BytesRef("aa")); |
| Assert.IsNotNull(seekResult); |
| Assert.AreEqual(17, seekResult.Output); |
| |
| // goes to b |
| seekResult = fstEnum.SeekCeil(new BytesRef("aa")); |
| Assert.IsNotNull(seekResult); |
| Assert.AreEqual(b, seekResult.Input); |
| Assert.AreEqual(42, seekResult.Output); |
| |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("c"), new Int32sRef()), Util.GetByOutput(fst, 13824324872317238L)); |
| Assert.IsNull(Util.GetByOutput(fst, 47)); |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("b"), new Int32sRef()), Util.GetByOutput(fst, 42)); |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("a"), new Int32sRef()), Util.GetByOutput(fst, 17)); |
| } |
| |
| [Test] |
| public virtual void TestPrimaryKeys() |
| { |
| Directory dir = NewDirectory(); |
| |
| for (int cycle = 0; cycle < 2; cycle++) |
| { |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: cycle=" + cycle); |
| } |
| RandomIndexWriter w = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)).SetOpenMode(OpenMode.CREATE)); |
| Document doc = new Document(); |
| Field idField = NewStringField("id", "", Field.Store.NO); |
| doc.Add(idField); |
| |
| int NUM_IDS = AtLeast(200); |
| //final int NUM_IDS = (int) (377 * (1.0+random.nextDouble())); |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: NUM_IDS=" + NUM_IDS); |
| } |
| ISet<string> allIDs = new JCG.HashSet<string>(); |
| for (int id = 0; id < NUM_IDS; id++) |
| { |
| string idString; |
| if (cycle == 0) |
| { |
| // PKs are assigned sequentially |
| idString = string.Format(CultureInfo.InvariantCulture, "{0:0000000}", id); |
| } |
| else |
| { |
| while (true) |
| { |
| string s = Convert.ToString(Random.NextInt64(), CultureInfo.InvariantCulture); |
| if (!allIDs.Contains(s)) |
| { |
| idString = s; |
| break; |
| } |
| } |
| } |
| allIDs.Add(idString); |
| idField.SetStringValue(idString); |
| w.AddDocument(doc); |
| } |
| |
| //w.forceMerge(1); |
| |
| // turn writer into reader: |
| IndexReader r = w.GetReader(); |
| IndexSearcher idxS = NewSearcher(r); |
| w.Dispose(); |
| |
| IList<string> allIDsList = new List<string>(allIDs); |
| IList<string> sortedAllIDsList = new List<string>(allIDsList); |
| CollectionUtil.TimSort(sortedAllIDsList); |
| |
| // Sprinkle in some non-existent PKs: |
| ISet<string> outOfBounds = new JCG.HashSet<string>(); |
| for (int idx = 0; idx < NUM_IDS / 10; idx++) |
| { |
| string idString; |
| if (cycle == 0) |
| { |
| idString = string.Format(CultureInfo.InvariantCulture, "{0:0000000}", (NUM_IDS + idx)); |
| } |
| else |
| { |
| while (true) |
| { |
| idString = Convert.ToString(Random.NextInt64(), CultureInfo.InvariantCulture); |
| if (!allIDs.Contains(idString)) |
| { |
| break; |
| } |
| } |
| } |
| outOfBounds.Add(idString); |
| allIDsList.Add(idString); |
| } |
| |
| // Verify w/ TermQuery |
| for (int iter = 0; iter < 2 * NUM_IDS; iter++) |
| { |
| string id = allIDsList[Random.Next(allIDsList.Count)]; |
| bool exists = !outOfBounds.Contains(id); |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: TermQuery " + (exists ? "" : "non-exist ") + " id=" + id); |
| } |
| Assert.AreEqual(exists ? 1 : 0, idxS.Search(new TermQuery(new Term("id", id)), 1).TotalHits, (exists ? "" : "non-exist ") + "id=" + id); |
| } |
| |
| // Verify w/ MultiTermsEnum |
| TermsEnum termsEnum = MultiFields.GetTerms(r, "id").GetEnumerator(); |
| for (int iter = 0; iter < 2 * NUM_IDS; iter++) |
| { |
| string id; |
| string nextID; |
| bool exists; |
| |
| if (Random.NextBoolean()) |
| { |
| id = allIDsList[Random.Next(allIDsList.Count)]; |
| exists = !outOfBounds.Contains(id); |
| nextID = null; |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: exactOnly " + (exists ? "" : "non-exist ") + "id=" + id); |
| } |
| } |
| else |
| { |
| // Pick ID between two IDs: |
| exists = false; |
| int idv = Random.Next(NUM_IDS - 1); |
| if (cycle == 0) |
| { |
| id = string.Format(CultureInfo.InvariantCulture, "{0:0000000}a", idv); |
| nextID = string.Format(CultureInfo.InvariantCulture, "{0:0000000}", idv + 1); |
| } |
| else |
| { |
| id = sortedAllIDsList[idv] + "a"; |
| nextID = sortedAllIDsList[idv + 1]; |
| } |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: not exactOnly id=" + id + " nextID=" + nextID); |
| } |
| } |
| |
| TermsEnum.SeekStatus status; |
| if (nextID == null) |
| { |
| if (termsEnum.SeekExact(new BytesRef(id))) |
| { |
| status = TermsEnum.SeekStatus.FOUND; |
| } |
| else |
| { |
| status = TermsEnum.SeekStatus.NOT_FOUND; |
| } |
| } |
| else |
| { |
| status = termsEnum.SeekCeil(new BytesRef(id)); |
| } |
| |
| if (nextID != null) |
| { |
| Assert.AreEqual(TermsEnum.SeekStatus.NOT_FOUND, status); |
| Assert.AreEqual(new BytesRef(nextID), termsEnum.Term, "expected=" + nextID + " actual=" + termsEnum.Term.Utf8ToString()); |
| } |
| else if (!exists) |
| { |
| Assert.IsTrue(status == TermsEnum.SeekStatus.NOT_FOUND || status == TermsEnum.SeekStatus.END); |
| } |
| else |
| { |
| Assert.AreEqual(TermsEnum.SeekStatus.FOUND, status); |
| } |
| } |
| |
| r.Dispose(); |
| } |
| dir.Dispose(); |
| } |
| |
| [Test] |
| public virtual void TestRandomTermLookup() |
| { |
| Directory dir = NewDirectory(); |
| |
| RandomIndexWriter w = new RandomIndexWriter(Random, dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random)) |
| .SetOpenMode(OpenMode.CREATE)); |
| Document doc = new Document(); |
| Field f = NewStringField("field", "", Field.Store.NO); |
| doc.Add(f); |
| |
| int NUM_TERMS = (int)(1000 * RandomMultiplier * (1 + Random.NextDouble())); |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: NUM_TERMS=" + NUM_TERMS); |
| } |
| |
| ISet<string> allTerms = new JCG.HashSet<string>(); |
| while (allTerms.Count < NUM_TERMS) |
| { |
| allTerms.Add(FSTTester<object>.SimpleRandomString(Random)); |
| } |
| |
| foreach (string term in allTerms) |
| { |
| f.SetStringValue(term); |
| w.AddDocument(doc); |
| } |
| |
| // turn writer into reader: |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: get reader"); |
| } |
| IndexReader r = w.GetReader(); |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: got reader=" + r); |
| } |
| IndexSearcher s = NewSearcher(r); |
| w.Dispose(); |
| |
| IList<string> allTermsList = new List<string>(allTerms); |
| allTermsList.Shuffle(Random); |
| |
| // verify exact lookup |
| foreach (string term in allTermsList) |
| { |
| if (Verbose) |
| { |
| Console.WriteLine("TEST: term=" + term); |
| } |
| Assert.AreEqual(s.Search(new TermQuery(new Term("field", term)), 1).TotalHits, 1, "term=" + term); |
| } |
| |
| r.Dispose(); |
| dir.Dispose(); |
| } |
| |
| |
| /// <summary> |
| /// Test state expansion (array format) on close-to-root states. Creates |
| /// synthetic input that has one expanded state on each level. |
| /// </summary> |
| /// <seealso cref= "https://issues.apache.org/jira/browse/LUCENE-2933" </seealso> |
| [Test] |
| public virtual void TestExpandedCloseToRoot() |
| { |
| // Sanity check. |
| Assert.IsTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); |
| Assert.IsTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); |
| |
| SyntheticData s = new SyntheticData(); |
| |
| List<string> @out = new List<string>(); |
| StringBuilder b = new StringBuilder(); |
| s.Generate(@out, b, 'a', 'i', 10); |
| string[] input = @out.ToArray(); |
| Array.Sort(input); |
| FST<object> fst = s.Compile(input); |
| FST.Arc<object> arc = fst.GetFirstArc(new FST.Arc<object>()); |
| s.VerifyStateAndBelow(fst, arc, 1); |
| } |
| |
| private class SyntheticData |
| { |
| public FST<object> Compile(string[] lines) |
| { |
| NoOutputs outputs = Fst.NoOutputs.Singleton; |
| object nothing = outputs.NoOutput; |
| Builder<object> b = new Builder<object>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| int line = 0; |
| BytesRef term = new BytesRef(); |
| Int32sRef scratchIntsRef = new Int32sRef(); |
| |
| while (line < lines.Length) |
| { |
| string w = lines[line++]; |
| if (w == null) |
| { |
| break; |
| } |
| term.CopyChars(w); |
| b.Add(Util.ToInt32sRef(term, scratchIntsRef), nothing); |
| } |
| |
| return b.Finish(); |
| } |
| |
| public void Generate(IList<string> @out, StringBuilder b, char from, char to, int depth) |
| { |
| if (depth == 0 || from == to) |
| { |
| string seq = b.ToString() + "_" + @out.Count + "_end"; |
| @out.Add(seq); |
| } |
| else |
| { |
| for (char c = from; c <= to; c++) |
| { |
| b.Append(c); |
| Generate(@out, b, from, c == to ? to : from, depth - 1); |
| b.Remove(b.Length - 1, 1);//remove last char |
| } |
| } |
| } |
| |
| public int VerifyStateAndBelow(FST<object> fst, FST.Arc<object> arc, int depth) |
| { |
| if (FST<object>.TargetHasArcs(arc)) |
| { |
| int childCount = 0; |
| BytesReader fstReader = fst.GetBytesReader(); |
| for (arc = fst.ReadFirstTargetArc(arc, arc, fstReader); ; |
| arc = fst.ReadNextArc(arc, fstReader), childCount++) |
| { |
| bool expanded = fst.IsExpandedTarget(arc, fstReader); |
| int children = VerifyStateAndBelow(fst, new FST.Arc<object>().CopyFrom(arc), depth + 1); |
| |
| Assert.AreEqual(expanded, (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); |
| if (arc.IsLast) |
| { |
| break; |
| } |
| } |
| return childCount; |
| } |
| return 0; |
| } |
| } |
| |
| [Test] |
| public virtual void TestFinalOutputOnEndState() |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, int.MaxValue, outputs, null, Random.NextBoolean(), PackedInt32s.DEFAULT, true, 15); |
| builder.Add(Util.ToUTF32("stat", new Int32sRef()), 17L); |
| builder.Add(Util.ToUTF32("station", new Int32sRef()), 10L); |
| FST<long?> fst = builder.Finish(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); |
| StringWriter w = new StringWriter(); |
| Util.ToDot(fst, w, false, false); |
| w.Dispose(); |
| //System.out.println(w.toString()); |
| Assert.IsTrue(w.ToString().IndexOf("label=\"t/[7]\"", StringComparison.Ordinal) != -1); |
| } |
| |
| [Test] |
| public virtual void TestInternalFinalState() |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| bool willRewrite = Random.NextBoolean(); |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, willRewrite, PackedInt32s.DEFAULT, true, 15); |
| builder.Add(Util.ToInt32sRef(new BytesRef("stat"), new Int32sRef()), outputs.NoOutput); |
| builder.Add(Util.ToInt32sRef(new BytesRef("station"), new Int32sRef()), outputs.NoOutput); |
| FST<long?> fst = builder.Finish(); |
| StringWriter w = new StringWriter(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); |
| Util.ToDot(fst, w, false, false); |
| w.Dispose(); |
| //System.out.println(w.toString()); |
| |
| // check for accept state at label t |
| Assert.IsTrue(w.ToString().IndexOf("[label=\"t\" style=\"bold\"", StringComparison.Ordinal) != -1); |
| // check for accept state at label n |
| Assert.IsTrue(w.ToString().IndexOf("[label=\"n\" style=\"bold\"", StringComparison.Ordinal) != -1); |
| } |
| |
| // Make sure raw FST can differentiate between final vs |
| // non-final end nodes |
| [Test] |
| public virtual void TestNonFinalStopNode() |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| long? nothing = outputs.NoOutput; |
| Builder<long?> b = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| FST<long?> fst = new FST<long?>(FST.INPUT_TYPE.BYTE1, outputs, false, PackedInt32s.COMPACT, true, 15); |
| |
| Builder.UnCompiledNode<long?> rootNode = new Builder.UnCompiledNode<long?>(b, 0); |
| |
| // Add final stop node |
| { |
| Builder.UnCompiledNode<long?> node = new Builder.UnCompiledNode<long?>(b, 0); |
| node.IsFinal = true; |
| rootNode.AddArc('a', node); |
| Builder.CompiledNode frozen = new Builder.CompiledNode(); |
| frozen.Node = fst.AddNode(node); |
| rootNode.Arcs[0].NextFinalOutput = 17L; |
| rootNode.Arcs[0].IsFinal = true; |
| rootNode.Arcs[0].Output = nothing; |
| rootNode.Arcs[0].Target = frozen; |
| } |
| |
| // Add non-final stop node |
| { |
| Builder.UnCompiledNode<long?> node = new Builder.UnCompiledNode<long?>(b, 0); |
| rootNode.AddArc('b', node); |
| Builder.CompiledNode frozen = new Builder.CompiledNode(); |
| frozen.Node = fst.AddNode(node); |
| rootNode.Arcs[1].NextFinalOutput = nothing; |
| rootNode.Arcs[1].Output = 42L; |
| rootNode.Arcs[1].Target = frozen; |
| } |
| |
| fst.Finish(fst.AddNode(rootNode)); |
| |
| StringWriter w = new StringWriter(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp3/out.dot")); |
| Util.ToDot(fst, w, false, false); |
| w.Dispose(); |
| |
| CheckStopNodes(fst, outputs); |
| |
| // Make sure it still works after save/load: |
| Directory dir = NewDirectory(); |
| IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); |
| fst.Save(@out); |
| @out.Dispose(); |
| |
| IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); |
| FST<long?> fst2 = new FST<long?>(@in, outputs); |
| CheckStopNodes(fst2, outputs); |
| @in.Dispose(); |
| dir.Dispose(); |
| } |
| |
| private void CheckStopNodes(FST<long?> fst, PositiveInt32Outputs outputs) |
| { |
| long? nothing = outputs.NoOutput; |
| FST.Arc<long?> startArc = fst.GetFirstArc(new FST.Arc<long?>()); |
| Assert.AreEqual(nothing, startArc.Output); |
| Assert.AreEqual(nothing, startArc.NextFinalOutput); |
| |
| FST.Arc<long?> arc = fst.ReadFirstTargetArc(startArc, new FST.Arc<long?>(), fst.GetBytesReader()); |
| Assert.AreEqual('a', arc.Label); |
| Assert.AreEqual(17, arc.NextFinalOutput); |
| Assert.IsTrue(arc.IsFinal); |
| |
| arc = fst.ReadNextArc(arc, fst.GetBytesReader()); |
| Assert.AreEqual('b', arc.Label); |
| Assert.IsFalse(arc.IsFinal); |
| Assert.AreEqual(42, arc.Output); |
| } |
| |
| internal static readonly IComparer<long?> minLongComparer = Comparer<long?>.Create((left, right)=> left.Value.CompareTo(right.Value)); |
| |
| [Test] |
| public virtual void TestShortestPaths() |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| Int32sRef scratch = new Int32sRef(); |
| builder.Add(Util.ToInt32sRef(new BytesRef("aab"), scratch), 22L); |
| builder.Add(Util.ToInt32sRef(new BytesRef("aac"), scratch), 7L); |
| builder.Add(Util.ToInt32sRef(new BytesRef("ax"), scratch), 17L); |
| FST<long?> fst = builder.Finish(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.Dispose(); |
| |
| Util.TopResults<long?> res = Util.ShortestPaths(fst, fst.GetFirstArc(new FST.Arc<long?>()), outputs.NoOutput, minLongComparer, 3, true); |
| Assert.IsTrue(res.IsComplete); |
| Assert.AreEqual(3, res.TopN.Count); |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("aac"), scratch), res.TopN[0].Input); |
| Assert.AreEqual(7L, res.TopN[0].Output); |
| |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("ax"), scratch), res.TopN[1].Input); |
| Assert.AreEqual(17L, res.TopN[1].Output); |
| |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("aab"), scratch), res.TopN[2].Input); |
| Assert.AreEqual(22L, res.TopN[2].Output); |
| } |
| |
| [Test] |
| public virtual void TestRejectNoLimits() |
| { |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| Int32sRef scratch = new Int32sRef(); |
| builder.Add(Util.ToInt32sRef(new BytesRef("aab"), scratch), 22L); |
| builder.Add(Util.ToInt32sRef(new BytesRef("aac"), scratch), 7L); |
| builder.Add(Util.ToInt32sRef(new BytesRef("adcd"), scratch), 17L); |
| builder.Add(Util.ToInt32sRef(new BytesRef("adcde"), scratch), 17L); |
| |
| builder.Add(Util.ToInt32sRef(new BytesRef("ax"), scratch), 17L); |
| FST<long?> fst = builder.Finish(); |
| AtomicInt32 rejectCount = new AtomicInt32(); |
| Util.TopNSearcher<long?> searcher = new TopNSearcherAnonymousClass(fst, minLongComparer, rejectCount); |
| |
| searcher.AddStartPaths(fst.GetFirstArc(new FST.Arc<long?>()), outputs.NoOutput, true, new Int32sRef()); |
| Util.TopResults<long?> res = searcher.Search(); |
| Assert.AreEqual(rejectCount, 4); |
| Assert.IsTrue(res.IsComplete); // rejected(4) + topN(2) <= maxQueueSize(6) |
| |
| Assert.AreEqual(1, res.TopN.Count); |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("aac"), scratch), res.TopN[0].Input); |
| Assert.AreEqual(7L, res.TopN[0].Output); |
| rejectCount.Value = (0); |
| searcher = new TopNSearcherAnonymousClass2(fst, minLongComparer, rejectCount); |
| |
| searcher.AddStartPaths(fst.GetFirstArc(new FST.Arc<long?>()), outputs.NoOutput, true, new Int32sRef()); |
| res = searcher.Search(); |
| Assert.AreEqual(rejectCount, 4); |
| Assert.IsFalse(res.IsComplete); // rejected(4) + topN(2) > maxQueueSize(5) |
| } |
| |
| private class TopNSearcherAnonymousClass : Util.TopNSearcher<long?> |
| { |
| private readonly AtomicInt32 rejectCount; |
| |
| public TopNSearcherAnonymousClass(FST<long?> fst, IComparer<long?> minLongComparer, AtomicInt32 rejectCount) |
| : base(fst, 2, 6, minLongComparer) |
| { |
| this.rejectCount = rejectCount; |
| } |
| |
| protected override bool AcceptResult(Int32sRef input, long? output) |
| { |
| bool accept = (int)output == 7; |
| if (!accept) |
| { |
| rejectCount.IncrementAndGet(); |
| } |
| return accept; |
| } |
| } |
| |
| private class TopNSearcherAnonymousClass2 : Util.TopNSearcher<long?> |
| { |
| private readonly AtomicInt32 rejectCount; |
| |
| public TopNSearcherAnonymousClass2(FST<long?> fst, IComparer<long?> minLongComparer, AtomicInt32 rejectCount) |
| : base(fst, 2, 5, minLongComparer) |
| { |
| this.rejectCount = rejectCount; |
| } |
| |
| protected override bool AcceptResult(Int32sRef input, long? output) |
| { |
| bool accept = (int)output == 7; |
| if (!accept) |
| { |
| rejectCount.IncrementAndGet(); |
| } |
| return accept; |
| } |
| } |
| |
| // compares just the weight side of the pair |
| internal static readonly IComparer<Pair> minPairWeightComparer = Comparer<Pair>.Create((left, right)=> left.Output1.GetValueOrDefault().CompareTo(right.Output1.GetValueOrDefault())); |
| |
| /// <summary> |
| /// like testShortestPaths, but uses pairoutputs so we have both a weight and an output </summary> |
| [Test] |
| public virtual void TestShortestPathsWFST() |
| { |
| |
| PairOutputs<long?, long?> outputs = new PairOutputs<long?, long?>(PositiveInt32Outputs.Singleton, PositiveInt32Outputs.Singleton); // output - weight |
| |
| Builder<Pair> builder = new Builder<Pair>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| Int32sRef scratch = new Int32sRef(); |
| builder.Add(Util.ToInt32sRef(new BytesRef("aab"), scratch), outputs.NewPair(22L, 57L)); |
| builder.Add(Util.ToInt32sRef(new BytesRef("aac"), scratch), outputs.NewPair(7L, 36L)); |
| builder.Add(Util.ToInt32sRef(new BytesRef("ax"), scratch), outputs.NewPair(17L, 85L)); |
| FST<Pair> fst = builder.Finish(); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.Dispose(); |
| |
| Util.TopResults<Pair> res = Util.ShortestPaths(fst, fst.GetFirstArc(new FST.Arc<Pair>()), outputs.NoOutput, minPairWeightComparer, 3, true); |
| Assert.IsTrue(res.IsComplete); |
| Assert.AreEqual(3, res.TopN.Count); |
| |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("aac"), scratch), res.TopN[0].Input); |
| Assert.AreEqual(7L, res.TopN[0].Output.Output1); // weight |
| Assert.AreEqual(36L, res.TopN[0].Output.Output2); // output |
| |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("ax"), scratch), res.TopN[1].Input); |
| Assert.AreEqual(17L, res.TopN[1].Output.Output1); // weight |
| Assert.AreEqual(85L, res.TopN[1].Output.Output2); // output |
| |
| Assert.AreEqual(Util.ToInt32sRef(new BytesRef("aab"), scratch), res.TopN[2].Input); |
| Assert.AreEqual(22L, res.TopN[2].Output.Output1); // weight |
| Assert.AreEqual(57L, res.TopN[2].Output.Output2); // output |
| } |
| |
| [Test] |
| public virtual void TestShortestPathsRandom() |
| { |
| Random random = Random; |
| int numWords = AtLeast(1000); |
| |
| JCG.SortedDictionary<string, long> slowCompletor = new JCG.SortedDictionary<string, long>(StringComparer.Ordinal); |
| JCG.SortedSet<string> allPrefixes = new JCG.SortedSet<string>(StringComparer.Ordinal); |
| |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); |
| Int32sRef scratch = new Int32sRef(); |
| |
| for (int i = 0; i < numWords; i++) |
| { |
| string s; |
| while (true) |
| { |
| s = TestUtil.RandomSimpleString(random); |
| if (!slowCompletor.ContainsKey(s)) |
| { |
| break; |
| } |
| } |
| |
| for (int j = 1; j < s.Length; j++) |
| { |
| allPrefixes.Add(s.Substring(0, j)); |
| } |
| int weight = TestUtil.NextInt32(random, 1, 100); // weights 1..100 |
| slowCompletor[s] = (long)weight; |
| } |
| |
| foreach (KeyValuePair<string, long> e in slowCompletor) |
| { |
| //System.out.println("add: " + e); |
| builder.Add(Util.ToInt32sRef(new BytesRef(e.Key), scratch), e.Value); |
| } |
| |
| FST<long?> fst = builder.Finish(); |
| //System.out.println("SAVE out.dot"); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.Dispose(); |
| |
| BytesReader reader = fst.GetBytesReader(); |
| |
| //System.out.println("testing: " + allPrefixes.Size() + " prefixes"); |
| foreach (string prefix in allPrefixes) |
| { |
| // 1. run prefix against fst, then complete by value |
| //System.out.println("TEST: " + prefix); |
| |
| long? prefixOutput = 0; |
| FST.Arc<long?> arc = fst.GetFirstArc(new FST.Arc<long?>()); |
| for (int idx = 0; idx < prefix.Length; idx++) |
| { |
| if (fst.FindTargetArc((int)prefix[idx], arc, arc, reader) == null) |
| { |
| Assert.Fail(); |
| } |
| prefixOutput += arc.Output; |
| } |
| |
| int topN = TestUtil.NextInt32(random, 1, 10); |
| |
| Util.TopResults<long?> r = Util.ShortestPaths(fst, arc, fst.Outputs.NoOutput, minLongComparer, topN, true); |
| Assert.IsTrue(r.IsComplete); |
| |
| // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion |
| List<Util.Result<long?>> matches = new List<Util.Result<long?>>(); |
| |
| // TODO: could be faster... but its slowCompletor for a reason |
| foreach (KeyValuePair<string, long> e in slowCompletor) |
| { |
| if (e.Key.StartsWith(prefix, StringComparison.Ordinal)) |
| { |
| //System.out.println(" consider " + e.getKey()); |
| matches.Add(new Util.Result<long?>(Util.ToInt32sRef(new BytesRef(e.Key.Substring(prefix.Length)), new Int32sRef()), e.Value - prefixOutput)); |
| } |
| } |
| |
| Assert.IsTrue(matches.Count > 0); |
| matches.Sort(new TieBreakByInputComparer<long?>(minLongComparer)); |
| if (matches.Count > topN) |
| { |
| matches.SubList(topN, matches.Count).Clear(); |
| } |
| |
| Assert.AreEqual(matches.Count, r.TopN.Count); |
| |
| for (int hit = 0; hit < r.TopN.Count; hit++) |
| { |
| //System.out.println(" check hit " + hit); |
| Assert.AreEqual(matches[hit].Input, r.TopN[hit].Input); |
| Assert.AreEqual(matches[hit].Output, r.TopN[hit].Output); |
| } |
| } |
| } |
| |
| private class TieBreakByInputComparer<T> : IComparer<Util.Result<T>> |
| { |
| private readonly IComparer<T> comparer; |
| public TieBreakByInputComparer(IComparer<T> comparer) |
| { |
| this.comparer = comparer; |
| } |
| |
| public virtual int Compare(Util.Result<T> a, Util.Result<T> b) |
| { |
| int cmp = comparer.Compare(a.Output, b.Output); |
| if (cmp == 0) |
| { |
| return a.Input.CompareTo(b.Input); |
| } |
| else |
| { |
| return cmp; |
| } |
| } |
| } |
| |
| // used by slowcompletor |
| internal class TwoLongs |
| { |
| internal long a; |
| internal long b; |
| |
| internal TwoLongs(long a, long b) |
| { |
| this.a = a; |
| this.b = b; |
| } |
| } |
| |
| /// <summary> |
| /// like testShortestPathsRandom, but uses pairoutputs so we have both a weight and an output </summary> |
| [Test] |
| public virtual void TestShortestPathsWFSTRandom() |
| { |
| int numWords = AtLeast(1000); |
| |
| JCG.SortedDictionary<string, TwoLongs> slowCompletor = new JCG.SortedDictionary<string, TwoLongs>(StringComparer.Ordinal); |
| JCG.SortedSet<string> allPrefixes = new JCG.SortedSet<string>(StringComparer.Ordinal); |
| |
| PairOutputs<long?, long?> outputs = new PairOutputs<long?, long?>(PositiveInt32Outputs.Singleton, PositiveInt32Outputs.Singleton); // output - weight |
| Builder<Pair> builder = new Builder<Pair>(FST.INPUT_TYPE.BYTE1, outputs); |
| Int32sRef scratch = new Int32sRef(); |
| |
| Random random = Random; |
| for (int i = 0; i < numWords; i++) |
| { |
| string s; |
| while (true) |
| { |
| s = TestUtil.RandomSimpleString(random); |
| if (!slowCompletor.ContainsKey(s)) |
| { |
| break; |
| } |
| } |
| |
| for (int j = 1; j < s.Length; j++) |
| { |
| allPrefixes.Add(s.Substring(0, j)); |
| } |
| int weight = TestUtil.NextInt32(random, 1, 100); // weights 1..100 |
| int output = TestUtil.NextInt32(random, 0, 500); // outputs 0..500 |
| slowCompletor[s] = new TwoLongs(weight, output); |
| } |
| |
| foreach (KeyValuePair<string, TwoLongs> e in slowCompletor) |
| { |
| //System.out.println("add: " + e); |
| long weight = e.Value.a; |
| long output = e.Value.b; |
| builder.Add(Util.ToInt32sRef(new BytesRef(e.Key), scratch), outputs.NewPair(weight, output)); |
| } |
| |
| FST<Pair> fst = builder.Finish(); |
| //System.out.println("SAVE out.dot"); |
| //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); |
| //Util.toDot(fst, w, false, false); |
| //w.Dispose(); |
| |
| BytesReader reader = fst.GetBytesReader(); |
| |
| //System.out.println("testing: " + allPrefixes.Size() + " prefixes"); |
| foreach (string prefix in allPrefixes) |
| { |
| // 1. run prefix against fst, then complete by value |
| //System.out.println("TEST: " + prefix); |
| |
| Pair prefixOutput = outputs.NoOutput; |
| FST.Arc<Pair> arc = fst.GetFirstArc(new FST.Arc<Pair>()); |
| for (int idx = 0; idx < prefix.Length; idx++) |
| { |
| if (fst.FindTargetArc((int)prefix[idx], arc, arc, reader) == null) |
| { |
| Assert.Fail(); |
| } |
| prefixOutput = outputs.Add(prefixOutput, arc.Output); |
| } |
| |
| int topN = TestUtil.NextInt32(random, 1, 10); |
| |
| Util.TopResults<Pair> r = Util.ShortestPaths(fst, arc, fst.Outputs.NoOutput, minPairWeightComparer, topN, true); |
| Assert.IsTrue(r.IsComplete); |
| // 2. go thru whole treemap (slowCompletor) and check its actually the best suggestion |
| List<Util.Result<Pair>> matches = new List<Util.Result<Pair>>(); |
| |
| // TODO: could be faster... but its slowCompletor for a reason |
| foreach (KeyValuePair<string, TwoLongs> e in slowCompletor) |
| { |
| if (e.Key.StartsWith(prefix, StringComparison.Ordinal)) |
| { |
| //System.out.println(" consider " + e.getKey()); |
| matches.Add(new Util.Result<Pair>(Util.ToInt32sRef(new BytesRef(e.Key.Substring(prefix.Length)), new Int32sRef()), |
| outputs.NewPair(e.Value.a - prefixOutput.Output1, e.Value.b - prefixOutput.Output2))); |
| } |
| } |
| |
| Assert.IsTrue(matches.Count > 0); |
| matches.Sort(new TieBreakByInputComparer<Pair>(minPairWeightComparer)); |
| if (matches.Count > topN) |
| { |
| matches.SubList(topN, matches.Count).Clear(); |
| } |
| |
| Assert.AreEqual(matches.Count, r.TopN.Count); |
| |
| for (int hit = 0; hit < r.TopN.Count; hit++) |
| { |
| //System.out.println(" check hit " + hit); |
| Assert.AreEqual(matches[hit].Input, r.TopN[hit].Input); |
| Assert.AreEqual(matches[hit].Output, r.TopN[hit].Output); |
| } |
| } |
| } |
| |
| [Test] |
| public virtual void TestLargeOutputsOnArrayArcs() |
| { |
| ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; |
| Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs); |
| |
| byte[] bytes = new byte[300]; |
| Int32sRef input = new Int32sRef(); |
| input.Grow(1); |
| input.Length = 1; |
| BytesRef output = new BytesRef(bytes); |
| for (int arc = 0; arc < 6; arc++) |
| { |
| input.Int32s[0] = arc; |
| output.Bytes[0] = (byte)arc; |
| builder.Add(input, BytesRef.DeepCopyOf(output)); |
| } |
| |
| FST<BytesRef> fst = builder.Finish(); |
| for (int arc = 0; arc < 6; arc++) |
| { |
| input.Int32s[0] = arc; |
| BytesRef result = Util.Get(fst, input); |
| Assert.IsNotNull(result); |
| Assert.AreEqual(300, result.Length); |
| Assert.AreEqual(result.Bytes[result.Offset], arc); |
| for (int byteIDX = 1; byteIDX < result.Length; byteIDX++) |
| { |
| Assert.AreEqual((byte)0, result.Bytes[result.Offset + byteIDX]); |
| } |
| } |
| } |
| } |
| } |