| using Lucene.Net.Store; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util.Packed; |
| using System; |
| using System.Collections; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Linq; |
| using System.Text; |
| using Console = Lucene.Net.Support.SystemConsole; |
| using Debug = Lucene.Net.Diagnostics.Debug; // LUCENENET NOTE: We cannot use System.Diagnostics.Debug because those calls will be optimized out of the release! |
| using Assert = Lucene.Net.TestFramework.Assert; |
| using Directory = Lucene.Net.Store.Directory; |
| |
| namespace Lucene.Net.Util.Fst |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Holds one input/output pair. </summary> |
| public class InputOutput<T> : IComparable<InputOutput<T>> |
| { |
| public Int32sRef Input { get; private set; } |
| public T Output { get; private set; } |
| |
| public InputOutput(Int32sRef input, T output) |
| { |
| this.Input = input; |
| this.Output = output; |
| } |
| |
| public virtual int CompareTo(InputOutput<T> other) |
| { |
| return this.Input.CompareTo(other.Input); |
| } |
| } |
| |
| /// <summary> |
| /// Helper class to test FSTs. </summary> |
| public class FSTTester<T> |
| { |
| internal readonly Random random; |
| internal readonly IList<InputOutput<T>> pairs; |
| internal readonly int inputMode; |
| internal readonly Outputs<T> outputs; |
| internal readonly Directory dir; |
| internal readonly bool doReverseLookup; |
| |
| public FSTTester(Random random, Directory dir, int inputMode, IList<InputOutput<T>> pairs, Outputs<T> outputs, bool doReverseLookup) |
| { |
| this.random = random; |
| this.dir = dir; |
| this.inputMode = inputMode; |
| this.pairs = pairs; |
| this.outputs = outputs; |
| this.doReverseLookup = doReverseLookup; |
| } |
| |
| internal static string InputToString(int inputMode, Int32sRef term) |
| { |
| return InputToString(inputMode, term, true); |
| } |
| |
| internal static string InputToString(int inputMode, Int32sRef term, bool isValidUnicode) |
| { |
| if (!isValidUnicode) |
| { |
| return term.ToString(); |
| } |
| else if (inputMode == 0) |
| { |
| // utf8 |
| return ToBytesRef(term).Utf8ToString() + " " + term; |
| } |
| else |
| { |
| // utf32 |
| return UnicodeUtil.NewString(term.Int32s, term.Offset, term.Length) + " " + term; |
| } |
| } |
| |
| private static BytesRef ToBytesRef(Int32sRef ir) |
| { |
| BytesRef br = new BytesRef(ir.Length); |
| for (int i = 0; i < ir.Length; i++) |
| { |
| int x = ir.Int32s[ir.Offset + i]; |
| Debug.Assert(x >= 0 && x <= 255); |
| br.Bytes[i] = (byte)x; |
| } |
| br.Length = ir.Length; |
| return br; |
| } |
| |
| internal static string GetRandomString(Random random) |
| { |
| string term; |
| if (random.NextBoolean()) |
| { |
| term = TestUtil.RandomRealisticUnicodeString(random); |
| } |
| else |
| { |
| // we want to mix in limited-alphabet symbols so |
| // we get more sharing of the nodes given how few |
| // terms we are testing... |
| term = SimpleRandomString(random); |
| } |
| return term; |
| } |
| |
| internal static string SimpleRandomString(Random r) |
| { |
| int end = r.Next(10); |
| if (end == 0) |
| { |
| // allow 0 length |
| return ""; |
| } |
| char[] buffer = new char[end]; |
| for (int i = 0; i < end; i++) |
| { |
| buffer[i] = (char)TestUtil.NextInt32(r, 97, 102); |
| } |
| return new string(buffer, 0, end); |
| } |
| |
| internal static Int32sRef ToInt32sRef(string s, int inputMode) |
| { |
| return ToInt32sRef(s, inputMode, new Int32sRef(10)); |
| } |
| |
| internal static Int32sRef ToInt32sRef(string s, int inputMode, Int32sRef ir) |
| { |
| if (inputMode == 0) |
| { |
| // utf8 |
| return ToInt32sRef(new BytesRef(s), ir); |
| } |
| else |
| { |
| // utf32 |
| return ToInt32sRefUTF32(s, ir); |
| } |
| } |
| |
| internal static Int32sRef ToInt32sRefUTF32(string s, Int32sRef ir) |
| { |
| int charLength = s.Length; |
| int charIdx = 0; |
| int intIdx = 0; |
| while (charIdx < charLength) |
| { |
| if (intIdx == ir.Int32s.Length) |
| { |
| ir.Grow(intIdx + 1); |
| } |
| int utf32 = Character.CodePointAt(s, charIdx); |
| ir.Int32s[intIdx] = utf32; |
| charIdx += Character.CharCount(utf32); |
| intIdx++; |
| } |
| ir.Length = intIdx; |
| return ir; |
| } |
| |
| internal static Int32sRef ToInt32sRef(BytesRef br, Int32sRef ir) |
| { |
| if (br.Length > ir.Int32s.Length) |
| { |
| ir.Grow(br.Length); |
| } |
| for (int i = 0; i < br.Length; i++) |
| { |
| ir.Int32s[i] = br.Bytes[br.Offset + i] & 0xFF; |
| } |
| ir.Length = br.Length; |
| return ir; |
| } |
| |
| // LUCENENET specific - de-nested InputOutput<T> |
| |
| public virtual void DoTest(bool testPruning) |
| { |
| // no pruning |
| DoTest(0, 0, true); |
| |
| if (testPruning) |
| { |
| // simple pruning |
| DoTest(TestUtil.NextInt32(random, 1, 1 + pairs.Count), 0, true); |
| |
| // leafy pruning |
| DoTest(0, TestUtil.NextInt32(random, 1, 1 + pairs.Count), true); |
| } |
| } |
| |
| // runs the term, returning the output, or null if term |
| // isn't accepted. if prefixLength is non-null it must be |
| // length 1 int array; prefixLength[0] is set to the length |
| // of the term prefix that matches |
| private T Run(FST<T> fst, Int32sRef term, int[] prefixLength) |
| { |
| Debug.Assert(prefixLength == null || prefixLength.Length == 1); |
| FST.Arc<T> arc = fst.GetFirstArc(new FST.Arc<T>()); |
| T NO_OUTPUT = fst.Outputs.NoOutput; |
| T output = NO_OUTPUT; |
| FST.BytesReader fstReader = fst.GetBytesReader(); |
| |
| for (int i = 0; i <= term.Length; i++) |
| { |
| int label; |
| if (i == term.Length) |
| { |
| label = FST.END_LABEL; |
| } |
| else |
| { |
| label = term.Int32s[term.Offset + i]; |
| } |
| // System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.Outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); |
| if (fst.FindTargetArc(label, arc, arc, fstReader) == null) |
| { |
| // System.out.println(" not found"); |
| if (prefixLength != null) |
| { |
| prefixLength[0] = i; |
| return output; |
| } |
| else |
| { |
| return default(T); |
| } |
| } |
| output = fst.Outputs.Add(output, arc.Output); |
| } |
| |
| if (prefixLength != null) |
| { |
| prefixLength[0] = term.Length; |
| } |
| |
| return output; |
| } |
| |
| private T RandomAcceptedWord(FST<T> fst, Int32sRef @in) |
| { |
| FST.Arc<T> arc = fst.GetFirstArc(new FST.Arc<T>()); |
| |
| IList<FST.Arc<T>> arcs = new List<FST.Arc<T>>(); |
| @in.Length = 0; |
| @in.Offset = 0; |
| T NO_OUTPUT = fst.Outputs.NoOutput; |
| T output = NO_OUTPUT; |
| FST.BytesReader fstReader = fst.GetBytesReader(); |
| |
| while (true) |
| { |
| // read all arcs: |
| fst.ReadFirstTargetArc(arc, arc, fstReader); |
| arcs.Add((new FST.Arc<T>()).CopyFrom(arc)); |
| while (!arc.IsLast) |
| { |
| fst.ReadNextArc(arc, fstReader); |
| arcs.Add((new FST.Arc<T>()).CopyFrom(arc)); |
| } |
| |
| // pick one |
| arc = arcs[random.Next(arcs.Count)]; |
| arcs.Clear(); |
| |
| // accumulate output |
| output = fst.Outputs.Add(output, arc.Output); |
| |
| // append label |
| if (arc.Label == FST.END_LABEL) |
| { |
| break; |
| } |
| |
| if (@in.Int32s.Length == @in.Length) |
| { |
| @in.Grow(1 + @in.Length); |
| } |
| @in.Int32s[@in.Length++] = arc.Label; |
| } |
| |
| return output; |
| } |
| |
| internal virtual FST<T> DoTest(int prune1, int prune2, bool allowRandomSuffixSharing) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("\nTEST: prune1=" + prune1 + " prune2=" + prune2); |
| } |
| |
| bool willRewrite = random.NextBoolean(); |
| |
| Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, |
| prune1, prune2, |
| prune1 == 0 && prune2 == 0, |
| allowRandomSuffixSharing ? random.NextBoolean() : true, |
| allowRandomSuffixSharing ? TestUtil.NextInt32(random, 1, 10) : int.MaxValue, |
| outputs, |
| null, |
| willRewrite, |
| PackedInt32s.DEFAULT, |
| true, |
| 15); |
| if (LuceneTestCase.VERBOSE) |
| { |
| if (willRewrite) |
| { |
| Console.WriteLine("TEST: packed FST"); |
| } |
| else |
| { |
| Console.WriteLine("TEST: non-packed FST"); |
| } |
| } |
| |
| foreach (InputOutput<T> pair in pairs) |
| { |
| if (pair.Output is IEnumerable) |
| { |
| Builder<object> builderObject = builder as Builder<object>; |
| var values = pair.Output as IEnumerable; |
| foreach (object value in values) |
| { |
| builderObject.Add(pair.Input, value); |
| } |
| } |
| else |
| { |
| builder.Add(pair.Input, pair.Output); |
| } |
| } |
| FST<T> fst = builder.Finish(); |
| |
| if (random.NextBoolean() && fst != null && !willRewrite) |
| { |
| IOContext context = LuceneTestCase.NewIOContext(random); |
| using (IndexOutput @out = dir.CreateOutput("fst.bin", context)) |
| { |
| fst.Save(@out); |
| } |
| IndexInput @in = dir.OpenInput("fst.bin", context); |
| try |
| { |
| fst = new FST<T>(@in, outputs); |
| } |
| finally |
| { |
| @in.Dispose(); |
| dir.DeleteFile("fst.bin"); |
| } |
| } |
| |
| if (LuceneTestCase.VERBOSE && pairs.Count <= 20 && fst != null) |
| { |
| using (TextWriter w = new StreamWriter(new FileStream("out.dot", FileMode.OpenOrCreate), Encoding.UTF8)) |
| { |
| Util.ToDot(fst, w, false, false); |
| } |
| Console.WriteLine("SAVED out.dot"); |
| } |
| |
| if (LuceneTestCase.VERBOSE) |
| { |
| if (fst == null) |
| { |
| Console.WriteLine(" fst has 0 nodes (fully pruned)"); |
| } |
| else |
| { |
| Console.WriteLine(" fst has " + fst.NodeCount + " nodes and " + fst.ArcCount + " arcs"); |
| } |
| } |
| |
| if (prune1 == 0 && prune2 == 0) |
| { |
| VerifyUnPruned(inputMode, fst); |
| } |
| else |
| { |
| VerifyPruned(inputMode, fst, prune1, prune2); |
| } |
| |
| return fst; |
| } |
| |
| protected virtual bool OutputsEqual(T a, T b) |
| { |
| // LUCENENET: In .NET, IEnumerables do not automatically test to ensure |
| // their values are equal, so we need to do that manually. |
| // Note that we are testing the values without regard to whether |
| // the enumerable type is nullable. |
| return Collections.Equals(a, b); |
| } |
| |
| // FST is complete |
| private void VerifyUnPruned(int inputMode, FST<T> fst) |
| { |
| FST<long?> fstLong; |
| ISet<long?> validOutputs; |
| long minLong = long.MaxValue; |
| long maxLong = long.MinValue; |
| |
| if (doReverseLookup) |
| { |
| FST<long?> fstLong0 = fst as FST<long?>; |
| fstLong = fstLong0; |
| validOutputs = new HashSet<long?>(); |
| foreach (InputOutput<T> pair in pairs) |
| { |
| long? output = pair.Output as long?; |
| maxLong = Math.Max(maxLong, output.Value); |
| minLong = Math.Min(minLong, output.Value); |
| validOutputs.Add(output.Value); |
| } |
| } |
| else |
| { |
| fstLong = null; |
| validOutputs = null; |
| } |
| |
| if (pairs.Count == 0) |
| { |
| Assert.IsNull(fst); |
| return; |
| } |
| |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: now verify " + pairs.Count + " terms"); |
| foreach (InputOutput<T> pair in pairs) |
| { |
| Assert.IsNotNull(pair); |
| Assert.IsNotNull(pair.Input); |
| Assert.IsNotNull(pair.Output); |
| Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + outputs.OutputToString(pair.Output)); |
| } |
| } |
| |
| Assert.IsNotNull(fst); |
| |
| // visit valid pairs in order -- make sure all words |
| // are accepted, and FSTEnum's next() steps through |
| // them correctly |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: check valid terms/next()"); |
| } |
| { |
| Int32sRefFSTEnum<T> fstEnum = new Int32sRefFSTEnum<T>(fst); |
| foreach (InputOutput<T> pair in pairs) |
| { |
| Int32sRef term = pair.Input; |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: check term=" + InputToString(inputMode, term) + " output=" + fst.Outputs.OutputToString(pair.Output)); |
| } |
| T output = Run(fst, term, null); |
| Assert.IsNotNull(output, "term " + InputToString(inputMode, term) + " is not accepted"); |
| Assert.IsTrue(OutputsEqual(pair.Output, output)); |
| |
| // verify enum's next |
| Int32sRefFSTEnum.InputOutput<T> t = fstEnum.Next(); |
| Assert.IsNotNull(t); |
| Assert.AreEqual(term, t.Input, "expected input=" + InputToString(inputMode, term) + " but fstEnum returned " + InputToString(inputMode, t.Input)); |
| Assert.IsTrue(OutputsEqual(pair.Output, t.Output)); |
| } |
| Assert.IsNull(fstEnum.Next()); |
| } |
| |
| IDictionary<Int32sRef, T> termsMap = new Dictionary<Int32sRef, T>(); |
| foreach (InputOutput<T> pair in pairs) |
| { |
| termsMap[pair.Input] = pair.Output; |
| } |
| |
| if (doReverseLookup && maxLong > minLong) |
| { |
| // Do random lookups so we test null (output doesn't |
| // exist) case: |
| Assert.IsNull(Util.GetByOutput(fstLong, minLong - 7)); |
| Assert.IsNull(Util.GetByOutput(fstLong, maxLong + 7)); |
| |
| int num = LuceneTestCase.AtLeast(random, 100); |
| for (int iter = 0; iter < num; iter++) |
| { |
| long v = TestUtil.NextInt64(random, minLong, maxLong); |
| Int32sRef input = Util.GetByOutput(fstLong, v); |
| Assert.IsTrue(validOutputs.Contains(v) || input == null); |
| } |
| } |
| |
| // find random matching word and make sure it's valid |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: verify random accepted terms"); |
| } |
| Int32sRef scratch = new Int32sRef(10); |
| int num_ = LuceneTestCase.AtLeast(random, 500); |
| for (int iter = 0; iter < num_; iter++) |
| { |
| T output = RandomAcceptedWord(fst, scratch); |
| Assert.IsTrue(termsMap.ContainsKey(scratch), "accepted word " + InputToString(inputMode, scratch) + " is not valid"); |
| Assert.IsTrue(OutputsEqual(termsMap[scratch], output)); |
| |
| if (doReverseLookup) |
| { |
| //System.out.println("lookup output=" + output + " outs=" + fst.Outputs); |
| Int32sRef input = Util.GetByOutput(fstLong, (output as long?).Value); |
| Assert.IsNotNull(input); |
| //System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString()); |
| Assert.AreEqual(scratch, input); |
| } |
| } |
| |
| // test IntsRefFSTEnum.Seek: |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: verify seek"); |
| } |
| Int32sRefFSTEnum<T> fstEnum_ = new Int32sRefFSTEnum<T>(fst); |
| num_ = LuceneTestCase.AtLeast(random, 100); |
| for (int iter = 0; iter < num_; iter++) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" iter=" + iter); |
| } |
| if (random.NextBoolean()) |
| { |
| // seek to term that doesn't exist: |
| while (true) |
| { |
| Int32sRef term = ToInt32sRef(GetRandomString(random), inputMode); |
| int pos = pairs.BinarySearch(new InputOutput<T>(term, default(T))); |
| if (pos < 0) |
| { |
| pos = -(pos + 1); |
| // ok doesn't exist |
| //System.out.println(" seek " + inputToString(inputMode, term)); |
| Int32sRefFSTEnum.InputOutput<T> seekResult; |
| if (random.Next(3) == 0) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do non-exist seekExact term=" + InputToString(inputMode, term)); |
| } |
| seekResult = fstEnum_.SeekExact(term); |
| pos = -1; |
| } |
| else if (random.NextBoolean()) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do non-exist seekFloor term=" + InputToString(inputMode, term)); |
| } |
| seekResult = fstEnum_.SeekFloor(term); |
| pos--; |
| } |
| else |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do non-exist seekCeil term=" + InputToString(inputMode, term)); |
| } |
| seekResult = fstEnum_.SeekCeil(term); |
| } |
| |
| if (pos != -1 && pos < pairs.Count) |
| { |
| //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.Outputs.outputToString(seekResult.Output)); |
| Assert.IsNotNull(seekResult, "got null but expected term=" + InputToString(inputMode, pairs[pos].Input)); |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" got " + InputToString(inputMode, seekResult.Input)); |
| } |
| Assert.AreEqual(pairs[pos].Input, seekResult.Input, "expected " + InputToString(inputMode, pairs[pos].Input) + " but got " + InputToString(inputMode, seekResult.Input)); |
| Assert.IsTrue(OutputsEqual(pairs[pos].Output, seekResult.Output)); |
| } |
| else |
| { |
| // seeked before start or beyond end |
| //System.out.println("seek=" + seekTerm); |
| Assert.IsNull(seekResult, "expected null but got " + (seekResult == null ? "null" : InputToString(inputMode, seekResult.Input))); |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" got null"); |
| } |
| } |
| |
| break; |
| } |
| } |
| } |
| else |
| { |
| // seek to term that does exist: |
| InputOutput<T> pair = pairs[random.Next(pairs.Count)]; |
| Int32sRefFSTEnum.InputOutput<T> seekResult; |
| if (random.Next(3) == 2) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do exists seekExact term=" + InputToString(inputMode, pair.Input)); |
| } |
| seekResult = fstEnum_.SeekExact(pair.Input); |
| } |
| else if (random.NextBoolean()) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do exists seekFloor " + InputToString(inputMode, pair.Input)); |
| } |
| seekResult = fstEnum_.SeekFloor(pair.Input); |
| } |
| else |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do exists seekCeil " + InputToString(inputMode, pair.Input)); |
| } |
| seekResult = fstEnum_.SeekCeil(pair.Input); |
| } |
| Assert.IsNotNull(seekResult); |
| Assert.AreEqual(pair.Input, seekResult.Input, "got " + InputToString(inputMode, seekResult.Input) + " but expected " + InputToString(inputMode, pair.Input)); |
| Assert.IsTrue(OutputsEqual(pair.Output, seekResult.Output)); |
| } |
| } |
| |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: mixed next/seek"); |
| } |
| |
| // test mixed next/seek |
| num_ = LuceneTestCase.AtLeast(random, 100); |
| for (int iter = 0; iter < num_; iter++) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: iter " + iter); |
| } |
| // reset: |
| fstEnum_ = new Int32sRefFSTEnum<T>(fst); |
| int upto = -1; |
| while (true) |
| { |
| bool isDone = false; |
| if (upto == pairs.Count - 1 || random.NextBoolean()) |
| { |
| // next |
| upto++; |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do next"); |
| } |
| isDone = fstEnum_.Next() == null; |
| } |
| else if (upto != -1 && upto < 0.75 * pairs.Count && random.NextBoolean()) |
| { |
| int attempt = 0; |
| for (; attempt < 10; attempt++) |
| { |
| Int32sRef term = ToInt32sRef(GetRandomString(random), inputMode); |
| if (!termsMap.ContainsKey(term) && term.CompareTo(pairs[upto].Input) > 0) |
| { |
| int pos = pairs.BinarySearch(new InputOutput<T>(term, default(T))); |
| Debug.Assert(pos < 0); |
| upto = -(pos + 1); |
| |
| if (random.NextBoolean()) |
| { |
| upto--; |
| Assert.IsTrue(upto != -1); |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do non-exist seekFloor(" + InputToString(inputMode, term) + ")"); |
| } |
| isDone = fstEnum_.SeekFloor(term) == null; |
| } |
| else |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do non-exist seekCeil(" + InputToString(inputMode, term) + ")"); |
| } |
| isDone = fstEnum_.SeekCeil(term) == null; |
| } |
| |
| break; |
| } |
| } |
| if (attempt == 10) |
| { |
| continue; |
| } |
| } |
| else |
| { |
| int inc = random.Next(pairs.Count - upto - 1); |
| upto += inc; |
| if (upto == -1) |
| { |
| upto = 0; |
| } |
| |
| if (random.NextBoolean()) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do seekCeil(" + InputToString(inputMode, pairs[upto].Input) + ")"); |
| } |
| isDone = fstEnum_.SeekCeil(pairs[upto].Input) == null; |
| } |
| else |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" do seekFloor(" + InputToString(inputMode, pairs[upto].Input) + ")"); |
| } |
| isDone = fstEnum_.SeekFloor(pairs[upto].Input) == null; |
| } |
| } |
| if (LuceneTestCase.VERBOSE) |
| { |
| if (!isDone) |
| { |
| Console.WriteLine(" got " + InputToString(inputMode, fstEnum_.Current.Input)); |
| } |
| else |
| { |
| Console.WriteLine(" got null"); |
| } |
| } |
| |
| if (upto == pairs.Count) |
| { |
| Assert.IsTrue(isDone); |
| break; |
| } |
| else |
| { |
| Assert.IsFalse(isDone); |
| Assert.AreEqual(pairs[upto].Input, fstEnum_.Current.Input); |
| Assert.IsTrue(OutputsEqual(pairs[upto].Output, fstEnum_.Current.Output)); |
| |
| /* |
| if (upto < pairs.size()-1) { |
| int tryCount = 0; |
| while(tryCount < 10) { |
| final IntsRef t = toIntsRef(getRandomString(), inputMode); |
| if (pairs.get(upto).input.compareTo(t) < 0) { |
| final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; |
| if (LuceneTestCase.VERBOSE) { |
| System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); |
| } |
| Assert.AreEqual(expected, fstEnum.beforeNext(t)); |
| break; |
| } |
| tryCount++; |
| } |
| } |
| */ |
| } |
| } |
| } |
| } |
| |
| private class CountMinOutput<S> |
| { |
| internal int Count { get; set; } |
| internal S Output { get; set; } |
| internal S FinalOutput { get; set; } |
| internal bool IsLeaf { get; set; } = true; |
| internal bool IsFinal { get; set; } |
| } |
| |
| // FST is pruned |
| private void VerifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: now verify pruned " + pairs.Count + " terms; outputs=" + outputs); |
| foreach (InputOutput<T> pair in pairs) |
| { |
| Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + outputs.OutputToString(pair.Output)); |
| } |
| } |
| |
| // To validate the FST, we brute-force compute all prefixes |
| // in the terms, matched to their "common" outputs, prune that |
| // set according to the prune thresholds, then assert the FST |
| // matches that same set. |
| |
| // NOTE: Crazy RAM intensive!! |
| |
| //System.out.println("TEST: tally prefixes"); |
| |
| // build all prefixes |
| IDictionary<Int32sRef, CountMinOutput<T>> prefixes = new HashMap<Int32sRef, CountMinOutput<T>>(); |
| Int32sRef scratch = new Int32sRef(10); |
| foreach (InputOutput<T> pair in pairs) |
| { |
| scratch.CopyInt32s(pair.Input); |
| for (int idx = 0; idx <= pair.Input.Length; idx++) |
| { |
| scratch.Length = idx; |
| if (!prefixes.TryGetValue(scratch, out CountMinOutput<T> cmo) || cmo == null) |
| { |
| cmo = new CountMinOutput<T>(); |
| cmo.Count = 1; |
| cmo.Output = pair.Output; |
| prefixes[Int32sRef.DeepCopyOf(scratch)] = cmo; |
| } |
| else |
| { |
| cmo.Count++; |
| T output1 = cmo.Output; |
| if (output1.Equals(outputs.NoOutput)) |
| { |
| output1 = outputs.NoOutput; |
| } |
| T output2 = pair.Output; |
| if (output2.Equals(outputs.NoOutput)) |
| { |
| output2 = outputs.NoOutput; |
| } |
| cmo.Output = outputs.Common(output1, output2); |
| } |
| if (idx == pair.Input.Length) |
| { |
| cmo.IsFinal = true; |
| cmo.FinalOutput = cmo.Output; |
| } |
| } |
| } |
| |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: now prune"); |
| } |
| |
| |
| // prune 'em |
| // LUCENENET NOTE: Altered this a bit to go in reverse rather than use an enumerator since |
| // in .NET you cannot delete records while enumerating forward through a dictionary. |
| for (int i = prefixes.Count - 1; i >= 0; i--) |
| { |
| KeyValuePair<Int32sRef, CountMinOutput<T>> ent = prefixes.ElementAt(i); |
| Int32sRef prefix = ent.Key; |
| CountMinOutput<T> cmo = ent.Value; |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" term prefix=" + InputToString(inputMode, prefix, false) + " count=" + cmo.Count + " isLeaf=" + cmo.IsLeaf + " output=" + outputs.OutputToString(cmo.Output) + " isFinal=" + cmo.IsFinal); |
| } |
| bool keep; |
| if (prune1 > 0) |
| { |
| keep = cmo.Count >= prune1; |
| } |
| else |
| { |
| Debug.Assert(prune2 > 0); |
| if (prune2 > 1 && cmo.Count >= prune2) |
| { |
| keep = true; |
| } |
| else if (prefix.Length > 0) |
| { |
| // consult our parent |
| scratch.Length = prefix.Length - 1; |
| Array.Copy(prefix.Int32s, prefix.Offset, scratch.Int32s, 0, scratch.Length); |
| prefixes.TryGetValue(scratch, out CountMinOutput<T> cmo2); |
| //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); |
| keep = cmo2 != null && ((prune2 > 1 && cmo2.Count >= prune2) || (prune2 == 1 && (cmo2.Count >= 2 || prefix.Length <= 1))); |
| } |
| else if (cmo.Count >= prune2) |
| { |
| keep = true; |
| } |
| else |
| { |
| keep = false; |
| } |
| } |
| |
| if (!keep) |
| { |
| prefixes.Remove(prefix); |
| //System.out.println(" remove"); |
| } |
| else |
| { |
| // clear isLeaf for all ancestors |
| //System.out.println(" keep"); |
| scratch.CopyInt32s(prefix); |
| scratch.Length--; |
| while (scratch.Length >= 0) |
| { |
| if (prefixes.TryGetValue(scratch, out CountMinOutput<T> cmo2) && cmo2 != null) |
| { |
| //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); |
| cmo2.IsLeaf = false; |
| } |
| scratch.Length--; |
| } |
| } |
| } |
| |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: after prune"); |
| foreach (KeyValuePair<Int32sRef, CountMinOutput<T>> ent in prefixes) |
| { |
| Console.WriteLine(" " + InputToString(inputMode, ent.Key, false) + ": isLeaf=" + ent.Value.IsLeaf + " isFinal=" + ent.Value.IsFinal); |
| if (ent.Value.IsFinal) |
| { |
| Console.WriteLine(" finalOutput=" + outputs.OutputToString(ent.Value.FinalOutput)); |
| } |
| } |
| } |
| |
| if (prefixes.Count <= 1) |
| { |
| Assert.IsNull(fst); |
| return; |
| } |
| |
| Assert.IsNotNull(fst); |
| |
| // make sure FST only enums valid prefixes |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: check pruned enum"); |
| } |
| Int32sRefFSTEnum<T> fstEnum = new Int32sRefFSTEnum<T>(fst); |
| Int32sRefFSTEnum.InputOutput<T> current; |
| while ((current = fstEnum.Next()) != null) |
| { |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine(" fstEnum.next prefix=" + InputToString(inputMode, current.Input, false) + " output=" + outputs.OutputToString(current.Output)); |
| } |
| prefixes.TryGetValue(current.Input, out CountMinOutput<T> cmo); |
| Assert.IsNotNull(cmo); |
| Assert.IsTrue(cmo.IsLeaf || cmo.IsFinal); |
| //if (cmo.isFinal && !cmo.isLeaf) { |
| if (cmo.IsFinal) |
| { |
| Assert.AreEqual(cmo.FinalOutput, current.Output); |
| } |
| else |
| { |
| Assert.AreEqual(cmo.Output, current.Output); |
| } |
| } |
| |
| // make sure all non-pruned prefixes are present in the FST |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: verify all prefixes"); |
| } |
| int[] stopNode = new int[1]; |
| foreach (KeyValuePair<Int32sRef, CountMinOutput<T>> ent in prefixes) |
| { |
| if (ent.Key.Length > 0) |
| { |
| CountMinOutput<T> cmo = ent.Value; |
| T output = Run(fst, ent.Key, stopNode); |
| if (LuceneTestCase.VERBOSE) |
| { |
| Console.WriteLine("TEST: verify prefix=" + InputToString(inputMode, ent.Key, false) + " output=" + outputs.OutputToString(cmo.Output)); |
| } |
| // if (cmo.isFinal && !cmo.isLeaf) { |
| if (cmo.IsFinal) |
| { |
| Assert.AreEqual(cmo.FinalOutput, output); |
| } |
| else |
| { |
| Assert.AreEqual(cmo.Output, output); |
| } |
| Assert.AreEqual(ent.Key.Length, stopNode[0]); |
| } |
| } |
| } |
| } |
| } |