| using J2N; |
| using J2N.Numerics; |
| using J2N.Text; |
| using Lucene.Net.Diagnostics; |
| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using System.IO; |
| using System.Runtime.CompilerServices; |
| using BitSet = Lucene.Net.Util.OpenBitSet; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Util.Fst |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Static helper methods. |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| public static class Util // LUCENENET specific - made static // LUCENENET TODO: Fix naming conflict with containing namespace |
| { |
| /// <summary> |
| /// Looks up the output for this input, or <c>null</c> if the |
| /// input is not accepted. |
| /// </summary> |
| public static T Get<T>(FST<T> fst, Int32sRef input) |
| { |
| // TODO: would be nice not to alloc this on every lookup |
| var arc = fst.GetFirstArc(new FST.Arc<T>()); |
| |
| var fstReader = fst.GetBytesReader(); |
| |
| // Accumulate output as we go |
| T output = fst.Outputs.NoOutput; |
| for (int i = 0; i < input.Length; i++) |
| { |
| if (fst.FindTargetArc(input.Int32s[input.Offset + i], arc, arc, fstReader) == null) |
| { |
| return default; |
| } |
| output = fst.Outputs.Add(output, arc.Output); |
| } |
| |
| if (arc.IsFinal) |
| { |
| return fst.Outputs.Add(output, arc.NextFinalOutput); |
| } |
| else |
| { |
| return default; |
| } |
| } |
| |
| // TODO: maybe a CharsRef version for BYTE2 |
| |
| /// <summary> |
| /// Looks up the output for this input, or <c>null</c> if the |
| /// input is not accepted |
| /// </summary> |
| public static T Get<T>(FST<T> fst, BytesRef input) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(fst.InputType == FST.INPUT_TYPE.BYTE1); |
| |
| var fstReader = fst.GetBytesReader(); |
| |
| // TODO: would be nice not to alloc this on every lookup |
| var arc = fst.GetFirstArc(new FST.Arc<T>()); |
| |
| // Accumulate output as we go |
| T output = fst.Outputs.NoOutput; |
| for (int i = 0; i < input.Length; i++) |
| { |
| if (fst.FindTargetArc(input.Bytes[i + input.Offset] & 0xFF, arc, arc, fstReader) == null) |
| { |
| return default; |
| } |
| output = fst.Outputs.Add(output, arc.Output); |
| } |
| |
| if (arc.IsFinal) |
| { |
| return fst.Outputs.Add(output, arc.NextFinalOutput); |
| } |
| else |
| { |
| return default; |
| } |
| } |
| |
| /// <summary> |
| /// Reverse lookup (lookup by output instead of by input), |
| /// in the special case when your FSTs outputs are |
| /// strictly ascending. This locates the input/output |
| /// pair where the output is equal to the target, and will |
| /// return <c>null</c> if that output does not exist. |
| /// |
| /// <para/>NOTE: this only works with <see cref="T:FST{long?}"/>, only |
| /// works when the outputs are ascending in order with |
| /// the inputs. |
| /// For example, simple ordinals (0, 1, |
| /// 2, ...), or file offets (when appending to a file) |
| /// fit this. |
| /// </summary> |
| public static Int32sRef GetByOutput(FST<long?> fst, long targetOutput) |
| { |
| var @in = fst.GetBytesReader(); |
| |
| // TODO: would be nice not to alloc this on every lookup |
| FST.Arc<long?> arc = fst.GetFirstArc(new FST.Arc<long?>()); |
| |
| FST.Arc<long?> scratchArc = new FST.Arc<long?>(); |
| |
| Int32sRef result = new Int32sRef(); |
| |
| return GetByOutput(fst, targetOutput, @in, arc, scratchArc, result); |
| } |
| |
| /// <summary> |
| /// Expert: like <see cref="Util.GetByOutput(FST{long?}, long)"/> except reusing |
| /// <see cref="FST.BytesReader"/>, initial and scratch Arc, and result. |
| /// </summary> |
| public static Int32sRef GetByOutput(FST<long?> fst, long targetOutput, FST.BytesReader @in, FST.Arc<long?> arc, FST.Arc<long?> scratchArc, Int32sRef result) |
| { |
| long output = arc.Output.Value; |
| int upto = 0; |
| |
| //System.out.println("reverseLookup output=" + targetOutput); |
| |
| while (true) |
| { |
| //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc); |
| if (arc.IsFinal) |
| { |
| long finalOutput = output + arc.NextFinalOutput.Value; |
| //System.out.println(" isFinal finalOutput=" + finalOutput); |
| if (finalOutput == targetOutput) |
| { |
| result.Length = upto; |
| //System.out.println(" found!"); |
| return result; |
| } |
| else if (finalOutput > targetOutput) |
| { |
| //System.out.println(" not found!"); |
| return null; |
| } |
| } |
| |
| if (FST<long?>.TargetHasArcs(arc)) |
| { |
| //System.out.println(" targetHasArcs"); |
| if (result.Int32s.Length == upto) |
| { |
| result.Grow(1 + upto); |
| } |
| |
| fst.ReadFirstRealTargetArc(arc.Target, arc, @in); |
| |
| if (arc.BytesPerArc != 0) |
| { |
| int low = 0; |
| int high = arc.NumArcs - 1; |
| int mid = 0; |
| //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output); |
| bool exact = false; |
| while (low <= high) |
| { |
| mid = (low + high).TripleShift(1); |
| @in.Position = arc.PosArcsStart; |
| @in.SkipBytes(arc.BytesPerArc * mid); |
| var flags = (sbyte)@in.ReadByte(); |
| fst.ReadLabel(@in); |
| long minArcOutput; |
| if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0) |
| { |
| long arcOutput = fst.Outputs.Read(@in).Value; |
| minArcOutput = output + arcOutput; |
| } |
| else |
| { |
| minArcOutput = output; |
| } |
| if (minArcOutput == targetOutput) |
| { |
| exact = true; |
| break; |
| } |
| else if (minArcOutput < targetOutput) |
| { |
| low = mid + 1; |
| } |
| else |
| { |
| high = mid - 1; |
| } |
| } |
| |
| if (high == -1) |
| { |
| return null; |
| } |
| else if (exact) |
| { |
| arc.ArcIdx = mid - 1; |
| } |
| else |
| { |
| arc.ArcIdx = low - 2; |
| } |
| |
| fst.ReadNextRealArc(arc, @in); |
| result.Int32s[upto++] = arc.Label; |
| output += arc.Output.Value; |
| } |
| else |
| { |
| FST.Arc<long?> prevArc = null; |
| |
| while (true) |
| { |
| //System.out.println(" cycle label=" + arc.label + " output=" + arc.output); |
| |
| // this is the min output we'd hit if we follow |
| // this arc: |
| long minArcOutput = output + arc.Output.Value; |
| |
| if (minArcOutput == targetOutput) |
| { |
| // Recurse on this arc: |
| //System.out.println(" match! break"); |
| output = minArcOutput; |
| result.Int32s[upto++] = arc.Label; |
| break; |
| } |
| else if (minArcOutput > targetOutput) |
| { |
| if (prevArc == null) |
| { |
| // Output doesn't exist |
| return null; |
| } |
| else |
| { |
| // Recurse on previous arc: |
| arc.CopyFrom(prevArc); |
| result.Int32s[upto++] = arc.Label; |
| output += arc.Output.Value; |
| //System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output); |
| break; |
| } |
| } |
| else if (arc.IsLast) |
| { |
| // Recurse on this arc: |
| output = minArcOutput; |
| //System.out.println(" recurse last label=" + (char) arc.label + " output=" + output); |
| result.Int32s[upto++] = arc.Label; |
| break; |
| } |
| else |
| { |
| // Read next arc in this node: |
| prevArc = scratchArc; |
| prevArc.CopyFrom(arc); |
| //System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label); |
| fst.ReadNextRealArc(arc, @in); |
| } |
| } |
| } |
| } |
| else |
| { |
| //System.out.println(" no target arcs; not found!"); |
| return null; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Represents a path in TopNSearcher. |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| public class FSTPath<T> |
| { |
| public FST.Arc<T> Arc { get; set; } |
| public T Cost { get; set; } |
| public Int32sRef Input { get; private set; } |
| |
| /// <summary> |
| /// Sole constructor </summary> |
| public FSTPath(T cost, FST.Arc<T> arc, Int32sRef input) |
| { |
| this.Arc = (new FST.Arc<T>()).CopyFrom(arc); |
| this.Cost = cost; |
| this.Input = input; |
| } |
| |
| public override string ToString() |
| { |
| return "input=" + Input + " cost=" + Cost; |
| } |
| } |
| |
| /// <summary> |
| /// Compares first by the provided comparer, and then |
| /// tie breaks by <see cref="FSTPath{T}.Input"/>. |
| /// </summary> |
| private class TieBreakByInputComparer<T> : IComparer<FSTPath<T>> |
| { |
| internal readonly IComparer<T> comparer; |
| |
| public TieBreakByInputComparer(IComparer<T> comparer) |
| { |
| this.comparer = comparer; |
| } |
| |
| public virtual int Compare(FSTPath<T> a, FSTPath<T> b) |
| { |
| int cmp = comparer.Compare(a.Cost, b.Cost); |
| if (cmp == 0) |
| { |
| return a.Input.CompareTo(b.Input); |
| } |
| else |
| { |
| return cmp; |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Utility class to find top N shortest paths from start |
| /// point(s). |
| /// </summary> |
| public class TopNSearcher<T> |
| { |
| private readonly FST<T> fst; |
| private readonly FST.BytesReader bytesReader; |
| private readonly int topN; |
| private readonly int maxQueueDepth; |
| |
| private readonly FST.Arc<T> scratchArc = new FST.Arc<T>(); |
| |
| internal readonly IComparer<T> comparer; |
| |
| internal JCG.SortedSet<FSTPath<T>> queue = null; |
| |
| private readonly object syncLock = new object(); |
| |
| /// <summary> |
| /// Creates an unbounded TopNSearcher </summary> |
| /// <param name="fst"> the <see cref="Lucene.Net.Util.Fst.FST{T}"/> to search on </param> |
| /// <param name="topN"> the number of top scoring entries to retrieve </param> |
| /// <param name="maxQueueDepth"> the maximum size of the queue of possible top entries </param> |
| /// <param name="comparer"> the comparer to select the top N </param> |
| public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, IComparer<T> comparer) |
| { |
| this.fst = fst; |
| this.bytesReader = fst.GetBytesReader(); |
| this.topN = topN; |
| this.maxQueueDepth = maxQueueDepth; |
| this.comparer = comparer; |
| |
| queue = new JCG.SortedSet<FSTPath<T>>(new TieBreakByInputComparer<T>(comparer)); |
| } |
| |
| /// <summary> |
| /// If back plus this arc is competitive then add to queue: |
| /// </summary> |
| protected virtual void AddIfCompetitive(FSTPath<T> path) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(queue != null); |
| |
| T cost = fst.Outputs.Add(path.Cost, path.Arc.Output); |
| //System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label); |
| |
| if (queue.Count == maxQueueDepth) |
| { |
| FSTPath<T> bottom = queue.Max; |
| int comp = comparer.Compare(cost, bottom.Cost); |
| if (comp > 0) |
| { |
| // Doesn't compete |
| return; |
| } |
| else if (comp == 0) |
| { |
| // Tie break by alpha sort on the input: |
| path.Input.Grow(path.Input.Length + 1); |
| path.Input.Int32s[path.Input.Length++] = path.Arc.Label; |
| int cmp = bottom.Input.CompareTo(path.Input); |
| path.Input.Length--; |
| |
| // We should never see dups: |
| if (Debugging.AssertsEnabled) Debugging.Assert(cmp != 0); |
| |
| if (cmp < 0) |
| { |
| // Doesn't compete |
| return; |
| } |
| } |
| // Competes |
| } |
| else |
| { |
| // Queue isn't full yet, so any path we hit competes: |
| } |
| |
| // copy over the current input to the new input |
| // and add the arc.label to the end |
| Int32sRef newInput = new Int32sRef(path.Input.Length + 1); |
| Array.Copy(path.Input.Int32s, 0, newInput.Int32s, 0, path.Input.Length); |
| newInput.Int32s[path.Input.Length] = path.Arc.Label; |
| newInput.Length = path.Input.Length + 1; |
| FSTPath<T> newPath = new FSTPath<T>(cost, path.Arc, newInput); |
| |
| queue.Add(newPath); |
| |
| if (queue.Count == maxQueueDepth + 1) |
| { |
| // LUCENENET NOTE: SortedSet doesn't have atomic operations, |
| // so we need to add some thread safety just in case. |
| // Perhaps it might make sense to wrap SortedSet into a type |
| // that provides thread safety. |
| lock (syncLock) |
| { |
| queue.Remove(queue.Max); |
| } |
| } |
| } |
| |
| /// <summary> |
| /// Adds all leaving arcs, including 'finished' arc, if |
| /// the node is final, from this node into the queue. |
| /// </summary> |
| public virtual void AddStartPaths(FST.Arc<T> node, T startOutput, bool allowEmptyString, Int32sRef input) |
| { |
| // De-dup NO_OUTPUT since it must be a singleton: |
| if (startOutput.Equals(fst.Outputs.NoOutput)) |
| { |
| startOutput = fst.Outputs.NoOutput; |
| } |
| |
| FSTPath<T> path = new FSTPath<T>(startOutput, node, input); |
| fst.ReadFirstTargetArc(node, path.Arc, bytesReader); |
| |
| //System.out.println("add start paths"); |
| |
| // Bootstrap: find the min starting arc |
| while (true) |
| { |
| if (allowEmptyString || path.Arc.Label != FST.END_LABEL) |
| { |
| AddIfCompetitive(path); |
| } |
| if (path.Arc.IsLast) |
| { |
| break; |
| } |
| fst.ReadNextArc(path.Arc, bytesReader); |
| } |
| } |
| |
| public virtual TopResults<T> Search() |
| { |
| IList<Result<T>> results = new JCG.List<Result<T>>(); |
| |
| //System.out.println("search topN=" + topN); |
| |
| var fstReader = fst.GetBytesReader(); |
| T NO_OUTPUT = fst.Outputs.NoOutput; |
| |
| // TODO: we could enable FST to sorting arcs by weight |
| // as it freezes... can easily do this on first pass |
| // (w/o requiring rewrite) |
| |
| // TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!? |
| // (nibbles) |
| int rejectCount = 0; |
| |
| // For each top N path: |
| while (results.Count < topN) |
| { |
| //System.out.println("\nfind next path: queue.size=" + queue.size()); |
| |
| FSTPath<T> path; |
| |
| if (queue == null) |
| { |
| // Ran out of paths |
| //System.out.println(" break queue=null"); |
| break; |
| } |
| |
| // Remove top path since we are now going to |
| // pursue it: |
| |
| // LUCENENET NOTE: SortedSet doesn't have atomic operations, |
| // so we need to add some thread safety just in case. |
| // Perhaps it might make sense to wrap SortedSet into a type |
| // that provides thread safety. |
| lock (syncLock) |
| { |
| path = queue.Min; |
| if (path != null) |
| { |
| queue.Remove(path); |
| } |
| } |
| |
| if (path == null) |
| { |
| // There were less than topN paths available: |
| //System.out.println(" break no more paths"); |
| break; |
| } |
| |
| if (path.Arc.Label == FST.END_LABEL) |
| { |
| //System.out.println(" empty string! cost=" + path.cost); |
| // Empty string! |
| path.Input.Length--; |
| results.Add(new Result<T>(path.Input, path.Cost)); |
| continue; |
| } |
| |
| if (results.Count == topN - 1 && maxQueueDepth == topN) |
| { |
| // Last path -- don't bother w/ queue anymore: |
| queue = null; |
| } |
| |
| //System.out.println(" path: " + path); |
| |
| // We take path and find its "0 output completion", |
| // ie, just keep traversing the first arc with |
| // NO_OUTPUT that we can find, since this must lead |
| // to the minimum path that completes from |
| // path.arc. |
| |
| // For each input letter: |
| while (true) |
| { |
| //System.out.println("\n cycle path: " + path); |
| fst.ReadFirstTargetArc(path.Arc, path.Arc, fstReader); |
| |
| // For each arc leaving this node: |
| bool foundZero = false; |
| while (true) |
| { |
| //System.out.println(" arc=" + (char) path.arc.label + " cost=" + path.arc.output); |
| // tricky: instead of comparing output == 0, we must |
| // express it via the comparer compare(output, 0) == 0 |
| if (comparer.Compare(NO_OUTPUT, path.Arc.Output) == 0) |
| { |
| if (queue == null) |
| { |
| foundZero = true; |
| break; |
| } |
| else if (!foundZero) |
| { |
| scratchArc.CopyFrom(path.Arc); |
| foundZero = true; |
| } |
| else |
| { |
| AddIfCompetitive(path); |
| } |
| } |
| else if (queue != null) |
| { |
| AddIfCompetitive(path); |
| } |
| if (path.Arc.IsLast) |
| { |
| break; |
| } |
| fst.ReadNextArc(path.Arc, fstReader); |
| } |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(foundZero); |
| |
| if (queue != null) |
| { |
| // TODO: maybe we can save this copyFrom if we |
| // are more clever above... eg on finding the |
| // first NO_OUTPUT arc we'd switch to using |
| // scratchArc |
| path.Arc.CopyFrom(scratchArc); |
| } |
| |
| if (path.Arc.Label == FST.END_LABEL) |
| { |
| // Add final output: |
| //Debug.WriteLine(" done!: " + path); |
| T finalOutput = fst.Outputs.Add(path.Cost, path.Arc.Output); |
| if (AcceptResult(path.Input, finalOutput)) |
| { |
| //Debug.WriteLine(" add result: " + path); |
| results.Add(new Result<T>(path.Input, finalOutput)); |
| } |
| else |
| { |
| rejectCount++; |
| } |
| break; |
| } |
| else |
| { |
| path.Input.Grow(1 + path.Input.Length); |
| path.Input.Int32s[path.Input.Length] = path.Arc.Label; |
| path.Input.Length++; |
| path.Cost = fst.Outputs.Add(path.Cost, path.Arc.Output); |
| } |
| } |
| } |
| return new TopResults<T>(rejectCount + topN <= maxQueueDepth, results); |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| protected virtual bool AcceptResult(Int32sRef input, T output) |
| { |
| return true; |
| } |
| } |
| |
| /// <summary> |
| /// Holds a single input (<see cref="Int32sRef"/>) + output, returned by |
| /// <see cref="ShortestPaths"/>. |
| /// </summary> |
| public sealed class Result<T> |
| { |
| public Int32sRef Input { get; private set; } |
| public T Output { get; private set; } |
| |
| public Result(Int32sRef input, T output) |
| { |
| this.Input = input; |
| this.Output = output; |
| } |
| } |
| |
| /// <summary> |
| /// Holds the results for a top N search using <see cref="TopNSearcher{T}"/> |
| /// </summary> |
| public sealed class TopResults<T> : IEnumerable<Result<T>> |
| { |
| /// <summary> |
| /// <c>true</c> iff this is a complete result ie. if |
| /// the specified queue size was large enough to find the complete list of results. this might |
| /// be <c>false</c> if the <see cref="TopNSearcher{T}"/> rejected too many results. |
| /// </summary> |
| public bool IsComplete { get; private set; } |
| |
| /// <summary> |
| /// The top results |
| /// </summary> |
| public IList<Result<T>> TopN { get; private set; } |
| |
| internal TopResults(bool isComplete, IList<Result<T>> topN) |
| { |
| this.TopN = topN; |
| this.IsComplete = isComplete; |
| } |
| |
| public IEnumerator<Result<T>> GetEnumerator() |
| { |
| return TopN.GetEnumerator(); |
| } |
| |
| System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() |
| { |
| return GetEnumerator(); |
| } |
| } |
| |
| /// <summary> |
| /// Starting from node, find the top N min cost |
| /// completions to a final node. |
| /// </summary> |
| public static TopResults<T> ShortestPaths<T>(FST<T> fst, FST.Arc<T> fromNode, T startOutput, IComparer<T> comparer, int topN, bool allowEmptyString) |
| { |
| // All paths are kept, so we can pass topN for |
| // maxQueueDepth and the pruning is admissible: |
| TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, topN, comparer); |
| |
| // since this search is initialized with a single start node |
| // it is okay to start with an empty input path here |
| searcher.AddStartPaths(fromNode, startOutput, allowEmptyString, new Int32sRef()); |
| return searcher.Search(); |
| } |
| |
| /// <summary> |
| /// Dumps an <see cref="FST{T}"/> to a GraphViz's <c>dot</c> language description |
| /// for visualization. Example of use: |
| /// |
| /// <code> |
| /// using (TextWriter sw = new StreamWriter("out.dot")) |
| /// { |
| /// Util.ToDot(fst, sw, true, true); |
| /// } |
| /// </code> |
| /// |
| /// and then, from command line: |
| /// |
| /// <code> |
| /// dot -Tpng -o out.png out.dot |
| /// </code> |
| /// |
| /// <para/> |
| /// Note: larger FSTs (a few thousand nodes) won't even |
| /// render, don't bother. If the FST is > 2.1 GB in size |
| /// then this method will throw strange exceptions. |
| /// <para/> |
| /// See also <a href="http://www.graphviz.org/">http://www.graphviz.org/</a>. |
| /// </summary> |
| /// <param name="sameRank"> |
| /// If <c>true</c>, the resulting <c>dot</c> file will try |
| /// to order states in layers of breadth-first traversal. This may |
| /// mess up arcs, but makes the output FST's structure a bit clearer. |
| /// </param> |
| /// <param name="labelStates"> |
| /// If <c>true</c> states will have labels equal to their offsets in their |
| /// binary format. Expands the graph considerably. |
| /// </param> |
| public static void ToDot<T>(FST<T> fst, TextWriter @out, bool sameRank, bool labelStates) |
| { |
| const string expandedNodeColor = "blue"; |
| |
| // this is the start arc in the automaton (from the epsilon state to the first state |
| // with outgoing transitions. |
| FST.Arc<T> startArc = fst.GetFirstArc(new FST.Arc<T>()); |
| |
| // A queue of transitions to consider for the next level. |
| IList<FST.Arc<T>> thisLevelQueue = new JCG.List<FST.Arc<T>>(); |
| |
| // A queue of transitions to consider when processing the next level. |
| IList<FST.Arc<T>> nextLevelQueue = new JCG.List<FST.Arc<T>> |
| { |
| startArc |
| }; |
| //System.out.println("toDot: startArc: " + startArc); |
| |
| // A list of states on the same level (for ranking). |
| IList<int?> sameLevelStates = new JCG.List<int?>(); |
| |
| // A bitset of already seen states (target offset). |
| BitSet seen = new BitSet(); |
| seen.Set((int)startArc.Target); |
| |
| // Shape for states. |
| const string stateShape = "circle"; |
| const string finalStateShape = "doublecircle"; |
| |
| // Emit DOT prologue. |
| @out.Write("digraph FST {\n"); |
| @out.Write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); |
| |
| if (!labelStates) |
| { |
| @out.Write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); |
| } |
| |
| EmitDotState(@out, "initial", "point", "white", ""); |
| |
| T NO_OUTPUT = fst.Outputs.NoOutput; |
| var r = fst.GetBytesReader(); |
| |
| // final FST.Arc<T> scratchArc = new FST.Arc<>(); |
| |
| { |
| string stateColor; |
| if (fst.IsExpandedTarget(startArc, r)) |
| { |
| stateColor = expandedNodeColor; |
| } |
| else |
| { |
| stateColor = null; |
| } |
| |
| bool isFinal; |
| T finalOutput; |
| if (startArc.IsFinal) |
| { |
| isFinal = true; |
| finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default : startArc.NextFinalOutput; |
| } |
| else |
| { |
| isFinal = false; |
| finalOutput = default; |
| } |
| |
| EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput)); |
| } |
| |
| @out.Write(" initial -> " + startArc.Target + "\n"); |
| |
| int level = 0; |
| |
| while (nextLevelQueue.Count > 0) |
| { |
| // we could double buffer here, but it doesn't matter probably. |
| //System.out.println("next level=" + level); |
| thisLevelQueue.AddRange(nextLevelQueue); |
| nextLevelQueue.Clear(); |
| |
| level++; |
| @out.Write("\n // Transitions and states at level: " + level + "\n"); |
| while (thisLevelQueue.Count > 0) |
| { |
| FST.Arc<T> arc = thisLevelQueue[thisLevelQueue.Count - 1]; |
| thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1); |
| //System.out.println(" pop: " + arc); |
| if (FST<T>.TargetHasArcs(arc)) |
| { |
| // scan all target arcs |
| //System.out.println(" readFirstTarget..."); |
| |
| long node = arc.Target; |
| |
| fst.ReadFirstRealTargetArc(arc.Target, arc, r); |
| |
| //System.out.println(" firstTarget: " + arc); |
| |
| while (true) |
| { |
| //System.out.println(" cycle arc=" + arc); |
| // Emit the unseen state and add it to the queue for the next level. |
| if (arc.Target >= 0 && !seen.Get((int)arc.Target)) |
| { |
| /* |
| boolean isFinal = false; |
| T finalOutput = null; |
| fst.readFirstTargetArc(arc, scratchArc); |
| if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { |
| // target is final |
| isFinal = true; |
| finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; |
| System.out.println("dot hit final label=" + (char) scratchArc.label); |
| } |
| */ |
| string stateColor; |
| if (fst.IsExpandedTarget(arc, r)) |
| { |
| stateColor = expandedNodeColor; |
| } |
| else |
| { |
| stateColor = null; |
| } |
| |
| string finalOutput; |
| if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT)) |
| { |
| finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput); |
| } |
| else |
| { |
| finalOutput = ""; |
| } |
| |
| EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput); |
| // To see the node address, use this instead: |
| //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); |
| seen.Set((int)arc.Target); |
| nextLevelQueue.Add((new FST.Arc<T>()).CopyFrom(arc)); |
| sameLevelStates.Add((int)arc.Target); |
| } |
| |
| string outs; |
| if (!arc.Output.Equals(NO_OUTPUT)) |
| { |
| outs = "/" + fst.Outputs.OutputToString(arc.Output); |
| } |
| else |
| { |
| outs = ""; |
| } |
| |
| if (!FST<T>.TargetHasArcs(arc) && arc.IsFinal && !arc.NextFinalOutput.Equals(NO_OUTPUT)) |
| { |
| // Tricky special case: sometimes, due to |
| // pruning, the builder can [sillily] produce |
| // an FST with an arc into the final end state |
| // (-1) but also with a next final output; in |
| // this case we pull that output up onto this |
| // arc |
| outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]"; |
| } |
| |
| string arcColor; |
| if (arc.Flag(FST.BIT_TARGET_NEXT)) |
| { |
| arcColor = "red"; |
| } |
| else |
| { |
| arcColor = "black"; |
| } |
| |
| if (Debugging.AssertsEnabled) Debugging.Assert(arc.Label != FST.END_LABEL); |
| @out.Write(" " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.IsFinal ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n"); |
| |
| // Break the loop if we're on the last arc of this state. |
| if (arc.IsLast) |
| { |
| //System.out.println(" break"); |
| break; |
| } |
| fst.ReadNextRealArc(arc, r); |
| } |
| } |
| } |
| |
| // Emit state ranking information. |
| if (sameRank && sameLevelStates.Count > 1) |
| { |
| @out.Write(" {rank=same; "); |
| foreach (int state in sameLevelStates) |
| { |
| @out.Write(state + "; "); |
| } |
| @out.Write(" }\n"); |
| } |
| sameLevelStates.Clear(); |
| } |
| |
| // Emit terminating state (always there anyway). |
| @out.Write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); |
| @out.Write(" {rank=sink; -1 }\n"); |
| |
| @out.Write("}\n"); |
| @out.Flush(); |
| } |
| |
| /// <summary> |
| /// Emit a single state in the <c>dot</c> language. |
| /// </summary> |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| private static void EmitDotState(TextWriter @out, string name, string shape, string color, string label) |
| { |
| @out.Write(" " + name |
| + " [" |
| + (shape != null ? "shape=" + shape : "") + " " |
| + (color != null ? "color=" + color : "") + " " |
| + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " " |
| + "]\n"); |
| } |
| |
| /// <summary> |
| /// Ensures an arc's label is indeed printable (dot uses US-ASCII). |
| /// </summary> |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| private static string PrintableLabel(int label) |
| { |
| // Any ordinary ascii character, except for " or \, are |
| // printed as the character; else, as a hex string: |
| if (label >= 0x20 && label <= 0x7d && label != 0x22 && label != 0x5c) // " OR \ |
| { |
| return char.ToString((char)label); |
| } |
| return "0x" + label.ToString("x", CultureInfo.InvariantCulture); |
| } |
| |
| /// <summary> |
| /// Just maps each UTF16 unit (char) to the <see cref="int"/>s in an |
| /// <see cref="Int32sRef"/>. |
| /// </summary> |
| public static Int32sRef ToUTF16(string s, Int32sRef scratch) |
| { |
| int charLimit = s.Length; |
| scratch.Offset = 0; |
| scratch.Length = charLimit; |
| scratch.Grow(charLimit); |
| for (int idx = 0; idx < charLimit; idx++) |
| { |
| scratch.Int32s[idx] = (int)s[idx]; |
| } |
| return scratch; |
| } |
| |
| /// <summary> |
| /// Decodes the Unicode codepoints from the provided |
| /// <see cref="ICharSequence"/> and places them in the provided scratch |
| /// <see cref="Int32sRef"/>, which must not be <c>null</c>, returning it. |
| /// </summary> |
| public static Int32sRef ToUTF32(string s, Int32sRef scratch) |
| { |
| int charIdx = 0; |
| int intIdx = 0; |
| int charLimit = s.Length; |
| while (charIdx < charLimit) |
| { |
| scratch.Grow(intIdx + 1); |
| int utf32 = Character.CodePointAt(s, charIdx); |
| scratch.Int32s[intIdx] = utf32; |
| charIdx += Character.CharCount(utf32); |
| intIdx++; |
| } |
| scratch.Length = intIdx; |
| return scratch; |
| } |
| |
| /// <summary> |
| /// Decodes the Unicode codepoints from the provided |
| /// <see cref="T:char[]"/> and places them in the provided scratch |
| /// <see cref="Int32sRef"/>, which must not be <c>null</c>, returning it. |
| /// </summary> |
| public static Int32sRef ToUTF32(char[] s, int offset, int length, Int32sRef scratch) |
| { |
| int charIdx = offset; |
| int intIdx = 0; |
| int charLimit = offset + length; |
| while (charIdx < charLimit) |
| { |
| scratch.Grow(intIdx + 1); |
| int utf32 = Character.CodePointAt(s, charIdx, charLimit); |
| scratch.Int32s[intIdx] = utf32; |
| charIdx += Character.CharCount(utf32); |
| intIdx++; |
| } |
| scratch.Length = intIdx; |
| return scratch; |
| } |
| |
| /// <summary> |
| /// Just takes unsigned byte values from the <see cref="BytesRef"/> and |
| /// converts into an <see cref="Int32sRef"/>. |
| /// <para/> |
| /// NOTE: This was toIntsRef() in Lucene |
| /// </summary> |
| public static Int32sRef ToInt32sRef(BytesRef input, Int32sRef scratch) |
| { |
| scratch.Grow(input.Length); |
| for (int i = 0; i < input.Length; i++) |
| { |
| scratch.Int32s[i] = input.Bytes[i + input.Offset] & 0xFF; |
| } |
| scratch.Length = input.Length; |
| return scratch; |
| } |
| |
| /// <summary> |
| /// Just converts <see cref="Int32sRef"/> to <see cref="BytesRef"/>; you must ensure the |
| /// <see cref="int"/> values fit into a <see cref="byte"/>. |
| /// </summary> |
| public static BytesRef ToBytesRef(Int32sRef input, BytesRef scratch) |
| { |
| scratch.Grow(input.Length); |
| for (int i = 0; i < input.Length; i++) |
| { |
| int value = input.Int32s[i + input.Offset]; |
| // NOTE: we allow -128 to 255 |
| if (Debugging.AssertsEnabled) Debugging.Assert(value >= sbyte.MinValue && value <= 255, "value {0} doesn't fit into byte", value); |
| scratch.Bytes[i] = (byte)value; |
| } |
| scratch.Length = input.Length; |
| return scratch; |
| } |
| |
| // Uncomment for debugging: |
| |
| /* |
| public static <T> void dotToFile(FST<T> fst, String filePath) throws IOException { |
| Writer w = new OutputStreamWriter(new FileOutputStream(filePath)); |
| toDot(fst, w, true, true); |
| w.Dispose(); |
| } |
| */ |
| |
| /// <summary> |
| /// Reads the first arc greater or equal that the given label into the provided |
| /// arc in place and returns it iff found, otherwise return <c>null</c>. |
| /// </summary> |
| /// <param name="label"> the label to ceil on </param> |
| /// <param name="fst"> the fst to operate on </param> |
| /// <param name="follow"> the arc to follow reading the label from </param> |
| /// <param name="arc"> the arc to read into in place </param> |
| /// <param name="in"> the fst's <see cref="FST.BytesReader"/> </param> |
| public static FST.Arc<T> ReadCeilArc<T>(int label, FST<T> fst, FST.Arc<T> follow, FST.Arc<T> arc, FST.BytesReader @in) |
| { |
| // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum? |
| if (label == FST.END_LABEL) |
| { |
| if (follow.IsFinal) |
| { |
| if (follow.Target <= 0) |
| { |
| arc.Flags = (sbyte)FST.BIT_LAST_ARC; |
| } |
| else |
| { |
| arc.Flags = 0; |
| // NOTE: nextArc is a node (not an address!) in this case: |
| arc.NextArc = follow.Target; |
| arc.Node = follow.Target; |
| } |
| arc.Output = follow.NextFinalOutput; |
| arc.Label = FST.END_LABEL; |
| return arc; |
| } |
| else |
| { |
| return null; |
| } |
| } |
| |
| if (!FST<T>.TargetHasArcs(follow)) |
| { |
| return null; |
| } |
| fst.ReadFirstTargetArc(follow, arc, @in); |
| if (arc.BytesPerArc != 0 && arc.Label != FST.END_LABEL) |
| { |
| // Arcs are fixed array -- use binary search to find |
| // the target. |
| |
| int low = arc.ArcIdx; |
| int high = arc.NumArcs - 1; |
| int mid /*= 0*/; // LUCENENET: Removed unnecessary assignment |
| // System.out.println("do arc array low=" + low + " high=" + high + |
| // " targetLabel=" + targetLabel); |
| while (low <= high) |
| { |
| mid = (low + high).TripleShift(1); |
| @in.Position = arc.PosArcsStart; |
| @in.SkipBytes(arc.BytesPerArc * mid + 1); |
| int midLabel = fst.ReadLabel(@in); |
| int cmp = midLabel - label; |
| // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + |
| // mid + " midLabel=" + midLabel + " cmp=" + cmp); |
| if (cmp < 0) |
| { |
| low = mid + 1; |
| } |
| else if (cmp > 0) |
| { |
| high = mid - 1; |
| } |
| else |
| { |
| arc.ArcIdx = mid - 1; |
| return fst.ReadNextRealArc(arc, @in); |
| } |
| } |
| if (low == arc.NumArcs) |
| { |
| // DEAD END! |
| return null; |
| } |
| |
| arc.ArcIdx = (low > high ? high : low); |
| return fst.ReadNextRealArc(arc, @in); |
| } |
| |
| // Linear scan |
| fst.ReadFirstRealTargetArc(follow.Target, arc, @in); |
| |
| while (true) |
| { |
| // System.out.println(" non-bs cycle"); |
| // TODO: we should fix this code to not have to create |
| // object for the output of every arc we scan... only |
| // for the matching arc, if found |
| if (arc.Label >= label) |
| { |
| // System.out.println(" found!"); |
| return arc; |
| } |
| else if (arc.IsLast) |
| { |
| return null; |
| } |
| else |
| { |
| fst.ReadNextRealArc(arc, @in); |
| } |
| } |
| } |
| } |
| } |