blob: 8c7d8a189ae2789ae3cb3d029d41ec65851c44e3 [file] [log] [blame]
using J2N.Collections;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.CompilerServices;
using System.Text;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Util.Fst
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using ByteArrayDataOutput = Lucene.Net.Store.ByteArrayDataOutput;
using CodecUtil = Lucene.Net.Codecs.CodecUtil;
using DataInput = Lucene.Net.Store.DataInput;
using DataOutput = Lucene.Net.Store.DataOutput;
using GrowableWriter = Lucene.Net.Util.Packed.GrowableWriter;
using InputStreamDataInput = Lucene.Net.Store.InputStreamDataInput;
using OutputStreamDataOutput = Lucene.Net.Store.OutputStreamDataOutput;
using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s;
using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
// different byte[] level encodings (packed or
// not)... and things like nodeCount, arcCount are read only
// TODO: if FST is pure prefix trie we can do a more compact
// job, ie, once we are at a 'suffix only', just store the
// completion labels as a string not as a series of arcs.
// NOTE: while the FST is able to represent a non-final
// dead-end state (NON_FINAL_END_NODE=0), the layers above
// (FSTEnum, Util) have problems with this!!
/// <summary>
/// Represents an finite state machine (FST), using a
/// compact <see cref="T:byte[]"/> format.
/// <para/> The format is similar to what's used by Morfologik
/// (http://sourceforge.net/projects/morfologik).
///
/// <para/> See the <a href="https://lucene.apache.org/core/4_8_0/core/org/apache/lucene/util/fst/package-summary.html">
/// FST package documentation</a> for some simple examples.
/// <para/>
/// @lucene.experimental
/// </summary>
public sealed class FST<T>
{
/*/// <summary>
/// Specifies allowed range of each int input label for
/// this FST.
/// </summary>
public enum INPUT_TYPE
{
BYTE1,
BYTE2,
BYTE4
}*/
private readonly FST.INPUT_TYPE inputType;
/*internal static readonly int BIT_FINAL_ARC = 1 << 0;
internal static readonly int BIT_LAST_ARC = 1 << 1;
internal static readonly int BIT_TARGET_NEXT = 1 << 2;
// TODO: we can free up a bit if we can nuke this:
internal static readonly int BIT_STOP_NODE = 1 << 3;
internal static readonly int BIT_ARC_HAS_OUTPUT = 1 << 4;
internal static readonly int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
// Arcs are stored as fixed-size (per entry) array, so
// that we can find an arc using binary search. We do
// this when number of arcs is > NUM_ARCS_ARRAY:
// If set, the target node is delta coded vs current
// position:
private static readonly int BIT_TARGET_DELTA = 1 << 6;
// We use this as a marker (because this one flag is
// illegal by itself ...):
private static readonly sbyte ARCS_AS_FIXED_ARRAY = (sbyte)BIT_ARC_HAS_FINAL_OUTPUT;
/// <seealso cref= #shouldExpand(UnCompiledNode) </seealso>
internal const int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node.
/// <seealso cref= #shouldExpand(UnCompiledNode) </seealso>
internal const int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
/// <seealso cref= #shouldExpand(UnCompiledNode) </seealso>
internal const int FIXED_ARRAY_NUM_ARCS_DEEP = 10;*/
private int[] bytesPerArc = Arrays.Empty<int>();
/*// Increment version to change it
private const string FILE_FORMAT_NAME = "FST";
private const int VERSION_START = 0;
/// <summary>
/// Changed numBytesPerArc for array'd case from byte to int. </summary>
private const int VERSION_INT_NUM_BYTES_PER_ARC = 1;
/// <summary>
/// Write BYTE2 labels as 2-byte short, not vInt. </summary>
private const int VERSION_SHORT_BYTE2_LABELS = 2;
/// <summary>
/// Added optional packed format. </summary>
private const int VERSION_PACKED = 3;
/// <summary>
/// Changed from int to vInt for encoding arc targets.
/// Also changed maxBytesPerArc from int to vInt in the array case.
/// </summary>
private const int VERSION_VINT_TARGET = 4;
private const int VERSION_CURRENT = VERSION_VINT_TARGET;
// Never serialized; just used to represent the virtual
// final node w/ no arcs:
private const long FINAL_END_NODE = -1;
// Never serialized; just used to represent the virtual
// non-final node w/ no arcs:
private const long NON_FINAL_END_NODE = 0;*/
// if non-null, this FST accepts the empty string and
// produces this output
internal T emptyOutput;
internal readonly BytesStore bytes;
private long startNode = -1;
public Outputs<T> Outputs { get; private set; }
// Used for the BIT_TARGET_NEXT optimization (whereby
// instead of storing the address of the target node for
// a given arc, we mark a single bit noting that the next
// node in the byte[] is the target node):
private long lastFrozenNode;
private readonly T NO_OUTPUT;
// LUCENENET NOTE: changed accessibility of the following 3 fields
// because we already have public properties that can read them.
// This class is sealed, so it is unclear what the benefit of setting
// them from outside is (if any).
internal long nodeCount;
private long arcCount;
private long arcWithOutputCount;
private readonly bool packed;
private PackedInt32s.Reader nodeRefToAddress;
///// <summary>
///// If arc has this label then that arc is final/accepted </summary>
//public static readonly int END_LABEL = -1;
private readonly bool allowArrayArcs;
private FST.Arc<T>[] cachedRootArcs;
private FST.Arc<T>[] assertingCachedRootArcs; // only set wit assert
// LUCENENET NOTE: Arc<T> moved into FST class
internal static bool Flag(int flags, int bit)
{
return (flags & bit) != 0;
}
private GrowableWriter nodeAddress;
// TODO: we could be smarter here, and prune periodically
// as we go; high in-count nodes will "usually" become
// clear early on:
private GrowableWriter inCounts;
private readonly int version;
// make a new empty FST, for building; Builder invokes
// this ctor
internal FST(FST.INPUT_TYPE inputType, Outputs<T> outputs, bool willPackFST, float acceptableOverheadRatio, bool allowArrayArcs, int bytesPageBits)
{
this.inputType = inputType;
this.Outputs = outputs;
this.allowArrayArcs = allowArrayArcs;
version = FST.VERSION_CURRENT;
bytes = new BytesStore(bytesPageBits);
// pad: ensure no node gets address 0 which is reserved to mean
// the stop state w/ no arcs
bytes.WriteByte(0);
NO_OUTPUT = outputs.NoOutput;
if (willPackFST)
{
nodeAddress = new GrowableWriter(15, 8, acceptableOverheadRatio);
inCounts = new GrowableWriter(1, 8, acceptableOverheadRatio);
}
else
{
nodeAddress = null;
inCounts = null;
}
emptyOutput = default;
packed = false;
nodeRefToAddress = null;
}
/// <summary>
/// Load a previously saved FST. </summary>
public FST(DataInput @in, Outputs<T> outputs)
: this(@in, outputs, FST.DEFAULT_MAX_BLOCK_BITS)
{
}
/// <summary>
/// Load a previously saved FST; <paramref name="maxBlockBits"/> allows you to
/// control the size of the <see cref="T:byte[]"/> pages used to hold the FST bytes.
/// </summary>
public FST(DataInput @in, Outputs<T> outputs, int maxBlockBits)
{
this.Outputs = outputs;
if (maxBlockBits < 1 || maxBlockBits > 30)
{
throw new ArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
}
// NOTE: only reads most recent format; we don't have
// back-compat promise for FSTs (they are experimental):
version = CodecUtil.CheckHeader(@in, FST.FILE_FORMAT_NAME, FST.VERSION_PACKED, FST.VERSION_VINT32_TARGET);
packed = @in.ReadByte() == 1;
if (@in.ReadByte() == 1)
{
// accepts empty string
// 1 KB blocks:
BytesStore emptyBytes = new BytesStore(10);
int numBytes = @in.ReadVInt32();
emptyBytes.CopyBytes(@in, numBytes);
// De-serialize empty-string output:
FST.BytesReader reader;
if (packed)
{
reader = emptyBytes.GetForwardReader();
}
else
{
reader = emptyBytes.GetReverseReader();
// NoOutputs uses 0 bytes when writing its output,
// so we have to check here else BytesStore gets
// angry:
if (numBytes > 0)
{
reader.Position = numBytes - 1;
}
}
emptyOutput = outputs.ReadFinalOutput(reader);
}
else
{
emptyOutput = default;
}
var t = @in.ReadByte();
inputType = t switch
{
0 => FST.INPUT_TYPE.BYTE1,
1 => FST.INPUT_TYPE.BYTE2,
2 => FST.INPUT_TYPE.BYTE4,
_ => throw new InvalidOperationException("invalid input type " + t),
};
if (packed)
{
nodeRefToAddress = PackedInt32s.GetReader(@in);
}
else
{
nodeRefToAddress = null;
}
startNode = @in.ReadVInt64();
nodeCount = @in.ReadVInt64();
arcCount = @in.ReadVInt64();
arcWithOutputCount = @in.ReadVInt64();
long numBytes_ = @in.ReadVInt64();
bytes = new BytesStore(@in, numBytes_, 1 << maxBlockBits);
NO_OUTPUT = outputs.NoOutput;
CacheRootArcs();
// NOTE: bogus because this is only used during
// building; we need to break out mutable FST from
// immutable
allowArrayArcs = false;
/*
if (bytes.length == 665) {
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
Util.toDot(this, w, false, false);
w.Dispose();
System.out.println("Wrote FST to out.dot");
}
*/
}
public FST.INPUT_TYPE InputType => inputType;
/// <summary>
/// Returns bytes used to represent the FST </summary>
public long GetSizeInBytes()
{
long size = bytes.Position;
if (packed)
{
size += nodeRefToAddress.RamBytesUsed();
}
else if (nodeAddress != null)
{
size += nodeAddress.RamBytesUsed();
size += inCounts.RamBytesUsed();
}
return size;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void Finish(long newStartNode)
{
if (startNode != -1)
{
throw new InvalidOperationException("already finished");
}
if (newStartNode == FST.FINAL_END_NODE && !EqualityComparer<T>.Default.Equals(emptyOutput, default))
{
newStartNode = 0;
}
startNode = newStartNode;
bytes.Finish();
CacheRootArcs();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private long GetNodeAddress(long node)
{
if (nodeAddress != null)
{
// Deref
return nodeAddress.Get((int)node);
}
else
{
// Straight
return node;
}
}
// Caches first 128 labels
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void CacheRootArcs()
{
cachedRootArcs = (FST.Arc<T>[])new FST.Arc<T>[0x80];
ReadRootArcs(cachedRootArcs);
if (Debugging.AssertsEnabled)
{
Debugging.Assert(SetAssertingRootArcs(cachedRootArcs));
Debugging.Assert(AssertRootArcs());
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void ReadRootArcs(FST.Arc<T>[] arcs)
{
FST.Arc<T> arc = new FST.Arc<T>();
GetFirstArc(arc);
FST.BytesReader @in = GetBytesReader();
if (TargetHasArcs(arc))
{
ReadFirstRealTargetArc(arc.Target, arc, @in);
while (true)
{
if (Debugging.AssertsEnabled) Debugging.Assert(arc.Label != FST.END_LABEL);
if (arc.Label < cachedRootArcs.Length)
{
arcs[arc.Label] = (new FST.Arc<T>()).CopyFrom(arc);
}
else
{
break;
}
if (arc.IsLast)
{
break;
}
ReadNextRealArc(arc, @in);
}
}
}
private bool SetAssertingRootArcs(FST.Arc<T>[] arcs) // Only called from assert
{
assertingCachedRootArcs = (FST.Arc<T>[])new FST.Arc<T>[arcs.Length];
ReadRootArcs(assertingCachedRootArcs);
return true;
}
private bool AssertRootArcs()
{
Debugging.Assert(cachedRootArcs != null);
Debugging.Assert(assertingCachedRootArcs != null);
for (int i = 0; i < cachedRootArcs.Length; i++)
{
FST.Arc<T> root = cachedRootArcs[i];
FST.Arc<T> asserting = assertingCachedRootArcs[i];
if (root != null)
{
Debugging.Assert(root.ArcIdx == asserting.ArcIdx);
Debugging.Assert(root.BytesPerArc == asserting.BytesPerArc);
Debugging.Assert(root.Flags == asserting.Flags);
Debugging.Assert(root.Label == asserting.Label);
Debugging.Assert(root.NextArc == asserting.NextArc);
// LUCENENET NOTE: In .NET, IEnumerable will not equal another identical IEnumerable
// because it checks for reference equality, not that the list contents
// are the same. StructuralEqualityComparer.Default.Equals() will make that check.
Debugging.Assert(typeof(T).IsValueType
? JCG.EqualityComparer<T>.Default.Equals(root.NextFinalOutput, asserting.NextFinalOutput)
: StructuralEqualityComparer.Default.Equals(root.NextFinalOutput, asserting.NextFinalOutput));
Debugging.Assert(root.Node == asserting.Node);
Debugging.Assert(root.NumArcs == asserting.NumArcs);
Debugging.Assert(typeof(T).IsValueType
? JCG.EqualityComparer<T>.Default.Equals(root.Output, asserting.Output)
: StructuralEqualityComparer.Default.Equals(root.Output, asserting.Output));
Debugging.Assert(root.PosArcsStart == asserting.PosArcsStart);
Debugging.Assert(root.Target == asserting.Target);
}
else
{
Debugging.Assert(root == null && asserting == null);
}
}
return true;
}
public T EmptyOutput
{
get => emptyOutput;
set
{
if (emptyOutput != null)
{
emptyOutput = Outputs.Merge(emptyOutput, value);
}
else
{
emptyOutput = value;
}
}
}
public void Save(DataOutput @out)
{
if (startNode == -1)
{
throw new InvalidOperationException("call finish first");
}
if (nodeAddress != null)
{
throw new InvalidOperationException("cannot save an FST pre-packed FST; it must first be packed");
}
if (packed && !(nodeRefToAddress is PackedInt32s.Mutable))
{
throw new InvalidOperationException("cannot save a FST which has been loaded from disk ");
}
CodecUtil.WriteHeader(@out, FST.FILE_FORMAT_NAME, FST.VERSION_CURRENT);
if (packed)
{
@out.WriteByte(1);
}
else
{
@out.WriteByte(0);
}
// TODO: really we should encode this as an arc, arriving
// to the root node, instead of special casing here:
if (!EqualityComparer<T>.Default.Equals(emptyOutput, default))
{
// Accepts empty string
@out.WriteByte(1);
// Serialize empty-string output:
var ros = new RAMOutputStream();
Outputs.WriteFinalOutput(emptyOutput, ros);
var emptyOutputBytes = new byte[(int)ros.GetFilePointer()];
ros.WriteTo(emptyOutputBytes, 0);
if (!packed)
{
// reverse
int stopAt = emptyOutputBytes.Length / 2;
int upto = 0;
while (upto < stopAt)
{
var b = emptyOutputBytes[upto];
emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.Length - upto - 1];
emptyOutputBytes[emptyOutputBytes.Length - upto - 1] = b;
upto++;
}
}
@out.WriteVInt32(emptyOutputBytes.Length);
@out.WriteBytes(emptyOutputBytes, 0, emptyOutputBytes.Length);
}
else
{
@out.WriteByte(0);
}
sbyte t;
if (inputType == FST.INPUT_TYPE.BYTE1)
{
t = 0;
}
else if (inputType == FST.INPUT_TYPE.BYTE2)
{
t = 1;
}
else
{
t = 2;
}
@out.WriteByte((byte)t);
if (packed)
{
((PackedInt32s.Mutable)nodeRefToAddress).Save(@out);
}
@out.WriteVInt64(startNode);
@out.WriteVInt64(nodeCount);
@out.WriteVInt64(arcCount);
@out.WriteVInt64(arcWithOutputCount);
long numBytes = bytes.Position;
@out.WriteVInt64(numBytes);
bytes.WriteTo(@out);
}
/// <summary>
/// Writes an automaton to a file.
/// </summary>
public void Save(FileInfo file)
{
bool success = false;
var bs = file.OpenWrite();
try
{
Save(new OutputStreamDataOutput(bs));
success = true;
}
finally
{
if (success)
{
IOUtils.Dispose(bs);
}
else
{
IOUtils.DisposeWhileHandlingException(bs);
}
}
}
// LUCENENET NOTE: static Read<T>() was moved into the FST class
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void WriteLabel(DataOutput @out, int v)
{
if (Debugging.AssertsEnabled) Debugging.Assert(v >= 0,"v={0}", v);
if (inputType == FST.INPUT_TYPE.BYTE1)
{
if (Debugging.AssertsEnabled) Debugging.Assert(v <= 255,"v={0}", v);
@out.WriteByte((byte)v);
}
else if (inputType == FST.INPUT_TYPE.BYTE2)
{
if (Debugging.AssertsEnabled) Debugging.Assert(v <= 65535,"v={0}", v);
@out.WriteInt16((short)v);
}
else
{
@out.WriteVInt32(v);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal int ReadLabel(DataInput @in)
{
int v;
if (inputType == FST.INPUT_TYPE.BYTE1)
{
// Unsigned byte:
v = @in.ReadByte() & 0xFF;
}
else if (inputType == FST.INPUT_TYPE.BYTE2)
{
// Unsigned short:
v = @in.ReadInt16() & 0xFFFF;
}
else
{
v = @in.ReadVInt32();
}
return v;
}
/// <summary>
/// returns <c>true</c> if the node at this address has any
/// outgoing arcs
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TargetHasArcs(FST.Arc<T> arc)
{
return arc.Target > 0;
}
// serializes new node by appending its bytes to the end
// of the current byte[]
internal long AddNode(Builder.UnCompiledNode<T> nodeIn)
{
//System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs);
if (nodeIn.NumArcs == 0)
{
if (nodeIn.IsFinal)
{
return FST.FINAL_END_NODE;
}
else
{
return FST.NON_FINAL_END_NODE;
}
}
long startAddress = bytes.Position;
//System.out.println(" startAddr=" + startAddress);
bool doFixedArray = ShouldExpand(nodeIn);
if (doFixedArray)
{
//System.out.println(" fixedArray");
if (bytesPerArc.Length < nodeIn.NumArcs)
{
bytesPerArc = new int[ArrayUtil.Oversize(nodeIn.NumArcs, 1)];
}
}
arcCount += nodeIn.NumArcs;
int lastArc = nodeIn.NumArcs - 1;
long lastArcStart = bytes.Position;
int maxBytesPerArc = 0;
for (int arcIdx = 0; arcIdx < nodeIn.NumArcs; arcIdx++)
{
Builder.Arc<T> arc = nodeIn.Arcs[arcIdx];
var target = (Builder.CompiledNode)arc.Target;
int flags = 0;
//System.out.println(" arc " + arcIdx + " label=" + arc.Label + " -> target=" + target.Node);
if (arcIdx == lastArc)
{
flags += FST.BIT_LAST_ARC;
}
if (lastFrozenNode == target.Node && !doFixedArray)
{
// TODO: for better perf (but more RAM used) we
// could avoid this except when arc is "near" the
// last arc:
flags += FST.BIT_TARGET_NEXT;
}
if (arc.IsFinal)
{
flags += FST.BIT_FINAL_ARC;
if (!arc.NextFinalOutput.Equals(NO_OUTPUT))
{
flags += FST.BIT_ARC_HAS_FINAL_OUTPUT;
}
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(arc.NextFinalOutput.Equals(NO_OUTPUT));
}
bool targetHasArcs = target.Node > 0;
if (!targetHasArcs)
{
flags += FST.BIT_STOP_NODE;
}
else if (inCounts != null)
{
inCounts.Set((int)target.Node, inCounts.Get((int)target.Node) + 1);
}
if (!arc.Output.Equals(NO_OUTPUT))
{
flags += FST.BIT_ARC_HAS_OUTPUT;
}
bytes.WriteByte((byte)flags);
WriteLabel(bytes, arc.Label);
// System.out.println(" write arc: label=" + (char) arc.Label + " flags=" + flags + " target=" + target.Node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.Output));
if (!arc.Output.Equals(NO_OUTPUT))
{
Outputs.Write(arc.Output, bytes);
//System.out.println(" write output");
arcWithOutputCount++;
}
if (!arc.NextFinalOutput.Equals(NO_OUTPUT))
{
//System.out.println(" write final output");
Outputs.WriteFinalOutput(arc.NextFinalOutput, bytes);
}
if (targetHasArcs && (flags & FST.BIT_TARGET_NEXT) == 0)
{
if (Debugging.AssertsEnabled) Debugging.Assert(target.Node > 0);
//System.out.println(" write target");
bytes.WriteVInt64(target.Node);
}
// just write the arcs "like normal" on first pass,
// but record how many bytes each one took, and max
// byte size:
if (doFixedArray)
{
bytesPerArc[arcIdx] = (int)(bytes.Position - lastArcStart);
lastArcStart = bytes.Position;
maxBytesPerArc = Math.Max(maxBytesPerArc, bytesPerArc[arcIdx]);
//System.out.println(" bytes=" + bytesPerArc[arcIdx]);
}
}
// TODO: try to avoid wasteful cases: disable doFixedArray in that case
/*
*
* LUCENE-4682: what is a fair heuristic here?
* It could involve some of these:
* 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount?
* 2. how much binSearch saves over scan: nodeIn.numArcs
* 3. waste: numBytes vs numBytesExpanded
*
* the one below just looks at #3
if (doFixedArray) {
// rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor????
int numBytes = lastArcStart - startAddress;
int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs;
if (numBytesExpanded > numBytes*1.25) {
doFixedArray = false;
}
}
*/
if (doFixedArray)
{
const int MAX_HEADER_SIZE = 11; // header(byte) + numArcs(vint) + numBytes(vint)
if (Debugging.AssertsEnabled) Debugging.Assert(maxBytesPerArc > 0);
// 2nd pass just "expands" all arcs to take up a fixed
// byte size
//System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs);
// create the header
// TODO: clean this up: or just rewind+reuse and deal with it
byte[] header = new byte[MAX_HEADER_SIZE];
var bad = new ByteArrayDataOutput(header);
// write a "false" first arc:
bad.WriteByte((byte)FST.ARCS_AS_FIXED_ARRAY);
bad.WriteVInt32(nodeIn.NumArcs);
bad.WriteVInt32(maxBytesPerArc);
int headerLen = bad.Position;
long fixedArrayStart = startAddress + headerLen;
// expand the arcs in place, backwards
long srcPos = bytes.Position;
long destPos = fixedArrayStart + nodeIn.NumArcs * maxBytesPerArc;
if (Debugging.AssertsEnabled) Debugging.Assert(destPos >= srcPos);
if (destPos > srcPos)
{
bytes.SkipBytes((int)(destPos - srcPos));
for (int arcIdx = nodeIn.NumArcs - 1; arcIdx >= 0; arcIdx--)
{
destPos -= maxBytesPerArc;
srcPos -= bytesPerArc[arcIdx];
//System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos);
if (srcPos != destPos)
{
//System.out.println(" copy len=" + bytesPerArc[arcIdx]);
if (Debugging.AssertsEnabled) Debugging.Assert(destPos > srcPos, "destPos={0} srcPos={1} arcIdx={2} maxBytesPerArc={3} bytesPerArc[arcIdx]={4} nodeIn.numArcs={5}", destPos, srcPos, arcIdx, maxBytesPerArc, bytesPerArc[arcIdx], nodeIn.NumArcs);
bytes.CopyBytes(srcPos, destPos, bytesPerArc[arcIdx]);
}
}
}
// now write the header
bytes.WriteBytes(startAddress, header, 0, headerLen);
}
long thisNodeAddress = bytes.Position - 1;
bytes.Reverse(startAddress, thisNodeAddress);
// PackedInts uses int as the index, so we cannot handle
// > 2.1B nodes when packing:
if (nodeAddress != null && nodeCount == int.MaxValue)
{
throw new InvalidOperationException("cannot create a packed FST with more than 2.1 billion nodes");
}
nodeCount++;
long node;
if (nodeAddress != null)
{
// Nodes are addressed by 1+ord:
if ((int)nodeCount == nodeAddress.Count)
{
nodeAddress = nodeAddress.Resize(ArrayUtil.Oversize(nodeAddress.Count + 1, nodeAddress.BitsPerValue));
inCounts = inCounts.Resize(ArrayUtil.Oversize(inCounts.Count + 1, inCounts.BitsPerValue));
}
nodeAddress.Set((int)nodeCount, thisNodeAddress);
// System.out.println(" write nodeAddress[" + nodeCount + "] = " + endAddress);
node = nodeCount;
}
else
{
node = thisNodeAddress;
}
lastFrozenNode = node;
//System.out.println(" ret node=" + node + " address=" + thisNodeAddress + " nodeAddress=" + nodeAddress);
return node;
}
/// <summary>
/// Fills virtual 'start' arc, ie, an empty incoming arc to
/// the FST's start node
/// </summary>
public FST.Arc<T> GetFirstArc(FST.Arc<T> arc)
{
if (!EqualityComparer<T>.Default.Equals(emptyOutput, default))
{
arc.Flags = FST.BIT_FINAL_ARC | FST.BIT_LAST_ARC;
arc.NextFinalOutput = emptyOutput;
if (!emptyOutput.Equals(NO_OUTPUT))
{
arc.Flags |= FST.BIT_ARC_HAS_FINAL_OUTPUT;
}
}
else
{
arc.Flags = FST.BIT_LAST_ARC;
arc.NextFinalOutput = NO_OUTPUT;
}
arc.Output = NO_OUTPUT;
// If there are no nodes, ie, the FST only accepts the
// empty string, then startNode is 0
arc.Target = startNode;
return arc;
}
/// <summary>
/// Follows the <paramref name="follow"/> arc and reads the last
/// arc of its target; this changes the provided
/// <paramref name="arc"/> (2nd arg) in-place and returns it.
/// </summary>
/// <returns> Returns the second argument
/// (<paramref name="arc"/>). </returns>
public FST.Arc<T> ReadLastTargetArc(FST.Arc<T> follow, FST.Arc<T> arc, FST.BytesReader @in)
{
//System.out.println("readLast");
if (!TargetHasArcs(follow))
{
//System.out.println(" end node");
if (Debugging.AssertsEnabled) Debugging.Assert(follow.IsFinal);
arc.Label = FST.END_LABEL;
arc.Target = FST.FINAL_END_NODE;
arc.Output = follow.NextFinalOutput;
arc.Flags = (sbyte)FST.BIT_LAST_ARC;
return arc;
}
else
{
@in.Position = GetNodeAddress(follow.Target);
arc.Node = follow.Target;
var b = (sbyte)@in.ReadByte();
if (b == FST.ARCS_AS_FIXED_ARRAY)
{
// array: jump straight to end
arc.NumArcs = @in.ReadVInt32();
if (packed || version >= FST.VERSION_VINT32_TARGET)
{
arc.BytesPerArc = @in.ReadVInt32();
}
else
{
arc.BytesPerArc = @in.ReadInt32();
}
//System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
arc.PosArcsStart = @in.Position;
arc.ArcIdx = arc.NumArcs - 2;
}
else
{
arc.Flags = b;
// non-array: linear scan
arc.BytesPerArc = 0;
//System.out.println(" scan");
while (!arc.IsLast)
{
// skip this arc:
ReadLabel(@in);
if (arc.Flag(FST.BIT_ARC_HAS_OUTPUT))
{
Outputs.Read(@in);
}
if (arc.Flag(FST.BIT_ARC_HAS_FINAL_OUTPUT))
{
Outputs.ReadFinalOutput(@in);
}
if (arc.Flag(FST.BIT_STOP_NODE))
{
}
else if (arc.Flag(FST.BIT_TARGET_NEXT))
{
}
else if (packed)
{
@in.ReadVInt64();
}
else
{
ReadUnpackedNodeTarget(@in);
}
arc.Flags = (sbyte)@in.ReadByte();
}
// Undo the byte flags we read:
@in.SkipBytes(-1);
arc.NextArc = @in.Position;
}
ReadNextRealArc(arc, @in);
if (Debugging.AssertsEnabled) Debugging.Assert(arc.IsLast);
return arc;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private long ReadUnpackedNodeTarget(FST.BytesReader @in)
{
long target;
if (version < FST.VERSION_VINT32_TARGET)
{
target = @in.ReadInt32();
}
else
{
target = @in.ReadVInt64();
}
return target;
}
/// <summary>
/// Follow the <paramref name="follow"/> arc and read the first arc of its target;
/// this changes the provided <paramref name="arc"/> (2nd arg) in-place and returns
/// it.
/// </summary>
/// <returns> Returns the second argument (<paramref name="arc"/>). </returns>
public FST.Arc<T> ReadFirstTargetArc(FST.Arc<T> follow, FST.Arc<T> arc, FST.BytesReader @in)
{
//int pos = address;
//Debug.WriteLine(" readFirstTarget follow.target=" + follow.Target + " isFinal=" + follow.IsFinal);
if (follow.IsFinal)
{
// Insert "fake" final first arc:
arc.Label = FST.END_LABEL;
arc.Output = follow.NextFinalOutput;
arc.Flags = (sbyte)FST.BIT_FINAL_ARC;
if (follow.Target <= 0)
{
arc.Flags |= (sbyte)FST.BIT_LAST_ARC;
}
else
{
arc.Node = follow.Target;
// NOTE: nextArc is a node (not an address!) in this case:
arc.NextArc = follow.Target;
}
arc.Target = FST.FINAL_END_NODE;
//Debug.WriteLine(" insert isFinal; nextArc=" + follow.Target + " isLast=" + arc.IsLast + " output=" + Outputs.OutputToString(arc.Output));
return arc;
}
else
{
return ReadFirstRealTargetArc(follow.Target, arc, @in);
}
}
public FST.Arc<T> ReadFirstRealTargetArc(long node, FST.Arc<T> arc, FST.BytesReader @in)
{
long address = GetNodeAddress(node);
@in.Position = address;
//System.out.println(" readFirstRealTargtArc address="
//+ address);
//System.out.println(" flags=" + arc.flags);
arc.Node = node;
if (@in.ReadByte() == FST.ARCS_AS_FIXED_ARRAY)
{
//System.out.println(" fixedArray");
// this is first arc in a fixed-array
arc.NumArcs = @in.ReadVInt32();
if (packed || version >= FST.VERSION_VINT32_TARGET)
{
arc.BytesPerArc = @in.ReadVInt32();
}
else
{
arc.BytesPerArc = @in.ReadInt32();
}
arc.ArcIdx = -1;
arc.NextArc = arc.PosArcsStart = @in.Position;
//System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
}
else
{
//arc.flags = b;
arc.NextArc = address;
arc.BytesPerArc = 0;
}
return ReadNextRealArc(arc, @in);
}
/// <summary>
/// Checks if arc's target state is in expanded (or vector) format.
/// </summary>
/// <returns> Returns <c>true</c> if arc points to a state in an
/// expanded array format. </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool IsExpandedTarget(FST.Arc<T> follow, FST.BytesReader @in)
{
if (!TargetHasArcs(follow))
{
return false;
}
else
{
@in.Position = GetNodeAddress(follow.Target);
return @in.ReadByte() == FST.ARCS_AS_FIXED_ARRAY;
}
}
/// <summary>
/// In-place read; returns the arc. </summary>
public FST.Arc<T> ReadNextArc(FST.Arc<T> arc, FST.BytesReader @in)
{
if (arc.Label == FST.END_LABEL)
{
// this was a fake inserted "final" arc
if (arc.NextArc <= 0)
{
throw new ArgumentException("cannot readNextArc when arc.isLast()=true");
}
return ReadFirstRealTargetArc(arc.NextArc, arc, @in);
}
else
{
return ReadNextRealArc(arc, @in);
}
}
/// <summary>
/// Peeks at next arc's label; does not alter <paramref name="arc"/>. Do
/// not call this if arc.IsLast!
/// </summary>
public int ReadNextArcLabel(FST.Arc<T> arc, FST.BytesReader @in)
{
if (Debugging.AssertsEnabled) Debugging.Assert(!arc.IsLast);
if (arc.Label == FST.END_LABEL)
{
//Debug.WriteLine(" nextArc fake " + arc.NextArc);
long pos = GetNodeAddress(arc.NextArc);
@in.Position = pos;
var b = (sbyte)@in.ReadByte();
if (b == FST.ARCS_AS_FIXED_ARRAY)
{
//System.out.println(" nextArc fixed array");
@in.ReadVInt32();
// Skip bytesPerArc:
if (packed || version >= FST.VERSION_VINT32_TARGET)
{
@in.ReadVInt32();
}
else
{
@in.ReadInt32();
}
}
else
{
@in.Position = pos;
}
}
else
{
if (arc.BytesPerArc != 0)
{
//System.out.println(" nextArc real array");
// arcs are at fixed entries
@in.Position = arc.PosArcsStart;
@in.SkipBytes((1 + arc.ArcIdx) * arc.BytesPerArc);
}
else
{
// arcs are packed
//System.out.println(" nextArc real packed");
@in.Position = arc.NextArc;
}
}
// skip flags
@in.ReadByte();
return ReadLabel(@in);
}
/// <summary>
/// Never returns <c>null</c>, but you should never call this if
/// arc.IsLast is <c>true</c>.
/// </summary>
public FST.Arc<T> ReadNextRealArc(FST.Arc<T> arc, FST.BytesReader @in)
{
// TODO: can't assert this because we call from readFirstArc
// assert !flag(arc.flags, BIT_LAST_ARC);
// this is a continuing arc in a fixed array
if (arc.BytesPerArc != 0)
{
// arcs are at fixed entries
arc.ArcIdx++;
if (Debugging.AssertsEnabled) Debugging.Assert(arc.ArcIdx < arc.NumArcs);
@in.Position = arc.PosArcsStart;
@in.SkipBytes(arc.ArcIdx * arc.BytesPerArc);
}
else
{
// arcs are packed
@in.Position = arc.NextArc;
}
arc.Flags = (sbyte)@in.ReadByte();
arc.Label = ReadLabel(@in);
if (arc.Flag(FST.BIT_ARC_HAS_OUTPUT))
{
arc.Output = Outputs.Read(@in);
}
else
{
arc.Output = Outputs.NoOutput;
}
if (arc.Flag(FST.BIT_ARC_HAS_FINAL_OUTPUT))
{
arc.NextFinalOutput = Outputs.ReadFinalOutput(@in);
}
else
{
arc.NextFinalOutput = Outputs.NoOutput;
}
if (arc.Flag(FST.BIT_STOP_NODE))
{
if (arc.Flag(FST.BIT_FINAL_ARC))
{
arc.Target = FST.FINAL_END_NODE;
}
else
{
arc.Target = FST.NON_FINAL_END_NODE;
}
arc.NextArc = @in.Position;
}
else if (arc.Flag(FST.BIT_TARGET_NEXT))
{
arc.NextArc = @in.Position;
// TODO: would be nice to make this lazy -- maybe
// caller doesn't need the target and is scanning arcs...
if (nodeAddress == null)
{
if (!arc.Flag(FST.BIT_LAST_ARC))
{
if (arc.BytesPerArc == 0)
{
// must scan
SeekToNextNode(@in);
}
else
{
@in.Position = arc.PosArcsStart;
@in.SkipBytes(arc.BytesPerArc * arc.NumArcs);
}
}
arc.Target = @in.Position;
}
else
{
arc.Target = arc.Node - 1;
if (Debugging.AssertsEnabled) Debugging.Assert(arc.Target > 0);
}
}
else
{
if (packed)
{
long pos = @in.Position;
long code = @in.ReadVInt64();
if (arc.Flag(FST.BIT_TARGET_DELTA))
{
// Address is delta-coded from current address:
arc.Target = pos + code;
//System.out.println(" delta pos=" + pos + " delta=" + code + " target=" + arc.target);
}
else if (code < nodeRefToAddress.Count)
{
// Deref
arc.Target = nodeRefToAddress.Get((int)code);
//System.out.println(" deref code=" + code + " target=" + arc.target);
}
else
{
// Absolute
arc.Target = code;
//System.out.println(" abs code=" + code);
}
}
else
{
arc.Target = ReadUnpackedNodeTarget(@in);
}
arc.NextArc = @in.Position;
}
return arc;
}
// TODO: could we somehow [partially] tableize arc lookups
// look automaton?
/// <summary>
/// Finds an arc leaving the incoming <paramref name="arc"/>, replacing the arc in place.
/// this returns <c>null</c> if the arc was not found, else the incoming <paramref name="arc"/>.
/// </summary>
public FST.Arc<T> FindTargetArc(int labelToMatch, FST.Arc<T> follow, FST.Arc<T> arc, FST.BytesReader @in)
{
if (labelToMatch == FST.END_LABEL)
{
if (follow.IsFinal)
{
if (follow.Target <= 0)
{
arc.Flags = (sbyte)FST.BIT_LAST_ARC;
}
else
{
arc.Flags = 0;
// NOTE: nextArc is a node (not an address!) in this case:
arc.NextArc = follow.Target;
arc.Node = follow.Target;
}
arc.Output = follow.NextFinalOutput;
arc.Label = FST.END_LABEL;
return arc;
}
else
{
return null;
}
}
// Short-circuit if this arc is in the root arc cache:
if (follow.Target == startNode && labelToMatch < cachedRootArcs.Length)
{
// LUCENE-5152: detect tricky cases where caller
// modified previously returned cached root-arcs:
if (Debugging.AssertsEnabled) Debugging.Assert(AssertRootArcs());
FST.Arc<T> result = cachedRootArcs[labelToMatch];
if (result == null)
{
return null;
}
else
{
arc.CopyFrom(result);
return arc;
}
}
if (!TargetHasArcs(follow))
{
return null;
}
@in.Position = GetNodeAddress(follow.Target);
arc.Node = follow.Target;
// System.out.println("fta label=" + (char) labelToMatch);
if (@in.ReadByte() == FST.ARCS_AS_FIXED_ARRAY)
{
// Arcs are full array; do binary search:
arc.NumArcs = @in.ReadVInt32();
if (packed || version >= FST.VERSION_VINT32_TARGET)
{
arc.BytesPerArc = @in.ReadVInt32();
}
else
{
arc.BytesPerArc = @in.ReadInt32();
}
arc.PosArcsStart = @in.Position;
int low = 0;
int high = arc.NumArcs - 1;
while (low <= high)
{
//System.out.println(" cycle");
int mid = (int)((uint)(low + high) >> 1);
@in.Position = arc.PosArcsStart;
@in.SkipBytes(arc.BytesPerArc * mid + 1);
int midLabel = ReadLabel(@in);
int cmp = midLabel - labelToMatch;
if (cmp < 0)
{
low = mid + 1;
}
else if (cmp > 0)
{
high = mid - 1;
}
else
{
arc.ArcIdx = mid - 1;
//System.out.println(" found!");
return ReadNextRealArc(arc, @in);
}
}
return null;
}
// Linear scan
ReadFirstRealTargetArc(follow.Target, arc, @in);
while (true)
{
//System.out.println(" non-bs cycle");
// TODO: we should fix this code to not have to create
// object for the output of every arc we scan... only
// for the matching arc, if found
if (arc.Label == labelToMatch)
{
//System.out.println(" found!");
return arc;
}
else if (arc.Label > labelToMatch)
{
return null;
}
else if (arc.IsLast)
{
return null;
}
else
{
ReadNextRealArc(arc, @in);
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void SeekToNextNode(FST.BytesReader @in)
{
while (true)
{
int flags = @in.ReadByte();
ReadLabel(@in);
if (Flag(flags, FST.BIT_ARC_HAS_OUTPUT))
{
Outputs.Read(@in);
}
if (Flag(flags, FST.BIT_ARC_HAS_FINAL_OUTPUT))
{
Outputs.ReadFinalOutput(@in);
}
if (!Flag(flags, FST.BIT_STOP_NODE) && !Flag(flags, FST.BIT_TARGET_NEXT))
{
if (packed)
{
@in.ReadVInt64();
}
else
{
ReadUnpackedNodeTarget(@in);
}
}
if (Flag(flags, FST.BIT_LAST_ARC))
{
return;
}
}
}
public long NodeCount =>
// 1+ in order to count the -1 implicit final node
1 + nodeCount;
public long ArcCount => arcCount;
public long ArcWithOutputCount => arcWithOutputCount;
/// <summary>
/// Nodes will be expanded if their depth (distance from the root node) is
/// &lt;= this value and their number of arcs is &gt;=
/// <see cref="FST.FIXED_ARRAY_NUM_ARCS_SHALLOW"/>.
///
/// <para/>
/// Fixed array consumes more RAM but enables binary search on the arcs
/// (instead of a linear scan) on lookup by arc label.
/// </summary>
/// <returns> <c>true</c> if <paramref name="node"/> should be stored in an
/// expanded (array) form.
/// </returns>
/// <seealso cref="FST.FIXED_ARRAY_NUM_ARCS_DEEP"/>
/// <seealso cref="Builder.UnCompiledNode{S}.Depth"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool ShouldExpand(Builder.UnCompiledNode<T> node)
{
return allowArrayArcs && ((node.Depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && node.NumArcs >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || node.NumArcs >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
}
/// <summary>
/// Returns a <see cref="FST.BytesReader"/> for this FST, positioned at
/// position 0.
/// </summary>
public FST.BytesReader GetBytesReader()
{
FST.BytesReader @in;
if (packed)
{
@in = bytes.GetForwardReader();
}
else
{
@in = bytes.GetReverseReader();
}
return @in;
}
/*
/// <summary>
/// Reads bytes stored in an FST. </summary>
public abstract class BytesReader : DataInput
{
/// <summary>
/// Get current read position. </summary>
public abstract long Position {get;set;}
/// <summary>
/// Returns true if this reader uses reversed bytes
/// under-the-hood.
/// </summary>
public abstract bool IsReversed { get; }
/// <summary>
/// Skips bytes. </summary>
public abstract void SkipBytes(int count);
}*/
/*
public void countSingleChains() throws IOException {
// TODO: must assert this FST was built with
// "willRewrite"
final List<ArcAndState<T>> queue = new ArrayList<>();
// TODO: use bitset to not revisit nodes already
// visited
FixedBitSet seen = new FixedBitSet(1+nodeCount);
int saved = 0;
queue.add(new ArcAndState<T>(getFirstArc(new Arc<T>()), new IntsRef()));
Arc<T> scratchArc = new Arc<>();
while(queue.size() > 0) {
//System.out.println("cycle size=" + queue.size());
//for(ArcAndState<T> ent : queue) {
// System.out.println(" " + Util.toBytesRef(ent.chain, new BytesRef()));
// }
final ArcAndState<T> arcAndState = queue.get(queue.size()-1);
seen.set(arcAndState.arc.Node);
final BytesRef br = Util.toBytesRef(arcAndState.chain, new BytesRef());
if (br.length > 0 && br.bytes[br.length-1] == -1) {
br.length--;
}
//System.out.println(" top node=" + arcAndState.arc.Target + " chain=" + br.utf8ToString());
if (targetHasArcs(arcAndState.arc) && !seen.get(arcAndState.arc.target)) {
// push
readFirstTargetArc(arcAndState.arc, scratchArc);
//System.out.println(" push label=" + (char) scratchArc.Label);
//System.out.println(" tonode=" + scratchArc.Target + " last?=" + scratchArc.isLast());
final IntsRef chain = IntsRef.deepCopyOf(arcAndState.chain);
chain.grow(1+chain.length);
// TODO
//assert scratchArc.Label != END_LABEL;
chain.ints[chain.length] = scratchArc.Label;
chain.length++;
if (scratchArc.isLast()) {
if (scratchArc.Target != -1 && inCounts[scratchArc.target] == 1) {
//System.out.println(" append");
} else {
if (arcAndState.chain.length > 1) {
saved += chain.length-2;
try {
System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef()).utf8ToString());
} catch (AssertionError ae) {
System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef()));
}
}
chain.length = 0;
}
} else {
//System.out.println(" reset");
if (arcAndState.chain.length > 1) {
saved += arcAndState.chain.length-2;
try {
System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString());
} catch (AssertionError ae) {
System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()));
}
}
if (scratchArc.Target != -1 && inCounts[scratchArc.target] != 1) {
chain.length = 0;
} else {
chain.ints[0] = scratchArc.Label;
chain.length = 1;
}
}
// TODO: instead of new Arc() we can re-use from
// a by-depth array
queue.add(new ArcAndState<T>(new Arc<T>().copyFrom(scratchArc), chain));
} else if (!arcAndState.arc.isLast()) {
// next
readNextArc(arcAndState.arc);
//System.out.println(" next label=" + (char) arcAndState.arc.Label + " len=" + arcAndState.chain.length);
if (arcAndState.chain.length != 0) {
arcAndState.chain.ints[arcAndState.chain.length-1] = arcAndState.arc.Label;
}
} else {
if (arcAndState.chain.length > 1) {
saved += arcAndState.chain.length-2;
System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString());
}
// pop
//System.out.println(" pop");
queue.remove(queue.size()-1);
while(queue.size() > 0 && queue.get(queue.size()-1).arc.isLast()) {
queue.remove(queue.size()-1);
}
if (queue.size() > 0) {
final ArcAndState<T> arcAndState2 = queue.get(queue.size()-1);
readNextArc(arcAndState2.arc);
//System.out.println(" read next=" + (char) arcAndState2.arc.Label + " queue=" + queue.size());
assert arcAndState2.arc.Label != END_LABEL;
if (arcAndState2.chain.length != 0) {
arcAndState2.chain.ints[arcAndState2.chain.length-1] = arcAndState2.arc.Label;
}
}
}
}
System.out.println("TOT saved " + saved);
}
*/
/// <summary>
/// Creates a packed FST
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private FST(FST.INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits)
{
version = FST.VERSION_CURRENT;
packed = true;
this.inputType = inputType;
bytes = new BytesStore(bytesPageBits);
this.Outputs = outputs;
NO_OUTPUT = outputs.NoOutput;
// NOTE: bogus because this is only used during
// building; we need to break out mutable FST from
// immutable
allowArrayArcs = false;
}
/// <summary>
/// Expert: creates an FST by packing this one. This
/// process requires substantial additional RAM (currently
/// up to ~8 bytes per node depending on
/// <c>acceptableOverheadRatio</c>), but then should
/// produce a smaller FST.
///
/// <para/>The implementation of this method uses ideas from
/// <a target="_blank" href="http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf">Smaller Representation of Finite State Automata</a>,
/// which describes techniques to reduce the size of a FST.
/// However, this is not a strict implementation of the
/// algorithms described in this paper.
/// </summary>
internal FST<T> Pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio)
{
// NOTE: maxDerefNodes is intentionally int: we cannot
// support > 2.1B deref nodes
// TODO: other things to try
// - renumber the nodes to get more next / better locality?
// - allow multiple input labels on an arc, so
// singular chain of inputs can take one arc (on
// wikipedia terms this could save another ~6%)
// - in the ord case, the output '1' is presumably
// very common (after NO_OUTPUT)... maybe use a bit
// for it..?
// - use spare bits in flags.... for top few labels /
// outputs / targets
if (nodeAddress == null)
{
throw new ArgumentException("this FST was not built with willPackFST=true");
}
FST.Arc<T> arc = new FST.Arc<T>();
FST.BytesReader r = GetBytesReader();
int topN = Math.Min(maxDerefNodes, inCounts.Count);
// Find top nodes with highest number of incoming arcs:
FST.NodeQueue q = new FST.NodeQueue(topN);
// TODO: we could use more RAM efficient selection algo here...
FST.NodeAndInCount bottom = null;
for (int node = 0; node < inCounts.Count; node++)
{
if (inCounts.Get(node) >= minInCountDeref)
{
if (bottom == null)
{
q.Add(new FST.NodeAndInCount(node, (int)inCounts.Get(node)));
if (q.Count == topN)
{
bottom = q.Top;
}
}
else if (inCounts.Get(node) > bottom.Count)
{
q.InsertWithOverflow(new FST.NodeAndInCount(node, (int)inCounts.Get(node)));
}
}
}
// Free up RAM:
inCounts = null;
IDictionary<int, int> topNodeMap = new Dictionary<int, int>();
for (int downTo = q.Count - 1; downTo >= 0; downTo--)
{
FST.NodeAndInCount n = q.Pop();
topNodeMap[n.Node] = downTo;
//System.out.println("map node=" + n.Node + " inCount=" + n.count + " to newID=" + downTo);
}
// +1 because node ords start at 1 (0 is reserved as stop node):
GrowableWriter newNodeAddress = new GrowableWriter(PackedInt32s.BitsRequired(this.bytes.Position), (int)(1 + nodeCount), acceptableOverheadRatio);
// Fill initial coarse guess:
for (int node = 1; node <= nodeCount; node++)
{
newNodeAddress.Set(node, 1 + this.bytes.Position - nodeAddress.Get(node));
}
int absCount;
int deltaCount;
int topCount;
int nextCount;
FST<T> fst;
// Iterate until we converge:
while (true)
{
//System.out.println("\nITER");
bool changed = false;
// for assert:
bool negDelta = false;
fst = new FST<T>(inputType, Outputs, bytes.BlockBits);
BytesStore writer = fst.bytes;
// Skip 0 byte since 0 is reserved target:
writer.WriteByte(0);
fst.arcWithOutputCount = 0;
fst.nodeCount = 0;
fst.arcCount = 0;
absCount = deltaCount = topCount = nextCount = 0;
int changedCount = 0;
long addressError = 0;
//int totWasted = 0;
// Since we re-reverse the bytes, we now write the
// nodes backwards, so that BIT_TARGET_NEXT is
// unchanged:
for (int node = (int)nodeCount; node >= 1; node--)
{
fst.nodeCount++;
long address = writer.Position;
//System.out.println(" node: " + node + " address=" + address);
if (address != newNodeAddress.Get(node))
{
addressError = address - newNodeAddress.Get(node);
//System.out.println(" change: " + (address - newNodeAddress[node]));
changed = true;
newNodeAddress.Set(node, address);
changedCount++;
}
int nodeArcCount = 0;
int bytesPerArc = 0;
bool retry = false;
// for assert:
bool anyNegDelta = false;
// Retry loop: possibly iterate more than once, if
// this is an array'd node and bytesPerArc changes:
while (true) // retry writing this node
{
//System.out.println(" cycle: retry");
ReadFirstRealTargetArc(node, arc, r);
bool useArcArray = arc.BytesPerArc != 0;
if (useArcArray)
{
// Write false first arc:
if (bytesPerArc == 0)
{
bytesPerArc = arc.BytesPerArc;
}
writer.WriteByte((byte)FST.ARCS_AS_FIXED_ARRAY);
writer.WriteVInt32(arc.NumArcs);
writer.WriteVInt32(bytesPerArc);
//System.out.println("node " + node + ": " + arc.numArcs + " arcs");
}
int maxBytesPerArc = 0;
//int wasted = 0;
while (true) // iterate over all arcs for this node
{
//System.out.println(" cycle next arc");
long arcStartPos = writer.Position;
nodeArcCount++;
sbyte flags = 0;
if (arc.IsLast)
{
flags += (sbyte)FST.BIT_LAST_ARC;
}
/*
if (!useArcArray && nodeUpto < nodes.length-1 && arc.Target == nodes[nodeUpto+1]) {
flags += BIT_TARGET_NEXT;
}
*/
if (!useArcArray && node != 1 && arc.Target == node - 1)
{
flags += (sbyte)FST.BIT_TARGET_NEXT;
if (!retry)
{
nextCount++;
}
}
if (arc.IsFinal)
{
flags += (sbyte)FST.BIT_FINAL_ARC;
if (!arc.NextFinalOutput.Equals(NO_OUTPUT))
{
flags += (sbyte)FST.BIT_ARC_HAS_FINAL_OUTPUT;
}
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(arc.NextFinalOutput.Equals(NO_OUTPUT));
}
if (!TargetHasArcs(arc))
{
flags += (sbyte)FST.BIT_STOP_NODE;
}
if (!arc.Output.Equals(NO_OUTPUT))
{
flags += (sbyte)FST.BIT_ARC_HAS_OUTPUT;
}
long absPtr;
bool doWriteTarget = TargetHasArcs(arc) && (flags & FST.BIT_TARGET_NEXT) == 0;
if (doWriteTarget)
{
if (topNodeMap.TryGetValue((int)arc.Target, out int ptr))
{
absPtr = ptr;
}
else
{
absPtr = topNodeMap.Count + newNodeAddress.Get((int)arc.Target) + addressError;
}
long delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.Position - 2;
if (delta < 0)
{
//System.out.println("neg: " + delta);
anyNegDelta = true;
delta = 0;
}
if (delta < absPtr)
{
flags |= FST.BIT_TARGET_DELTA;
}
}
else
{
absPtr = 0;
}
if (Debugging.AssertsEnabled) Debugging.Assert(flags != FST.ARCS_AS_FIXED_ARRAY);
writer.WriteByte((byte)flags);
fst.WriteLabel(writer, arc.Label);
if (!arc.Output.Equals(NO_OUTPUT))
{
Outputs.Write(arc.Output, writer);
if (!retry)
{
fst.arcWithOutputCount++;
}
}
if (!arc.NextFinalOutput.Equals(NO_OUTPUT))
{
Outputs.WriteFinalOutput(arc.NextFinalOutput, writer);
}
if (doWriteTarget)
{
long delta = newNodeAddress.Get((int)arc.Target) + addressError - writer.Position;
if (delta < 0)
{
anyNegDelta = true;
//System.out.println("neg: " + delta);
delta = 0;
}
if (Flag(flags, FST.BIT_TARGET_DELTA))
{
//System.out.println(" delta");
writer.WriteVInt64(delta);
if (!retry)
{
deltaCount++;
}
}
else
{
/*
if (ptr != null) {
System.out.println(" deref");
} else {
System.out.println(" abs");
}
*/
writer.WriteVInt64(absPtr);
if (!retry)
{
if (absPtr >= topNodeMap.Count)
{
absCount++;
}
else
{
topCount++;
}
}
}
}
if (useArcArray)
{
int arcBytes = (int)(writer.Position - arcStartPos);
//System.out.println(" " + arcBytes + " bytes");
maxBytesPerArc = Math.Max(maxBytesPerArc, arcBytes);
// NOTE: this may in fact go "backwards", if
// somehow (rarely, possibly never) we use
// more bytesPerArc in this rewrite than the
// incoming FST did... but in this case we
// will retry (below) so it's OK to ovewrite
// bytes:
//wasted += bytesPerArc - arcBytes;
writer.SkipBytes((int)(arcStartPos + bytesPerArc - writer.Position));
}
if (arc.IsLast)
{
break;
}
ReadNextRealArc(arc, r);
}
if (useArcArray)
{
if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc <= bytesPerArc))
{
// converged
//System.out.println(" bba=" + bytesPerArc + " wasted=" + wasted);
//totWasted += wasted;
break;
}
}
else
{
break;
}
//System.out.println(" retry this node maxBytesPerArc=" + maxBytesPerArc + " vs " + bytesPerArc);
// Retry:
bytesPerArc = maxBytesPerArc;
writer.Truncate(address);
nodeArcCount = 0;
retry = true;
anyNegDelta = false;
//writeNodeContinue:;
}
//writeNodeBreak:
negDelta |= anyNegDelta;
fst.arcCount += nodeArcCount;
}
if (!changed)
{
// We don't renumber the nodes (just reverse their
// order) so nodes should only point forward to
// other nodes because we only produce acyclic FSTs
// w/ nodes only pointing "forwards":
if (Debugging.AssertsEnabled) Debugging.Assert(!negDelta);
//System.out.println("TOT wasted=" + totWasted);
// Converged!
break;
}
//System.out.println(" " + changedCount + " of " + fst.nodeCount + " changed; retry");
}
long maxAddress = 0;
foreach (long key in topNodeMap.Keys)
{
maxAddress = Math.Max(maxAddress, newNodeAddress.Get((int)key));
}
PackedInt32s.Mutable nodeRefToAddressIn = PackedInt32s.GetMutable(topNodeMap.Count, PackedInt32s.BitsRequired(maxAddress), acceptableOverheadRatio);
foreach (KeyValuePair<int, int> ent in topNodeMap)
{
nodeRefToAddressIn.Set(ent.Value, newNodeAddress.Get(ent.Key));
}
fst.nodeRefToAddress = nodeRefToAddressIn;
fst.startNode = newNodeAddress.Get((int)startNode);
//System.out.println("new startNode=" + fst.startNode + " old startNode=" + startNode);
if (!JCG.EqualityComparer<T>.Default.Equals(emptyOutput, default))
{
fst.EmptyOutput = emptyOutput;
}
if (Debugging.AssertsEnabled)
{
Debugging.Assert(fst.nodeCount == nodeCount,"fst.nodeCount={0} nodeCount={1}", fst.nodeCount, nodeCount);
Debugging.Assert(fst.arcCount == arcCount);
Debugging.Assert(fst.arcWithOutputCount == arcWithOutputCount,"fst.arcWithOutputCount={0} arcWithOutputCount={1}", fst.arcWithOutputCount, arcWithOutputCount);
}
fst.bytes.Finish();
fst.CacheRootArcs();
//final int size = fst.sizeInBytes();
//System.out.println("nextCount=" + nextCount + " topCount=" + topCount + " deltaCount=" + deltaCount + " absCount=" + absCount);
return fst;
}
}
/// <summary>
/// LUCENENET specific: This new base class is to mimic Java's ability to use nested types without specifying
/// a type parameter. i.e. FST.BytesReader instead of FST&lt;BytesRef&gt;.BytesReader
/// </summary>
public sealed class FST
{
public static readonly int DEFAULT_MAX_BLOCK_BITS = Constants.RUNTIME_IS_64BIT ? 30 : 28;
public FST()
{ }
internal const int BIT_FINAL_ARC = 1 << 0;
internal const int BIT_LAST_ARC = 1 << 1;
internal const int BIT_TARGET_NEXT = 1 << 2;
// TODO: we can free up a bit if we can nuke this:
internal const int BIT_STOP_NODE = 1 << 3;
internal const int BIT_ARC_HAS_OUTPUT = 1 << 4;
internal const int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
// Arcs are stored as fixed-size (per entry) array, so
// that we can find an arc using binary search. We do
// this when number of arcs is > NUM_ARCS_ARRAY:
// If set, thie target node is delta coded vs current position:
internal const int BIT_TARGET_DELTA = 1 << 6;
// We use this as a marker (because this one flag is
// illegal by itself ...):
internal const sbyte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT;
/// <summary>
/// <see cref="Builder.UnCompiledNode{S}"/>
/// </summary>
public const int FIXED_ARRAY_SHALLOW_DISTANCE = 3;
/// <summary>
/// <see cref="Builder.UnCompiledNode{S}"/>
/// </summary>
public const int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
/// <summary>
/// <see cref="Builder.UnCompiledNode{S}"/>
/// </summary>
public const int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
// Increment version to change it
internal const string FILE_FORMAT_NAME = "FST";
internal const int VERSION_START = 0;
/// <summary>
/// Changed numBytesPerArc for array'd case from byte to <see cref="int"/>.
/// <para/>
/// NOTE: This was VERSION_INT_NUM_BYTES_PER_ARC in Lucene
/// </summary>
internal const int VERSION_INT32_NUM_BYTES_PER_ARC = 1;
/// <summary>
/// Write BYTE2 labels as 2-byte <see cref="short"/>, not v<see cref="int"/>.
/// <para/>
/// NOTE: This was VERSION_SHORT_BYTE2_LABELS in Lucene
/// </summary>
internal const int VERSION_INT16_BYTE2_LABELS = 2;
/// <summary>
/// Added optional packed format.
/// </summary>
internal const int VERSION_PACKED = 3;
/// <summary>
/// Changed from <see cref="int"/> to v<see cref="int"/> for encoding arc targets.
/// Also changed maxBytesPerArc from int to v<see cref="int"/> in the array case.
/// <para/>
/// NOTE: This was VERSION_VINT_TARGET in Lucene
/// </summary>
internal const int VERSION_VINT32_TARGET = 4;
internal const int VERSION_CURRENT = VERSION_VINT32_TARGET;
/// <summary>
/// Never serialized; just used to represent the virtual
/// final node w/ no arcs:
/// </summary>
internal const long FINAL_END_NODE = -1;
/// <summary>
/// Never serialized; just used to represent the virtual
/// non-final node w/ no arcs:
/// </summary>
internal const long NON_FINAL_END_NODE = 0;
/// <summary>
/// If arc has this label then that arc is final/accepted </summary>
public static readonly int END_LABEL = -1;
/// <summary>
/// returns <c>true</c> if the node at this address has any
/// outgoing arcs
/// </summary>
public static bool TargetHasArcs<T>(Arc<T> arc)
{
return arc.Target > 0;
}
/// <summary>
/// Reads an automaton from a file.
/// </summary>
public static FST<T> Read<T>(FileInfo file, Outputs<T> outputs)
{
var bs = new BufferedStream(file.OpenRead());
bool success = false;
try
{
FST<T> fst = new FST<T>(new InputStreamDataInput(bs), outputs);
success = true;
return fst;
}
finally
{
if (success)
{
IOUtils.Dispose(bs);
}
else
{
IOUtils.DisposeWhileHandlingException(bs);
}
}
}
/// <summary>
/// Reads bytes stored in an FST.
/// </summary>
public abstract class BytesReader : DataInput
{
/// <summary>
/// Current read position
/// </summary>
public abstract long Position { get; set; }
/// <summary>
/// Returns <c>true</c> if this reader uses reversed bytes
/// under-the-hood.
/// </summary>
/// <returns></returns>
public abstract bool IsReversed { get; }
/// <summary>
/// Skips bytes.
/// </summary>
/// <param name="count"></param>
public abstract void SkipBytes(int count);
}
/// <summary>
/// Specifies allowed range of each int input label for this FST.
/// </summary>
public enum INPUT_TYPE { BYTE1, BYTE2, BYTE4 }
/// <summary>
/// Represents a single arc.
/// </summary>
/// <typeparam name="T"></typeparam>
public class Arc<T>
{
public int Label { get; set; }
public T Output { get; set; }
/// <summary>
/// From node (ord or address); currently only used when
/// building an FST w/ willPackFST=true:
/// </summary>
internal long Node { get; set; }
/// <summary>
/// To node (ord or address)
/// </summary>
public long Target { get; set; }
internal sbyte Flags { get; set; }
public T NextFinalOutput { get; set; }
/// <summary>
/// address (into the byte[]), or ord/address if label == END_LABEL
/// </summary>
internal long NextArc { get; set; }
/// <summary>
/// This is non-zero if current arcs are fixed array:
/// </summary>
internal long PosArcsStart { get; set; }
internal int BytesPerArc { get; set; }
internal int ArcIdx { get; set; }
internal int NumArcs { get; set; }
/// <summary>
/// Return this
/// </summary>
/// <param name="other"></param>
/// <returns></returns>
public Arc<T> CopyFrom(Arc<T> other)
{
Node = other.Node;
Label = other.Label;
Target = other.Target;
Flags = other.Flags;
Output = other.Output;
NextFinalOutput = other.Output;
NextFinalOutput = other.NextFinalOutput;
NextArc = other.NextArc;
BytesPerArc = other.BytesPerArc;
if (BytesPerArc != 0)
{
PosArcsStart = other.PosArcsStart;
ArcIdx = other.ArcIdx;
NumArcs = other.NumArcs;
}
return this;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal virtual bool Flag(int flag)
{
return FST<T>.Flag(Flags, flag);
}
public virtual bool IsLast => Flag(BIT_LAST_ARC);
public virtual bool IsFinal => Flag(BIT_FINAL_ARC);
public override string ToString()
{
var b = new StringBuilder();
b.Append("node=" + Node);
b.Append(" target=" + Target);
b.Append(" label=" + Label);
if (Flag(BIT_LAST_ARC)) b.Append(" last");
if (Flag(BIT_FINAL_ARC)) b.Append(" final");
if (Flag(BIT_TARGET_NEXT)) b.Append(" targetNext");
if (Flag(BIT_ARC_HAS_OUTPUT)) b.Append(" output=" + Output);
if (Flag(BIT_ARC_HAS_FINAL_OUTPUT)) b.Append(" nextFinalOutput=" + NextFinalOutput);
if (BytesPerArc != 0) b.Append(" arcArray(idx=" + ArcIdx + " of " + NumArcs + ")");
return b.ToString();
}
}
internal class ArcAndState<T>
{
internal Arc<T> Arc { get; private set; }
internal Int32sRef Chain { get; private set; }
public ArcAndState(Arc<T> arc, Int32sRef chain)
{
this.Arc = arc;
this.Chain = chain;
}
}
internal class NodeAndInCount : IComparable<NodeAndInCount>
{
internal int Node { get; private set; }
internal int Count { get; private set; }
public NodeAndInCount(int node, int count)
{
this.Node = node;
this.Count = count;
}
public virtual int CompareTo(NodeAndInCount other)
{
if (Count > other.Count)
{
return 1;
}
else if (Count < other.Count)
{
return -1;
}
else
{
// Tie-break: smaller node compares as greater than
return other.Node - Node;
}
}
}
internal class NodeQueue : PriorityQueue<NodeAndInCount>
{
public NodeQueue(int topN)
: base(topN, false)
{
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
protected internal override bool LessThan(NodeAndInCount a, NodeAndInCount b)
{
int cmp = a.CompareTo(b);
if (Debugging.AssertsEnabled) Debugging.Assert(cmp != 0);
return cmp < 0;
}
}
}
}