| using Lucene.Net.Util.Fst; |
| using System; |
| using System.Collections; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Linq; |
| using JCG = J2N.Collections.Generic; |
| |
| namespace Lucene.Net.Codecs.Memory |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using ArrayUtil = Util.ArrayUtil; |
| using BlockPackedWriter = Util.Packed.BlockPackedWriter; |
| using ByteArrayDataOutput = Store.ByteArrayDataOutput; |
| using BytesRef = Util.BytesRef; |
| using FieldInfo = Index.FieldInfo; |
| using FormatAndBits = Util.Packed.PackedInt32s.FormatAndBits; |
| using IndexFileNames = Index.IndexFileNames; |
| using IndexOutput = Store.IndexOutput; |
| using INPUT_TYPE = Util.Fst.FST.INPUT_TYPE; |
| using Int32sRef = Util.Int32sRef; |
| using IOUtils = Util.IOUtils; |
| using MathUtil = Util.MathUtil; |
| using MonotonicBlockPackedWriter = Util.Packed.MonotonicBlockPackedWriter; |
| using PackedInt32s = Util.Packed.PackedInt32s; |
| using PositiveInt32Outputs = Util.Fst.PositiveInt32Outputs; |
| using SegmentWriteState = Index.SegmentWriteState; |
| using Util = Util.Fst.Util; |
| |
| /// <summary> |
| /// Writer for <see cref="MemoryDocValuesFormat"/>. |
| /// </summary> |
| internal class MemoryDocValuesConsumer : DocValuesConsumer |
| { |
| private IndexOutput data, meta; |
| private readonly int maxDoc; |
| private readonly float acceptableOverheadRatio; |
| |
| internal MemoryDocValuesConsumer(SegmentWriteState state, string dataCodec, string dataExtension, |
| string metaCodec, |
| string metaExtension, float acceptableOverheadRatio) |
| { |
| this.acceptableOverheadRatio = acceptableOverheadRatio; |
| maxDoc = state.SegmentInfo.DocCount; |
| var success = false; |
| try |
| { |
| var dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension); |
| data = state.Directory.CreateOutput(dataName, state.Context); |
| CodecUtil.WriteHeader(data, dataCodec, MemoryDocValuesProducer.VERSION_CURRENT); |
| var metaName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension); |
| meta = state.Directory.CreateOutput(metaName, state.Context); |
| CodecUtil.WriteHeader(meta, metaCodec, MemoryDocValuesProducer.VERSION_CURRENT); |
| success = true; |
| } |
| finally |
| { |
| if (!success) |
| { |
| IOUtils.DisposeWhileHandlingException(this); |
| } |
| } |
| } |
| |
| public override void AddNumericField(FieldInfo field, IEnumerable<long?> values) |
| { |
| AddNumericField(field, values, true); |
| } |
| |
| internal virtual void AddNumericField(FieldInfo field, IEnumerable<long?> values, bool optimizeStorage) |
| { |
| meta.WriteVInt32(field.Number); |
| meta.WriteByte(MemoryDocValuesProducer.NUMBER); |
| meta.WriteInt64(data.GetFilePointer()); |
| long minValue = long.MaxValue; |
| long maxValue = long.MinValue; |
| long gcd = 0; |
| bool missing = false; |
| // TODO: more efficient? |
| ISet<long?> uniqueValues = null; |
| if (optimizeStorage) |
| { |
| uniqueValues = new JCG.HashSet<long?>(); |
| |
| long count = 0; |
| foreach (var nv in values) |
| { |
| long v; |
| if (nv == null) |
| { |
| v = 0; |
| missing = true; |
| } |
| else |
| { |
| v = nv.Value; |
| } |
| |
| if (gcd != 1) |
| { |
| if (v < long.MinValue / 2 || v > long.MaxValue / 2) |
| { |
| // in that case v - minValue might overflow and make the GCD computation return |
| // wrong results. Since these extreme values are unlikely, we just discard |
| // GCD computation for them |
| gcd = 1; |
| } // minValue needs to be set first |
| else if (count != 0) |
| { |
| gcd = MathUtil.Gcd(gcd, v - minValue); |
| } |
| } |
| |
| minValue = Math.Min(minValue, v); |
| maxValue = Math.Max(maxValue, v); |
| |
| if (uniqueValues != null) |
| { |
| if (uniqueValues.Add(v)) |
| { |
| if (uniqueValues.Count > 256) |
| { |
| uniqueValues = null; |
| } |
| } |
| } |
| |
| ++count; |
| } |
| Debug.Assert(count == maxDoc); |
| } |
| |
| if (missing) |
| { |
| long start = data.GetFilePointer(); |
| WriteMissingBitset(values); |
| meta.WriteInt64(start); |
| meta.WriteInt64(data.GetFilePointer() - start); |
| } |
| else |
| { |
| meta.WriteInt64(-1L); |
| } |
| |
| if (uniqueValues != null) |
| { |
| // small number of unique values |
| |
| int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); |
| FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, |
| acceptableOverheadRatio); |
| if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) |
| { |
| meta.WriteByte(MemoryDocValuesProducer.UNCOMPRESSED); // uncompressed |
| foreach (var nv in values) |
| { |
| data.WriteByte((byte)nv.GetValueOrDefault()); |
| } |
| } |
| else |
| { |
| meta.WriteByte(MemoryDocValuesProducer.TABLE_COMPRESSED); // table-compressed |
| long?[] decode = uniqueValues.ToArray(); |
| |
| var encode = new Dictionary<long?, int?>(); |
| data.WriteVInt32(decode.Length); |
| for (int i = 0; i < decode.Length; i++) |
| { |
| data.WriteInt64(decode[i].Value); |
| encode[decode[i]] = i; |
| } |
| |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| data.WriteVInt32(formatAndBits.Format.Id); |
| data.WriteVInt32(formatAndBits.BitsPerValue); |
| |
| PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, |
| formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); |
| foreach (var nv in values) |
| { |
| var v = encode[nv.GetValueOrDefault()]; |
| |
| writer.Add((long)v); |
| } |
| writer.Finish(); |
| } |
| } |
| else if (gcd != 0 && gcd != 1) |
| { |
| meta.WriteByte(MemoryDocValuesProducer.GCD_COMPRESSED); |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| data.WriteInt64(minValue); |
| data.WriteInt64(gcd); |
| data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); |
| |
| var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); |
| foreach (var nv in values) |
| { |
| writer.Add((nv.GetValueOrDefault() - minValue) / gcd); |
| } |
| writer.Finish(); |
| } |
| else |
| { |
| meta.WriteByte(MemoryDocValuesProducer.DELTA_COMPRESSED); // delta-compressed |
| |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); |
| |
| var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); |
| foreach (var nv in values) |
| { |
| writer.Add(nv.GetValueOrDefault()); |
| } |
| writer.Finish(); |
| } |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| if (!disposing) return; |
| |
| var success = false; |
| try |
| { |
| if (meta != null) |
| { |
| meta.WriteVInt32(-1); // write EOF marker |
| CodecUtil.WriteFooter(meta); // write checksum |
| } |
| if (data != null) |
| { |
| CodecUtil.WriteFooter(data); |
| } |
| success = true; |
| } |
| finally |
| { |
| if (success) |
| { |
| IOUtils.Dispose(data, meta); |
| } |
| else |
| { |
| IOUtils.DisposeWhileHandlingException(data, meta); |
| } |
| data = meta = null; |
| } |
| } |
| |
| public override void AddBinaryField(FieldInfo field, IEnumerable<BytesRef> values) |
| { |
| // write the byte[] data |
| meta.WriteVInt32(field.Number); |
| meta.WriteByte(MemoryDocValuesProducer.BYTES); |
| var minLength = int.MaxValue; |
| var maxLength = int.MinValue; |
| |
| var startFP = data.GetFilePointer(); |
| var missing = false; |
| foreach (var v in values) |
| { |
| int length; |
| if (v == null) |
| { |
| length = 0; |
| missing = true; |
| } |
| else |
| { |
| length = v.Length; |
| } |
| if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH) |
| { |
| throw new ArgumentException("DocValuesField \"" + field.Name + "\" is too large, must be <= " + |
| MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH); |
| } |
| minLength = Math.Min(minLength, length); |
| maxLength = Math.Max(maxLength, length); |
| if (v != null) |
| { |
| data.WriteBytes(v.Bytes, v.Offset, v.Length); |
| } |
| } |
| meta.WriteInt64(startFP); |
| meta.WriteInt64(data.GetFilePointer() - startFP); |
| if (missing) |
| { |
| long start = data.GetFilePointer(); |
| WriteMissingBitset(values); |
| meta.WriteInt64(start); |
| meta.WriteInt64(data.GetFilePointer() - start); |
| } |
| else |
| { |
| meta.WriteInt64(-1L); |
| } |
| meta.WriteVInt32(minLength); |
| meta.WriteVInt32(maxLength); |
| |
| // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit) |
| // otherwise, we need to record the length fields... |
| if (minLength != maxLength) |
| { |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| meta.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE); |
| |
| |
| var writer = new MonotonicBlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE); |
| long addr = 0; |
| foreach (BytesRef v in values) |
| { |
| if (v != null) |
| { |
| addr += v.Length; |
| } |
| writer.Add(addr); |
| } |
| writer.Finish(); |
| } |
| } |
| |
| private void WriteFST(FieldInfo field, IEnumerable<BytesRef> values) |
| { |
| meta.WriteVInt32(field.Number); |
| meta.WriteByte(MemoryDocValuesProducer.FST); |
| meta.WriteInt64(data.GetFilePointer()); |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| var builder = new Builder<long?>(INPUT_TYPE.BYTE1, outputs); |
| var scratch = new Int32sRef(); |
| long ord = 0; |
| foreach (BytesRef v in values) |
| { |
| builder.Add(Util.ToInt32sRef(v, scratch), ord); |
| ord++; |
| } |
| FST<long?> fst = builder.Finish(); |
| if (fst != null) |
| { |
| fst.Save(data); |
| } |
| meta.WriteVInt64(ord); |
| } |
| |
| // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on, |
| // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode) |
| |
| internal virtual void WriteMissingBitset<T1>(IEnumerable<T1> values) |
| { |
| long bits = 0; |
| int count = 0; |
| foreach (object v in values) |
| { |
| if (count == 64) |
| { |
| data.WriteInt64(bits); |
| count = 0; |
| bits = 0; |
| } |
| if (v != null) |
| { |
| bits |= 1L << (count & 0x3f); |
| } |
| count++; |
| } |
| if (count > 0) |
| { |
| data.WriteInt64(bits); |
| } |
| } |
| |
| public override void AddSortedField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<long?> docToOrd) |
| { |
| // write the ordinals as numerics |
| AddNumericField(field, docToOrd, false); |
| |
| // write the values as FST |
| WriteFST(field, values); |
| } |
| |
| // note: this might not be the most efficient... but its fairly simple |
| public override void AddSortedSetField(FieldInfo field, IEnumerable<BytesRef> values, |
| IEnumerable<long?> docToOrdCount, IEnumerable<long?> ords) |
| { |
| // write the ordinals as a binary field |
| AddBinaryField(field, new IterableAnonymousInnerClassHelper(this, docToOrdCount, ords)); |
| |
| // write the values as FST |
| WriteFST(field, values); |
| } |
| |
| private class IterableAnonymousInnerClassHelper : IEnumerable<BytesRef> |
| { |
| private readonly IEnumerable<long?> _docToOrdCount; |
| private readonly IEnumerable<long?> _ords; |
| |
| public IterableAnonymousInnerClassHelper(MemoryDocValuesConsumer outerInstance, |
| IEnumerable<long?> docToOrdCount, IEnumerable<long?> ords) |
| { |
| _docToOrdCount = docToOrdCount; |
| _ords = ords; |
| } |
| |
| public IEnumerator<BytesRef> GetEnumerator() |
| { |
| return new SortedSetIterator(_docToOrdCount.GetEnumerator(), _ords.GetEnumerator()); |
| } |
| |
| IEnumerator IEnumerable.GetEnumerator() |
| { |
| return this.GetEnumerator(); |
| } |
| } |
| |
| // per-document vint-encoded byte[] |
| internal class SortedSetIterator : IEnumerator<BytesRef> |
| { |
| private byte[] buffer = new byte[10]; |
| private ByteArrayDataOutput @out = new ByteArrayDataOutput(); |
| private BytesRef _current = new BytesRef(); |
| |
| private readonly IEnumerator<long?> counts; |
| private readonly IEnumerator<long?> ords; |
| |
| public BytesRef Current |
| { |
| get |
| { |
| return _current; |
| } |
| } |
| |
| object IEnumerator.Current |
| { |
| get |
| { |
| return this.Current; |
| } |
| } |
| |
| internal SortedSetIterator(IEnumerator<long?> counts, IEnumerator<long?> ords) |
| { |
| this.counts = counts; |
| this.ords = ords; |
| } |
| |
| public bool MoveNext() |
| { |
| |
| if (!counts.MoveNext()) |
| return false; |
| |
| int count = (int)counts.Current; |
| int maxSize = count * 9; // worst case |
| if (maxSize > buffer.Length) |
| { |
| buffer = ArrayUtil.Grow(buffer, maxSize); |
| } |
| |
| try |
| { |
| EncodeValues(count); |
| } |
| catch (System.IO.IOException bogus) |
| { |
| throw new Exception(bogus.ToString(), bogus); |
| } |
| |
| _current.Bytes = buffer; |
| _current.Offset = 0; |
| _current.Length = @out.Position; |
| |
| return true; |
| } |
| |
| // encodes count values to buffer |
| private void EncodeValues(int count) |
| { |
| @out.Reset(buffer); |
| long lastOrd = 0; |
| for (int i = 0; i < count; i++) |
| { |
| if (!ords.MoveNext()) |
| break; |
| |
| long ord = ords.Current.Value; |
| @out.WriteVInt64(ord - lastOrd); |
| lastOrd = ord; |
| } |
| } |
| |
| public void Dispose() |
| { |
| this.counts.Dispose(); |
| this.ords.Dispose(); |
| } |
| |
| public void Reset() |
| { |
| throw new NotSupportedException(); |
| } |
| } |
| } |
| } |