| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Index; |
| using Lucene.Net.Store; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Fst; |
| using Lucene.Net.Util.Packed; |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using JCG = J2N.Collections.Generic; |
| using static Lucene.Net.Util.Fst.FST; |
| using static Lucene.Net.Util.Packed.PackedInt32s; |
| |
| namespace Lucene.Net.Codecs.Lucene42 |
| { |
| using Util = Lucene.Net.Util.Fst.Util; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Constants use Lucene42DocValuesProducer. |
| |
| /// <summary> |
| /// Writer for <see cref="Lucene42DocValuesFormat"/>. |
| /// </summary> |
| #pragma warning disable 612, 618 |
| internal class Lucene42DocValuesConsumer : DocValuesConsumer |
| { |
| #pragma warning disable CA2213 // Disposable fields should be disposed |
| internal readonly IndexOutput data, meta; |
| #pragma warning restore CA2213 // Disposable fields should be disposed |
| internal readonly int maxDoc; |
| internal readonly float acceptableOverheadRatio; |
| |
| internal Lucene42DocValuesConsumer(SegmentWriteState state, string dataCodec, string dataExtension, string metaCodec, string metaExtension, float acceptableOverheadRatio) |
| { |
| this.acceptableOverheadRatio = acceptableOverheadRatio; |
| maxDoc = state.SegmentInfo.DocCount; |
| bool success = false; |
| try |
| { |
| string dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension); |
| data = state.Directory.CreateOutput(dataName, state.Context); |
| // this writer writes the format 4.2 did! |
| CodecUtil.WriteHeader(data, dataCodec, Lucene42DocValuesProducer.VERSION_GCD_COMPRESSION); |
| string metaName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension); |
| meta = state.Directory.CreateOutput(metaName, state.Context); |
| CodecUtil.WriteHeader(meta, metaCodec, Lucene42DocValuesProducer.VERSION_GCD_COMPRESSION); |
| success = true; |
| } |
| finally |
| { |
| if (!success) |
| { |
| IOUtils.DisposeWhileHandlingException(this); |
| } |
| } |
| } |
| |
| public override void AddNumericField(FieldInfo field, IEnumerable<long?> values) |
| { |
| AddNumericField(field, values, true); |
| } |
| |
| internal virtual void AddNumericField(FieldInfo field, IEnumerable<long?> values, bool optimizeStorage) |
| { |
| meta.WriteVInt32(field.Number); |
| meta.WriteByte((byte)Lucene42DocValuesProducer.NUMBER); |
| meta.WriteInt64(data.GetFilePointer()); |
| long minValue = long.MaxValue; |
| long maxValue = long.MinValue; |
| long gcd = 0; |
| // TODO: more efficient? |
| ISet<long> uniqueValues = null; |
| if (optimizeStorage) |
| { |
| uniqueValues = new JCG.HashSet<long>(); |
| |
| long count = 0; |
| foreach (long? nv in values) |
| { |
| // TODO: support this as MemoryDVFormat (and be smart about missing maybe) |
| long v = nv.GetValueOrDefault(); |
| |
| if (gcd != 1) |
| { |
| if (v < long.MinValue / 2 || v > long.MaxValue / 2) |
| { |
| // in that case v - minValue might overflow and make the GCD computation return |
| // wrong results. Since these extreme values are unlikely, we just discard |
| // GCD computation for them |
| gcd = 1; |
| } // minValue needs to be set first |
| else if (count != 0) |
| { |
| gcd = MathUtil.Gcd(gcd, v - minValue); |
| } |
| } |
| |
| minValue = Math.Min(minValue, v); |
| maxValue = Math.Max(maxValue, v); |
| |
| if (uniqueValues != null) |
| { |
| if (uniqueValues.Add(v)) |
| { |
| if (uniqueValues.Count > 256) |
| { |
| uniqueValues = null; |
| } |
| } |
| } |
| |
| ++count; |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(count == maxDoc); |
| } |
| |
| if (uniqueValues != null) |
| { |
| // small number of unique values |
| int bitsPerValue = PackedInt32s.BitsRequired(uniqueValues.Count - 1); |
| FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio); |
| if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue) |
| { |
| meta.WriteByte((byte)Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed |
| foreach (long? nv in values) |
| { |
| data.WriteByte((byte)nv.GetValueOrDefault()); |
| } |
| } |
| else |
| { |
| meta.WriteByte((byte)Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed |
| var decode = new long[uniqueValues.Count]; |
| uniqueValues.CopyTo(decode, 0); |
| var encode = new Dictionary<long, int>(); |
| data.WriteVInt32(decode.Length); |
| for (int i = 0; i < decode.Length; i++) |
| { |
| data.WriteInt64(decode[i]); |
| encode[decode[i]] = i; |
| } |
| |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| data.WriteVInt32(formatAndBits.Format.Id); |
| data.WriteVInt32(formatAndBits.BitsPerValue); |
| |
| PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE); |
| foreach (long? nv in values) |
| { |
| writer.Add(encode[nv.GetValueOrDefault()]); |
| } |
| writer.Finish(); |
| } |
| } |
| else if (gcd != 0 && gcd != 1) |
| { |
| meta.WriteByte((byte)Lucene42DocValuesProducer.GCD_COMPRESSED); |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| data.WriteInt64(minValue); |
| data.WriteInt64(gcd); |
| data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); |
| |
| BlockPackedWriter writer = new BlockPackedWriter(data, Lucene42DocValuesProducer.BLOCK_SIZE); |
| foreach (long? nv in values) |
| { |
| writer.Add((nv.GetValueOrDefault() - minValue) / gcd); |
| } |
| writer.Finish(); |
| } |
| else |
| { |
| meta.WriteByte((byte)Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed |
| |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); |
| |
| BlockPackedWriter writer = new BlockPackedWriter(data, Lucene42DocValuesProducer.BLOCK_SIZE); |
| foreach (long? nv in values) |
| { |
| writer.Add(nv.GetValueOrDefault()); |
| } |
| writer.Finish(); |
| } |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| if (disposing) |
| { |
| bool success = false; |
| try |
| { |
| if (meta != null) |
| { |
| meta.WriteVInt32(-1); // write EOF marker |
| } |
| success = true; |
| } |
| finally |
| { |
| if (success) |
| { |
| IOUtils.Dispose(data, meta); |
| } |
| else |
| { |
| IOUtils.DisposeWhileHandlingException(data, meta); |
| } |
| } |
| } |
| } |
| |
| public override void AddBinaryField(FieldInfo field, IEnumerable<BytesRef> values) |
| { |
| // write the byte[] data |
| meta.WriteVInt32(field.Number); |
| meta.WriteByte((byte)Lucene42DocValuesProducer.BYTES); |
| int minLength = int.MaxValue; |
| int maxLength = int.MinValue; |
| long startFP = data.GetFilePointer(); |
| foreach (BytesRef v in values) |
| { |
| int length = v == null ? 0 : v.Length; |
| if (length > Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH) |
| { |
| throw new ArgumentException("DocValuesField \"" + field.Name + "\" is too large, must be <= " + Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH); |
| } |
| minLength = Math.Min(minLength, length); |
| maxLength = Math.Max(maxLength, length); |
| if (v != null) |
| { |
| data.WriteBytes(v.Bytes, v.Offset, v.Length); |
| } |
| } |
| meta.WriteInt64(startFP); |
| meta.WriteInt64(data.GetFilePointer() - startFP); |
| meta.WriteVInt32(minLength); |
| meta.WriteVInt32(maxLength); |
| |
| // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit) |
| // otherwise, we need to record the length fields... |
| if (minLength != maxLength) |
| { |
| meta.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| meta.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE); |
| |
| MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, Lucene42DocValuesProducer.BLOCK_SIZE); |
| long addr = 0; |
| foreach (BytesRef v in values) |
| { |
| if (v != null) |
| { |
| addr += v.Length; |
| } |
| writer.Add(addr); |
| } |
| writer.Finish(); |
| } |
| } |
| |
| private void WriteFST(FieldInfo field, IEnumerable<BytesRef> values) |
| { |
| meta.WriteVInt32(field.Number); |
| meta.WriteByte((byte)Lucene42DocValuesProducer.FST); |
| meta.WriteInt64(data.GetFilePointer()); |
| PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton; |
| Builder<long?> builder = new Builder<long?>(INPUT_TYPE.BYTE1, outputs); |
| Int32sRef scratch = new Int32sRef(); |
| long ord = 0; |
| foreach (BytesRef v in values) |
| { |
| builder.Add(Util.ToInt32sRef(v, scratch), ord); |
| ord++; |
| } |
| |
| var fst = builder.Finish(); |
| if (fst != null) |
| { |
| fst.Save(data); |
| } |
| meta.WriteVInt64(ord); |
| } |
| |
| public override void AddSortedField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<long?> docToOrd) |
| { |
| // three cases for simulating the old writer: |
| // 1. no missing |
| // 2. missing (and empty string in use): remap ord=-1 -> ord=0 |
| // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values |
| bool anyMissing = false; |
| foreach (long? n in docToOrd) |
| { |
| if (n.Value == -1) |
| { |
| anyMissing = true; |
| break; |
| } |
| } |
| |
| bool hasEmptyString = false; |
| foreach (BytesRef b in values) |
| { |
| hasEmptyString = b.Length == 0; |
| break; |
| } |
| |
| if (!anyMissing) |
| { |
| // nothing to do |
| } |
| else if (hasEmptyString) |
| { |
| docToOrd = MissingOrdRemapper.MapMissingToOrd0(docToOrd); |
| } |
| else |
| { |
| docToOrd = MissingOrdRemapper.MapAllOrds(docToOrd); |
| values = MissingOrdRemapper.InsertEmptyValue(values); |
| } |
| |
| // write the ordinals as numerics |
| AddNumericField(field, docToOrd, false); |
| |
| // write the values as FST |
| WriteFST(field, values); |
| } |
| |
| // note: this might not be the most efficient... but its fairly simple |
| public override void AddSortedSetField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<long?> docToOrdCount, IEnumerable<long?> ords) |
| { |
| // write the ordinals as a binary field |
| AddBinaryField(field, new IterableAnonymousClass(docToOrdCount, ords)); |
| |
| // write the values as FST |
| WriteFST(field, values); |
| } |
| |
| private class IterableAnonymousClass : IEnumerable<BytesRef> |
| { |
| private readonly IEnumerable<long?> docToOrdCount; |
| private readonly IEnumerable<long?> ords; |
| |
| public IterableAnonymousClass(IEnumerable<long?> docToOrdCount, IEnumerable<long?> ords) |
| { |
| this.docToOrdCount = docToOrdCount; |
| this.ords = ords; |
| } |
| |
| public IEnumerator<BytesRef> GetEnumerator() |
| { |
| return new SortedSetIterator(docToOrdCount.GetEnumerator(), ords.GetEnumerator()); |
| } |
| |
| System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() |
| { |
| return GetEnumerator(); |
| } |
| } |
| |
| // per-document vint-encoded byte[] |
| internal class SortedSetIterator : IEnumerator<BytesRef> |
| { |
| internal byte[] buffer = new byte[10]; |
| internal ByteArrayDataOutput @out = new ByteArrayDataOutput(); |
| internal BytesRef @ref = new BytesRef(); |
| |
| internal readonly IEnumerator<long?> counts; |
| internal readonly IEnumerator<long?> ords; |
| |
| internal SortedSetIterator(IEnumerator<long?> counts, IEnumerator<long?> ords) |
| { |
| this.counts = counts; |
| this.ords = ords; |
| } |
| |
| public bool MoveNext() |
| { |
| if (!counts.MoveNext()) |
| { |
| return false; |
| } |
| |
| int count = (int)counts.Current; |
| int maxSize = count * 9; //worst case |
| if (maxSize > buffer.Length) |
| { |
| buffer = ArrayUtil.Grow(buffer, maxSize); |
| } |
| |
| try |
| { |
| EncodeValues(count); |
| } |
| catch (IOException bogus) |
| { |
| throw new Exception(bogus.ToString(), bogus); |
| } |
| |
| @ref.Bytes = buffer; |
| @ref.Offset = 0; |
| @ref.Length = @out.Position; |
| |
| return true; |
| } |
| |
| public BytesRef Current => @ref; |
| |
| object System.Collections.IEnumerator.Current => Current; |
| |
| // encodes count values to buffer |
| internal virtual void EncodeValues(int count) |
| { |
| @out.Reset(buffer); |
| long lastOrd = 0; |
| for (int i = 0; i < count; i++) |
| { |
| ords.MoveNext(); |
| long ord = ords.Current.Value; |
| @out.WriteVInt64(ord - lastOrd); |
| lastOrd = ord; |
| } |
| } |
| |
| public void Reset() |
| { |
| throw new NotImplementedException(); |
| } |
| |
| public void Dispose() |
| { |
| } |
| } |
| } |
| #pragma warning restore 612, 618 |
| } |