| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using System; |
| using System.IO; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using TokenStream = Lucene.Net.Analysis.TokenStream; |
| using Lucene.Net.Documents; |
| using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException; |
| using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput; |
| using Directory = Lucene.Net.Store.Directory; |
| using IndexInput = Lucene.Net.Store.IndexInput; |
| |
| namespace Lucene.Net.Index |
| { |
| |
| /// <summary> Class responsible for access to stored document fields. |
| /// <p/> |
| /// It uses <segment>.fdt and <segment>.fdx; files. |
| /// |
| /// </summary> |
| public sealed class FieldsReader : ICloneable, IDisposable |
| { |
| private readonly FieldInfos fieldInfos; |
| |
| // The main fieldStream, used only for cloning. |
| private readonly IndexInput cloneableFieldsStream; |
| |
| // This is a clone of cloneableFieldsStream used for reading documents. |
| // It should not be cloned outside of a synchronized context. |
| private readonly IndexInput fieldsStream; |
| |
| private readonly IndexInput cloneableIndexStream; |
| private readonly IndexInput indexStream; |
| private readonly int numTotalDocs; |
| private readonly int size; |
| private bool closed; |
| private readonly int format; |
| private readonly int formatSize; |
| |
| // The docID offset where our docs begin in the index |
| // file. This will be 0 if we have our own private file. |
| private readonly int docStoreOffset; |
| |
| private readonly CloseableThreadLocal<IndexInput> fieldsStreamTL = new CloseableThreadLocal<IndexInput>(); |
| private readonly bool isOriginal = false; |
| |
| /// <summary>Returns a cloned FieldsReader that shares open |
| /// IndexInputs with the original one. It is the caller's |
| /// job not to close the original FieldsReader until all |
| /// clones are called (eg, currently SegmentReader manages |
| /// this logic). |
| /// </summary> |
| public System.Object Clone() |
| { |
| EnsureOpen(); |
| return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream); |
| } |
| |
| // Used only by clone |
| private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize, int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) |
| { |
| this.fieldInfos = fieldInfos; |
| this.numTotalDocs = numTotalDocs; |
| this.size = size; |
| this.format = format; |
| this.formatSize = formatSize; |
| this.docStoreOffset = docStoreOffset; |
| this.cloneableFieldsStream = cloneableFieldsStream; |
| this.cloneableIndexStream = cloneableIndexStream; |
| fieldsStream = (IndexInput) cloneableFieldsStream.Clone(); |
| indexStream = (IndexInput) cloneableIndexStream.Clone(); |
| } |
| |
| public /*internal*/ FieldsReader(Directory d, String segment, FieldInfos fn):this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, - 1, 0) |
| { |
| } |
| |
| internal FieldsReader(Directory d, System.String segment, FieldInfos fn, int readBufferSize):this(d, segment, fn, readBufferSize, - 1, 0) |
| { |
| } |
| |
| internal FieldsReader(Directory d, System.String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) |
| { |
| bool success = false; |
| isOriginal = true; |
| try |
| { |
| fieldInfos = fn; |
| |
| cloneableFieldsStream = d.OpenInput(segment + "." + IndexFileNames.FIELDS_EXTENSION, readBufferSize); |
| cloneableIndexStream = d.OpenInput(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION, readBufferSize); |
| |
| // First version of fdx did not include a format |
| // header, but, the first int will always be 0 in that |
| // case |
| int firstInt = cloneableIndexStream.ReadInt(); |
| format = firstInt == 0 ? 0 : firstInt; |
| |
| if (format > FieldsWriter.FORMAT_CURRENT) |
| throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FieldsWriter.FORMAT_CURRENT + " or lower"); |
| |
| formatSize = format > FieldsWriter.FORMAT ? 4 : 0; |
| |
| if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) |
| cloneableFieldsStream.SetModifiedUTF8StringsMode(); |
| |
| fieldsStream = (IndexInput) cloneableFieldsStream.Clone(); |
| |
| long indexSize = cloneableIndexStream.Length() - formatSize; |
| |
| if (docStoreOffset != - 1) |
| { |
| // We read only a slice out of this shared fields file |
| this.docStoreOffset = docStoreOffset; |
| this.size = size; |
| |
| // Verify the file is long enough to hold all of our |
| // docs |
| System.Diagnostics.Debug.Assert(((int)(indexSize / 8)) >= size + this.docStoreOffset, "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset); |
| } |
| else |
| { |
| this.docStoreOffset = 0; |
| this.size = (int) (indexSize >> 3); |
| } |
| |
| indexStream = (IndexInput) cloneableIndexStream.Clone(); |
| numTotalDocs = (int) (indexSize >> 3); |
| success = true; |
| } |
| finally |
| { |
| // With lock-less commits, it's entirely possible (and |
| // fine) to hit a FileNotFound exception above. In |
| // this case, we want to explicitly close any subset |
| // of things that were opened so that we don't have to |
| // wait for a GC to do so. |
| if (!success) |
| { |
| Dispose(); |
| } |
| } |
| } |
| |
| /// <throws> AlreadyClosedException if this FieldsReader is closed </throws> |
| internal void EnsureOpen() |
| { |
| if (closed) |
| { |
| throw new AlreadyClosedException("this FieldsReader is closed"); |
| } |
| } |
| |
| /// <summary> Closes the underlying <see cref="Lucene.Net.Store.IndexInput" /> streams, including any ones associated with a |
| /// lazy implementation of a Field. This means that the Fields values will not be accessible. |
| /// |
| /// </summary> |
| /// <throws> IOException </throws> |
| public void Dispose() |
| { |
| // Move to protected method if class becomes unsealed |
| if (!closed) |
| { |
| if (fieldsStream != null) |
| { |
| fieldsStream.Close(); |
| } |
| if (isOriginal) |
| { |
| if (cloneableFieldsStream != null) |
| { |
| cloneableFieldsStream.Close(); |
| } |
| if (cloneableIndexStream != null) |
| { |
| cloneableIndexStream.Close(); |
| } |
| } |
| if (indexStream != null) |
| { |
| indexStream.Close(); |
| } |
| fieldsStreamTL.Close(); |
| closed = true; |
| } |
| } |
| |
| public /*internal*/ int Size() |
| { |
| return size; |
| } |
| |
| private void SeekIndex(int docID) |
| { |
| indexStream.Seek(formatSize + (docID + docStoreOffset) * 8L); |
| } |
| |
| internal bool CanReadRawDocs() |
| { |
| // Disable reading raw docs in 2.x format, because of the removal of compressed |
| // fields in 3.0. We don't want rawDocs() to decode field bits to figure out |
| // if a field was compressed, hence we enforce ordinary (non-raw) stored field merges |
| // for <3.0 indexes. |
| return format >= FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; |
| } |
| |
| public /*internal*/ Document Doc(int n, FieldSelector fieldSelector) |
| { |
| SeekIndex(n); |
| long position = indexStream.ReadLong(); |
| fieldsStream.Seek(position); |
| |
| var doc = new Document(); |
| int numFields = fieldsStream.ReadVInt(); |
| for (int i = 0; i < numFields; i++) |
| { |
| int fieldNumber = fieldsStream.ReadVInt(); |
| FieldInfo fi = fieldInfos.FieldInfo(fieldNumber); |
| FieldSelectorResult acceptField = fieldSelector == null?FieldSelectorResult.LOAD:fieldSelector.Accept(fi.name); |
| |
| byte bits = fieldsStream.ReadByte(); |
| System.Diagnostics.Debug.Assert(bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY); |
| |
| bool compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; |
| System.Diagnostics.Debug.Assert( |
| (!compressed || (format < FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS)), |
| "compressed fields are only allowed in indexes of version <= 2.9"); |
| bool tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; |
| bool binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; |
| //TODO: Find an alternative approach here if this list continues to grow beyond the |
| //list of 5 or 6 currently here. See Lucene 762 for discussion |
| if (acceptField.Equals(FieldSelectorResult.LOAD)) |
| { |
| AddField(doc, fi, binary, compressed, tokenize); |
| } |
| else if (acceptField.Equals(FieldSelectorResult.LOAD_AND_BREAK)) |
| { |
| AddField(doc, fi, binary, compressed, tokenize); |
| break; //Get out of this loop |
| } |
| else if (acceptField.Equals(FieldSelectorResult.LAZY_LOAD)) |
| { |
| AddFieldLazy(doc, fi, binary, compressed, tokenize); |
| } |
| else if (acceptField.Equals(FieldSelectorResult.SIZE)) |
| { |
| SkipField(binary, compressed, AddFieldSize(doc, fi, binary, compressed)); |
| } |
| else if (acceptField.Equals(FieldSelectorResult.SIZE_AND_BREAK)) |
| { |
| AddFieldSize(doc, fi, binary, compressed); |
| break; |
| } |
| else |
| { |
| SkipField(binary, compressed); |
| } |
| } |
| |
| return doc; |
| } |
| |
| /// <summary>Returns the length in bytes of each raw document in a |
| /// contiguous range of length numDocs starting with |
| /// startDocID. Returns the IndexInput (the fieldStream), |
| /// already seeked to the starting point for startDocID. |
| /// </summary> |
| internal IndexInput RawDocs(int[] lengths, int startDocID, int numDocs) |
| { |
| SeekIndex(startDocID); |
| long startOffset = indexStream.ReadLong(); |
| long lastOffset = startOffset; |
| int count = 0; |
| while (count < numDocs) |
| { |
| long offset; |
| int docID = docStoreOffset + startDocID + count + 1; |
| System.Diagnostics.Debug.Assert(docID <= numTotalDocs); |
| if (docID < numTotalDocs) |
| offset = indexStream.ReadLong(); |
| else |
| offset = fieldsStream.Length(); |
| lengths[count++] = (int) (offset - lastOffset); |
| lastOffset = offset; |
| } |
| |
| fieldsStream.Seek(startOffset); |
| |
| return fieldsStream; |
| } |
| |
| /// <summary> Skip the field. We still have to read some of the information about the field, but can skip past the actual content. |
| /// This will have the most payoff on large fields. |
| /// </summary> |
| private void SkipField(bool binary, bool compressed) |
| { |
| SkipField(binary, compressed, fieldsStream.ReadVInt()); |
| } |
| |
| private void SkipField(bool binary, bool compressed, int toRead) |
| { |
| if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) |
| { |
| fieldsStream.Seek(fieldsStream.FilePointer + toRead); |
| } |
| else |
| { |
| // We need to skip chars. This will slow us down, but still better |
| fieldsStream.SkipChars(toRead); |
| } |
| } |
| |
| private void AddFieldLazy(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize) |
| { |
| if (binary) |
| { |
| int toRead = fieldsStream.ReadVInt(); |
| long pointer = fieldsStream.FilePointer; |
| //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); |
| doc.Add(new LazyField(this, fi.name, Field.Store.YES, toRead, pointer, binary, compressed)); |
| |
| //Need to move the pointer ahead by toRead positions |
| fieldsStream.Seek(pointer + toRead); |
| } |
| else |
| { |
| const Field.Store store = Field.Store.YES; |
| Field.Index index = FieldExtensions.ToIndex(fi.isIndexed, tokenize); |
| Field.TermVector termVector = FieldExtensions.ToTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); |
| |
| AbstractField f; |
| if (compressed) |
| { |
| int toRead = fieldsStream.ReadVInt(); |
| long pointer = fieldsStream.FilePointer; |
| f = new LazyField(this, fi.name, store, toRead, pointer, binary, compressed); |
| //skip over the part that we aren't loading |
| fieldsStream.Seek(pointer + toRead); |
| f.OmitNorms = fi.omitNorms; |
| f.OmitTermFreqAndPositions = fi.omitTermFreqAndPositions; |
| } |
| else |
| { |
| int length = fieldsStream.ReadVInt(); |
| long pointer = fieldsStream.FilePointer; |
| //Skip ahead of where we are by the length of what is stored |
| if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) |
| { |
| fieldsStream.Seek(pointer + length); |
| } |
| else |
| { |
| fieldsStream.SkipChars(length); |
| } |
| f = new LazyField(this, fi.name, store, index, termVector, length, pointer, binary, compressed) |
| {OmitNorms = fi.omitNorms, OmitTermFreqAndPositions = fi.omitTermFreqAndPositions}; |
| } |
| |
| doc.Add(f); |
| } |
| } |
| |
| private void AddField(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize) |
| { |
| //we have a binary stored field, and it may be compressed |
| if (binary) |
| { |
| int toRead = fieldsStream.ReadVInt(); |
| var b = new byte[toRead]; |
| fieldsStream.ReadBytes(b, 0, b.Length); |
| doc.Add(compressed ? new Field(fi.name, Uncompress(b), Field.Store.YES) : new Field(fi.name, b, Field.Store.YES)); |
| } |
| else |
| { |
| const Field.Store store = Field.Store.YES; |
| Field.Index index = FieldExtensions.ToIndex(fi.isIndexed, tokenize); |
| Field.TermVector termVector = FieldExtensions.ToTermVector(fi.storeTermVector, fi.storeOffsetWithTermVector, fi.storePositionWithTermVector); |
| |
| AbstractField f; |
| if (compressed) |
| { |
| int toRead = fieldsStream.ReadVInt(); |
| |
| var b = new byte[toRead]; |
| fieldsStream.ReadBytes(b, 0, b.Length); |
| f = new Field(fi.name, false, System.Text.Encoding.GetEncoding("UTF-8").GetString(Uncompress(b)), store, index, |
| termVector) {OmitTermFreqAndPositions = fi.omitTermFreqAndPositions, OmitNorms = fi.omitNorms}; |
| } |
| else |
| { |
| f = new Field(fi.name, false, fieldsStream.ReadString(), store, index, termVector) |
| {OmitTermFreqAndPositions = fi.omitTermFreqAndPositions, OmitNorms = fi.omitNorms}; |
| } |
| |
| doc.Add(f); |
| } |
| } |
| |
| // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) |
| // Read just the size -- caller must skip the field content to continue reading fields |
| // Return the size in bytes or chars, depending on field type |
| private int AddFieldSize(Document doc, FieldInfo fi, bool binary, bool compressed) |
| { |
| int size = fieldsStream.ReadVInt(), bytesize = binary || compressed?size:2 * size; |
| var sizebytes = new byte[4]; |
| sizebytes[0] = (byte) (Number.URShift(bytesize, 24)); |
| sizebytes[1] = (byte) (Number.URShift(bytesize, 16)); |
| sizebytes[2] = (byte) (Number.URShift(bytesize, 8)); |
| sizebytes[3] = (byte) bytesize; |
| doc.Add(new Field(fi.name, sizebytes, Field.Store.YES)); |
| return size; |
| } |
| |
| /// <summary> A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is |
| /// loaded. |
| /// </summary> |
| [Serializable] |
| private sealed class LazyField : AbstractField |
| { |
| private void InitBlock(FieldsReader enclosingInstance) |
| { |
| this.Enclosing_Instance = enclosingInstance; |
| } |
| |
| private FieldsReader Enclosing_Instance { get; set; } |
| |
| private int toRead; |
| private long pointer; |
| [Obsolete("Only kept for backward-compatbility with <3.0 indexes. Will be removed in 4.0.")] |
| private readonly Boolean isCompressed; |
| |
| public LazyField(FieldsReader enclosingInstance, System.String name, Field.Store store, int toRead, long pointer, bool isBinary, bool isCompressed):base(name, store, Field.Index.NO, Field.TermVector.NO) |
| { |
| InitBlock(enclosingInstance); |
| this.toRead = toRead; |
| this.pointer = pointer; |
| this.internalIsBinary = isBinary; |
| if (isBinary) |
| internalBinaryLength = toRead; |
| lazy = true; |
| this.isCompressed = isCompressed; |
| } |
| |
| public LazyField(FieldsReader enclosingInstance, System.String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, bool isBinary, bool isCompressed):base(name, store, index, termVector) |
| { |
| InitBlock(enclosingInstance); |
| this.toRead = toRead; |
| this.pointer = pointer; |
| this.internalIsBinary = isBinary; |
| if (isBinary) |
| internalBinaryLength = toRead; |
| lazy = true; |
| this.isCompressed = isCompressed; |
| } |
| |
| private IndexInput GetFieldStream() |
| { |
| IndexInput localFieldsStream = Enclosing_Instance.fieldsStreamTL.Get(); |
| if (localFieldsStream == null) |
| { |
| localFieldsStream = (IndexInput) Enclosing_Instance.cloneableFieldsStream.Clone(); |
| Enclosing_Instance.fieldsStreamTL.Set(localFieldsStream); |
| } |
| return localFieldsStream; |
| } |
| |
| /// <summary>The value of the field as a Reader, or null. If null, the String value, |
| /// binary value, or TokenStream value is used. Exactly one of StringValue(), |
| /// ReaderValue(), GetBinaryValue(), and TokenStreamValue() must be set. |
| /// </summary> |
| public override TextReader ReaderValue |
| { |
| get |
| { |
| Enclosing_Instance.EnsureOpen(); |
| return null; |
| } |
| } |
| |
| /// <summary>The value of the field as a TokenStream, or null. If null, the Reader value, |
| /// String value, or binary value is used. Exactly one of StringValue(), |
| /// ReaderValue(), GetBinaryValue(), and TokenStreamValue() must be set. |
| /// </summary> |
| public override TokenStream TokenStreamValue |
| { |
| get |
| { |
| Enclosing_Instance.EnsureOpen(); |
| return null; |
| } |
| } |
| |
| /// <summary>The value of the field as a String, or null. If null, the Reader value, |
| /// binary value, or TokenStream value is used. Exactly one of StringValue(), |
| /// ReaderValue(), GetBinaryValue(), and TokenStreamValue() must be set. |
| /// </summary> |
| public override string StringValue |
| { |
| get |
| { |
| Enclosing_Instance.EnsureOpen(); |
| if (internalIsBinary) |
| return null; |
| |
| if (fieldsData == null) |
| { |
| IndexInput localFieldsStream = GetFieldStream(); |
| try |
| { |
| localFieldsStream.Seek(pointer); |
| if (isCompressed) |
| { |
| var b = new byte[toRead]; |
| localFieldsStream.ReadBytes(b, 0, b.Length); |
| fieldsData = |
| System.Text.Encoding.GetEncoding("UTF-8").GetString(Enclosing_Instance.Uncompress(b)); |
| } |
| else |
| { |
| if (Enclosing_Instance.format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) |
| { |
| var bytes = new byte[toRead]; |
| localFieldsStream.ReadBytes(bytes, 0, toRead); |
| fieldsData = System.Text.Encoding.GetEncoding("UTF-8").GetString(bytes); |
| } |
| else |
| { |
| //read in chars b/c we already know the length we need to read |
| var chars = new char[toRead]; |
| localFieldsStream.ReadChars(chars, 0, toRead); |
| fieldsData = new System.String(chars); |
| } |
| } |
| } |
| catch (System.IO.IOException e) |
| { |
| throw new FieldReaderException(e); |
| } |
| } |
| return (System.String) fieldsData; |
| } |
| } |
| |
| public long Pointer |
| { |
| get |
| { |
| Enclosing_Instance.EnsureOpen(); |
| return pointer; |
| } |
| set |
| { |
| Enclosing_Instance.EnsureOpen(); |
| this.pointer = value; |
| } |
| } |
| |
| public int ToRead |
| { |
| get |
| { |
| Enclosing_Instance.EnsureOpen(); |
| return toRead; |
| } |
| set |
| { |
| Enclosing_Instance.EnsureOpen(); |
| this.toRead = value; |
| } |
| } |
| |
| public override byte[] GetBinaryValue(byte[] result) |
| { |
| Enclosing_Instance.EnsureOpen(); |
| |
| if (internalIsBinary) |
| { |
| if (fieldsData == null) |
| { |
| // Allocate new buffer if result is null or too small |
| byte[] b; |
| if (result == null || result.Length < toRead) |
| b = new byte[toRead]; |
| else |
| b = result; |
| |
| IndexInput localFieldsStream = GetFieldStream(); |
| |
| // Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people |
| // since they are already handling this exception when getting the document |
| try |
| { |
| localFieldsStream.Seek(pointer); |
| localFieldsStream.ReadBytes(b, 0, toRead); |
| fieldsData = isCompressed ? Enclosing_Instance.Uncompress(b) : b; |
| } |
| catch (IOException e) |
| { |
| throw new FieldReaderException(e); |
| } |
| |
| internalbinaryOffset = 0; |
| internalBinaryLength = toRead; |
| } |
| |
| return (byte[]) fieldsData; |
| } |
| return null; |
| } |
| } |
| |
| private byte[] Uncompress(byte[] b) |
| { |
| try |
| { |
| return CompressionTools.Decompress(b); |
| } |
| catch (Exception e) |
| { |
| // this will happen if the field is not compressed |
| throw new CorruptIndexException("field data are in wrong format: " + e, e); |
| } |
| } |
| } |
| } |