| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using System; |
| |
| using Directory = Lucene.Net.Store.Directory; |
| using IndexOutput = Lucene.Net.Store.IndexOutput; |
| using StringHelper = Lucene.Net.Util.StringHelper; |
| using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; |
| |
| namespace Lucene.Net.Index |
| { |
| sealed class TermVectorsWriter : IDisposable |
| { |
| |
| private readonly IndexOutput tvx = null; |
| private readonly IndexOutput tvd = null; |
| private readonly IndexOutput tvf = null; |
| private readonly FieldInfos fieldInfos; |
| internal UnicodeUtil.UTF8Result[] utf8Results = new[]{new UnicodeUtil.UTF8Result(), new UnicodeUtil.UTF8Result()}; |
| |
| public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos) |
| { |
| // Open files for TermVector storage |
| tvx = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); |
| tvx.WriteInt(TermVectorsReader.FORMAT_CURRENT); |
| tvd = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); |
| tvd.WriteInt(TermVectorsReader.FORMAT_CURRENT); |
| tvf = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); |
| tvf.WriteInt(TermVectorsReader.FORMAT_CURRENT); |
| |
| this.fieldInfos = fieldInfos; |
| } |
| |
| /// <summary> Add a complete document specified by all its term vectors. If document has no |
| /// term vectors, add value for tvx. |
| /// |
| /// </summary> |
| /// <param name="vectors"> |
| /// </param> |
| /// <throws> IOException </throws> |
| public void AddAllDocVectors(ITermFreqVector[] vectors) |
| { |
| |
| tvx.WriteLong(tvd.FilePointer); |
| tvx.WriteLong(tvf.FilePointer); |
| |
| if (vectors != null) |
| { |
| int numFields = vectors.Length; |
| tvd.WriteVInt(numFields); |
| |
| var fieldPointers = new long[numFields]; |
| |
| for (int i = 0; i < numFields; i++) |
| { |
| fieldPointers[i] = tvf.FilePointer; |
| |
| int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field); |
| |
| // 1st pass: write field numbers to tvd |
| tvd.WriteVInt(fieldNumber); |
| |
| int numTerms = vectors[i].Size; |
| tvf.WriteVInt(numTerms); |
| |
| TermPositionVector tpVector; |
| |
| byte bits; |
| bool storePositions; |
| bool storeOffsets; |
| |
| if (vectors[i] is TermPositionVector) |
| { |
| // May have positions & offsets |
| tpVector = (TermPositionVector) vectors[i]; |
| storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null; |
| storeOffsets = tpVector.Size > 0 && tpVector.GetOffsets(0) != null; |
| bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0)); |
| } |
| else |
| { |
| tpVector = null; |
| bits = 0; |
| storePositions = false; |
| storeOffsets = false; |
| } |
| |
| tvf.WriteVInt(bits); |
| |
| System.String[] terms = vectors[i].GetTerms(); |
| int[] freqs = vectors[i].GetTermFrequencies(); |
| |
| int utf8Upto = 0; |
| utf8Results[1].length = 0; |
| |
| for (int j = 0; j < numTerms; j++) |
| { |
| |
| UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]); |
| |
| int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); |
| int length = utf8Results[utf8Upto].length - start; |
| tvf.WriteVInt(start); // write shared prefix length |
| tvf.WriteVInt(length); // write delta length |
| tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes |
| utf8Upto = 1 - utf8Upto; |
| |
| int termFreq = freqs[j]; |
| |
| tvf.WriteVInt(termFreq); |
| |
| if (storePositions) |
| { |
| int[] positions = tpVector.GetTermPositions(j); |
| if (positions == null) |
| throw new System.SystemException("Trying to write positions that are null!"); |
| System.Diagnostics.Debug.Assert(positions.Length == termFreq); |
| |
| // use delta encoding for positions |
| int lastPosition = 0; |
| foreach (int position in positions) |
| { |
| tvf.WriteVInt(position - lastPosition); |
| lastPosition = position; |
| } |
| } |
| |
| if (storeOffsets) |
| { |
| TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j); |
| if (offsets == null) |
| throw new System.SystemException("Trying to write offsets that are null!"); |
| System.Diagnostics.Debug.Assert(offsets.Length == termFreq); |
| |
| // use delta encoding for offsets |
| int lastEndOffset = 0; |
| foreach (TermVectorOffsetInfo t in offsets) |
| { |
| int startOffset = t.StartOffset; |
| int endOffset = t.EndOffset; |
| tvf.WriteVInt(startOffset - lastEndOffset); |
| tvf.WriteVInt(endOffset - startOffset); |
| lastEndOffset = endOffset; |
| } |
| } |
| } |
| } |
| |
| // 2nd pass: write field pointers to tvd |
| if (numFields > 1) |
| { |
| long lastFieldPointer = fieldPointers[0]; |
| for (int i = 1; i < numFields; i++) |
| { |
| long fieldPointer = fieldPointers[i]; |
| tvd.WriteVLong(fieldPointer - lastFieldPointer); |
| lastFieldPointer = fieldPointer; |
| } |
| } |
| } |
| else |
| tvd.WriteVInt(0); |
| } |
| |
| /// <summary> Do a bulk copy of numDocs documents from reader to our |
| /// streams. This is used to expedite merging, if the |
| /// field numbers are congruent. |
| /// </summary> |
| internal void AddRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) |
| { |
| long tvdPosition = tvd.FilePointer; |
| long tvfPosition = tvf.FilePointer; |
| long tvdStart = tvdPosition; |
| long tvfStart = tvfPosition; |
| for (int i = 0; i < numDocs; i++) |
| { |
| tvx.WriteLong(tvdPosition); |
| tvdPosition += tvdLengths[i]; |
| tvx.WriteLong(tvfPosition); |
| tvfPosition += tvfLengths[i]; |
| } |
| tvd.CopyBytes(reader.GetTvdStream(), tvdPosition - tvdStart); |
| tvf.CopyBytes(reader.GetTvfStream(), tvfPosition - tvfStart); |
| System.Diagnostics.Debug.Assert(tvd.FilePointer == tvdPosition); |
| System.Diagnostics.Debug.Assert(tvf.FilePointer == tvfPosition); |
| } |
| |
| /// <summary>Close all streams. </summary> |
| public void Dispose() |
| { |
| // Move to a protected method if class becomes unsealed |
| |
| // make an effort to close all streams we can but remember and re-throw |
| // the first exception encountered in this process |
| System.IO.IOException keep = null; |
| if (tvx != null) |
| try |
| { |
| tvx.Close(); |
| } |
| catch (System.IO.IOException e) |
| { |
| keep = e; |
| } |
| if (tvd != null) |
| try |
| { |
| tvd.Close(); |
| } |
| catch (System.IO.IOException e) |
| { |
| if (keep == null) |
| keep = e; |
| } |
| if (tvf != null) |
| try |
| { |
| tvf.Close(); |
| } |
| catch (System.IO.IOException e) |
| { |
| if (keep == null) |
| keep = e; |
| } |
| if (keep != null) |
| { |
| throw new System.IO.IOException(keep.StackTrace); |
| } |
| } |
| } |
| } |