| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using System; |
| using Lucene.Net.Analysis.Tokenattributes; |
| using Lucene.Net.Documents; |
| using IndexOutput = Lucene.Net.Store.IndexOutput; |
| using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; |
| |
| namespace Lucene.Net.Index |
| { |
| |
| sealed class TermVectorsTermsWriterPerField:TermsHashConsumerPerField |
| { |
| |
| internal TermVectorsTermsWriterPerThread perThread; |
| internal TermsHashPerField termsHashPerField; |
| internal TermVectorsTermsWriter termsWriter; |
| internal FieldInfo fieldInfo; |
| internal DocumentsWriter.DocState docState; |
| internal FieldInvertState fieldState; |
| |
| internal bool doVectors; |
| internal bool doVectorPositions; |
| internal bool doVectorOffsets; |
| |
| internal int maxNumPostings; |
| internal IOffsetAttribute offsetAttribute = null; |
| |
| public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) |
| { |
| this.termsHashPerField = termsHashPerField; |
| this.perThread = perThread; |
| this.termsWriter = perThread.termsWriter; |
| this.fieldInfo = fieldInfo; |
| docState = termsHashPerField.docState; |
| fieldState = termsHashPerField.fieldState; |
| } |
| |
| internal override int GetStreamCount() |
| { |
| return 2; |
| } |
| |
| internal override bool Start(IFieldable[] fields, int count) |
| { |
| doVectors = false; |
| doVectorPositions = false; |
| doVectorOffsets = false; |
| |
| for (int i = 0; i < count; i++) |
| { |
| IFieldable field = fields[i]; |
| if (field.IsIndexed && field.IsTermVectorStored) |
| { |
| doVectors = true; |
| doVectorPositions |= field.IsStorePositionWithTermVector; |
| doVectorOffsets |= field.IsStoreOffsetWithTermVector; |
| } |
| } |
| |
| if (doVectors) |
| { |
| if (perThread.doc == null) |
| { |
| perThread.doc = termsWriter.GetPerDoc(); |
| perThread.doc.docID = docState.docID; |
| System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0); |
| System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.Length); |
| System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.FilePointer); |
| } |
| |
| System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID); |
| if (termsHashPerField.numPostings != 0) |
| { |
| // Only necessary if previous doc hit a |
| // non-aborting exception while writing vectors in |
| // this field: |
| termsHashPerField.Reset(); |
| perThread.termsHashPerThread.Reset(false); |
| } |
| } |
| |
| // TODO: only if needed for performance |
| //perThread.postingsCount = 0; |
| |
| return doVectors; |
| } |
| |
| public void Abort() |
| { |
| } |
| |
| /// <summary>Called once per field per document if term vectors |
| /// are enabled, to write the vectors to |
| /// RAMOutputStream, which is then quickly flushed to |
| /// the real term vectors files in the Directory. |
| /// </summary> |
| internal override void Finish() |
| { |
| |
| System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); |
| |
| int numPostings = termsHashPerField.numPostings; |
| |
| System.Diagnostics.Debug.Assert(numPostings >= 0); |
| |
| if (!doVectors || numPostings == 0) |
| return ; |
| |
| if (numPostings > maxNumPostings) |
| maxNumPostings = numPostings; |
| |
| IndexOutput tvf = perThread.doc.perDocTvf; |
| |
| // This is called once, after inverting all occurences |
| // of a given field in the doc. At this point we flush |
| // our hash into the DocWriter. |
| |
| System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); |
| System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); |
| |
| perThread.doc.AddField(termsHashPerField.fieldInfo.number); |
| |
| RawPostingList[] postings = termsHashPerField.SortPostings(); |
| |
| tvf.WriteVInt(numPostings); |
| byte bits = (byte) (0x0); |
| if (doVectorPositions) |
| bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; |
| if (doVectorOffsets) |
| bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; |
| tvf.WriteByte(bits); |
| |
| int encoderUpto = 0; |
| int lastTermBytesCount = 0; |
| |
| ByteSliceReader reader = perThread.vectorSliceReader; |
| char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; |
| for (int j = 0; j < numPostings; j++) |
| { |
| TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j]; |
| int freq = posting.freq; |
| |
| char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; |
| int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; |
| |
| // We swap between two encoders to save copying |
| // last Term's byte array |
| UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; |
| |
| // TODO: we could do this incrementally |
| UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); |
| int termBytesCount = utf8Result.length; |
| |
| // TODO: UTF16toUTF8 could tell us this prefix |
| // Compute common prefix between last term and |
| // this term |
| int prefix = 0; |
| if (j > 0) |
| { |
| byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; |
| byte[] termBytes = perThread.utf8Results[encoderUpto].result; |
| while (prefix < lastTermBytesCount && prefix < termBytesCount) |
| { |
| if (lastTermBytes[prefix] != termBytes[prefix]) |
| break; |
| prefix++; |
| } |
| } |
| encoderUpto = 1 - encoderUpto; |
| lastTermBytesCount = termBytesCount; |
| |
| int suffix = termBytesCount - prefix; |
| tvf.WriteVInt(prefix); |
| tvf.WriteVInt(suffix); |
| tvf.WriteBytes(utf8Result.result, prefix, suffix); |
| tvf.WriteVInt(freq); |
| |
| if (doVectorPositions) |
| { |
| termsHashPerField.InitReader(reader, posting, 0); |
| reader.WriteTo(tvf); |
| } |
| |
| if (doVectorOffsets) |
| { |
| termsHashPerField.InitReader(reader, posting, 1); |
| reader.WriteTo(tvf); |
| } |
| } |
| |
| termsHashPerField.Reset(); |
| |
| // NOTE: we clear, per-field, at the thread level, |
| // because term vectors fully write themselves on each |
| // field; this saves RAM (eg if large doc has two large |
| // fields w/ term vectors on) because we recycle/reuse |
| // all RAM after each field: |
| perThread.termsHashPerThread.Reset(false); |
| } |
| |
| internal void ShrinkHash() |
| { |
| termsHashPerField.ShrinkHash(maxNumPostings); |
| maxNumPostings = 0; |
| } |
| |
| internal override void Start(IFieldable f) |
| { |
| if (doVectorOffsets) |
| { |
| offsetAttribute = fieldState.attributeSource.AddAttribute<IOffsetAttribute>(); |
| } |
| else |
| { |
| offsetAttribute = null; |
| } |
| } |
| |
| internal override void NewTerm(RawPostingList p0) |
| { |
| |
| System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start")); |
| |
| TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; |
| |
| p.freq = 1; |
| |
| if (doVectorOffsets) |
| { |
| int startOffset = fieldState.offset + offsetAttribute.StartOffset; ; |
| int endOffset = fieldState.offset + offsetAttribute.EndOffset; |
| |
| termsHashPerField.WriteVInt(1, startOffset); |
| termsHashPerField.WriteVInt(1, endOffset - startOffset); |
| p.lastOffset = endOffset; |
| } |
| |
| if (doVectorPositions) |
| { |
| termsHashPerField.WriteVInt(0, fieldState.position); |
| p.lastPosition = fieldState.position; |
| } |
| } |
| |
| internal override void AddTerm(RawPostingList p0) |
| { |
| |
| System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start")); |
| |
| TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; |
| p.freq++; |
| |
| if (doVectorOffsets) |
| { |
| int startOffset = fieldState.offset + offsetAttribute.StartOffset; ; |
| int endOffset = fieldState.offset + offsetAttribute.EndOffset; |
| |
| termsHashPerField.WriteVInt(1, startOffset - p.lastOffset); |
| termsHashPerField.WriteVInt(1, endOffset - startOffset); |
| p.lastOffset = endOffset; |
| } |
| |
| if (doVectorPositions) |
| { |
| termsHashPerField.WriteVInt(0, fieldState.position - p.lastPosition); |
| p.lastPosition = fieldState.position; |
| } |
| } |
| |
| internal override void SkippingLongTerm() |
| { |
| } |
| } |
| } |