blob: 945f32be0f55c03fcc7fb3c55fad7a9c7baa7ca9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Documents;
using IndexOutput = Lucene.Net.Store.IndexOutput;
using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
namespace Lucene.Net.Index
{
sealed class TermVectorsTermsWriterPerField:TermsHashConsumerPerField
{
internal TermVectorsTermsWriterPerThread perThread;
internal TermsHashPerField termsHashPerField;
internal TermVectorsTermsWriter termsWriter;
internal FieldInfo fieldInfo;
internal DocumentsWriter.DocState docState;
internal FieldInvertState fieldState;
internal bool doVectors;
internal bool doVectorPositions;
internal bool doVectorOffsets;
internal int maxNumPostings;
internal IOffsetAttribute offsetAttribute = null;
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo)
{
this.termsHashPerField = termsHashPerField;
this.perThread = perThread;
this.termsWriter = perThread.termsWriter;
this.fieldInfo = fieldInfo;
docState = termsHashPerField.docState;
fieldState = termsHashPerField.fieldState;
}
internal override int GetStreamCount()
{
return 2;
}
internal override bool Start(IFieldable[] fields, int count)
{
doVectors = false;
doVectorPositions = false;
doVectorOffsets = false;
for (int i = 0; i < count; i++)
{
IFieldable field = fields[i];
if (field.IsIndexed && field.IsTermVectorStored)
{
doVectors = true;
doVectorPositions |= field.IsStorePositionWithTermVector;
doVectorOffsets |= field.IsStoreOffsetWithTermVector;
}
}
if (doVectors)
{
if (perThread.doc == null)
{
perThread.doc = termsWriter.GetPerDoc();
perThread.doc.docID = docState.docID;
System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0);
System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.Length);
System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.FilePointer);
}
System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID);
if (termsHashPerField.numPostings != 0)
{
// Only necessary if previous doc hit a
// non-aborting exception while writing vectors in
// this field:
termsHashPerField.Reset();
perThread.termsHashPerThread.Reset(false);
}
}
// TODO: only if needed for performance
//perThread.postingsCount = 0;
return doVectors;
}
public void Abort()
{
}
/// <summary>Called once per field per document if term vectors
/// are enabled, to write the vectors to
/// RAMOutputStream, which is then quickly flushed to
/// the real term vectors files in the Directory.
/// </summary>
internal override void Finish()
{
System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));
int numPostings = termsHashPerField.numPostings;
System.Diagnostics.Debug.Assert(numPostings >= 0);
if (!doVectors || numPostings == 0)
return ;
if (numPostings > maxNumPostings)
maxNumPostings = numPostings;
IndexOutput tvf = perThread.doc.perDocTvf;
// This is called once, after inverting all occurences
// of a given field in the doc. At this point we flush
// our hash into the DocWriter.
System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));
perThread.doc.AddField(termsHashPerField.fieldInfo.number);
RawPostingList[] postings = termsHashPerField.SortPostings();
tvf.WriteVInt(numPostings);
byte bits = (byte) (0x0);
if (doVectorPositions)
bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
if (doVectorOffsets)
bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
tvf.WriteByte(bits);
int encoderUpto = 0;
int lastTermBytesCount = 0;
ByteSliceReader reader = perThread.vectorSliceReader;
char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
for (int j = 0; j < numPostings; j++)
{
TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
int freq = posting.freq;
char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
// We swap between two encoders to save copying
// last Term's byte array
UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
// TODO: we could do this incrementally
UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
int termBytesCount = utf8Result.length;
// TODO: UTF16toUTF8 could tell us this prefix
// Compute common prefix between last term and
// this term
int prefix = 0;
if (j > 0)
{
byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
byte[] termBytes = perThread.utf8Results[encoderUpto].result;
while (prefix < lastTermBytesCount && prefix < termBytesCount)
{
if (lastTermBytes[prefix] != termBytes[prefix])
break;
prefix++;
}
}
encoderUpto = 1 - encoderUpto;
lastTermBytesCount = termBytesCount;
int suffix = termBytesCount - prefix;
tvf.WriteVInt(prefix);
tvf.WriteVInt(suffix);
tvf.WriteBytes(utf8Result.result, prefix, suffix);
tvf.WriteVInt(freq);
if (doVectorPositions)
{
termsHashPerField.InitReader(reader, posting, 0);
reader.WriteTo(tvf);
}
if (doVectorOffsets)
{
termsHashPerField.InitReader(reader, posting, 1);
reader.WriteTo(tvf);
}
}
termsHashPerField.Reset();
// NOTE: we clear, per-field, at the thread level,
// because term vectors fully write themselves on each
// field; this saves RAM (eg if large doc has two large
// fields w/ term vectors on) because we recycle/reuse
// all RAM after each field:
perThread.termsHashPerThread.Reset(false);
}
internal void ShrinkHash()
{
termsHashPerField.ShrinkHash(maxNumPostings);
maxNumPostings = 0;
}
internal override void Start(IFieldable f)
{
if (doVectorOffsets)
{
offsetAttribute = fieldState.attributeSource.AddAttribute<IOffsetAttribute>();
}
else
{
offsetAttribute = null;
}
}
internal override void NewTerm(RawPostingList p0)
{
System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start"));
TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
p.freq = 1;
if (doVectorOffsets)
{
int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
int endOffset = fieldState.offset + offsetAttribute.EndOffset;
termsHashPerField.WriteVInt(1, startOffset);
termsHashPerField.WriteVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
}
if (doVectorPositions)
{
termsHashPerField.WriteVInt(0, fieldState.position);
p.lastPosition = fieldState.position;
}
}
internal override void AddTerm(RawPostingList p0)
{
System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));
TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
p.freq++;
if (doVectorOffsets)
{
int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
int endOffset = fieldState.offset + offsetAttribute.EndOffset;
termsHashPerField.WriteVInt(1, startOffset - p.lastOffset);
termsHashPerField.WriteVInt(1, endOffset - startOffset);
p.lastOffset = endOffset;
}
if (doVectorPositions)
{
termsHashPerField.WriteVInt(0, fieldState.position - p.lastPosition);
p.lastPosition = fieldState.position;
}
}
internal override void SkippingLongTerm()
{
}
}
}