blob: e178c3e790a7884ba5d883ddc495acf400caffe3 [file] [log] [blame]
using Lucene.Net.Diagnostics;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
namespace Lucene.Net.Codecs.SimpleText
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BytesRef = Util.BytesRef;
using Directory = Store.Directory;
using FieldInfo = Index.FieldInfo;
using FieldInfos = Index.FieldInfos;
using IndexFileNames = Index.IndexFileNames;
using IndexOutput = Store.IndexOutput;
using IOContext = Store.IOContext;
using IOUtils = Util.IOUtils;
/// <summary>
/// Writes plain-text term vectors.
/// <para>
/// <b><font color="red">FOR RECREATIONAL USE ONLY</font></b>
/// </para>
/// @lucene.experimental
/// </summary>
public class SimpleTextTermVectorsWriter : TermVectorsWriter
{
internal static readonly BytesRef END = new BytesRef("END");
internal static readonly BytesRef DOC = new BytesRef("doc ");
internal static readonly BytesRef NUMFIELDS = new BytesRef(" numfields ");
internal static readonly BytesRef FIELD = new BytesRef(" field ");
internal static readonly BytesRef FIELDNAME = new BytesRef(" name ");
internal static readonly BytesRef FIELDPOSITIONS = new BytesRef(" positions ");
internal static readonly BytesRef FIELDOFFSETS = new BytesRef(" offsets ");
internal static readonly BytesRef FIELDPAYLOADS = new BytesRef(" payloads ");
internal static readonly BytesRef FIELDTERMCOUNT = new BytesRef(" numterms ");
internal static readonly BytesRef TERMTEXT = new BytesRef(" term ");
internal static readonly BytesRef TERMFREQ = new BytesRef(" freq ");
internal static readonly BytesRef POSITION = new BytesRef(" position ");
internal static readonly BytesRef PAYLOAD = new BytesRef(" payload ");
internal static readonly BytesRef STARTOFFSET = new BytesRef(" startoffset ");
internal static readonly BytesRef ENDOFFSET = new BytesRef(" endoffset ");
internal const string VECTORS_EXTENSION = "vec";
private readonly Directory _directory;
private readonly string _segment;
private IndexOutput _output;
private int _numDocsWritten;
private readonly BytesRef _scratch = new BytesRef();
private bool _offsets;
private bool _positions;
private bool _payloads;
public SimpleTextTermVectorsWriter(Directory directory, string segment, IOContext context)
{
_directory = directory;
_segment = segment;
bool success = false;
try
{
_output = directory.CreateOutput(IndexFileNames.SegmentFileName(segment, "", VECTORS_EXTENSION), context);
success = true;
}
finally
{
if (!success)
{
Abort();
}
}
}
public override void StartDocument(int numVectorFields)
{
Write(DOC);
Write(Convert.ToString(_numDocsWritten, CultureInfo.InvariantCulture));
NewLine();
Write(NUMFIELDS);
Write(Convert.ToString(numVectorFields, CultureInfo.InvariantCulture));
NewLine();
_numDocsWritten++;
}
public override void StartField(FieldInfo info, int numTerms, bool positions, bool offsets, bool payloads)
{
Write(FIELD);
Write(Convert.ToString(info.Number, CultureInfo.InvariantCulture));
NewLine();
Write(FIELDNAME);
Write(info.Name);
NewLine();
Write(FIELDPOSITIONS);
Write(Convert.ToString(positions, CultureInfo.InvariantCulture).ToLowerInvariant());
NewLine();
Write(FIELDOFFSETS);
Write(Convert.ToString(offsets, CultureInfo.InvariantCulture).ToLowerInvariant());
NewLine();
Write(FIELDPAYLOADS);
Write(Convert.ToString(payloads, CultureInfo.InvariantCulture).ToLowerInvariant());
NewLine();
Write(FIELDTERMCOUNT);
Write(Convert.ToString(numTerms, CultureInfo.InvariantCulture));
NewLine();
_positions = positions;
_offsets = offsets;
_payloads = payloads;
}
public override void StartTerm(BytesRef term, int freq)
{
Write(TERMTEXT);
Write(term);
NewLine();
Write(TERMFREQ);
Write(Convert.ToString(freq, CultureInfo.InvariantCulture));
NewLine();
}
public override void AddPosition(int position, int startOffset, int endOffset, BytesRef payload)
{
if (Debugging.AssertsEnabled) Debugging.Assert(_positions || _offsets);
if (_positions)
{
Write(POSITION);
Write(Convert.ToString(position, CultureInfo.InvariantCulture));
NewLine();
if (_payloads)
{
Write(PAYLOAD);
if (payload != null)
{
if (Debugging.AssertsEnabled) Debugging.Assert(payload.Length > 0);
Write(payload);
}
NewLine();
}
}
if (_offsets)
{
Write(STARTOFFSET);
Write(Convert.ToString(startOffset, CultureInfo.InvariantCulture));
NewLine();
Write(ENDOFFSET);
Write(Convert.ToString(endOffset, CultureInfo.InvariantCulture));
NewLine();
}
}
public override sealed void Abort()
{
try
{
Dispose();
}
finally
{
IOUtils.DeleteFilesIgnoringExceptions(_directory,
IndexFileNames.SegmentFileName(_segment, "", VECTORS_EXTENSION));
}
}
public override void Finish(FieldInfos fis, int numDocs)
{
if (_numDocsWritten != numDocs)
{
throw new Exception("mergeVectors produced an invalid result: mergedDocs is " + numDocs +
" but vec numDocs is " + _numDocsWritten + " file=" + _output +
"; now aborting this merge to prevent index corruption");
}
Write(END);
NewLine();
SimpleTextUtil.WriteChecksum(_output, _scratch);
}
protected override void Dispose(bool disposing)
{
if (!disposing) return;
try
{
IOUtils.Dispose(_output);
}
finally
{
_output = null;
}
}
public override IComparer<BytesRef> Comparer => BytesRef.UTF8SortedAsUnicodeComparer;
private void Write(string s)
{
SimpleTextUtil.Write(_output, s, _scratch);
}
private void Write(BytesRef bytes)
{
SimpleTextUtil.Write(_output, bytes);
}
private void NewLine()
{
SimpleTextUtil.WriteNewline(_output);
}
}
}