blob: 666f4672892d70580544579e6d7c93e6b7d5e835 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System.Linq;
using Lucene.Net.Support;
namespace Lucene.Net.Codecs.SimpleText
{
using System;
using System.Diagnostics;
using System.Collections.Generic;
using DocsAndPositionsEnum = Index.DocsAndPositionsEnum;
using DocsEnum = Index.DocsEnum;
using Fields = Index.Fields;
using IndexFileNames = Index.IndexFileNames;
using SegmentInfo = Index.SegmentInfo;
using Terms = Index.Terms;
using TermsEnum = Index.TermsEnum;
using AlreadyClosedException = Store.AlreadyClosedException;
using BufferedChecksumIndexInput = Store.BufferedChecksumIndexInput;
using ChecksumIndexInput = Store.ChecksumIndexInput;
using Directory = Store.Directory;
using IOContext = Store.IOContext;
using IndexInput = Store.IndexInput;
using ArrayUtil = Util.ArrayUtil;
using Bits = Util.Bits;
using BytesRef = Util.BytesRef;
using CharsRef = Util.CharsRef;
using IOUtils = Util.IOUtils;
using StringHelper = Util.StringHelper;
using UnicodeUtil = Util.UnicodeUtil;
/// <summary>
/// Reads plain-text term vectors.
/// <para>
/// <b><font color="red">FOR RECREATIONAL USE ONLY</font></B>
/// @lucene.experimental
/// </para>
/// </summary>
public class SimpleTextTermVectorsReader : TermVectorsReader
{
private long[] _offsets; // docid -> offset in .vec file
private IndexInput _input;
private readonly BytesRef _scratch = new BytesRef();
private readonly CharsRef _scratchUtf16 = new CharsRef();
public SimpleTextTermVectorsReader(Directory directory, SegmentInfo si, IOContext context)
{
bool success = false;
try
{
_input = directory.OpenInput(IndexFileNames.SegmentFileName(si.Name, "", SimpleTextTermVectorsWriter.VECTORS_EXTENSION), context);
success = true;
}
finally
{
if (!success)
{
try
{
Dispose();
}
catch (Exception)
{
// ensure we throw our original exception
}
}
}
ReadIndex(si.DocCount);
}
// used by clone
internal SimpleTextTermVectorsReader(long[] offsets, IndexInput input)
{
_offsets = offsets;
_input = input;
}
// we don't actually write a .tvx-like index, instead we read the
// vectors file in entirety up-front and save the offsets
// so we can seek to the data later.
private void ReadIndex(int maxDoc)
{
ChecksumIndexInput input = new BufferedChecksumIndexInput(_input);
_offsets = new long[maxDoc];
int upto = 0;
while (!_scratch.Equals(SimpleTextTermVectorsWriter.END))
{
SimpleTextUtil.ReadLine(input, _scratch);
if (StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.DOC))
{
_offsets[upto] = input.FilePointer;
upto++;
}
}
SimpleTextUtil.CheckFooter(input);
Debug.Assert(upto == _offsets.Length);
}
public override Fields Get(int doc)
{
var fields = new SortedDictionary<string, SimpleTVTerms>();
_input.Seek(_offsets[doc]);
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.NUMFIELDS));
var numFields = ParseIntAt(SimpleTextTermVectorsWriter.NUMFIELDS.Length);
if (numFields == 0)
{
return null; // no vectors for this doc
}
for (var i = 0; i < numFields; i++)
{
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELD));
// skip fieldNumber:
ParseIntAt(SimpleTextTermVectorsWriter.FIELD.Length);
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDNAME));
var fieldName = ReadString(SimpleTextTermVectorsWriter.FIELDNAME.Length, _scratch);
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPOSITIONS));
var positions = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPOSITIONS.Length, _scratch));
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDOFFSETS));
var offsets = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDOFFSETS.Length, _scratch));
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDPAYLOADS));
var payloads = Convert.ToBoolean(ReadString(SimpleTextTermVectorsWriter.FIELDPAYLOADS.Length, _scratch));
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.FIELDTERMCOUNT));
var termCount = ParseIntAt(SimpleTextTermVectorsWriter.FIELDTERMCOUNT.Length);
var terms = new SimpleTVTerms(offsets, positions, payloads);
fields.Add(fieldName, terms);
for (var j = 0; j < termCount; j++)
{
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMTEXT));
var term = new BytesRef();
var termLength = _scratch.Length - SimpleTextTermVectorsWriter.TERMTEXT.Length;
term.Grow(termLength);
term.Length = termLength;
Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.TERMTEXT.Length, term.Bytes, term.Offset, termLength);
var postings = new SimpleTVPostings();
terms.TERMS.Add(term, postings);
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.TERMFREQ));
postings.FREQ = ParseIntAt(SimpleTextTermVectorsWriter.TERMFREQ.Length);
if (!positions && !offsets) continue;
if (positions)
{
postings.POSITIONS = new int[postings.FREQ];
if (payloads)
{
postings.PAYLOADS = new BytesRef[postings.FREQ];
}
}
if (offsets)
{
postings.START_OFFSETS = new int[postings.FREQ];
postings.END_OFFSETS = new int[postings.FREQ];
}
for (var k = 0; k < postings.FREQ; k++)
{
if (positions)
{
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.POSITION));
postings.POSITIONS[k] = ParseIntAt(SimpleTextTermVectorsWriter.POSITION.Length);
if (payloads)
{
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.PAYLOAD));
if (_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length == 0)
{
postings.PAYLOADS[k] = null;
}
else
{
var payloadBytes = new byte[_scratch.Length - SimpleTextTermVectorsWriter.PAYLOAD.Length];
Array.Copy(_scratch.Bytes, _scratch.Offset + SimpleTextTermVectorsWriter.PAYLOAD.Length, payloadBytes, 0,
payloadBytes.Length);
postings.PAYLOADS[k] = new BytesRef(payloadBytes);
}
}
}
if (!offsets) continue;
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.STARTOFFSET));
postings.START_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.STARTOFFSET.Length);
ReadLine();
Debug.Assert(StringHelper.StartsWith(_scratch, SimpleTextTermVectorsWriter.ENDOFFSET));
postings.END_OFFSETS[k] = ParseIntAt(SimpleTextTermVectorsWriter.ENDOFFSET.Length);
}
}
}
return new SimpleTVFields(this, fields);
}
public override object Clone()
{
if (_input == null)
{
throw new AlreadyClosedException("this TermVectorsReader is closed");
}
return new SimpleTextTermVectorsReader(_offsets, (IndexInput)_input.Clone());
}
protected override void Dispose(bool disposing)
{
if (disposing) return;
try
{
IOUtils.Close(_input);
}
finally
{
_input = null;
_offsets = null;
}
}
private void ReadLine()
{
SimpleTextUtil.ReadLine(_input, _scratch);
}
private int ParseIntAt(int offset)
{
UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + offset, _scratch.Length - offset, _scratchUtf16);
return ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
}
private string ReadString(int offset, BytesRef scratch)
{
UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + offset, scratch.Length - offset, _scratchUtf16);
return _scratchUtf16.ToString();
}
public override long RamBytesUsed()
{
return 0;
}
public override void CheckIntegrity()
{
}
private class SimpleTVFields : Fields
{
private readonly SimpleTextTermVectorsReader _outerInstance;
private readonly SortedDictionary<string, SimpleTVTerms> _fields;
internal SimpleTVFields(SimpleTextTermVectorsReader outerInstance, SortedDictionary<string, SimpleTVTerms> fields)
{
_outerInstance = outerInstance;
_fields = fields;
}
public override IEnumerator<string> GetEnumerator()
{
return _fields.Keys.GetEnumerator();
}
public override Terms Terms(string field)
{
return _fields[field];
}
public override int Size
{
get { return _fields.Count; }
}
}
private class SimpleTVTerms : Terms
{
internal readonly SortedDictionary<BytesRef, SimpleTVPostings> TERMS;
private readonly bool _hasOffsetsRenamed;
private readonly bool _hasPositionsRenamed;
private readonly bool _hasPayloadsRenamed;
internal SimpleTVTerms(bool hasOffsets, bool hasPositions, bool hasPayloads)
{
_hasOffsetsRenamed = hasOffsets;
_hasPositionsRenamed = hasPositions;
_hasPayloadsRenamed = hasPayloads;
TERMS = new SortedDictionary<BytesRef, SimpleTVPostings>();
}
public override TermsEnum Iterator(TermsEnum reuse)
{
// TODO: reuse
return new SimpleTVTermsEnum(TERMS);
}
public override IComparer<BytesRef> Comparator
{
get { return BytesRef.UTF8SortedAsUnicodeComparer; }
}
public override long Size()
{
return TERMS.Count;
}
public override long SumTotalTermFreq
{
get { return -1; }
}
public override long SumDocFreq
{
get { return TERMS.Count; }
}
public override int DocCount
{
get { return 1; }
}
public override bool HasFreqs()
{
return true;
}
public override bool HasOffsets()
{
return _hasOffsetsRenamed;
}
public override bool HasPositions()
{
return _hasPositionsRenamed;
}
public override bool HasPayloads()
{
return _hasPayloadsRenamed;
}
}
private class SimpleTVPostings
{
internal int FREQ;
internal int[] POSITIONS;
internal int[] START_OFFSETS;
internal int[] END_OFFSETS;
internal BytesRef[] PAYLOADS;
}
private class SimpleTVTermsEnum : TermsEnum
{
private readonly SortedDictionary<BytesRef, SimpleTVPostings> _terms;
private IEnumerator<KeyValuePair<BytesRef, SimpleTVPostings>> _iterator;
private KeyValuePair<BytesRef, SimpleTVPostings> _current;
internal SimpleTVTermsEnum(SortedDictionary<BytesRef, SimpleTVPostings> terms)
{
_terms = terms;
_iterator = terms.EntrySet().GetEnumerator();
}
public override SeekStatus SeekCeil(BytesRef text)
{
var newTerms = new SortedDictionary<BytesRef, SimpleTVPostings>();
foreach (var p in _terms.Where(p => p.Key.CompareTo(text) >= 0))
newTerms.Add(p.Key, p.Value);
_iterator = newTerms.EntrySet().GetEnumerator();
try
{
_iterator.MoveNext();
return _iterator.Current.Key.Equals(text) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
}
catch
{
return SeekStatus.END;
}
}
public override void SeekExact(long ord)
{
throw new NotSupportedException();
}
public override BytesRef Next()
{
try
{
_iterator.MoveNext();
_current = _iterator.Current;
return _current.Key;
}
catch
{
return null;
}
}
public override BytesRef Term()
{
return _current.Key;
}
public override long Ord()
{
throw new NotSupportedException();
}
public override int DocFreq()
{
return 1;
}
public override long TotalTermFreq()
{
return _current.Value.FREQ;
}
public override DocsEnum Docs(Bits liveDocs, DocsEnum reuse, int flags)
{
// TODO: reuse
var e = new SimpleTVDocsEnum();
e.Reset(liveDocs, (flags & DocsEnum.FLAG_FREQS) == 0 ? 1 : _current.Value.FREQ);
return e;
}
public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags)
{
var postings = _current.Value;
if (postings.POSITIONS == null && postings.START_OFFSETS == null)
return null;
// TODO: reuse
var e = new SimpleTVDocsAndPositionsEnum();
e.Reset(liveDocs, postings.POSITIONS, postings.START_OFFSETS, postings.END_OFFSETS, postings.PAYLOADS);
return e;
}
public override IComparer<BytesRef> Comparator
{
get { return BytesRef.UTF8SortedAsUnicodeComparer; }
}
}
// note: these two enum classes are exactly like the Default impl...
private sealed class SimpleTVDocsEnum : DocsEnum
{
private bool _didNext;
private int _doc = -1;
private int _freqRenamed;
private Bits _liveDocs;
public override int Freq()
{
Debug.Assert(_freqRenamed != -1);
return _freqRenamed;
}
public override int DocID()
{
return _doc;
}
public override int NextDoc()
{
if (_didNext || (_liveDocs != null && !_liveDocs.Get(0))) return (_doc = NO_MORE_DOCS);
_didNext = true;
return (_doc = 0);
}
public override int Advance(int target)
{
return SlowAdvance(target);
}
public void Reset(Bits liveDocs, int freq)
{
_liveDocs = liveDocs;
_freqRenamed = freq;
_doc = -1;
_didNext = false;
}
public override long Cost()
{
return 1;
}
}
private sealed class SimpleTVDocsAndPositionsEnum : DocsAndPositionsEnum
{
private bool _didNext;
private int _doc = -1;
private int _nextPos;
private Bits _liveDocs;
private int[] _positions;
private BytesRef[] _payloads;
private int[] _startOffsets;
private int[] _endOffsets;
public override int Freq()
{
if (_positions != null)
return _positions.Length;
Debug.Assert(_startOffsets != null);
return _startOffsets.Length;
}
public override int DocID()
{
return _doc;
}
public override int NextDoc()
{
if (_didNext || (_liveDocs != null && !_liveDocs.Get(0))) return (_doc = NO_MORE_DOCS);
_didNext = true;
return (_doc = 0);
}
public override int Advance(int target)
{
return SlowAdvance(target);
}
public void Reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets,
BytesRef[] payloads)
{
_liveDocs = liveDocs;
_positions = positions;
_startOffsets = startOffsets;
_endOffsets = endOffsets;
_payloads = payloads;
_doc = -1;
_didNext = false;
_nextPos = 0;
}
public override BytesRef Payload
{
get { return _payloads == null ? null : _payloads[_nextPos - 1]; }
}
public override int NextPosition()
{
Debug.Assert((_positions != null && _nextPos < _positions.Length) ||
_startOffsets != null && _nextPos < _startOffsets.Length);
if (_positions != null)
{
return _positions[_nextPos++];
}
_nextPos++;
return -1;
}
public override int StartOffset()
{
if (_startOffsets == null)
return -1;
return _startOffsets[_nextPos - 1];
}
public override int EndOffset()
{
if (_endOffsets == null)
{
return -1;
}
return _endOffsets[_nextPos - 1];
}
public override long Cost()
{
return 1;
}
}
}
}