blob: 3f4579ac8095c73702727007271646d3cf6036e0 [file] [log] [blame]
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Documents;
using Lucene.Net.Index.Extensions;
using NUnit.Framework;
using System;
using System.IO;
using Assert = Lucene.Net.TestFramework.Assert;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BytesRef = Lucene.Net.Util.BytesRef;
using Codec = Lucene.Net.Codecs.Codec;
using Directory = Lucene.Net.Store.Directory;
using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
using Document = Documents.Document;
using Field = Field;
using FieldType = FieldType;
using LuceneTestCase = Lucene.Net.Util.LuceneTestCase;
using TermVectorsReader = Lucene.Net.Codecs.TermVectorsReader;
using TestUtil = Lucene.Net.Util.TestUtil;
using TextField = TextField;
[TestFixture]
public class TestTermVectorsReader : LuceneTestCase
{
public TestTermVectorsReader()
{
positions = new int[testTerms.Length][];
tokens = new TestToken[testTerms.Length * TERM_FREQ];
}
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
private string[] testFields = new string[] { "f1", "f2", "f3", "f4" };
private bool[] testFieldsStorePos = new bool[] { true, false, true, false };
private bool[] testFieldsStoreOff = new bool[] { true, false, false, true };
private string[] testTerms = new string[] { "this", "is", "a", "test" };
private int[][] positions;
private Directory dir;
private SegmentCommitInfo seg;
private FieldInfos fieldInfos = new FieldInfos(new FieldInfo[0]);
private static int TERM_FREQ = 3;
internal class TestToken : IComparable<TestToken>
{
private readonly TestTermVectorsReader outerInstance;
public TestToken(TestTermVectorsReader outerInstance)
{
this.outerInstance = outerInstance;
}
internal string text;
internal int pos;
internal int startOffset;
internal int endOffset;
public virtual int CompareTo(TestToken other)
{
return pos - other.pos;
}
}
internal TestToken[] tokens;
[SetUp]
public override void SetUp()
{
base.SetUp();
/*
for (int i = 0; i < testFields.length; i++) {
fieldInfos.Add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
}
*/
Array.Sort(testTerms);
int tokenUpto = 0;
for (int i = 0; i < testTerms.Length; i++)
{
positions[i] = new int[TERM_FREQ];
// first position must be 0
for (int j = 0; j < TERM_FREQ; j++)
{
// positions are always sorted in increasing order
positions[i][j] = (int)(j * 10 + new Random(1).NextDouble() * 10);
TestToken token = tokens[tokenUpto++] = new TestToken(this);
token.text = testTerms[i];
token.pos = positions[i][j];
token.startOffset = j * 10;
token.endOffset = j * 10 + testTerms[i].Length;
}
}
Array.Sort(tokens);
dir = NewDirectory();
IndexWriter writer = new IndexWriter(dir, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MyAnalyzer(this)).SetMaxBufferedDocs(-1).SetMergePolicy(NewLogMergePolicy(false, 10)).SetUseCompoundFile(false));
Document doc = new Document();
for (int i = 0; i < testFields.Length; i++)
{
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
if (testFieldsStorePos[i] && testFieldsStoreOff[i])
{
customType.StoreTermVectors = true;
customType.StoreTermVectorPositions = true;
customType.StoreTermVectorOffsets = true;
}
else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
{
customType.StoreTermVectors = true;
customType.StoreTermVectorPositions = true;
}
else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
{
customType.StoreTermVectors = true;
customType.StoreTermVectorOffsets = true;
}
else
{
customType.StoreTermVectors = true;
}
doc.Add(new Field(testFields[i], "", customType));
}
//Create 5 documents for testing, they all have the same
//terms
for (int j = 0; j < 5; j++)
{
writer.AddDocument(doc);
}
writer.Commit();
seg = writer.NewestSegment();
writer.Dispose();
fieldInfos = SegmentReader.ReadFieldInfos(seg);
}
[TearDown]
public override void TearDown()
{
dir.Dispose();
base.TearDown();
}
private class MyTokenizer : Tokenizer
{
private readonly TestTermVectorsReader outerInstance;
private int tokenUpto;
private readonly ICharTermAttribute termAtt;
private readonly IPositionIncrementAttribute posIncrAtt;
private readonly IOffsetAttribute offsetAtt;
public MyTokenizer(TestTermVectorsReader outerInstance, TextReader reader)
: base(reader)
{
this.outerInstance = outerInstance;
termAtt = AddAttribute<ICharTermAttribute>();
posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
}
public sealed override bool IncrementToken()
{
if (tokenUpto >= outerInstance.tokens.Length)
{
return false;
}
else
{
TestToken testToken = outerInstance.tokens[tokenUpto++];
ClearAttributes();
termAtt.Append(testToken.text);
offsetAtt.SetOffset(testToken.startOffset, testToken.endOffset);
if (tokenUpto > 1)
{
posIncrAtt.PositionIncrement = testToken.pos - outerInstance.tokens[tokenUpto - 2].pos;
}
else
{
posIncrAtt.PositionIncrement = testToken.pos + 1;
}
return true;
}
}
public override void Reset()
{
base.Reset();
this.tokenUpto = 0;
}
}
private class MyAnalyzer : Analyzer
{
private readonly TestTermVectorsReader outerInstance;
public MyAnalyzer(TestTermVectorsReader outerInstance)
{
this.outerInstance = outerInstance;
}
protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
{
return new TokenStreamComponents(new MyTokenizer(outerInstance, reader));
}
}
[Test]
public virtual void Test()
{
//Check to see the files were created properly in setup
DirectoryReader reader = DirectoryReader.Open(dir);
foreach (AtomicReaderContext ctx in reader.Leaves)
{
SegmentReader sr = (SegmentReader)ctx.Reader;
Assert.IsTrue(sr.FieldInfos.HasVectors);
}
reader.Dispose();
}
[Test]
public virtual void TestReader()
{
TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random));
for (int j = 0; j < 5; j++)
{
Terms vector = reader.Get(j).GetTerms(testFields[0]);
Assert.IsNotNull(vector);
Assert.AreEqual(testTerms.Length, vector.Count);
TermsEnum termsEnum = vector.GetEnumerator();
for (int i = 0; i < testTerms.Length; i++)
{
Assert.IsTrue(termsEnum.MoveNext());
BytesRef text = termsEnum.Term;
string term = text.Utf8ToString();
//System.out.println("Term: " + term);
Assert.AreEqual(testTerms[i], term);
}
Assert.IsFalse(termsEnum.MoveNext());
}
reader.Dispose();
}
[Test]
public virtual void TestDocsEnum()
{
TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random));
for (int j = 0; j < 5; j++)
{
Terms vector = reader.Get(j).GetTerms(testFields[0]);
Assert.IsNotNull(vector);
Assert.AreEqual(testTerms.Length, vector.Count);
TermsEnum termsEnum = vector.GetEnumerator();
DocsEnum docsEnum = null;
for (int i = 0; i < testTerms.Length; i++)
{
Assert.IsTrue(termsEnum.MoveNext());
BytesRef text = termsEnum.Term;
string term = text.Utf8ToString();
//System.out.println("Term: " + term);
Assert.AreEqual(testTerms[i], term);
docsEnum = TestUtil.Docs(Random, termsEnum, null, docsEnum, DocsFlags.NONE);
Assert.IsNotNull(docsEnum);
int doc = docsEnum.DocID;
Assert.AreEqual(-1, doc);
Assert.IsTrue(docsEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, docsEnum.NextDoc());
}
Assert.IsFalse(termsEnum.MoveNext());
}
reader.Dispose();
}
[Test]
public virtual void TestPositionReader()
{
TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random));
//BytesRef[] terms; // LUCENENET NOTE: Not used in Lucene
Terms vector = reader.Get(0).GetTerms(testFields[0]);
Assert.IsNotNull(vector);
Assert.AreEqual(testTerms.Length, vector.Count);
TermsEnum termsEnum = vector.GetEnumerator();
DocsAndPositionsEnum dpEnum = null;
for (int i = 0; i < testTerms.Length; i++)
{
Assert.IsTrue(termsEnum.MoveNext());
BytesRef text = termsEnum.Term;
string term = text.Utf8ToString();
//System.out.println("Term: " + term);
Assert.AreEqual(testTerms[i], term);
dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
Assert.IsNotNull(dpEnum);
int doc = dpEnum.DocID;
Assert.AreEqual(-1, doc);
Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
Assert.AreEqual(dpEnum.Freq, positions[i].Length);
for (int j = 0; j < positions[i].Length; j++)
{
Assert.AreEqual(positions[i][j], dpEnum.NextPosition());
}
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());
dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
doc = dpEnum.DocID;
Assert.AreEqual(-1, doc);
Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
Assert.IsNotNull(dpEnum);
Assert.AreEqual(dpEnum.Freq, positions[i].Length);
for (int j = 0; j < positions[i].Length; j++)
{
Assert.AreEqual(positions[i][j], dpEnum.NextPosition());
Assert.AreEqual(j * 10, dpEnum.StartOffset);
Assert.AreEqual(j * 10 + testTerms[i].Length, dpEnum.EndOffset);
}
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());
}
Terms freqVector = reader.Get(0).GetTerms(testFields[1]); //no pos, no offset
Assert.IsNotNull(freqVector);
Assert.AreEqual(testTerms.Length, freqVector.Count);
termsEnum = freqVector.GetEnumerator();
Assert.IsNotNull(termsEnum);
for (int i = 0; i < testTerms.Length; i++)
{
Assert.IsTrue(termsEnum.MoveNext());
BytesRef text = termsEnum.Term;
string term = text.Utf8ToString();
//System.out.println("Term: " + term);
Assert.AreEqual(testTerms[i], term);
Assert.IsNotNull(termsEnum.Docs(null, null));
Assert.IsNull(termsEnum.DocsAndPositions(null, null)); // no pos
}
reader.Dispose();
}
[Test]
public virtual void TestOffsetReader()
{
TermVectorsReader reader = Codec.Default.TermVectorsFormat.VectorsReader(dir, seg.Info, fieldInfos, NewIOContext(Random));
Terms vector = reader.Get(0).GetTerms(testFields[0]);
Assert.IsNotNull(vector);
TermsEnum termsEnum = vector.GetEnumerator();
Assert.IsNotNull(termsEnum);
Assert.AreEqual(testTerms.Length, vector.Count);
DocsAndPositionsEnum dpEnum = null;
for (int i = 0; i < testTerms.Length; i++)
{
Assert.IsTrue(termsEnum.MoveNext());
BytesRef text = termsEnum.Term;
string term = text.Utf8ToString();
Assert.AreEqual(testTerms[i], term);
dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
Assert.IsNotNull(dpEnum);
Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
Assert.AreEqual(dpEnum.Freq, positions[i].Length);
for (int j = 0; j < positions[i].Length; j++)
{
Assert.AreEqual(positions[i][j], dpEnum.NextPosition());
}
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());
dpEnum = termsEnum.DocsAndPositions(null, dpEnum);
Assert.IsTrue(dpEnum.NextDoc() != DocIdSetIterator.NO_MORE_DOCS);
Assert.IsNotNull(dpEnum);
Assert.AreEqual(dpEnum.Freq, positions[i].Length);
for (int j = 0; j < positions[i].Length; j++)
{
Assert.AreEqual(positions[i][j], dpEnum.NextPosition());
Assert.AreEqual(j * 10, dpEnum.StartOffset);
Assert.AreEqual(j * 10 + testTerms[i].Length, dpEnum.EndOffset);
}
Assert.AreEqual(DocIdSetIterator.NO_MORE_DOCS, dpEnum.NextDoc());
}
reader.Dispose();
}
[Test]
public virtual void TestIllegalIndexableField()
{
Directory dir = NewDirectory();
RandomIndexWriter w = new RandomIndexWriter(
#if FEATURE_INSTANCE_TESTDATA_INITIALIZATION
this,
#endif
Random, dir);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.StoreTermVectors = true;
ft.StoreTermVectorPayloads = true;
Document doc = new Document();
doc.Add(new Field("field", "value", ft));
try
{
w.AddDocument(doc);
Assert.Fail("did not hit exception");
}
catch (ArgumentException iae)
{
// Expected
Assert.AreEqual("cannot index term vector payloads without term vector positions (field=\"field\")", iae.Message);
}
ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.StoreTermVectors = false;
ft.StoreTermVectorOffsets = true;
doc = new Document();
doc.Add(new Field("field", "value", ft));
try
{
w.AddDocument(doc);
Assert.Fail("did not hit exception");
}
catch (ArgumentException iae)
{
// Expected
Assert.AreEqual("cannot index term vector offsets when term vectors are not indexed (field=\"field\")", iae.Message);
}
ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.StoreTermVectors = false;
ft.StoreTermVectorPositions = true;
doc = new Document();
doc.Add(new Field("field", "value", ft));
try
{
w.AddDocument(doc);
Assert.Fail("did not hit exception");
}
catch (ArgumentException iae)
{
// Expected
Assert.AreEqual("cannot index term vector positions when term vectors are not indexed (field=\"field\")", iae.Message);
}
ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.StoreTermVectors = false;
ft.StoreTermVectorPayloads = true;
doc = new Document();
doc.Add(new Field("field", "value", ft));
try
{
w.AddDocument(doc);
Assert.Fail("did not hit exception");
}
catch (ArgumentException iae)
{
// Expected
Assert.AreEqual("cannot index term vector payloads when term vectors are not indexed (field=\"field\")", iae.Message);
}
w.Dispose();
dir.Dispose();
}
}
}