blob: e2a3f206c2840055ac7724ba1213c4311c2850b2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestTermVectorsReader extends LuceneTestCase {
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
private String[] testFields = {"f1", "f2", "f3", "f4"};
private boolean[] testFieldsStorePos = {true, false, true, false};
private boolean[] testFieldsStoreOff = {true, false, false, true};
private String[] testTerms = {"this", "is", "a", "test"};
private int[][] positions = new int[testTerms.length][];
private Directory dir;
private SegmentCommitInfo seg;
private FieldInfos fieldInfos = FieldInfos.EMPTY;
private static int TERM_FREQ = 3;
private static class TestToken implements Comparable<TestToken> {
String text;
int pos;
int startOffset;
int endOffset;
@Override
public int compareTo(TestToken other) {
return pos - other.pos;
}
}
TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
@Override
public void setUp() throws Exception {
super.setUp();
/*
for (int i = 0; i < testFields.length; i++) {
fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
}
*/
Arrays.sort(testTerms);
int tokenUpto = 0;
Random rnd = random();
for (int i = 0; i < testTerms.length; i++) {
positions[i] = new int[TERM_FREQ];
// first position must be 0
for (int j = 0; j < TERM_FREQ; j++) {
// positions are always sorted in increasing order
positions[i][j] = (int) (j * 10 + rnd.nextDouble() * 10);
TestToken token = tokens[tokenUpto++] = new TestToken();
token.text = testTerms[i];
token.pos = positions[i][j];
token.startOffset = j * 10;
token.endOffset = j * 10 + testTerms[i].length();
}
}
Arrays.sort(tokens);
dir = newDirectory();
IndexWriter writer = new IndexWriter(
dir,
newIndexWriterConfig(new MyAnalyzer()).
setMaxBufferedDocs(-1).
setMergePolicy(newLogMergePolicy(false, 10))
.setUseCompoundFile(false)
);
Document doc = new Document();
for(int i=0;i<testFields.length;i++) {
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
if (testFieldsStorePos[i] && testFieldsStoreOff[i]) {
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
}
else if (testFieldsStorePos[i] && !testFieldsStoreOff[i]) {
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
}
else if (!testFieldsStorePos[i] && testFieldsStoreOff[i]) {
customType.setStoreTermVectors(true);
customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
}
else {
customType.setStoreTermVectors(true);
}
doc.add(new Field(testFields[i], "", customType));
}
//Create 5 documents for testing, they all have the same
//terms
for(int j=0;j<5;j++) {
writer.addDocument(doc);
}
writer.commit();
seg = writer.newestSegment();
writer.close();
fieldInfos = IndexWriter.readFieldInfos(seg);
}
@Override
public void tearDown() throws Exception {
dir.close();
super.tearDown();
}
private class MyTokenizer extends Tokenizer {
private int tokenUpto;
private final CharTermAttribute termAtt;
private final PositionIncrementAttribute posIncrAtt;
private final OffsetAttribute offsetAtt;
public MyTokenizer() {
super();
termAtt = addAttribute(CharTermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
offsetAtt = addAttribute(OffsetAttribute.class);
}
@Override
public boolean incrementToken() {
if (tokenUpto >= tokens.length) {
return false;
} else {
final TestToken testToken = tokens[tokenUpto++];
clearAttributes();
termAtt.append(testToken.text);
offsetAtt.setOffset(testToken.startOffset, testToken.endOffset);
if (tokenUpto > 1) {
posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
} else {
posIncrAtt.setPositionIncrement(testToken.pos+1);
}
return true;
}
}
@Override
public void reset() throws IOException {
super.reset();
this.tokenUpto = 0;
}
}
private class MyAnalyzer extends Analyzer {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new MyTokenizer());
}
}
public void test() throws IOException {
//Check to see the files were created properly in setup
DirectoryReader reader = DirectoryReader.open(dir);
for (LeafReaderContext ctx : reader.leaves()) {
SegmentReader sr = (SegmentReader) ctx.reader();
assertTrue(sr.getFieldInfos().hasVectors());
}
reader.close();
}
public void testReader() throws IOException {
TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
for (int j = 0; j < 5; j++) {
Terms vector = reader.get(j).terms(testFields[0]);
assertNotNull(vector);
assertEquals(testTerms.length, vector.size());
TermsEnum termsEnum = vector.iterator();
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
//System.out.println("Term: " + term);
assertEquals(testTerms[i], term);
}
assertNull(termsEnum.next());
}
reader.close();
}
public void testDocsEnum() throws IOException {
TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
for (int j = 0; j < 5; j++) {
Terms vector = reader.get(j).terms(testFields[0]);
assertNotNull(vector);
assertEquals(testTerms.length, vector.size());
TermsEnum termsEnum = vector.iterator();
PostingsEnum postingsEnum = null;
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
//System.out.println("Term: " + term);
assertEquals(testTerms[i], term);
postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE);
assertNotNull(postingsEnum);
int doc = postingsEnum.docID();
assertEquals(-1, doc);
assertTrue(postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
}
assertNull(termsEnum.next());
}
reader.close();
}
public void testPositionReader() throws IOException {
TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
BytesRef[] terms;
Terms vector = reader.get(0).terms(testFields[0]);
assertNotNull(vector);
assertEquals(testTerms.length, vector.size());
TermsEnum termsEnum = vector.iterator();
PostingsEnum dpEnum = null;
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
//System.out.println("Term: " + term);
assertEquals(testTerms[i], term);
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
assertNotNull(dpEnum);
int doc = dpEnum.docID();
assertEquals(-1, doc);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
doc = dpEnum.docID();
assertEquals(-1, doc);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertNotNull(dpEnum);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
assertEquals(j*10, dpEnum.startOffset());
assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
}
Terms freqVector = reader.get(0).terms(testFields[1]); //no pos, no offset
assertNotNull(freqVector);
assertEquals(testTerms.length, freqVector.size());
termsEnum = freqVector.iterator();
assertNotNull(termsEnum);
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
//System.out.println("Term: " + term);
assertEquals(testTerms[i], term);
assertNotNull(termsEnum.postings(null));
assertNotNull(termsEnum.postings(null, PostingsEnum.ALL));
}
reader.close();
}
public void testOffsetReader() throws IOException {
TermVectorsReader reader = Codec.getDefault().termVectorsFormat().vectorsReader(dir, seg.info, fieldInfos, newIOContext(random()));
Terms vector = reader.get(0).terms(testFields[0]);
assertNotNull(vector);
TermsEnum termsEnum = vector.iterator();
assertNotNull(termsEnum);
assertEquals(testTerms.length, vector.size());
PostingsEnum dpEnum = null;
for (int i = 0; i < testTerms.length; i++) {
final BytesRef text = termsEnum.next();
assertNotNull(text);
String term = text.utf8ToString();
assertEquals(testTerms[i], term);
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
assertNotNull(dpEnum);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
dpEnum = termsEnum.postings(dpEnum, PostingsEnum.ALL);
assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertNotNull(dpEnum);
assertEquals(dpEnum.freq(), positions[i].length);
for (int j = 0; j < positions[i].length; j++) {
assertEquals(positions[i][j], dpEnum.nextPosition());
assertEquals(j*10, dpEnum.startOffset());
assertEquals(j*10 + testTerms[i].length(), dpEnum.endOffset());
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
}
reader.close();
}
public void testIllegalPayloadsWithoutPositions() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPayloads(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot index term vector payloads without term vector positions (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
public void testIllegalOffsetsWithoutVectors() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(false);
ft.setStoreTermVectorOffsets(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot index term vector offsets when term vectors are not indexed (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
public void testIllegalPositionsWithoutVectors() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(false);
ft.setStoreTermVectorPositions(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot index term vector positions when term vectors are not indexed (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
public void testIllegalVectorPayloadsWithoutVectors() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(false);
ft.setStoreTermVectorPayloads(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot index term vector payloads when term vectors are not indexed (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
public void testIllegalVectorsWithoutIndexed() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(StoredField.TYPE);
ft.setStoreTermVectors(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot store term vectors for a field that is not indexed (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
public void testIllegalVectorPositionsWithoutIndexed() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(StoredField.TYPE);
ft.setStoreTermVectorPositions(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot store term vector positions for a field that is not indexed (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
public void testIllegalVectorOffsetsWithoutIndexed() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(StoredField.TYPE);
ft.setStoreTermVectorOffsets(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot store term vector offsets for a field that is not indexed (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
public void testIllegalVectorPayloadsWithoutIndexed() throws Exception {
Directory dir = newDirectory();
MockAnalyzer a = new MockAnalyzer(random());
a.setEnableChecks(false);
RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
FieldType ft = new FieldType(StoredField.TYPE);
ft.setStoreTermVectorPayloads(true);
Document doc = new Document();
doc.add(new Field("field", "value", ft));
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
w.addDocument(doc);
});
assertEquals("cannot store term vector payloads for a field that is not indexed (field=\"field\")", expected.getMessage());
w.close();
dir.close();
}
}