blob: 6094ce67cdedff977f8f07ddd8f767d51344e512 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
import org.junit.BeforeClass;
// TODO: test multiple codecs here?
// TODO
// - test across fields
// - fix this test to run once for all codecs
// - make more docs per term, to test > 1 level skipping
// - test all combinations of payloads/not and omitTF/not
// - test w/ different indexDivisor
// - test field where payload length rarely changes
// - 0-term fields
// - seek/skip to same term/doc i'm already on
// - mix in deleted docs
// - seek, skip beyond end -- assert returns false
// - seek, skip to things that don't exist -- ensure it
// goes to 1 before next one known to exist
// - skipTo(term)
// - skipTo(doc)
public class TestCodecs extends LuceneTestCase {
private static String[] fieldNames = new String[] {"one", "two", "three", "four"};
private static int NUM_TEST_ITER;
private final static int NUM_TEST_THREADS = 3;
private final static int NUM_FIELDS = 4;
private final static int NUM_TERMS_RAND = 50; // must be > 16 to test skipping
private final static int DOC_FREQ_RAND = 500; // must be > 16 to test skipping
private final static int TERM_DOC_FREQ_RAND = 20;
@BeforeClass
public static void beforeClass() {
NUM_TEST_ITER = atLeast(20);
}
static class FieldData implements Comparable<FieldData> {
final FieldInfo fieldInfo;
final TermData[] terms;
final boolean omitTF;
final boolean storePayloads;
public FieldData(final String name, final FieldInfos.Builder fieldInfos, final TermData[] terms, final boolean omitTF, final boolean storePayloads) {
this.omitTF = omitTF;
this.storePayloads = storePayloads;
// TODO: change this test to use all three
fieldInfo = fieldInfos.getOrAdd(name);
if (omitTF) {
fieldInfo.setIndexOptions(IndexOptions.DOCS);
} else {
fieldInfo.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
}
if (storePayloads) {
fieldInfo.setStorePayloads();
}
this.terms = terms;
for(int i=0;i<terms.length;i++)
terms[i].field = this;
Arrays.sort(terms);
}
@Override
public int compareTo(final FieldData other) {
return fieldInfo.name.compareTo(other.fieldInfo.name);
}
}
static class PositionData {
int pos;
BytesRef payload;
PositionData(final int pos, final BytesRef payload) {
this.pos = pos;
this.payload = payload;
}
}
static class TermData implements Comparable<TermData> {
String text2;
final BytesRef text;
int[] docs;
PositionData[][] positions;
FieldData field;
public TermData(final String text, final int[] docs, final PositionData[][] positions) {
this.text = new BytesRef(text);
this.text2 = text;
this.docs = docs;
this.positions = positions;
}
@Override
public int compareTo(final TermData o) {
return text.compareTo(o.text);
}
}
final private static String SEGMENT = "0";
TermData[] makeRandomTerms(final boolean omitTF, final boolean storePayloads) {
final int numTerms = 1+random().nextInt(NUM_TERMS_RAND);
//final int numTerms = 2;
final TermData[] terms = new TermData[numTerms];
final HashSet<String> termsSeen = new HashSet<>();
for(int i=0;i<numTerms;i++) {
// Make term text
String text2;
while(true) {
text2 = TestUtil.randomUnicodeString(random());
if (!termsSeen.contains(text2) && !text2.endsWith(".")) {
termsSeen.add(text2);
break;
}
}
final int docFreq = 1+random().nextInt(DOC_FREQ_RAND);
final int[] docs = new int[docFreq];
PositionData[][] positions;
if (!omitTF)
positions = new PositionData[docFreq][];
else
positions = null;
int docID = 0;
for(int j=0;j<docFreq;j++) {
docID += TestUtil.nextInt(random(), 1, 10);
docs[j] = docID;
if (!omitTF) {
final int termFreq = 1+random().nextInt(TERM_DOC_FREQ_RAND);
positions[j] = new PositionData[termFreq];
int position = 0;
for(int k=0;k<termFreq;k++) {
position += TestUtil.nextInt(random(), 1, 10);
final BytesRef payload;
if (storePayloads && random().nextInt(4) == 0) {
final byte[] bytes = new byte[1+random().nextInt(5)];
for(int l=0;l<bytes.length;l++) {
bytes[l] = (byte) random().nextInt(255);
}
payload = new BytesRef(bytes);
} else {
payload = null;
}
positions[j][k] = new PositionData(position, payload);
}
}
}
terms[i] = new TermData(text2, docs, positions);
}
return terms;
}
public void testFixedPostings() throws Throwable {
final int NUM_TERMS = 100;
final TermData[] terms = new TermData[NUM_TERMS];
for(int i=0;i<NUM_TERMS;i++) {
final int[] docs = new int[] {i};
final String text = Integer.toString(i, Character.MAX_RADIX);
terms[i] = new TermData(text, docs, null);
}
final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
final FieldData field = new FieldData("field", builder, terms, true, false);
final FieldData[] fields = new FieldData[] {field};
final FieldInfos fieldInfos = builder.finish();
final Directory dir = newDirectory();
Codec codec = Codec.getDefault();
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, SEGMENT, 10000, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null);
this.write(si, fieldInfos, dir, fields);
final FieldsProducer reader = codec.postingsFormat().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, newIOContext(random())));
final Iterator<String> fieldsEnum = reader.iterator();
String fieldName = fieldsEnum.next();
assertNotNull(fieldName);
final Terms terms2 = reader.terms(fieldName);
assertNotNull(terms2);
final TermsEnum termsEnum = terms2.iterator();
PostingsEnum postingsEnum = null;
for(int i=0;i<NUM_TERMS;i++) {
final BytesRef term = termsEnum.next();
assertNotNull(term);
assertEquals(terms[i].text2, term.utf8ToString());
// do this twice to stress test the codec's reuse, ie,
// make sure it properly fully resets (rewinds) its
// internal state:
for(int iter=0;iter<2;iter++) {
postingsEnum = TestUtil.docs(random(), termsEnum, postingsEnum, PostingsEnum.NONE);
assertEquals(terms[i].docs[0], postingsEnum.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
}
}
assertNull(termsEnum.next());
for(int i=0;i<NUM_TERMS;i++) {
assertEquals(termsEnum.seekCeil(new BytesRef(terms[i].text2)), TermsEnum.SeekStatus.FOUND);
}
assertFalse(fieldsEnum.hasNext());
reader.close();
dir.close();
}
public void testRandomPostings() throws Throwable {
final FieldInfos.Builder builder = new FieldInfos.Builder(new FieldInfos.FieldNumbers(null));
final FieldData[] fields = new FieldData[NUM_FIELDS];
for(int i=0;i<NUM_FIELDS;i++) {
final boolean omitTF = 0==(i%3);
final boolean storePayloads = 1==(i%3);
fields[i] = new FieldData(fieldNames[i], builder, this.makeRandomTerms(omitTF, storePayloads), omitTF, storePayloads);
}
final Directory dir = newDirectory();
final FieldInfos fieldInfos = builder.finish();
if (VERBOSE) {
System.out.println("TEST: now write postings");
}
Codec codec = Codec.getDefault();
final SegmentInfo si = new SegmentInfo(dir, Version.LATEST, Version.LATEST, SEGMENT, 10000, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>(), null);
this.write(si, fieldInfos, dir, fields);
if (VERBOSE) {
System.out.println("TEST: now read postings");
}
final FieldsProducer terms = codec.postingsFormat().fieldsProducer(new SegmentReadState(dir, si, fieldInfos, newIOContext(random())));
final Verify[] threads = new Verify[NUM_TEST_THREADS-1];
for(int i=0;i<NUM_TEST_THREADS-1;i++) {
threads[i] = new Verify(si, fields, terms);
threads[i].setDaemon(true);
threads[i].start();
}
new Verify(si, fields, terms).run();
for(int i=0;i<NUM_TEST_THREADS-1;i++) {
threads[i].join();
assert !threads[i].failed;
}
terms.close();
dir.close();
}
private static class Verify extends Thread {
final Fields termsDict;
final FieldData[] fields;
final SegmentInfo si;
volatile boolean failed;
Verify(final SegmentInfo si, final FieldData[] fields, final Fields termsDict) {
this.fields = fields;
this.termsDict = termsDict;
this.si = si;
}
@Override
public void run() {
try {
this._run();
} catch (final Throwable t) {
failed = true;
throw new RuntimeException(t);
}
}
private void verifyDocs(final int[] docs, final PositionData[][] positions, final PostingsEnum postingsEnum, final boolean doPos) throws Throwable {
for(int i=0;i<docs.length;i++) {
final int doc = postingsEnum.nextDoc();
assertTrue(doc != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(docs[i], doc);
if (doPos) {
this.verifyPositions(positions[i], postingsEnum);
}
}
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postingsEnum.nextDoc());
}
byte[] data = new byte[10];
private void verifyPositions(final PositionData[] positions, final PostingsEnum posEnum) throws Throwable {
for(int i=0;i<positions.length;i++) {
final int pos = posEnum.nextPosition();
assertEquals(positions[i].pos, pos);
if (positions[i].payload != null) {
assertNotNull(posEnum.getPayload());
if (random().nextInt(3) < 2) {
// Verify the payload bytes
final BytesRef otherPayload = posEnum.getPayload();
assertTrue("expected=" + positions[i].payload.toString() + " got=" + otherPayload.toString(), positions[i].payload.equals(otherPayload));
}
} else {
assertNull(posEnum.getPayload());
}
}
}
public void _run() throws Throwable {
for(int iter=0;iter<NUM_TEST_ITER;iter++) {
final FieldData field = fields[random().nextInt(fields.length)];
final TermsEnum termsEnum = termsDict.terms(field.fieldInfo.name).iterator();
int upto = 0;
// Test straight enum of the terms:
while(true) {
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
final BytesRef expected = new BytesRef(field.terms[upto++].text2);
assertTrue("expected=" + expected + " vs actual " + term, expected.bytesEquals(term));
}
assertEquals(upto, field.terms.length);
// Test random seek:
TermData term = field.terms[random().nextInt(field.terms.length)];
TermsEnum.SeekStatus status = termsEnum.seekCeil(new BytesRef(term.text2));
assertEquals(status, TermsEnum.SeekStatus.FOUND);
assertEquals(term.docs.length, termsEnum.docFreq());
if (field.omitTF) {
this.verifyDocs(term.docs, term.positions, TestUtil.docs(random(), termsEnum, null, PostingsEnum.NONE), false);
} else {
this.verifyDocs(term.docs, term.positions, termsEnum.postings(null, PostingsEnum.ALL), true);
}
// Test random seek by ord:
final int idx = random().nextInt(field.terms.length);
term = field.terms[idx];
boolean success = false;
try {
termsEnum.seekExact(idx);
success = true;
} catch (UnsupportedOperationException uoe) {
// ok -- skip it
}
if (success) {
assertEquals(status, TermsEnum.SeekStatus.FOUND);
assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
assertEquals(term.docs.length, termsEnum.docFreq());
if (field.omitTF) {
this.verifyDocs(term.docs, term.positions, TestUtil.docs(random(), termsEnum, null, PostingsEnum.NONE), false);
} else {
this.verifyDocs(term.docs, term.positions, termsEnum.postings(null, PostingsEnum.ALL), true);
}
}
// Test seek to non-existent terms:
if (VERBOSE) {
System.out.println("TEST: seek non-exist terms");
}
for(int i=0;i<100;i++) {
final String text2 = TestUtil.randomUnicodeString(random()) + ".";
status = termsEnum.seekCeil(new BytesRef(text2));
assertTrue(status == TermsEnum.SeekStatus.NOT_FOUND ||
status == TermsEnum.SeekStatus.END);
}
// Seek to each term, backwards:
if (VERBOSE) {
System.out.println("TEST: seek terms backwards");
}
for(int i=field.terms.length-1;i>=0;i--) {
assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(field.terms[i].text2)));
assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
}
// Seek to each term by ord, backwards
for(int i=field.terms.length-1;i>=0;i--) {
try {
termsEnum.seekExact(i);
assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
} catch (UnsupportedOperationException uoe) {
}
}
// Seek to non-existent empty-string term
status = termsEnum.seekCeil(new BytesRef(""));
assertNotNull(status);
//assertEquals(TermsEnum.SeekStatus.NOT_FOUND, status);
// Make sure we're now pointing to first term
assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[0].text2)));
// Test docs enum
termsEnum.seekCeil(new BytesRef(""));
upto = 0;
do {
term = field.terms[upto];
if (random().nextInt(3) == 1) {
final PostingsEnum postings;
if (!field.omitTF) {
// TODO: we should randomize which postings features are available, but
// need to coordinate this with the checks below that rely on such features
postings = termsEnum.postings(null, PostingsEnum.ALL);
} else {
postings = TestUtil.docs(random(), termsEnum, null, PostingsEnum.FREQS);
}
assertNotNull(postings);
int upto2 = -1;
boolean ended = false;
while(upto2 < term.docs.length-1) {
// Maybe skip:
final int left = term.docs.length-upto2;
int doc;
if (random().nextInt(3) == 1 && left >= 1) {
final int inc = 1+random().nextInt(left-1);
upto2 += inc;
if (random().nextInt(2) == 1) {
doc = postings.advance(term.docs[upto2]);
assertEquals(term.docs[upto2], doc);
} else {
doc = postings.advance(1+term.docs[upto2]);
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
// skipped past last doc
assert upto2 == term.docs.length-1;
ended = true;
break;
} else {
// skipped to next doc
assert upto2 < term.docs.length-1;
if (doc >= term.docs[1+upto2]) {
upto2++;
}
}
}
} else {
doc = postings.nextDoc();
assertTrue(doc != -1);
upto2++;
}
assertEquals(term.docs[upto2], doc);
if (!field.omitTF) {
assertEquals(term.positions[upto2].length, postings.freq());
if (random().nextInt(2) == 1) {
this.verifyPositions(term.positions[upto2], postings);
}
}
}
if (!ended) {
assertEquals(DocIdSetIterator.NO_MORE_DOCS, postings.nextDoc());
}
}
upto++;
} while (termsEnum.next() != null);
assertEquals(upto, field.terms.length);
}
}
}
private static class DataFields extends Fields {
private final FieldData[] fields;
public DataFields(FieldData[] fields) {
// already sorted:
this.fields = fields;
}
@Override
public Iterator<String> iterator() {
return new Iterator<String>() {
int upto = -1;
@Override
public boolean hasNext() {
return upto+1 < fields.length;
}
@Override
public String next() {
upto++;
return fields[upto].fieldInfo.name;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
public Terms terms(String field) {
// Slow linear search:
for(FieldData fieldData : fields) {
if (fieldData.fieldInfo.name.equals(field)) {
return new DataTerms(fieldData);
}
}
return null;
}
@Override
public int size() {
return fields.length;
}
}
private static class DataTerms extends Terms {
final FieldData fieldData;
public DataTerms(FieldData fieldData) {
this.fieldData = fieldData;
}
@Override
public TermsEnum iterator() {
return new DataTermsEnum(fieldData);
}
@Override
public long size() {
throw new UnsupportedOperationException();
}
@Override
public long getSumTotalTermFreq() {
throw new UnsupportedOperationException();
}
@Override
public long getSumDocFreq() {
throw new UnsupportedOperationException();
}
@Override
public int getDocCount() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasFreqs() {
return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
}
@Override
public boolean hasOffsets() {
return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return fieldData.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return fieldData.fieldInfo.hasPayloads();
}
}
private static class DataTermsEnum extends BaseTermsEnum {
final FieldData fieldData;
private int upto = -1;
public DataTermsEnum(FieldData fieldData) {
this.fieldData = fieldData;
}
@Override
public BytesRef next() {
upto++;
if (upto == fieldData.terms.length) {
return null;
}
return term();
}
@Override
public BytesRef term() {
return fieldData.terms[upto].text;
}
@Override
public SeekStatus seekCeil(BytesRef text) {
// Stupid linear impl:
for(int i=0;i<fieldData.terms.length;i++) {
int cmp = fieldData.terms[i].text.compareTo(text);
if (cmp == 0) {
upto = i;
return SeekStatus.FOUND;
} else if (cmp > 0) {
upto = i;
return SeekStatus.NOT_FOUND;
}
}
return SeekStatus.END;
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
throw new UnsupportedOperationException();
}
@Override
public int docFreq() {
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() {
throw new UnsupportedOperationException();
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) {
return new DataPostingsEnum(fieldData.terms[upto]);
}
@Override
public ImpactsEnum impacts(int flags) throws IOException {
throw new UnsupportedOperationException();
}
}
private static class DataPostingsEnum extends PostingsEnum {
final TermData termData;
int docUpto = -1;
int posUpto;
public DataPostingsEnum(TermData termData) {
this.termData = termData;
}
@Override
public long cost() {
throw new UnsupportedOperationException();
}
@Override
public int nextDoc() {
docUpto++;
if (docUpto == termData.docs.length) {
return NO_MORE_DOCS;
}
posUpto = -1;
return docID();
}
@Override
public int docID() {
return termData.docs[docUpto];
}
@Override
public int advance(int target) {
// Slow linear impl:
nextDoc();
while (docID() < target) {
nextDoc();
}
return docID();
}
@Override
public int freq() {
return termData.positions[docUpto].length;
}
@Override
public int nextPosition() {
posUpto++;
return termData.positions[docUpto][posUpto].pos;
}
@Override
public BytesRef getPayload() {
return termData.positions[docUpto][posUpto].payload;
}
@Override
public int startOffset() {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() {
throw new UnsupportedOperationException();
}
}
private void write(SegmentInfo si, final FieldInfos fieldInfos, final Directory dir, final FieldData[] fields) throws Throwable {
final Codec codec = si.getCodec();
final SegmentWriteState state = new SegmentWriteState(InfoStream.getDefault(), dir, si, fieldInfos, null, newIOContext(random()));
Arrays.sort(fields);
FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(state);
NormsProducer fakeNorms = new NormsProducer() {
@Override
public long ramBytesUsed() {
return 0;
}
@Override
public void close() throws IOException {}
@Override
public NumericDocValues getNorms(FieldInfo field) throws IOException {
return new NumericDocValues() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return si.maxDoc();
}
@Override
public int advance(int target) throws IOException {
if (target >= si.maxDoc()) {
return doc = NO_MORE_DOCS;
} else {
return doc = target;
}
}
@Override
public boolean advanceExact(int target) throws IOException {
doc = target;
return true;
}
@Override
public long longValue() throws IOException {
return 1;
}
};
}
@Override
public void checkIntegrity() throws IOException {}
};
boolean success = false;
try {
consumer.write(new DataFields(fields), fakeNorms);
success = true;
} finally {
if (success) {
IOUtils.close(consumer);
} else {
IOUtils.closeWhileHandlingException(consumer);
}
}
}
public void testDocsOnlyFreq() throws Exception {
// tests that when fields are indexed with DOCS_ONLY, the Codec
// returns 1 in docsEnum.freq()
Directory dir = newDirectory();
Random random = random();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random)));
// we don't need many documents to assert this, but don't use one document either
int numDocs = atLeast(random, 50);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new StringField("f", "doc", Store.NO));
writer.addDocument(doc);
}
writer.close();
Term term = new Term("f", new BytesRef("doc"));
DirectoryReader reader = DirectoryReader.open(dir);
for (LeafReaderContext ctx : reader.leaves()) {
PostingsEnum de = ctx.reader().postings(term);
while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
assertEquals("wrong freq for doc " + de.docID(), 1, de.freq());
}
}
reader.close();
dir.close();
}
}