blob: cce3777590782c390b31a8cdfd2622a32774e924 [file] [log] [blame]
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.DocTermOrds.TermOrdsIterator;
import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CoreCodecProvider;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.FixedGapTermsIndexReader;
import org.apache.lucene.index.codecs.FixedGapTermsIndexWriter;
import org.apache.lucene.index.codecs.PerDocConsumer;
import org.apache.lucene.index.codecs.DefaultDocValuesConsumer;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.index.codecs.PostingsReaderBase;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
// TODO:
// - test w/ del docs
// - test prefix
// - test w/ cutoff
// - crank docs way up so we get some merging sometimes
public class TestDocTermOrds extends LuceneTestCase {
public void testSimple() throws Exception {
Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
Document doc = new Document();
Field field = newField("field", "", Field.Index.ANALYZED);
doc.add(field);
field.setValue("a b c");
w.addDocument(doc);
field.setValue("d e f");
w.addDocument(doc);
field.setValue("a f");
w.addDocument(doc);
final IndexReader r = w.getReader();
w.close();
final DocTermOrds dto = new DocTermOrds(r, "field");
TermOrdsIterator iter = dto.lookup(0, null);
final int[] buffer = new int[5];
assertEquals(3, iter.read(buffer));
assertEquals(0, buffer[0]);
assertEquals(1, buffer[1]);
assertEquals(2, buffer[2]);
iter = dto.lookup(1, iter);
assertEquals(3, iter.read(buffer));
assertEquals(3, buffer[0]);
assertEquals(4, buffer[1]);
assertEquals(5, buffer[2]);
iter = dto.lookup(2, iter);
assertEquals(2, iter.read(buffer));
assertEquals(0, buffer[0]);
assertEquals(5, buffer[1]);
r.close();
dir.close();
}
private static class StandardCodecWithOrds extends Codec {
public StandardCodecWithOrds() {
name = "StandardOrds";
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docs = new StandardPostingsWriter(state);
// TODO: should we make the terms index more easily
// pluggable? Ie so that this codec would record which
// index impl was used, and switch on loading?
// Or... you must make a new Codec for this?
TermsIndexWriterBase indexWriter;
boolean success = false;
try {
indexWriter = new FixedGapTermsIndexWriter(state);
success = true;
} finally {
if (!success) {
docs.close();
}
}
success = false;
try {
FieldsConsumer ret = new BlockTermsWriter(indexWriter, state, docs);
success = true;
return ret;
} finally {
if (!success) {
try {
docs.close();
} finally {
indexWriter.close();
}
}
}
}
public final static int TERMS_CACHE_SIZE = 1024;
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postings = new StandardPostingsReader(state.dir, state.segmentInfo, state.readBufferSize, state.codecId);
TermsIndexReaderBase indexReader;
boolean success = false;
try {
indexReader = new FixedGapTermsIndexReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
BytesRef.getUTF8SortedAsUnicodeComparator(),
state.codecId);
success = true;
} finally {
if (!success) {
postings.close();
}
}
success = false;
try {
FieldsProducer ret = new BlockTermsReader(indexReader,
state.dir,
state.fieldInfos,
state.segmentInfo.name,
postings,
state.readBufferSize,
TERMS_CACHE_SIZE,
state.codecId);
success = true;
return ret;
} finally {
if (!success) {
try {
postings.close();
} finally {
indexReader.close();
}
}
}
}
/** Extension of freq postings file */
static final String FREQ_EXTENSION = "frq";
/** Extension of prox postings file */
static final String PROX_EXTENSION = "prx";
@Override
public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException {
StandardPostingsReader.files(dir, segmentInfo, id, files);
BlockTermsReader.files(dir, segmentInfo, id, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files);
}
@Override
public void getExtensions(Set<String> extensions) {
getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions);
}
public static void getStandardExtensions(Set<String> extensions) {
extensions.add(FREQ_EXTENSION);
extensions.add(PROX_EXTENSION);
BlockTermsReader.getExtensions(extensions);
FixedGapTermsIndexReader.getIndexExtensions(extensions);
}
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator());
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId);
}
}
public void testRandom() throws Exception {
MockDirectoryWrapper dir = newDirectory();
final int NUM_TERMS = atLeast(20);
final Set<BytesRef> terms = new HashSet<BytesRef>();
while(terms.size() < NUM_TERMS) {
final String s = _TestUtil.randomRealisticUnicodeString(random);
//final String s = _TestUtil.randomSimpleString(random);
if (s.length() > 0) {
terms.add(new BytesRef(s));
}
}
final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
Arrays.sort(termsArray);
final int NUM_DOCS = atLeast(100);
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
// Sometimes swap in codec that impls ord():
if (random.nextInt(10) == 7) {
// Make sure terms index has ords:
CoreCodecProvider cp = new CoreCodecProvider();
cp.register(new StandardCodecWithOrds());
cp.setDefaultFieldCodec("StandardOrds");
// So checkIndex on close works
dir.setCodecProvider(cp);
conf.setCodecProvider(cp);
}
final RandomIndexWriter w = new RandomIndexWriter(random, dir, conf);
final int[][] idToOrds = new int[NUM_DOCS][];
final Set<Integer> ordsForDocSet = new HashSet<Integer>();
for(int id=0;id<NUM_DOCS;id++) {
Document doc = new Document();
NumericField idField = new NumericField("id");
doc.add(idField.setIntValue(id));
final int termCount = _TestUtil.nextInt(random, 0, 20*RANDOM_MULTIPLIER);
while(ordsForDocSet.size() < termCount) {
ordsForDocSet.add(random.nextInt(termsArray.length));
}
final int[] ordsForDoc = new int[termCount];
int upto = 0;
if (VERBOSE) {
System.out.println("TEST: doc id=" + id);
}
for(int ord : ordsForDocSet) {
ordsForDoc[upto++] = ord;
Field field = newField("field", termsArray[ord].utf8ToString(), Field.Index.NOT_ANALYZED_NO_NORMS);
if (VERBOSE) {
System.out.println(" f=" + termsArray[ord].utf8ToString());
}
doc.add(field);
}
ordsForDocSet.clear();
Arrays.sort(ordsForDoc);
idToOrds[id] = ordsForDoc;
w.addDocument(doc);
}
final IndexReader r = w.getReader();
w.close();
if (VERBOSE) {
System.out.println("TEST: reader=" + r);
}
for(IndexReader subR : r.getSequentialSubReaders()) {
if (VERBOSE) {
System.out.println("\nTEST: sub=" + subR);
}
verify(subR, idToOrds, termsArray, null);
}
// Also test top-level reader: its enum does not support
// ord, so this forces the OrdWrapper to run:
if (VERBOSE) {
System.out.println("TEST: top reader");
}
verify(r, idToOrds, termsArray, null);
FieldCache.DEFAULT.purge(r);
r.close();
dir.close();
}
public void testRandomWithPrefix() throws Exception {
MockDirectoryWrapper dir = newDirectory();
final Set<String> prefixes = new HashSet<String>();
final int numPrefix = _TestUtil.nextInt(random, 2, 7);
if (VERBOSE) {
System.out.println("TEST: use " + numPrefix + " prefixes");
}
while(prefixes.size() < numPrefix) {
prefixes.add(_TestUtil.randomRealisticUnicodeString(random));
//prefixes.add(_TestUtil.randomSimpleString(random));
}
final String[] prefixesArray = prefixes.toArray(new String[prefixes.size()]);
final int NUM_TERMS = atLeast(20);
final Set<BytesRef> terms = new HashSet<BytesRef>();
while(terms.size() < NUM_TERMS) {
final String s = prefixesArray[random.nextInt(prefixesArray.length)] + _TestUtil.randomRealisticUnicodeString(random);
//final String s = prefixesArray[random.nextInt(prefixesArray.length)] + _TestUtil.randomSimpleString(random);
if (s.length() > 0) {
terms.add(new BytesRef(s));
}
}
final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
Arrays.sort(termsArray);
final int NUM_DOCS = atLeast(100);
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
// Sometimes swap in codec that impls ord():
if (random.nextInt(10) == 7) {
// Make sure terms index has ords:
CoreCodecProvider cp = new CoreCodecProvider();
cp.register(new StandardCodecWithOrds());
cp.setDefaultFieldCodec("StandardOrds");
// So checkIndex on close works
dir.setCodecProvider(cp);
conf.setCodecProvider(cp);
}
final RandomIndexWriter w = new RandomIndexWriter(random, dir, conf);
final int[][] idToOrds = new int[NUM_DOCS][];
final Set<Integer> ordsForDocSet = new HashSet<Integer>();
for(int id=0;id<NUM_DOCS;id++) {
Document doc = new Document();
NumericField idField = new NumericField("id");
doc.add(idField.setIntValue(id));
final int termCount = _TestUtil.nextInt(random, 0, 20*RANDOM_MULTIPLIER);
while(ordsForDocSet.size() < termCount) {
ordsForDocSet.add(random.nextInt(termsArray.length));
}
final int[] ordsForDoc = new int[termCount];
int upto = 0;
if (VERBOSE) {
System.out.println("TEST: doc id=" + id);
}
for(int ord : ordsForDocSet) {
ordsForDoc[upto++] = ord;
Field field = newField("field", termsArray[ord].utf8ToString(), Field.Index.NOT_ANALYZED_NO_NORMS);
if (VERBOSE) {
System.out.println(" f=" + termsArray[ord].utf8ToString());
}
doc.add(field);
}
ordsForDocSet.clear();
Arrays.sort(ordsForDoc);
idToOrds[id] = ordsForDoc;
w.addDocument(doc);
}
final IndexReader r = w.getReader();
w.close();
if (VERBOSE) {
System.out.println("TEST: reader=" + r);
}
for(String prefix : prefixesArray) {
final BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix);
final int[][] idToOrdsPrefix = new int[NUM_DOCS][];
for(int id=0;id<NUM_DOCS;id++) {
final int[] docOrds = idToOrds[id];
final List<Integer> newOrds = new ArrayList<Integer>();
for(int ord : idToOrds[id]) {
if (termsArray[ord].startsWith(prefixRef)) {
newOrds.add(ord);
}
}
final int[] newOrdsArray = new int[newOrds.size()];
int upto = 0;
for(int ord : newOrds) {
newOrdsArray[upto++] = ord;
}
idToOrdsPrefix[id] = newOrdsArray;
}
for(IndexReader subR : r.getSequentialSubReaders()) {
if (VERBOSE) {
System.out.println("\nTEST: sub=" + subR);
}
verify(subR, idToOrdsPrefix, termsArray, prefixRef);
}
// Also test top-level reader: its enum does not support
// ord, so this forces the OrdWrapper to run:
if (VERBOSE) {
System.out.println("TEST: top reader");
}
verify(r, idToOrdsPrefix, termsArray, prefixRef);
}
FieldCache.DEFAULT.purge(r);
r.close();
dir.close();
}
private void verify(IndexReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception {
final DocTermOrds dto = new DocTermOrds(r,
"field",
prefixRef,
Integer.MAX_VALUE,
_TestUtil.nextInt(random, 2, 10));
final int[] docIDToID = FieldCache.DEFAULT.getInts(r, "id");
/*
for(int docID=0;docID<subR.maxDoc();docID++) {
System.out.println(" docID=" + docID + " id=" + docIDToID[docID]);
}
*/
if (VERBOSE) {
System.out.println("TEST: verify prefix=" + prefixRef.utf8ToString());
System.out.println("TEST: all TERMS:");
TermsEnum allTE = MultiFields.getTerms(r, "field").iterator();
int ord = 0;
while(allTE.next() != null) {
System.out.println(" ord=" + (ord++) + " term=" + allTE.term().utf8ToString());
}
}
//final TermsEnum te = subR.fields().terms("field").iterator();
final TermsEnum te = dto.getOrdTermsEnum(r);
if (te == null) {
if (prefixRef == null) {
assertNull(MultiFields.getTerms(r, "field"));
} else {
Terms terms = MultiFields.getTerms(r, "field");
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
TermsEnum.SeekStatus result = termsEnum.seek(prefixRef, false);
if (result != TermsEnum.SeekStatus.END) {
assertFalse("term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), termsEnum.term().startsWith(prefixRef));
} else {
// ok
}
} else {
// ok
}
}
return;
}
if (VERBOSE) {
System.out.println("TEST: TERMS:");
te.seek(0);
while(true) {
System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString());
if (te.next() == null) {
break;
}
}
}
TermOrdsIterator iter = null;
final int[] buffer = new int[5];
for(int docID=0;docID<r.maxDoc();docID++) {
if (VERBOSE) {
System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID[docID] + ")");
}
iter = dto.lookup(docID, iter);
final int[] answers = idToOrds[docIDToID[docID]];
int upto = 0;
while(true) {
final int chunk = iter.read(buffer);
for(int idx=0;idx<chunk;idx++) {
assertEquals(TermsEnum.SeekStatus.FOUND, te.seek((long) buffer[idx]));
final BytesRef expected = termsArray[answers[upto++]];
if (VERBOSE) {
System.out.println(" exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString());
}
assertEquals("expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + buffer[idx], expected, te.term());
}
if (chunk < buffer.length) {
assertEquals(answers.length, upto);
break;
}
}
}
}
}