blob: e31b54dec9d60eb785a8cd4703fe5030de60f390 [file] [log] [blame]
package org.apache.lucene.index.values;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.IndexDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MultiPerDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.index.values.IndexDocValues.Source;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FloatsRef;
import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util._TestUtil;
import org.junit.Before;
/**
*
* Tests DocValues integration into IndexWriter & Codecs
*
*/
public class TestDocValuesIndexing extends LuceneTestCase {
/*
* - add test for unoptimized case with deletes
* - add multithreaded tests / integrate into stress indexing?
*/
@Before
public void setUp() throws Exception {
super.setUp();
assumeFalse("cannot work with preflex codec", CodecProvider.getDefault().getDefaultFieldCodec().equals("PreFlex"));
}
/*
* Simple test case to show how to use the API
*/
public void testDocValuesSimple() throws CorruptIndexException, IOException,
ParseException {
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, writerConfig(false));
for (int i = 0; i < 5; i++) {
Document doc = new Document();
IndexDocValuesField valuesField = new IndexDocValuesField("docId");
valuesField.setInt(i);
doc.add(valuesField);
doc.add(new Field("docId", "" + i, Store.NO, Index.ANALYZED));
writer.addDocument(doc);
}
writer.commit();
writer.optimize(true);
writer.close(true);
IndexReader reader = IndexReader.open(dir, null, true, 1);
assertTrue(reader.isOptimized());
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "docId",
new MockAnalyzer(random));
TopDocs search = searcher.search(parser.parse("0 OR 1 OR 2 OR 3 OR 4"), 10);
assertEquals(5, search.totalHits);
ScoreDoc[] scoreDocs = search.scoreDocs;
IndexDocValues docValues = MultiPerDocValues.getPerDocs(reader).docValues("docId");
Source source = docValues.getSource();
for (int i = 0; i < scoreDocs.length; i++) {
assertEquals(i, scoreDocs[i].doc);
assertEquals(i, source.getInt(scoreDocs[i].doc));
}
reader.close();
dir.close();
}
/**
* Tests complete indexing of {@link ValueType} including deletions, merging and
* sparse value fields on Compound-File
*/
public void testIndexBytesNoDeletesCFS() throws IOException {
runTestIndexBytes(writerConfig(true), false);
}
public void testIndexBytesDeletesCFS() throws IOException {
runTestIndexBytes(writerConfig(true), true);
}
public void testIndexNumericsNoDeletesCFS() throws IOException {
runTestNumerics(writerConfig(true), false);
}
public void testIndexNumericsDeletesCFS() throws IOException {
runTestNumerics(writerConfig(true), true);
}
/**
* Tests complete indexing of {@link ValueType} including deletions, merging and
* sparse value fields on None-Compound-File
*/
public void testIndexBytesNoDeletes() throws IOException {
runTestIndexBytes(writerConfig(false), false);
}
public void testIndexBytesDeletes() throws IOException {
runTestIndexBytes(writerConfig(false), true);
}
public void testIndexNumericsNoDeletes() throws IOException {
runTestNumerics(writerConfig(false), false);
}
public void testIndexNumericsDeletes() throws IOException {
runTestNumerics(writerConfig(false), true);
}
public void testAddIndexes() throws IOException {
int valuesPerIndex = 10;
List<ValueType> values = Arrays.asList(ValueType.values());
Collections.shuffle(values, random);
ValueType first = values.get(0);
ValueType second = values.get(1);
String msg = "[first=" + first.name() + ", second=" + second.name() + "]";
// index first index
Directory d_1 = newDirectory();
IndexWriter w_1 = new IndexWriter(d_1, writerConfig(random.nextBoolean()));
indexValues(w_1, valuesPerIndex, first, values, false, 7);
w_1.commit();
assertEquals(valuesPerIndex, w_1.maxDoc());
_TestUtil.checkIndex(d_1, w_1.getConfig().getCodecProvider());
// index second index
Directory d_2 = newDirectory();
IndexWriter w_2 = new IndexWriter(d_2, writerConfig(random.nextBoolean()));
indexValues(w_2, valuesPerIndex, second, values, false, 7);
w_2.commit();
assertEquals(valuesPerIndex, w_2.maxDoc());
_TestUtil.checkIndex(d_2, w_2.getConfig().getCodecProvider());
Directory target = newDirectory();
IndexWriter w = new IndexWriter(target, writerConfig(random.nextBoolean()));
IndexReader r_1 = IndexReader.open(w_1, true);
IndexReader r_2 = IndexReader.open(w_2, true);
if (random.nextBoolean()) {
w.addIndexes(d_1, d_2);
} else {
w.addIndexes(r_1, r_2);
}
w.optimize(true);
w.commit();
_TestUtil.checkIndex(target, w.getConfig().getCodecProvider());
assertEquals(valuesPerIndex * 2, w.maxDoc());
// check values
IndexReader merged = IndexReader.open(w, true);
ValuesEnum vE_1 = getValuesEnum(getDocValues(r_1, first.name()));
ValuesEnum vE_2 = getValuesEnum(getDocValues(r_2, second.name()));
ValuesEnum vE_1_merged = getValuesEnum(getDocValues(merged, first.name()));
ValuesEnum vE_2_merged = getValuesEnum(getDocValues(merged, second
.name()));
switch (second) { // these variants don't advance over missing values
case BYTES_FIXED_STRAIGHT:
case BYTES_VAR_STRAIGHT:
case FLOAT_32:
case FLOAT_64:
case INTS:
assertEquals(msg, valuesPerIndex-1, vE_2_merged.advance(valuesPerIndex-1));
}
for (int i = 0; i < valuesPerIndex; i++) {
assertEquals(msg, i, vE_1.nextDoc());
assertEquals(msg, i, vE_1_merged.nextDoc());
assertEquals(msg, i, vE_2.nextDoc());
assertEquals(msg, i + valuesPerIndex, vE_2_merged.nextDoc());
}
assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_1.nextDoc());
assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_2.nextDoc());
assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_1_merged.advance(valuesPerIndex*2));
assertEquals(msg, ValuesEnum.NO_MORE_DOCS, vE_2_merged.nextDoc());
// close resources
r_1.close();
r_2.close();
merged.close();
w_1.close(true);
w_2.close(true);
w.close(true);
d_1.close();
d_2.close();
target.close();
}
private IndexWriterConfig writerConfig(boolean useCompoundFile) {
final IndexWriterConfig cfg = newIndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer(random));
cfg.setMergePolicy(newLogMergePolicy(random));
LogMergePolicy policy = new LogDocMergePolicy();
cfg.setMergePolicy(policy);
policy.setUseCompoundFile(useCompoundFile);
return cfg;
}
public void runTestNumerics(IndexWriterConfig cfg, boolean withDeletions)
throws IOException {
Directory d = newDirectory();
IndexWriter w = new IndexWriter(d, cfg);
final int numValues = 179 + random.nextInt(151);
final List<ValueType> numVariantList = new ArrayList<ValueType>(NUMERICS);
// run in random order to test if fill works correctly during merges
Collections.shuffle(numVariantList, random);
for (ValueType val : numVariantList) {
OpenBitSet deleted = indexValues(w, numValues, val, numVariantList,
withDeletions, 7);
List<Closeable> closeables = new ArrayList<Closeable>();
IndexReader r = IndexReader.open(w, true);
final int numRemainingValues = (int) (numValues - deleted.cardinality());
final int base = r.numDocs() - numRemainingValues;
switch (val) {
case INTS: {
IndexDocValues intsReader = getDocValues(r, val.name());
assertNotNull(intsReader);
Source ints = getSource(intsReader);
for (int i = 0; i < base; i++) {
long value = ints.getInt(i);
assertEquals("index " + i, 0, value);
}
ValuesEnum intsEnum = getValuesEnum(intsReader);
assertTrue(intsEnum.advance(base) >= base);
intsEnum = getValuesEnum(intsReader);
LongsRef enumRef = intsEnum.getInt();
int expected = 0;
for (int i = base; i < r.numDocs(); i++, expected++) {
while (deleted.get(expected)) {
expected++;
}
assertEquals("advance failed at index: " + i + " of " + r.numDocs()
+ " docs", i, intsEnum.advance(i));
assertEquals(expected, ints.getInt(i));
assertEquals(expected, enumRef.get());
}
}
break;
case FLOAT_32:
case FLOAT_64: {
IndexDocValues floatReader = getDocValues(r, val.name());
assertNotNull(floatReader);
Source floats = getSource(floatReader);
for (int i = 0; i < base; i++) {
double value = floats.getFloat(i);
assertEquals(val + " failed for doc: " + i + " base: " + base,
0.0d, value, 0.0d);
}
ValuesEnum floatEnum = getValuesEnum(floatReader);
assertTrue(floatEnum.advance(base) >= base);
floatEnum = getValuesEnum(floatReader);
FloatsRef enumRef = floatEnum.getFloat();
int expected = 0;
for (int i = base; i < r.numDocs(); i++, expected++) {
while (deleted.get(expected)) {
expected++;
}
assertEquals("advance failed at index: " + i + " of " + r.numDocs()
+ " docs base:" + base, i, floatEnum.advance(i));
assertEquals(floatEnum.getClass() + " index " + i, 2.0 * expected,
enumRef.get(), 0.00001);
assertEquals("index " + i, 2.0 * expected, floats.getFloat(i),
0.00001);
}
}
break;
default:
fail("unexpected value " + val);
}
closeables.add(r);
for (Closeable toClose : closeables) {
toClose.close();
}
}
w.close();
d.close();
}
public void runTestIndexBytes(IndexWriterConfig cfg, boolean withDeletions)
throws CorruptIndexException, LockObtainFailedException, IOException {
final Directory d = newDirectory();
IndexWriter w = new IndexWriter(d, cfg);
final List<ValueType> byteVariantList = new ArrayList<ValueType>(BYTES);
// run in random order to test if fill works correctly during merges
Collections.shuffle(byteVariantList, random);
final int numValues = 179 + random.nextInt(151);
for (ValueType byteIndexValue : byteVariantList) {
List<Closeable> closeables = new ArrayList<Closeable>();
int bytesSize = 7 + random.nextInt(128);
OpenBitSet deleted = indexValues(w, numValues, byteIndexValue,
byteVariantList, withDeletions, bytesSize);
final IndexReader r = IndexReader.open(w, withDeletions);
assertEquals(0, r.numDeletedDocs());
final int numRemainingValues = (int) (numValues - deleted.cardinality());
final int base = r.numDocs() - numRemainingValues;
IndexDocValues bytesReader = getDocValues(r, byteIndexValue.name());
assertNotNull("field " + byteIndexValue.name()
+ " returned null reader - maybe merged failed", bytesReader);
Source bytes = getSource(bytesReader);
byte upto = 0;
// test the filled up slots for correctness
for (int i = 0; i < base; i++) {
BytesRef br = bytes.getBytes(i, new BytesRef());
String msg = " field: " + byteIndexValue.name() + " at index: " + i
+ " base: " + base + " numDocs:" + r.numDocs();
switch (byteIndexValue) {
case BYTES_VAR_STRAIGHT:
case BYTES_FIXED_STRAIGHT:
// fixed straight returns bytesref with zero bytes all of fixed
// length
assertNotNull("expected none null - " + msg, br);
if (br.length != 0) {
assertEquals("expected zero bytes of length " + bytesSize + " - "
+ msg, bytesSize, br.length);
for (int j = 0; j < br.length; j++) {
assertEquals("Byte at index " + j + " doesn't match - " + msg, 0,
br.bytes[br.offset + j]);
}
}
break;
case BYTES_VAR_SORTED:
case BYTES_FIXED_SORTED:
case BYTES_VAR_DEREF:
case BYTES_FIXED_DEREF:
default:
assertNotNull("expected none null - " + msg, br);
assertEquals(0, br.length);
// make sure we advance at least until base
ValuesEnum bytesEnum = getValuesEnum(bytesReader);
final int advancedTo = bytesEnum.advance(0);
assertTrue(byteIndexValue.name() + " advanced failed base:" + base
+ " advancedTo: " + advancedTo, base <= advancedTo);
}
}
ValuesEnum bytesEnum = getValuesEnum(bytesReader);
final BytesRef enumRef = bytesEnum.bytes();
// test the actual doc values added in this iteration
assertEquals(base + numRemainingValues, r.numDocs());
int v = 0;
for (int i = base; i < r.numDocs(); i++) {
String msg = " field: " + byteIndexValue.name() + " at index: " + i
+ " base: " + base + " numDocs:" + r.numDocs() + " bytesSize: "
+ bytesSize + " src: " + bytes;
while (withDeletions && deleted.get(v++)) {
upto += bytesSize;
}
BytesRef br = bytes.getBytes(i, new BytesRef());
if (bytesEnum.docID() != i) {
assertEquals("seek failed for index " + i + " " + msg, i, bytesEnum
.advance(i));
}
for (int j = 0; j < br.length; j++, upto++) {
assertTrue(" enumRef not initialized " + msg,
enumRef.bytes.length > 0);
assertEquals(
"EnumRef Byte at index " + j + " doesn't match - " + msg, upto,
enumRef.bytes[enumRef.offset + j]);
if (!(br.bytes.length > br.offset + j))
br = bytes.getBytes(i, new BytesRef());
assertTrue("BytesRef index exceeded [" + msg + "] offset: "
+ br.offset + " length: " + br.length + " index: "
+ (br.offset + j), br.bytes.length > br.offset + j);
assertEquals("SourceRef Byte at index " + j + " doesn't match - "
+ msg, upto, br.bytes[br.offset + j]);
}
}
// clean up
closeables.add(r);
for (Closeable toClose : closeables) {
toClose.close();
}
}
w.close();
d.close();
}
private IndexDocValues getDocValues(IndexReader reader, String field)
throws IOException {
boolean optimized = reader.isOptimized();
PerDocValues perDoc = optimized ? reader.getSequentialSubReaders()[0].perDocValues()
: MultiPerDocValues.getPerDocs(reader);
switch (random.nextInt(optimized ? 3 : 2)) { // case 2 only if optimized
case 0:
return perDoc.docValues(field);
case 1:
IndexDocValues docValues = perDoc.docValues(field);
if (docValues != null) {
return docValues;
}
throw new RuntimeException("no such field " + field);
case 2:// this only works if we are on an optimized index!
return reader.getSequentialSubReaders()[0].docValues(field);
}
throw new RuntimeException();
}
private Source getSource(IndexDocValues values) throws IOException {
Source source;
if (random.nextInt(10) == 0) {
source = values.load();
} else {
// getSource uses cache internally
source = values.getSource();
}
assertNotNull(source);
return source;
}
private ValuesEnum getValuesEnum(IndexDocValues values) throws IOException {
ValuesEnum valuesEnum;
if (!(values instanceof MultiIndexDocValues) && random.nextInt(10) == 0) {
// TODO not supported by MultiDocValues yet!
valuesEnum = getSource(values).getEnum();
} else {
valuesEnum = values.getEnum();
}
assertNotNull(valuesEnum);
return valuesEnum;
}
private static EnumSet<ValueType> BYTES = EnumSet.of(ValueType.BYTES_FIXED_DEREF,
ValueType.BYTES_FIXED_SORTED, ValueType.BYTES_FIXED_STRAIGHT, ValueType.BYTES_VAR_DEREF,
ValueType.BYTES_VAR_SORTED, ValueType.BYTES_VAR_STRAIGHT);
private static EnumSet<ValueType> NUMERICS = EnumSet.of(ValueType.INTS,
ValueType.FLOAT_32, ValueType.FLOAT_64);
private static Index[] IDX_VALUES = new Index[] { Index.ANALYZED,
Index.ANALYZED_NO_NORMS, Index.NOT_ANALYZED, Index.NOT_ANALYZED_NO_NORMS,
Index.NO };
private OpenBitSet indexValues(IndexWriter w, int numValues, ValueType value,
List<ValueType> valueVarList, boolean withDeletions, int multOfSeven)
throws CorruptIndexException, IOException {
final boolean isNumeric = NUMERICS.contains(value);
OpenBitSet deleted = new OpenBitSet(numValues);
Document doc = new Document();
Index idx = IDX_VALUES[random.nextInt(IDX_VALUES.length)];
AbstractField field = random.nextBoolean() ? new IndexDocValuesField(value.name())
: newField(value.name(), _TestUtil.randomRealisticUnicodeString(random,
10), idx == Index.NO ? Store.YES : Store.NO, idx);
doc.add(field);
IndexDocValuesField valField = new IndexDocValuesField("prototype");
final BytesRef bytesRef = new BytesRef();
final String idBase = value.name() + "_";
final byte[] b = new byte[multOfSeven];
if (bytesRef != null) {
bytesRef.bytes = b;
bytesRef.length = b.length;
bytesRef.offset = 0;
}
byte upto = 0;
for (int i = 0; i < numValues; i++) {
if (isNumeric) {
switch (value) {
case INTS:
valField.setInt(i);
break;
case FLOAT_32:
valField.setFloat(2.0f * i);
break;
case FLOAT_64:
valField.setFloat(2.0d * i);
break;
default:
fail("unexpected value " + value);
}
} else {
for (int j = 0; j < b.length; j++) {
b[j] = upto++;
}
if (bytesRef != null) {
valField.setBytes(bytesRef, value);
}
}
doc.removeFields("id");
doc.add(new Field("id", idBase + i, Store.YES,
Index.NOT_ANALYZED_NO_NORMS));
valField.set(field);
w.addDocument(doc);
if (i % 7 == 0) {
if (withDeletions && random.nextBoolean()) {
ValueType val = valueVarList.get(random.nextInt(1 + valueVarList
.indexOf(value)));
final int randInt = val == value ? random.nextInt(1 + i) : random
.nextInt(numValues);
w.deleteDocuments(new Term("id", val.name() + "_" + randInt));
if (val == value) {
deleted.set(randInt);
}
}
if (random.nextInt(10) == 0) {
w.commit();
}
}
}
w.commit();
// TODO test unoptimized with deletions
if (withDeletions || random.nextBoolean())
w.optimize(true);
return deleted;
}
}