blob: 38456a588e4af61b16e77326434e86c2896a5d0e [file] [log] [blame]
package org.apache.lucene.index.sorter;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.index.sorter.SortingAtomicReader.SortingDocsAndPositionsEnum;
import org.apache.lucene.index.sorter.SortingAtomicReader.SortingDocsEnum;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public abstract class SorterTestBase extends LuceneTestCase {
static final class NormsSimilarity extends Similarity {
private final Similarity in;
public NormsSimilarity(Similarity in) {
this.in = in;
}
@Override
public long computeNorm(FieldInvertState state) {
if (state.getName().equals(NORMS_FIELD)) {
return Float.floatToIntBits(state.getBoost());
} else {
return in.computeNorm(state);
}
}
@Override
public SimWeight computeWeight(float queryBoost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return in.computeWeight(queryBoost, collectionStats, termStats);
}
@Override
public SimScorer simScorer(SimWeight weight, AtomicReaderContext context) throws IOException {
return in.simScorer(weight, context);
}
}
static final class PositionsTokenStream extends TokenStream {
private final CharTermAttribute term;
private final PayloadAttribute payload;
private final OffsetAttribute offset;
private int pos, off;
public PositionsTokenStream() {
term = addAttribute(CharTermAttribute.class);
term.append(DOC_POSITIONS_TERM);
payload = addAttribute(PayloadAttribute.class);
offset = addAttribute(OffsetAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (pos == 0) {
return false;
}
payload.setPayload(new BytesRef(Integer.toString(pos)));
offset.setOffset(off, off);
--pos;
++off;
return true;
}
void setId(int id) {
pos = id / 10 + 1;
off = 0;
}
}
protected static final String ID_FIELD = "id";
protected static final String DOCS_ENUM_FIELD = "docs";
protected static final String DOCS_ENUM_TERM = "$all$";
protected static final String DOC_POSITIONS_FIELD = "positions";
protected static final String DOC_POSITIONS_TERM = "$all$";
protected static final String NUMERIC_DV_FIELD = "numeric";
protected static final String NORMS_FIELD = "norm";
protected static final String BINARY_DV_FIELD = "binary";
protected static final String SORTED_DV_FIELD = "sorted";
protected static final String SORTED_SET_DV_FIELD = "sorted_set";
protected static final String TERM_VECTORS_FIELD = "term_vectors";
private static final FieldType TERM_VECTORS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
static {
TERM_VECTORS_TYPE.setStoreTermVectors(true);
TERM_VECTORS_TYPE.freeze();
}
private static final FieldType POSITIONS_TYPE = new FieldType(TextField.TYPE_NOT_STORED);
static {
POSITIONS_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
POSITIONS_TYPE.freeze();
}
protected static Directory dir;
protected static AtomicReader reader;
protected static Integer[] sortedValues;
private static Document doc(final int id, PositionsTokenStream positions) {
final Document doc = new Document();
doc.add(new StringField(ID_FIELD, Integer.toString(id), Store.YES));
doc.add(new StringField(DOCS_ENUM_FIELD, DOCS_ENUM_TERM, Store.NO));
positions.setId(id);
if (doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
// codec doesnt support offsets: just index positions for the field
doc.add(new Field(DOC_POSITIONS_FIELD, positions, TextField.TYPE_NOT_STORED));
} else {
doc.add(new Field(DOC_POSITIONS_FIELD, positions, POSITIONS_TYPE));
}
doc.add(new NumericDocValuesField(NUMERIC_DV_FIELD, id));
TextField norms = new TextField(NORMS_FIELD, Integer.toString(id), Store.NO);
norms.setBoost(Float.intBitsToFloat(id));
doc.add(norms);
doc.add(new BinaryDocValuesField(BINARY_DV_FIELD, new BytesRef(Integer.toString(id))));
doc.add(new SortedDocValuesField(SORTED_DV_FIELD, new BytesRef(Integer.toString(id))));
if (defaultCodecSupportsSortedSet()) {
doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id))));
doc.add(new SortedSetDocValuesField(SORTED_SET_DV_FIELD, new BytesRef(Integer.toString(id + 1))));
}
doc.add(new Field(TERM_VECTORS_FIELD, Integer.toString(id), TERM_VECTORS_TYPE));
return doc;
}
/** Creates an index for sorting. */
public static void createIndex(Directory dir, int numDocs, Random random) throws IOException {
List<Integer> ids = new ArrayList<Integer>();
for (int i = 0; i < numDocs; i++) {
ids.add(Integer.valueOf(i * 10));
}
// shuffle them for indexing
Collections.shuffle(ids, random);
if (VERBOSE) {
System.out.println("Shuffled IDs for indexing: " + Arrays.toString(ids.toArray()));
}
PositionsTokenStream positions = new PositionsTokenStream();
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
conf.setMaxBufferedDocs(4); // create some segments
conf.setSimilarity(new NormsSimilarity(conf.getSimilarity())); // for testing norms field
RandomIndexWriter writer = new RandomIndexWriter(random, dir, conf);
writer.setDoRandomForceMerge(false);
for (int id : ids) {
writer.addDocument(doc(id, positions));
}
// delete some documents
writer.commit();
for (Integer id : ids) {
if (random.nextDouble() < 0.2) {
if (VERBOSE) {
System.out.println("delete doc_id " + id);
}
writer.deleteDocuments(new Term(ID_FIELD, id.toString()));
}
}
writer.close();
}
@BeforeClass
public static void beforeClassSorterTestBase() throws Exception {
dir = newDirectory();
int numDocs = atLeast(20);
createIndex(dir, numDocs, random());
reader = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(dir));
}
@AfterClass
public static void afterClassSorterTestBase() throws Exception {
reader.close();
dir.close();
}
@Test
public void testBinaryDocValuesField() throws Exception {
BinaryDocValues dv = reader.getBinaryDocValues(BINARY_DV_FIELD);
BytesRef bytes = new BytesRef();
for (int i = 0; i < reader.maxDoc(); i++) {
dv.get(i, bytes);
assertEquals("incorrect binary DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString());
}
}
@Test
public void testDocsAndPositionsEnum() throws Exception {
TermsEnum termsEnum = reader.terms(DOC_POSITIONS_FIELD).iterator(null);
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOC_POSITIONS_TERM)));
DocsAndPositionsEnum sortedPositions = termsEnum.docsAndPositions(null, null);
int doc;
// test nextDoc()
while ((doc = sortedPositions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
int freq = sortedPositions.freq();
assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq);
for (int i = 0; i < freq; i++) {
assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition());
if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset());
assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset());
}
assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString()));
}
}
// test advance()
final DocsAndPositionsEnum reuse = sortedPositions;
sortedPositions = termsEnum.docsAndPositions(null, reuse);
if (sortedPositions instanceof SortingDocsAndPositionsEnum) {
assertTrue(((SortingDocsAndPositionsEnum) sortedPositions).reused(reuse)); // make sure reuse worked
}
doc = 0;
while ((doc = sortedPositions.advance(doc + _TestUtil.nextInt(random(), 1, 5))) != DocIdSetIterator.NO_MORE_DOCS) {
int freq = sortedPositions.freq();
assertEquals("incorrect freq for doc=" + doc, sortedValues[doc].intValue() / 10 + 1, freq);
for (int i = 0; i < freq; i++) {
assertEquals("incorrect position for doc=" + doc, i, sortedPositions.nextPosition());
if (!doesntSupportOffsets.contains(_TestUtil.getPostingsFormat(DOC_POSITIONS_FIELD))) {
assertEquals("incorrect startOffset for doc=" + doc, i, sortedPositions.startOffset());
assertEquals("incorrect endOffset for doc=" + doc, i, sortedPositions.endOffset());
}
assertEquals("incorrect payload for doc=" + doc, freq - i, Integer.parseInt(sortedPositions.getPayload().utf8ToString()));
}
}
}
Bits randomLiveDocs(int maxDoc) {
if (rarely()) {
if (random().nextBoolean()) {
return null;
} else {
return new Bits.MatchNoBits(maxDoc);
}
}
final FixedBitSet bits = new FixedBitSet(maxDoc);
final int bitsSet = _TestUtil.nextInt(random(), 1, maxDoc - 1);
for (int i = 0; i < bitsSet; ++i) {
while (true) {
final int index = random().nextInt(maxDoc);
if (!bits.get(index)) {
bits.set(index);
break;
}
}
}
return bits;
}
@Test
public void testDocsEnum() throws Exception {
Bits mappedLiveDocs = randomLiveDocs(reader.maxDoc());
TermsEnum termsEnum = reader.terms(DOCS_ENUM_FIELD).iterator(null);
assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef(DOCS_ENUM_TERM)));
DocsEnum docs = termsEnum.docs(mappedLiveDocs, null);
int doc;
int prev = -1;
while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(doc));
assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD)));
while (++prev < doc) {
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev));
}
}
while (++prev < reader.maxDoc()) {
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev));
}
DocsEnum reuse = docs;
docs = termsEnum.docs(mappedLiveDocs, reuse);
if (docs instanceof SortingDocsEnum) {
assertTrue(((SortingDocsEnum) docs).reused(reuse)); // make sure reuse worked
}
doc = -1;
prev = -1;
while ((doc = docs.advance(doc + 1)) != DocIdSetIterator.NO_MORE_DOCS) {
assertTrue("document " + doc + " marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(doc));
assertEquals("incorrect value; doc " + doc, sortedValues[doc].intValue(), Integer.parseInt(reader.document(doc).get(ID_FIELD)));
while (++prev < doc) {
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev));
}
}
while (++prev < reader.maxDoc()) {
assertFalse("document " + prev + " not marked as deleted", mappedLiveDocs == null || mappedLiveDocs.get(prev));
}
}
@Test
public void testNormValues() throws Exception {
NumericDocValues dv = reader.getNormValues(NORMS_FIELD);
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
assertEquals("incorrect norm value for doc " + i, sortedValues[i].intValue(), dv.get(i));
}
}
@Test
public void testNumericDocValuesField() throws Exception {
NumericDocValues dv = reader.getNumericDocValues(NUMERIC_DV_FIELD);
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
assertEquals("incorrect numeric DocValues for doc " + i, sortedValues[i].intValue(), dv.get(i));
}
}
@Test
public void testSortedDocValuesField() throws Exception {
SortedDocValues dv = reader.getSortedDocValues(SORTED_DV_FIELD);
int maxDoc = reader.maxDoc();
BytesRef bytes = new BytesRef();
for (int i = 0; i < maxDoc; i++) {
dv.get(i, bytes);
assertEquals("incorrect sorted DocValues for doc " + i, sortedValues[i].toString(), bytes.utf8ToString());
}
}
@Test
public void testSortedSetDocValuesField() throws Exception {
assumeTrue("default codec does not support SORTED_SET", defaultCodecSupportsSortedSet());
SortedSetDocValues dv = reader.getSortedSetDocValues(SORTED_SET_DV_FIELD);
int maxDoc = reader.maxDoc();
BytesRef bytes = new BytesRef();
for (int i = 0; i < maxDoc; i++) {
dv.setDocument(i);
dv.lookupOrd(dv.nextOrd(), bytes);
int value = sortedValues[i].intValue();
assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value).toString(), bytes.utf8ToString());
dv.lookupOrd(dv.nextOrd(), bytes);
assertEquals("incorrect sorted-set DocValues for doc " + i, Integer.valueOf(value + 1).toString(), bytes.utf8ToString());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, dv.nextOrd());
}
}
@Test
public void testTermVectors() throws Exception {
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
Terms terms = reader.getTermVector(i, TERM_VECTORS_FIELD);
assertNotNull("term vectors not found for doc " + i + " field [" + TERM_VECTORS_FIELD + "]", terms);
assertEquals("incorrect term vector for doc " + i, sortedValues[i].toString(), terms.iterator(null).next().utf8ToString());
}
}
}