blob: cc1f2be04b9c26f19c4959362c7bdc12df27aba5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestDocsAndPositions extends LuceneTestCase {
private String fieldName;
@Override
public void setUp() throws Exception {
super.setUp();
fieldName = "field" + random().nextInt();
}
/**
* Simple testcase for {@link PostingsEnum}
*/
public void testPositionsSimple() throws IOException {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), directory,
newIndexWriterConfig(new MockAnalyzer(random())));
for (int i = 0; i < 39; i++) {
Document doc = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setOmitNorms(true);
doc.add(newField(fieldName, "1 2 3 4 5 6 7 8 9 10 "
+ "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 "
+ "1 2 3 4 5 6 7 8 9 10", customType));
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
int num = atLeast(13);
for (int i = 0; i < num; i++) {
BytesRef bytes = new BytesRef("1");
IndexReaderContext topReaderContext = reader.getContext();
for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) {
PostingsEnum docsAndPosEnum = getDocsAndPositions(
leafReaderContext.reader(), bytes);
assertNotNull(docsAndPosEnum);
if (leafReaderContext.reader().maxDoc() == 0) {
continue;
}
final int advance = docsAndPosEnum.advance(random().nextInt(leafReaderContext.reader().maxDoc()));
do {
String msg = "Advanced to: " + advance + " current doc: "
+ docsAndPosEnum.docID(); // TODO: + " usePayloads: " + usePayload;
assertEquals(msg, 4, docsAndPosEnum.freq());
assertEquals(msg, 0, docsAndPosEnum.nextPosition());
assertEquals(msg, 4, docsAndPosEnum.freq());
assertEquals(msg, 10, docsAndPosEnum.nextPosition());
assertEquals(msg, 4, docsAndPosEnum.freq());
assertEquals(msg, 20, docsAndPosEnum.nextPosition());
assertEquals(msg, 4, docsAndPosEnum.freq());
assertEquals(msg, 30, docsAndPosEnum.nextPosition());
} while (docsAndPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
}
}
reader.close();
directory.close();
}
public PostingsEnum getDocsAndPositions(LeafReader reader,
BytesRef bytes) throws IOException {
Terms terms = reader.terms(fieldName);
if (terms != null) {
TermsEnum te = terms.iterator();
if (te.seekExact(bytes)) {
return te.postings(null, PostingsEnum.ALL);
}
}
return null;
}
/**
* this test indexes random numbers within a range into a field and checks
* their occurrences by searching for a number from that range selected at
* random. All positions for that number are saved up front and compared to
* the enums positions.
*/
public void testRandomPositions() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setMergePolicy(newLogMergePolicy()));
int numDocs = atLeast(47);
int max = 1051;
int term = random().nextInt(max);
Integer[][] positionsInDoc = new Integer[numDocs][];
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setOmitNorms(true);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
ArrayList<Integer> positions = new ArrayList<>();
StringBuilder builder = new StringBuilder();
int num = atLeast(131);
for (int j = 0; j < num; j++) {
int nextInt = random().nextInt(max);
builder.append(nextInt).append(" ");
if (nextInt == term) {
positions.add(Integer.valueOf(j));
}
}
if (positions.size() == 0) {
builder.append(term);
positions.add(num);
}
doc.add(newField(fieldName, builder.toString(), customType));
positionsInDoc[i] = positions.toArray(new Integer[0]);
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
int num = atLeast(13);
for (int i = 0; i < num; i++) {
BytesRef bytes = new BytesRef("" + term);
IndexReaderContext topReaderContext = reader.getContext();
for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) {
PostingsEnum docsAndPosEnum = getDocsAndPositions(
leafReaderContext.reader(), bytes);
assertNotNull(docsAndPosEnum);
int initDoc = 0;
int maxDoc = leafReaderContext.reader().maxDoc();
// initially advance or do next doc
if (random().nextBoolean()) {
initDoc = docsAndPosEnum.nextDoc();
} else {
initDoc = docsAndPosEnum.advance(random().nextInt(maxDoc));
}
// now run through the scorer and check if all positions are there...
do {
int docID = docsAndPosEnum.docID();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
Integer[] pos = positionsInDoc[leafReaderContext.docBase + docID];
assertEquals(pos.length, docsAndPosEnum.freq());
// number of positions read should be random - don't read all of them
// allways
final int howMany = random().nextInt(20) == 0 ? pos.length
- random().nextInt(pos.length) : pos.length;
for (int j = 0; j < howMany; j++) {
assertEquals("iteration: " + i + " initDoc: " + initDoc + " doc: "
+ docID + " base: " + leafReaderContext.docBase
+ " positions: " + Arrays.toString(pos) /* TODO: + " usePayloads: "
+ usePayload*/, pos[j].intValue(), docsAndPosEnum.nextPosition());
}
if (random().nextInt(10) == 0) { // once is a while advance
if (docsAndPosEnum.advance(docID + 1 + random().nextInt((maxDoc - docID))) == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
}
} while (docsAndPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
}
}
reader.close();
dir.close();
}
public void testRandomDocs() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MockAnalyzer(random()))
.setMergePolicy(newLogMergePolicy()));
int numDocs = atLeast(49);
int max = 15678;
int term = random().nextInt(max);
int[] freqInDoc = new int[numDocs];
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setOmitNorms(true);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
StringBuilder builder = new StringBuilder();
for (int j = 0; j < 199; j++) {
int nextInt = random().nextInt(max);
builder.append(nextInt).append(' ');
if (nextInt == term) {
freqInDoc[i]++;
}
}
doc.add(newField(fieldName, builder.toString(), customType));
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
int num = atLeast(13);
for (int i = 0; i < num; i++) {
BytesRef bytes = new BytesRef("" + term);
IndexReaderContext topReaderContext = reader.getContext();
for (LeafReaderContext context : topReaderContext.leaves()) {
int maxDoc = context.reader().maxDoc();
PostingsEnum postingsEnum = TestUtil.docs(random(), context.reader(), fieldName, bytes, null, PostingsEnum.FREQS);
if (findNext(freqInDoc, context.docBase, context.docBase + maxDoc) == Integer.MAX_VALUE) {
assertNull(postingsEnum);
continue;
}
assertNotNull(postingsEnum);
postingsEnum.nextDoc();
for (int j = 0; j < maxDoc; j++) {
if (freqInDoc[context.docBase + j] != 0) {
assertEquals(j, postingsEnum.docID());
assertEquals(postingsEnum.freq(), freqInDoc[context.docBase +j]);
if (i % 2 == 0 && random().nextInt(10) == 0) {
int next = findNext(freqInDoc, context.docBase+j+1, context.docBase + maxDoc) - context.docBase;
int advancedTo = postingsEnum.advance(next);
if (next >= maxDoc) {
assertEquals(DocIdSetIterator.NO_MORE_DOCS, advancedTo);
} else {
assertTrue("advanced to: " +advancedTo + " but should be <= " + next, next >= advancedTo);
}
} else {
postingsEnum.nextDoc();
}
}
}
assertEquals("docBase: " + context.docBase + " maxDoc: " + maxDoc + " " + postingsEnum.getClass(), DocIdSetIterator.NO_MORE_DOCS, postingsEnum.docID());
}
}
reader.close();
dir.close();
}
private static int findNext(int[] docs, int pos, int max) {
for (int i = pos; i < max; i++) {
if( docs[i] != 0) {
return i;
}
}
return Integer.MAX_VALUE;
}
/**
* tests retrieval of positions for terms that have a large number of
* occurrences to force test of buffer refill during positions iteration.
*/
public void testLargeNumberOfPositions() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
newIndexWriterConfig(new MockAnalyzer(random())));
int howMany = 1000;
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setOmitNorms(true);
for (int i = 0; i < 39; i++) {
Document doc = new Document();
StringBuilder builder = new StringBuilder();
for (int j = 0; j < howMany; j++) {
if (j % 2 == 0) {
builder.append("even ");
} else {
builder.append("odd ");
}
}
doc.add(newField(fieldName, builder.toString(), customType));
writer.addDocument(doc);
}
// now do searches
IndexReader reader = writer.getReader();
writer.close();
int num = atLeast(13);
for (int i = 0; i < num; i++) {
BytesRef bytes = new BytesRef("even");
IndexReaderContext topReaderContext = reader.getContext();
for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) {
PostingsEnum docsAndPosEnum = getDocsAndPositions(
leafReaderContext.reader(), bytes);
assertNotNull(docsAndPosEnum);
int initDoc = 0;
int maxDoc = leafReaderContext.reader().maxDoc();
// initially advance or do next doc
if (random().nextBoolean()) {
initDoc = docsAndPosEnum.nextDoc();
} else {
initDoc = docsAndPosEnum.advance(random().nextInt(maxDoc));
}
String msg = "Iteration: " + i + " initDoc: " + initDoc; // TODO: + " payloads: " + usePayload;
assertEquals(howMany / 2, docsAndPosEnum.freq());
for (int j = 0; j < howMany; j += 2) {
assertEquals("position missmatch index: " + j + " with freq: "
+ docsAndPosEnum.freq() + " -- " + msg, j,
docsAndPosEnum.nextPosition());
}
}
}
reader.close();
dir.close();
}
public void testDocsEnumStart() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newStringField("foo", "bar", Field.Store.NO));
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
LeafReader r = getOnlyLeafReader(reader);
PostingsEnum disi = TestUtil.docs(random(), r, "foo", new BytesRef("bar"), null, PostingsEnum.NONE);
int docid = disi.docID();
assertEquals(-1, docid);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
// now reuse and check again
TermsEnum te = r.terms("foo").iterator();
assertTrue(te.seekExact(new BytesRef("bar")));
disi = TestUtil.docs(random(), te, disi, PostingsEnum.NONE);
docid = disi.docID();
assertEquals(-1, docid);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
writer.close();
r.close();
dir.close();
}
public void testDocsAndPositionsEnumStart() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(newTextField("foo", "bar", Field.Store.NO));
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
LeafReader r = getOnlyLeafReader(reader);
PostingsEnum disi = r.postings(new Term("foo", "bar"), PostingsEnum.ALL);
int docid = disi.docID();
assertEquals(-1, docid);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
// now reuse and check again
TermsEnum te = r.terms("foo").iterator();
assertTrue(te.seekExact(new BytesRef("bar")));
disi = te.postings(disi, PostingsEnum.ALL);
docid = disi.docID();
assertEquals(-1, docid);
assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
writer.close();
r.close();
dir.close();
}
}