blob: dd50d8df95dc8177afeca9a3de841f9eb1d248b8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
@SuppressCodecs({ "SimpleText", "Direct" })
public class TestLongPostings extends LuceneTestCase {
// Produces a realistic unicode random string that
// survives MockAnalyzer unchanged:
private String getRandomTerm(String other) throws IOException {
Analyzer a = new MockAnalyzer(random());
while(true) {
String s = TestUtil.randomRealisticUnicodeString(random());
if (other != null && s.equals(other)) {
continue;
}
try (TokenStream ts = a.tokenStream("foo", s)) {
final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
ts.reset();
int count = 0;
boolean changed = false;
while(ts.incrementToken()) {
final BytesRef termBytes = termAtt.getBytesRef();
if (count == 0 && !termBytes.utf8ToString().equals(s)) {
// The value was changed during analysis. Keep iterating so the
// tokenStream is exhausted.
changed = true;
}
count++;
}
ts.end();
// Did we iterate just once and the value was unchanged?
if (!changed && count == 1) {
return s;
}
}
}
}
public void testLongPostings() throws Exception {
// Don't use _TestUtil.getTempDir so that we own the
// randomness (ie same seed will point to same dir):
Directory dir = newFSDirectory(createTempDir("longpostings" + "." + random().nextLong()));
final int NUM_DOCS = atLeast(1000);
if (VERBOSE) {
System.out.println("TEST: NUM_DOCS=" + NUM_DOCS);
}
final String s1 = getRandomTerm(null);
final String s2 = getRandomTerm(s1);
if (VERBOSE) {
System.out.println("\nTEST: s1=" + s1 + " s2=" + s2);
/*
for(int idx=0;idx<s1.length();idx++) {
System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx)));
}
for(int idx=0;idx<s2.length();idx++) {
System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx)));
}
*/
}
final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS);
for(int idx=0;idx<NUM_DOCS;idx++) {
if (random().nextBoolean()) {
isS1.set(idx);
}
}
final IndexReader r;
final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy());
iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble());
iwc.setMaxBufferedDocs(-1);
final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
for(int idx=0;idx<NUM_DOCS;idx++) {
final Document doc = new Document();
String s = isS1.get(idx) ? s1 : s2;
final Field f = newTextField("field", s, Field.Store.NO);
final int count = TestUtil.nextInt(random(), 1, 4);
for(int ct=0;ct<count;ct++) {
doc.add(f);
}
riw.addDocument(doc);
}
r = riw.getReader();
riw.close();
/*
if (VERBOSE) {
System.out.println("TEST: terms");
TermEnum termEnum = r.terms();
while(termEnum.next()) {
System.out.println(" term=" + termEnum.term() + " len=" + termEnum.term().text().length());
assertTrue(termEnum.docFreq() > 0);
System.out.println(" s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length());
System.out.println(" s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length());
final String s = termEnum.term().text();
for(int idx=0;idx<s.length();idx++) {
System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx)));
}
}
}
*/
assertEquals(NUM_DOCS, r.numDocs());
assertTrue(r.docFreq(new Term("field", s1)) > 0);
assertTrue(r.docFreq(new Term("field", s2)) > 0);
int num = atLeast(1000);
for(int iter=0;iter<num;iter++) {
final String term;
final boolean doS1;
if (random().nextBoolean()) {
term = s1;
doS1 = true;
} else {
term = s2;
doS1 = false;
}
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1);
}
final PostingsEnum postings = MultiTerms.getTermPostingsEnum(r, "field", new BytesRef(term));
int docID = -1;
while(docID < DocIdSetIterator.NO_MORE_DOCS) {
final int what = random().nextInt(3);
if (what == 0) {
if (VERBOSE) {
System.out.println("TEST: docID=" + docID + "; do next()");
}
// nextDoc
int expected = docID+1;
while(true) {
if (expected == NUM_DOCS) {
expected = Integer.MAX_VALUE;
break;
} else if (isS1.get(expected) == doS1) {
break;
} else {
expected++;
}
}
docID = postings.nextDoc();
if (VERBOSE) {
System.out.println(" got docID=" + docID);
}
assertEquals(expected, docID);
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
if (random().nextInt(6) == 3) {
if (VERBOSE) {
System.out.println(" check positions");
}
final int freq = postings.freq();
assertTrue(freq >=1 && freq <= 4);
for(int pos=0;pos<freq;pos++) {
assertEquals(pos, postings.nextPosition());
if (random().nextBoolean()) {
postings.getPayload();
if (random().nextBoolean()) {
postings.getPayload(); // get it again
}
}
}
}
} else {
// advance
final int targetDocID;
if (docID == -1) {
targetDocID = random().nextInt(NUM_DOCS+1);
} else {
targetDocID = docID + TestUtil.nextInt(random(), 1, NUM_DOCS - docID);
}
if (VERBOSE) {
System.out.println("TEST: docID=" + docID + "; do advance(" + targetDocID + ")");
}
int expected = targetDocID;
while(true) {
if (expected == NUM_DOCS) {
expected = Integer.MAX_VALUE;
break;
} else if (isS1.get(expected) == doS1) {
break;
} else {
expected++;
}
}
docID = postings.advance(targetDocID);
if (VERBOSE) {
System.out.println(" got docID=" + docID);
}
assertEquals(expected, docID);
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
if (random().nextInt(6) == 3) {
final int freq = postings.freq();
assertTrue(freq >=1 && freq <= 4);
for(int pos=0;pos<freq;pos++) {
assertEquals(pos, postings.nextPosition());
if (random().nextBoolean()) {
postings.getPayload();
if (random().nextBoolean()) {
postings.getPayload(); // get it again
}
}
}
}
}
}
}
r.close();
dir.close();
}
// a weaker form of testLongPostings, that doesnt check positions
public void testLongPostingsNoPositions() throws Exception {
doTestLongPostingsNoPositions(IndexOptions.DOCS);
doTestLongPostingsNoPositions(IndexOptions.DOCS_AND_FREQS);
}
public void doTestLongPostingsNoPositions(IndexOptions options) throws Exception {
// Don't use _TestUtil.getTempDir so that we own the
// randomness (ie same seed will point to same dir):
Directory dir = newFSDirectory(createTempDir("longpostings" + "." + random().nextLong()));
final int NUM_DOCS = atLeast(1000);
if (VERBOSE) {
System.out.println("TEST: NUM_DOCS=" + NUM_DOCS);
}
final String s1 = getRandomTerm(null);
final String s2 = getRandomTerm(s1);
if (VERBOSE) {
System.out.println("\nTEST: s1=" + s1 + " s2=" + s2);
/*
for(int idx=0;idx<s1.length();idx++) {
System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx)));
}
for(int idx=0;idx<s2.length();idx++) {
System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx)));
}
*/
}
final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS);
for(int idx=0;idx<NUM_DOCS;idx++) {
if (random().nextBoolean()) {
isS1.set(idx);
}
}
final IndexReader r;
if (true) {
final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy());
iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble());
iwc.setMaxBufferedDocs(-1);
final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setIndexOptions(options);
for(int idx=0;idx<NUM_DOCS;idx++) {
final Document doc = new Document();
String s = isS1.get(idx) ? s1 : s2;
final Field f = newField("field", s, ft);
final int count = TestUtil.nextInt(random(), 1, 4);
for(int ct=0;ct<count;ct++) {
doc.add(f);
}
riw.addDocument(doc);
}
r = riw.getReader();
riw.close();
} else {
r = DirectoryReader.open(dir);
}
/*
if (VERBOSE) {
System.out.println("TEST: terms");
TermEnum termEnum = r.terms();
while(termEnum.next()) {
System.out.println(" term=" + termEnum.term() + " len=" + termEnum.term().text().length());
assertTrue(termEnum.docFreq() > 0);
System.out.println(" s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length());
System.out.println(" s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length());
final String s = termEnum.term().text();
for(int idx=0;idx<s.length();idx++) {
System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx)));
}
}
}
*/
assertEquals(NUM_DOCS, r.numDocs());
assertTrue(r.docFreq(new Term("field", s1)) > 0);
assertTrue(r.docFreq(new Term("field", s2)) > 0);
int num = atLeast(1000);
for(int iter=0;iter<num;iter++) {
final String term;
final boolean doS1;
if (random().nextBoolean()) {
term = s1;
doS1 = true;
} else {
term = s2;
doS1 = false;
}
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1 + " term=" + term);
}
final PostingsEnum docs;
final PostingsEnum postings;
if (options == IndexOptions.DOCS) {
docs = TestUtil.docs(random(), r, "field", new BytesRef(term), null, PostingsEnum.NONE);
postings = null;
} else {
docs = postings = TestUtil.docs(random(), r, "field", new BytesRef(term), null, PostingsEnum.FREQS);
assert postings != null;
}
assert docs != null;
int docID = -1;
while(docID < DocIdSetIterator.NO_MORE_DOCS) {
final int what = random().nextInt(3);
if (what == 0) {
if (VERBOSE) {
System.out.println("TEST: docID=" + docID + "; do next()");
}
// nextDoc
int expected = docID+1;
while(true) {
if (expected == NUM_DOCS) {
expected = Integer.MAX_VALUE;
break;
} else if (isS1.get(expected) == doS1) {
break;
} else {
expected++;
}
}
docID = docs.nextDoc();
if (VERBOSE) {
System.out.println(" got docID=" + docID);
}
assertEquals(expected, docID);
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
if (random().nextInt(6) == 3 && postings != null) {
final int freq = postings.freq();
assertTrue(freq >=1 && freq <= 4);
}
} else {
// advance
final int targetDocID;
if (docID == -1) {
targetDocID = random().nextInt(NUM_DOCS+1);
} else {
targetDocID = docID + TestUtil.nextInt(random(), 1, NUM_DOCS - docID);
}
if (VERBOSE) {
System.out.println("TEST: docID=" + docID + "; do advance(" + targetDocID + ")");
}
int expected = targetDocID;
while(true) {
if (expected == NUM_DOCS) {
expected = Integer.MAX_VALUE;
break;
} else if (isS1.get(expected) == doS1) {
break;
} else {
expected++;
}
}
docID = docs.advance(targetDocID);
if (VERBOSE) {
System.out.println(" got docID=" + docID);
}
assertEquals(expected, docID);
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
if (random().nextInt(6) == 3 && postings != null) {
final int freq = postings.freq();
assertTrue("got invalid freq=" + freq, freq >=1 && freq <= 4);
}
}
}
}
r.close();
dir.close();
}
}