| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.index; |
| |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| |
| @SuppressCodecs({ "SimpleText", "Direct" }) |
| public class TestLongPostings extends LuceneTestCase { |
| |
| // Produces a realistic unicode random string that |
| // survives MockAnalyzer unchanged: |
| private String getRandomTerm(String other) throws IOException { |
| Analyzer a = new MockAnalyzer(random()); |
| while(true) { |
| String s = TestUtil.randomRealisticUnicodeString(random()); |
| if (other != null && s.equals(other)) { |
| continue; |
| } |
| try (TokenStream ts = a.tokenStream("foo", s)) { |
| final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class); |
| ts.reset(); |
| |
| int count = 0; |
| boolean changed = false; |
| |
| while(ts.incrementToken()) { |
| final BytesRef termBytes = termAtt.getBytesRef(); |
| if (count == 0 && !termBytes.utf8ToString().equals(s)) { |
| // The value was changed during analysis. Keep iterating so the |
| // tokenStream is exhausted. |
| changed = true; |
| } |
| count++; |
| } |
| |
| ts.end(); |
| // Did we iterate just once and the value was unchanged? |
| if (!changed && count == 1) { |
| return s; |
| } |
| } |
| } |
| } |
| |
| public void testLongPostings() throws Exception { |
| // Don't use _TestUtil.getTempDir so that we own the |
| // randomness (ie same seed will point to same dir): |
| Directory dir = newFSDirectory(createTempDir("longpostings" + "." + random().nextLong())); |
| |
| final int NUM_DOCS = atLeast(1000); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: NUM_DOCS=" + NUM_DOCS); |
| } |
| |
| final String s1 = getRandomTerm(null); |
| final String s2 = getRandomTerm(s1); |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: s1=" + s1 + " s2=" + s2); |
| /* |
| for(int idx=0;idx<s1.length();idx++) { |
| System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx))); |
| } |
| for(int idx=0;idx<s2.length();idx++) { |
| System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx))); |
| } |
| */ |
| } |
| |
| final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS); |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| if (random().nextBoolean()) { |
| isS1.set(idx); |
| } |
| } |
| |
| final IndexReader r; |
| final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())) |
| .setOpenMode(IndexWriterConfig.OpenMode.CREATE) |
| .setMergePolicy(newLogMergePolicy()); |
| iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble()); |
| iwc.setMaxBufferedDocs(-1); |
| final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); |
| |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| final Document doc = new Document(); |
| String s = isS1.get(idx) ? s1 : s2; |
| final Field f = newTextField("field", s, Field.Store.NO); |
| final int count = TestUtil.nextInt(random(), 1, 4); |
| for(int ct=0;ct<count;ct++) { |
| doc.add(f); |
| } |
| riw.addDocument(doc); |
| } |
| |
| r = riw.getReader(); |
| riw.close(); |
| |
| /* |
| if (VERBOSE) { |
| System.out.println("TEST: terms"); |
| TermEnum termEnum = r.terms(); |
| while(termEnum.next()) { |
| System.out.println(" term=" + termEnum.term() + " len=" + termEnum.term().text().length()); |
| assertTrue(termEnum.docFreq() > 0); |
| System.out.println(" s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length()); |
| System.out.println(" s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length()); |
| final String s = termEnum.term().text(); |
| for(int idx=0;idx<s.length();idx++) { |
| System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx))); |
| } |
| } |
| } |
| */ |
| |
| assertEquals(NUM_DOCS, r.numDocs()); |
| assertTrue(r.docFreq(new Term("field", s1)) > 0); |
| assertTrue(r.docFreq(new Term("field", s2)) > 0); |
| |
| int num = atLeast(1000); |
| for(int iter=0;iter<num;iter++) { |
| |
| final String term; |
| final boolean doS1; |
| if (random().nextBoolean()) { |
| term = s1; |
| doS1 = true; |
| } else { |
| term = s2; |
| doS1 = false; |
| } |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1); |
| } |
| |
| final PostingsEnum postings = MultiTerms.getTermPostingsEnum(r, "field", new BytesRef(term)); |
| |
| int docID = -1; |
| while(docID < DocIdSetIterator.NO_MORE_DOCS) { |
| final int what = random().nextInt(3); |
| if (what == 0) { |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do next()"); |
| } |
| // nextDoc |
| int expected = docID+1; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| docID = postings.nextDoc(); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| if (docID == DocIdSetIterator.NO_MORE_DOCS) { |
| break; |
| } |
| |
| if (random().nextInt(6) == 3) { |
| if (VERBOSE) { |
| System.out.println(" check positions"); |
| } |
| final int freq = postings.freq(); |
| assertTrue(freq >=1 && freq <= 4); |
| for(int pos=0;pos<freq;pos++) { |
| assertEquals(pos, postings.nextPosition()); |
| if (random().nextBoolean()) { |
| postings.getPayload(); |
| if (random().nextBoolean()) { |
| postings.getPayload(); // get it again |
| } |
| } |
| } |
| } |
| } else { |
| // advance |
| final int targetDocID; |
| if (docID == -1) { |
| targetDocID = random().nextInt(NUM_DOCS+1); |
| } else { |
| targetDocID = docID + TestUtil.nextInt(random(), 1, NUM_DOCS - docID); |
| } |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do advance(" + targetDocID + ")"); |
| } |
| int expected = targetDocID; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| |
| docID = postings.advance(targetDocID); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| if (docID == DocIdSetIterator.NO_MORE_DOCS) { |
| break; |
| } |
| |
| if (random().nextInt(6) == 3) { |
| final int freq = postings.freq(); |
| assertTrue(freq >=1 && freq <= 4); |
| for(int pos=0;pos<freq;pos++) { |
| assertEquals(pos, postings.nextPosition()); |
| if (random().nextBoolean()) { |
| postings.getPayload(); |
| if (random().nextBoolean()) { |
| postings.getPayload(); // get it again |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| r.close(); |
| dir.close(); |
| } |
| |
| // a weaker form of testLongPostings, that doesnt check positions |
| public void testLongPostingsNoPositions() throws Exception { |
| doTestLongPostingsNoPositions(IndexOptions.DOCS); |
| doTestLongPostingsNoPositions(IndexOptions.DOCS_AND_FREQS); |
| } |
| |
| public void doTestLongPostingsNoPositions(IndexOptions options) throws Exception { |
| // Don't use _TestUtil.getTempDir so that we own the |
| // randomness (ie same seed will point to same dir): |
| Directory dir = newFSDirectory(createTempDir("longpostings" + "." + random().nextLong())); |
| |
| final int NUM_DOCS = atLeast(1000); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: NUM_DOCS=" + NUM_DOCS); |
| } |
| |
| final String s1 = getRandomTerm(null); |
| final String s2 = getRandomTerm(s1); |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: s1=" + s1 + " s2=" + s2); |
| /* |
| for(int idx=0;idx<s1.length();idx++) { |
| System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx))); |
| } |
| for(int idx=0;idx<s2.length();idx++) { |
| System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx))); |
| } |
| */ |
| } |
| |
| final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS); |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| if (random().nextBoolean()) { |
| isS1.set(idx); |
| } |
| } |
| |
| final IndexReader r; |
| if (true) { |
| final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())) |
| .setOpenMode(IndexWriterConfig.OpenMode.CREATE) |
| .setMergePolicy(newLogMergePolicy()); |
| iwc.setRAMBufferSizeMB(16.0 + 16.0 * random().nextDouble()); |
| iwc.setMaxBufferedDocs(-1); |
| final RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); |
| |
| FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); |
| ft.setIndexOptions(options); |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| final Document doc = new Document(); |
| String s = isS1.get(idx) ? s1 : s2; |
| final Field f = newField("field", s, ft); |
| final int count = TestUtil.nextInt(random(), 1, 4); |
| for(int ct=0;ct<count;ct++) { |
| doc.add(f); |
| } |
| riw.addDocument(doc); |
| } |
| |
| r = riw.getReader(); |
| riw.close(); |
| } else { |
| r = DirectoryReader.open(dir); |
| } |
| |
| /* |
| if (VERBOSE) { |
| System.out.println("TEST: terms"); |
| TermEnum termEnum = r.terms(); |
| while(termEnum.next()) { |
| System.out.println(" term=" + termEnum.term() + " len=" + termEnum.term().text().length()); |
| assertTrue(termEnum.docFreq() > 0); |
| System.out.println(" s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length()); |
| System.out.println(" s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length()); |
| final String s = termEnum.term().text(); |
| for(int idx=0;idx<s.length();idx++) { |
| System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx))); |
| } |
| } |
| } |
| */ |
| |
| assertEquals(NUM_DOCS, r.numDocs()); |
| assertTrue(r.docFreq(new Term("field", s1)) > 0); |
| assertTrue(r.docFreq(new Term("field", s2)) > 0); |
| |
| int num = atLeast(1000); |
| for(int iter=0;iter<num;iter++) { |
| |
| final String term; |
| final boolean doS1; |
| if (random().nextBoolean()) { |
| term = s1; |
| doS1 = true; |
| } else { |
| term = s2; |
| doS1 = false; |
| } |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1 + " term=" + term); |
| } |
| |
| final PostingsEnum docs; |
| final PostingsEnum postings; |
| |
| if (options == IndexOptions.DOCS) { |
| docs = TestUtil.docs(random(), r, "field", new BytesRef(term), null, PostingsEnum.NONE); |
| postings = null; |
| } else { |
| docs = postings = TestUtil.docs(random(), r, "field", new BytesRef(term), null, PostingsEnum.FREQS); |
| assert postings != null; |
| } |
| assert docs != null; |
| |
| int docID = -1; |
| while(docID < DocIdSetIterator.NO_MORE_DOCS) { |
| final int what = random().nextInt(3); |
| if (what == 0) { |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do next()"); |
| } |
| // nextDoc |
| int expected = docID+1; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| docID = docs.nextDoc(); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| if (docID == DocIdSetIterator.NO_MORE_DOCS) { |
| break; |
| } |
| |
| if (random().nextInt(6) == 3 && postings != null) { |
| final int freq = postings.freq(); |
| assertTrue(freq >=1 && freq <= 4); |
| } |
| } else { |
| // advance |
| final int targetDocID; |
| if (docID == -1) { |
| targetDocID = random().nextInt(NUM_DOCS+1); |
| } else { |
| targetDocID = docID + TestUtil.nextInt(random(), 1, NUM_DOCS - docID); |
| } |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do advance(" + targetDocID + ")"); |
| } |
| int expected = targetDocID; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| |
| docID = docs.advance(targetDocID); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| if (docID == DocIdSetIterator.NO_MORE_DOCS) { |
| break; |
| } |
| |
| if (random().nextInt(6) == 3 && postings != null) { |
| final int freq = postings.freq(); |
| assertTrue("got invalid freq=" + freq, freq >=1 && freq <= 4); |
| } |
| } |
| } |
| } |
| r.close(); |
| dir.close(); |
| } |
| } |