| package org.apache.lucene.index; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.io.StringReader; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.TermAttribute; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util._TestUtil; |
| |
| public class TestLongPostings extends LuceneTestCase { |
| |
| // Produces a realistic unicode random string that |
| // survives MockAnalyzer unchanged: |
| private String getRandomTerm(String other) throws IOException { |
| Analyzer a = new MockAnalyzer(random); |
| while(true) { |
| String s = _TestUtil.randomRealisticUnicodeString(random); |
| if (other != null && s.equals(other)) { |
| continue; |
| } |
| final TokenStream ts = a.tokenStream("foo", new StringReader(s)); |
| final TermAttribute termAtt = ts.getAttribute(TermAttribute.class); |
| int count = 0; |
| ts.reset(); |
| while(ts.incrementToken()) { |
| if (count == 0 && !termAtt.term().equals(s)) { |
| break; |
| } |
| count++; |
| } |
| if (count == 1) { |
| return s; |
| } |
| } |
| } |
| |
| public void testLongPostings() throws Exception { |
| // Don't use _TestUtil.getTempDir so that we own the |
| // randomness (ie same seed will point to same dir): |
| Directory dir = newFSDirectory(_TestUtil.getTempDir("longpostings" + "." + random.nextLong())); |
| |
| final int NUM_DOCS = atLeast(2000); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: NUM_DOCS=" + NUM_DOCS); |
| } |
| |
| final String s1 = getRandomTerm(null); |
| final String s2 = getRandomTerm(s1); |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: s1=" + s1 + " s2=" + s2); |
| /* |
| for(int idx=0;idx<s1.length();idx++) { |
| System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx))); |
| } |
| for(int idx=0;idx<s2.length();idx++) { |
| System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx))); |
| } |
| */ |
| } |
| |
| final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS); |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| if (random.nextBoolean()) { |
| isS1.set(idx); |
| } |
| } |
| |
| final IndexReader r; |
| if (true) { |
| final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) |
| .setOpenMode(IndexWriterConfig.OpenMode.CREATE) |
| .setMergePolicy(newLogMergePolicy()); |
| iwc.setRAMBufferSizeMB(16.0 + 16.0 * random.nextDouble()); |
| iwc.setMaxBufferedDocs(-1); |
| final RandomIndexWriter riw = new RandomIndexWriter(random, dir, iwc); |
| |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| final Document doc = new Document(); |
| String s = isS1.get(idx) ? s1 : s2; |
| final Field f = newField("field", s, Field.Index.ANALYZED); |
| final int count = _TestUtil.nextInt(random, 1, 4); |
| for(int ct=0;ct<count;ct++) { |
| doc.add(f); |
| } |
| riw.addDocument(doc); |
| } |
| |
| r = riw.getReader(); |
| riw.close(); |
| } else { |
| r = IndexReader.open(dir); |
| } |
| |
| /* |
| if (VERBOSE) { |
| System.out.println("TEST: terms"); |
| TermEnum termEnum = r.terms(); |
| while(termEnum.next()) { |
| System.out.println(" term=" + termEnum.term() + " len=" + termEnum.term().text().length()); |
| assertTrue(termEnum.docFreq() > 0); |
| System.out.println(" s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length()); |
| System.out.println(" s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length()); |
| final String s = termEnum.term().text(); |
| for(int idx=0;idx<s.length();idx++) { |
| System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx))); |
| } |
| } |
| } |
| */ |
| |
| assertEquals(NUM_DOCS, r.numDocs()); |
| assertTrue(r.docFreq(new Term("field", s1)) > 0); |
| assertTrue(r.docFreq(new Term("field", s2)) > 0); |
| |
| final byte[] payload = new byte[100]; |
| |
| int num = atLeast(1000); |
| for(int iter=0;iter<num;iter++) { |
| |
| final String term; |
| final boolean doS1; |
| if (random.nextBoolean()) { |
| term = s1; |
| doS1 = true; |
| } else { |
| term = s2; |
| doS1 = false; |
| } |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1); |
| } |
| |
| final TermPositions postings = r.termPositions(new Term("field", term)); |
| |
| int docID = -1; |
| while(docID < Integer.MAX_VALUE) { |
| final int what = random.nextInt(3); |
| if (what == 0) { |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do next()"); |
| } |
| // nextDoc |
| int expected = docID+1; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| boolean result = postings.next(); |
| if (!result) { |
| assertEquals(Integer.MAX_VALUE, expected); |
| if (VERBOSE) { |
| System.out.println(" end"); |
| } |
| break; |
| } else { |
| docID = postings.doc(); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| |
| if (random.nextInt(6) == 3) { |
| final int freq = postings.freq(); |
| assertTrue(freq >=1 && freq <= 4); |
| for(int pos=0;pos<freq;pos++) { |
| assertEquals(pos, postings.nextPosition()); |
| if (random.nextBoolean() && postings.isPayloadAvailable()) { |
| postings.getPayload(payload, 0); |
| } |
| } |
| } |
| } |
| } else { |
| // advance |
| final int targetDocID; |
| if (docID == -1) { |
| targetDocID = random.nextInt(NUM_DOCS+1); |
| } else { |
| targetDocID = docID + _TestUtil.nextInt(random, 1, NUM_DOCS - docID); |
| } |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do skipTo(" + targetDocID + ")"); |
| } |
| int expected = targetDocID; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| |
| final boolean result = postings.skipTo(targetDocID); |
| if (!result) { |
| assertEquals(Integer.MAX_VALUE, expected); |
| if (VERBOSE) { |
| System.out.println(" end"); |
| } |
| break; |
| } else { |
| docID = postings.doc(); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| |
| if (random.nextInt(6) == 3) { |
| final int freq = postings.freq(); |
| assertTrue(freq >=1 && freq <= 4); |
| for(int pos=0;pos<freq;pos++) { |
| assertEquals(pos, postings.nextPosition()); |
| if (random.nextBoolean() && postings.isPayloadAvailable()) { |
| postings.getPayload(payload, 0); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| r.close(); |
| dir.close(); |
| } |
| |
| // a weaker form of testLongPostings, that doesnt check positions |
| public void testLongPostingsNoPositions() throws Exception { |
| doTestLongPostingsNoPositions(IndexOptions.DOCS_ONLY); |
| doTestLongPostingsNoPositions(IndexOptions.DOCS_AND_FREQS); |
| } |
| |
| public void doTestLongPostingsNoPositions(IndexOptions options) throws Exception { |
| // Don't use _TestUtil.getTempDir so that we own the |
| // randomness (ie same seed will point to same dir): |
| Directory dir = newFSDirectory(_TestUtil.getTempDir("longpostings" + "." + random.nextLong())); |
| |
| final int NUM_DOCS = atLeast(2000); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: NUM_DOCS=" + NUM_DOCS); |
| } |
| |
| final String s1 = getRandomTerm(null); |
| final String s2 = getRandomTerm(s1); |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: s1=" + s1 + " s2=" + s2); |
| /* |
| for(int idx=0;idx<s1.length();idx++) { |
| System.out.println(" s1 ch=0x" + Integer.toHexString(s1.charAt(idx))); |
| } |
| for(int idx=0;idx<s2.length();idx++) { |
| System.out.println(" s2 ch=0x" + Integer.toHexString(s2.charAt(idx))); |
| } |
| */ |
| } |
| |
| final FixedBitSet isS1 = new FixedBitSet(NUM_DOCS); |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| if (random.nextBoolean()) { |
| isS1.set(idx); |
| } |
| } |
| |
| final IndexReader r; |
| if (true) { |
| final IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) |
| .setOpenMode(IndexWriterConfig.OpenMode.CREATE) |
| .setMergePolicy(newLogMergePolicy()); |
| iwc.setRAMBufferSizeMB(16.0 + 16.0 * random.nextDouble()); |
| iwc.setMaxBufferedDocs(-1); |
| final RandomIndexWriter riw = new RandomIndexWriter(random, dir, iwc); |
| |
| for(int idx=0;idx<NUM_DOCS;idx++) { |
| final Document doc = new Document(); |
| String s = isS1.get(idx) ? s1 : s2; |
| final Field f = newField("field", s, Field.Index.ANALYZED); |
| f.setIndexOptions(options); |
| final int count = _TestUtil.nextInt(random, 1, 4); |
| for(int ct=0;ct<count;ct++) { |
| doc.add(f); |
| } |
| riw.addDocument(doc); |
| } |
| |
| r = riw.getReader(); |
| riw.close(); |
| } else { |
| r = IndexReader.open(dir); |
| } |
| |
| /* |
| if (VERBOSE) { |
| System.out.println("TEST: terms"); |
| TermEnum termEnum = r.terms(); |
| while(termEnum.next()) { |
| System.out.println(" term=" + termEnum.term() + " len=" + termEnum.term().text().length()); |
| assertTrue(termEnum.docFreq() > 0); |
| System.out.println(" s1?=" + (termEnum.term().text().equals(s1)) + " s1len=" + s1.length()); |
| System.out.println(" s2?=" + (termEnum.term().text().equals(s2)) + " s2len=" + s2.length()); |
| final String s = termEnum.term().text(); |
| for(int idx=0;idx<s.length();idx++) { |
| System.out.println(" ch=0x" + Integer.toHexString(s.charAt(idx))); |
| } |
| } |
| } |
| */ |
| |
| assertEquals(NUM_DOCS, r.numDocs()); |
| assertTrue(r.docFreq(new Term("field", s1)) > 0); |
| assertTrue(r.docFreq(new Term("field", s2)) > 0); |
| |
| final byte[] payload = new byte[100]; |
| |
| int num = atLeast(1000); |
| for(int iter=0;iter<num;iter++) { |
| |
| final String term; |
| final boolean doS1; |
| if (random.nextBoolean()) { |
| term = s1; |
| doS1 = true; |
| } else { |
| term = s2; |
| doS1 = false; |
| } |
| |
| if (VERBOSE) { |
| System.out.println("\nTEST: iter=" + iter + " doS1=" + doS1); |
| } |
| |
| final TermDocs postings = r.termDocs(new Term("field", term)); |
| |
| int docID = -1; |
| while(docID < Integer.MAX_VALUE) { |
| final int what = random.nextInt(3); |
| if (what == 0) { |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do next()"); |
| } |
| // nextDoc |
| int expected = docID+1; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| boolean result = postings.next(); |
| if (!result) { |
| assertEquals(Integer.MAX_VALUE, expected); |
| if (VERBOSE) { |
| System.out.println(" end"); |
| } |
| break; |
| } else { |
| docID = postings.doc(); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| |
| if (random.nextInt(6) == 3) { |
| final int freq = postings.freq(); |
| assertTrue(freq >=1 && freq <= 4); |
| } |
| } |
| } else { |
| // advance |
| final int targetDocID; |
| if (docID == -1) { |
| targetDocID = random.nextInt(NUM_DOCS+1); |
| } else { |
| targetDocID = docID + _TestUtil.nextInt(random, 1, NUM_DOCS - docID); |
| } |
| if (VERBOSE) { |
| System.out.println("TEST: docID=" + docID + "; do skipTo(" + targetDocID + ")"); |
| } |
| int expected = targetDocID; |
| while(true) { |
| if (expected == NUM_DOCS) { |
| expected = Integer.MAX_VALUE; |
| break; |
| } else if (isS1.get(expected) == doS1) { |
| break; |
| } else { |
| expected++; |
| } |
| } |
| |
| final boolean result = postings.skipTo(targetDocID); |
| if (!result) { |
| assertEquals(Integer.MAX_VALUE, expected); |
| if (VERBOSE) { |
| System.out.println(" end"); |
| } |
| break; |
| } else { |
| docID = postings.doc(); |
| if (VERBOSE) { |
| System.out.println(" got docID=" + docID); |
| } |
| assertEquals(expected, docID); |
| |
| if (random.nextInt(6) == 3) { |
| final int freq = postings.freq(); |
| assertTrue(freq >=1 && freq <= 4); |
| } |
| } |
| } |
| } |
| } |
| r.close(); |
| dir.close(); |
| } |
| } |