| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.index; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| |
| public class TestDocsAndPositions extends LuceneTestCase { |
| private String fieldName; |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| fieldName = "field" + random().nextInt(); |
| } |
| |
| /** |
| * Simple testcase for {@link PostingsEnum} |
| */ |
| public void testPositionsSimple() throws IOException { |
| Directory directory = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), directory, |
| newIndexWriterConfig(new MockAnalyzer(random()))); |
| for (int i = 0; i < 39; i++) { |
| Document doc = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setOmitNorms(true); |
| doc.add(newField(fieldName, "1 2 3 4 5 6 7 8 9 10 " |
| + "1 2 3 4 5 6 7 8 9 10 " + "1 2 3 4 5 6 7 8 9 10 " |
| + "1 2 3 4 5 6 7 8 9 10", customType)); |
| writer.addDocument(doc); |
| } |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| |
| int num = atLeast(13); |
| for (int i = 0; i < num; i++) { |
| BytesRef bytes = new BytesRef("1"); |
| IndexReaderContext topReaderContext = reader.getContext(); |
| for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) { |
| PostingsEnum docsAndPosEnum = getDocsAndPositions( |
| leafReaderContext.reader(), bytes); |
| assertNotNull(docsAndPosEnum); |
| if (leafReaderContext.reader().maxDoc() == 0) { |
| continue; |
| } |
| final int advance = docsAndPosEnum.advance(random().nextInt(leafReaderContext.reader().maxDoc())); |
| do { |
| String msg = "Advanced to: " + advance + " current doc: " |
| + docsAndPosEnum.docID(); // TODO: + " usePayloads: " + usePayload; |
| assertEquals(msg, 4, docsAndPosEnum.freq()); |
| assertEquals(msg, 0, docsAndPosEnum.nextPosition()); |
| assertEquals(msg, 4, docsAndPosEnum.freq()); |
| assertEquals(msg, 10, docsAndPosEnum.nextPosition()); |
| assertEquals(msg, 4, docsAndPosEnum.freq()); |
| assertEquals(msg, 20, docsAndPosEnum.nextPosition()); |
| assertEquals(msg, 4, docsAndPosEnum.freq()); |
| assertEquals(msg, 30, docsAndPosEnum.nextPosition()); |
| } while (docsAndPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); |
| } |
| } |
| reader.close(); |
| directory.close(); |
| } |
| |
| public PostingsEnum getDocsAndPositions(LeafReader reader, |
| BytesRef bytes) throws IOException { |
| Terms terms = reader.terms(fieldName); |
| if (terms != null) { |
| TermsEnum te = terms.iterator(); |
| if (te.seekExact(bytes)) { |
| return te.postings(null, PostingsEnum.ALL); |
| } |
| } |
| return null; |
| } |
| |
| /** |
| * this test indexes random numbers within a range into a field and checks |
| * their occurrences by searching for a number from that range selected at |
| * random. All positions for that number are saved up front and compared to |
| * the enums positions. |
| */ |
| public void testRandomPositions() throws IOException { |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, |
| newIndexWriterConfig(new MockAnalyzer(random())) |
| .setMergePolicy(newLogMergePolicy())); |
| int numDocs = atLeast(47); |
| int max = 1051; |
| int term = random().nextInt(max); |
| Integer[][] positionsInDoc = new Integer[numDocs][]; |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setOmitNorms(true); |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| ArrayList<Integer> positions = new ArrayList<>(); |
| StringBuilder builder = new StringBuilder(); |
| int num = atLeast(131); |
| for (int j = 0; j < num; j++) { |
| int nextInt = random().nextInt(max); |
| builder.append(nextInt).append(" "); |
| if (nextInt == term) { |
| positions.add(Integer.valueOf(j)); |
| } |
| } |
| if (positions.size() == 0) { |
| builder.append(term); |
| positions.add(num); |
| } |
| doc.add(newField(fieldName, builder.toString(), customType)); |
| positionsInDoc[i] = positions.toArray(new Integer[0]); |
| writer.addDocument(doc); |
| } |
| |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| |
| int num = atLeast(13); |
| for (int i = 0; i < num; i++) { |
| BytesRef bytes = new BytesRef("" + term); |
| IndexReaderContext topReaderContext = reader.getContext(); |
| for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) { |
| PostingsEnum docsAndPosEnum = getDocsAndPositions( |
| leafReaderContext.reader(), bytes); |
| assertNotNull(docsAndPosEnum); |
| int initDoc = 0; |
| int maxDoc = leafReaderContext.reader().maxDoc(); |
| // initially advance or do next doc |
| if (random().nextBoolean()) { |
| initDoc = docsAndPosEnum.nextDoc(); |
| } else { |
| initDoc = docsAndPosEnum.advance(random().nextInt(maxDoc)); |
| } |
| // now run through the scorer and check if all positions are there... |
| do { |
| int docID = docsAndPosEnum.docID(); |
| if (docID == DocIdSetIterator.NO_MORE_DOCS) { |
| break; |
| } |
| Integer[] pos = positionsInDoc[leafReaderContext.docBase + docID]; |
| assertEquals(pos.length, docsAndPosEnum.freq()); |
| // number of positions read should be random - don't read all of them |
| // allways |
| final int howMany = random().nextInt(20) == 0 ? pos.length |
| - random().nextInt(pos.length) : pos.length; |
| for (int j = 0; j < howMany; j++) { |
| assertEquals("iteration: " + i + " initDoc: " + initDoc + " doc: " |
| + docID + " base: " + leafReaderContext.docBase |
| + " positions: " + Arrays.toString(pos) /* TODO: + " usePayloads: " |
| + usePayload*/, pos[j].intValue(), docsAndPosEnum.nextPosition()); |
| } |
| |
| if (random().nextInt(10) == 0) { // once is a while advance |
| if (docsAndPosEnum.advance(docID + 1 + random().nextInt((maxDoc - docID))) == DocIdSetIterator.NO_MORE_DOCS) { |
| break; |
| } |
| } |
| |
| } while (docsAndPosEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); |
| } |
| |
| } |
| reader.close(); |
| dir.close(); |
| } |
| |
| public void testRandomDocs() throws IOException { |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, |
| newIndexWriterConfig(new MockAnalyzer(random())) |
| .setMergePolicy(newLogMergePolicy())); |
| int numDocs = atLeast(49); |
| int max = 15678; |
| int term = random().nextInt(max); |
| int[] freqInDoc = new int[numDocs]; |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setOmitNorms(true); |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| StringBuilder builder = new StringBuilder(); |
| for (int j = 0; j < 199; j++) { |
| int nextInt = random().nextInt(max); |
| builder.append(nextInt).append(' '); |
| if (nextInt == term) { |
| freqInDoc[i]++; |
| } |
| } |
| doc.add(newField(fieldName, builder.toString(), customType)); |
| writer.addDocument(doc); |
| } |
| |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| |
| int num = atLeast(13); |
| for (int i = 0; i < num; i++) { |
| BytesRef bytes = new BytesRef("" + term); |
| IndexReaderContext topReaderContext = reader.getContext(); |
| for (LeafReaderContext context : topReaderContext.leaves()) { |
| int maxDoc = context.reader().maxDoc(); |
| PostingsEnum postingsEnum = TestUtil.docs(random(), context.reader(), fieldName, bytes, null, PostingsEnum.FREQS); |
| if (findNext(freqInDoc, context.docBase, context.docBase + maxDoc) == Integer.MAX_VALUE) { |
| assertNull(postingsEnum); |
| continue; |
| } |
| assertNotNull(postingsEnum); |
| postingsEnum.nextDoc(); |
| for (int j = 0; j < maxDoc; j++) { |
| if (freqInDoc[context.docBase + j] != 0) { |
| assertEquals(j, postingsEnum.docID()); |
| assertEquals(postingsEnum.freq(), freqInDoc[context.docBase +j]); |
| if (i % 2 == 0 && random().nextInt(10) == 0) { |
| int next = findNext(freqInDoc, context.docBase+j+1, context.docBase + maxDoc) - context.docBase; |
| int advancedTo = postingsEnum.advance(next); |
| if (next >= maxDoc) { |
| assertEquals(DocIdSetIterator.NO_MORE_DOCS, advancedTo); |
| } else { |
| assertTrue("advanced to: " +advancedTo + " but should be <= " + next, next >= advancedTo); |
| } |
| } else { |
| postingsEnum.nextDoc(); |
| } |
| } |
| } |
| assertEquals("docBase: " + context.docBase + " maxDoc: " + maxDoc + " " + postingsEnum.getClass(), DocIdSetIterator.NO_MORE_DOCS, postingsEnum.docID()); |
| } |
| |
| } |
| |
| reader.close(); |
| dir.close(); |
| } |
| |
| private static int findNext(int[] docs, int pos, int max) { |
| for (int i = pos; i < max; i++) { |
| if( docs[i] != 0) { |
| return i; |
| } |
| } |
| return Integer.MAX_VALUE; |
| } |
| |
| /** |
| * tests retrieval of positions for terms that have a large number of |
| * occurrences to force test of buffer refill during positions iteration. |
| */ |
| public void testLargeNumberOfPositions() throws IOException { |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, |
| newIndexWriterConfig(new MockAnalyzer(random()))); |
| int howMany = 1000; |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setOmitNorms(true); |
| for (int i = 0; i < 39; i++) { |
| Document doc = new Document(); |
| StringBuilder builder = new StringBuilder(); |
| for (int j = 0; j < howMany; j++) { |
| if (j % 2 == 0) { |
| builder.append("even "); |
| } else { |
| builder.append("odd "); |
| } |
| } |
| doc.add(newField(fieldName, builder.toString(), customType)); |
| writer.addDocument(doc); |
| } |
| |
| // now do searches |
| IndexReader reader = writer.getReader(); |
| writer.close(); |
| |
| int num = atLeast(13); |
| for (int i = 0; i < num; i++) { |
| BytesRef bytes = new BytesRef("even"); |
| |
| IndexReaderContext topReaderContext = reader.getContext(); |
| for (LeafReaderContext leafReaderContext : topReaderContext.leaves()) { |
| PostingsEnum docsAndPosEnum = getDocsAndPositions( |
| leafReaderContext.reader(), bytes); |
| assertNotNull(docsAndPosEnum); |
| |
| int initDoc = 0; |
| int maxDoc = leafReaderContext.reader().maxDoc(); |
| // initially advance or do next doc |
| if (random().nextBoolean()) { |
| initDoc = docsAndPosEnum.nextDoc(); |
| } else { |
| initDoc = docsAndPosEnum.advance(random().nextInt(maxDoc)); |
| } |
| String msg = "Iteration: " + i + " initDoc: " + initDoc; // TODO: + " payloads: " + usePayload; |
| assertEquals(howMany / 2, docsAndPosEnum.freq()); |
| for (int j = 0; j < howMany; j += 2) { |
| assertEquals("position missmatch index: " + j + " with freq: " |
| + docsAndPosEnum.freq() + " -- " + msg, j, |
| docsAndPosEnum.nextPosition()); |
| } |
| } |
| } |
| reader.close(); |
| dir.close(); |
| } |
| |
| public void testDocsEnumStart() throws Exception { |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir); |
| Document doc = new Document(); |
| doc.add(newStringField("foo", "bar", Field.Store.NO)); |
| writer.addDocument(doc); |
| DirectoryReader reader = writer.getReader(); |
| LeafReader r = getOnlyLeafReader(reader); |
| PostingsEnum disi = TestUtil.docs(random(), r, "foo", new BytesRef("bar"), null, PostingsEnum.NONE); |
| int docid = disi.docID(); |
| assertEquals(-1, docid); |
| assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); |
| |
| // now reuse and check again |
| TermsEnum te = r.terms("foo").iterator(); |
| assertTrue(te.seekExact(new BytesRef("bar"))); |
| disi = TestUtil.docs(random(), te, disi, PostingsEnum.NONE); |
| docid = disi.docID(); |
| assertEquals(-1, docid); |
| assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); |
| writer.close(); |
| r.close(); |
| dir.close(); |
| } |
| |
| public void testDocsAndPositionsEnumStart() throws Exception { |
| Directory dir = newDirectory(); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir); |
| Document doc = new Document(); |
| doc.add(newTextField("foo", "bar", Field.Store.NO)); |
| writer.addDocument(doc); |
| DirectoryReader reader = writer.getReader(); |
| LeafReader r = getOnlyLeafReader(reader); |
| PostingsEnum disi = r.postings(new Term("foo", "bar"), PostingsEnum.ALL); |
| int docid = disi.docID(); |
| assertEquals(-1, docid); |
| assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); |
| |
| // now reuse and check again |
| TermsEnum te = r.terms("foo").iterator(); |
| assertTrue(te.seekExact(new BytesRef("bar"))); |
| disi = te.postings(disi, PostingsEnum.ALL); |
| docid = disi.docID(); |
| assertEquals(-1, docid); |
| assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); |
| writer.close(); |
| r.close(); |
| dir.close(); |
| } |
| } |