| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.perfield; |
| |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.List; |
| |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.codecs.Codec; |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.FieldsProducer; |
| import org.apache.lucene.codecs.NormsProducer; |
| import org.apache.lucene.codecs.PostingsFormat; |
| import org.apache.lucene.codecs.asserting.AssertingCodec; |
| import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval; |
| import org.apache.lucene.codecs.memory.DirectPostingsFormat; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.IntPoint; |
| import org.apache.lucene.document.StringField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.LogDocMergePolicy; |
| import org.apache.lucene.index.MergeState; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.index.SegmentReadState; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.TopDocs; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| import org.junit.Test; |
| |
| /** |
| * |
| * |
| */ |
| //TODO: would be better in this test to pull termsenums and instanceof or something? |
| // this way we can verify PFPF is doing the right thing. |
| // for now we do termqueries. |
| @LuceneTestCase.SuppressCodecs("SimpleText") |
| public class TestPerFieldPostingsFormat2 extends LuceneTestCase { |
| |
| private IndexWriter newWriter(Directory dir, IndexWriterConfig conf) |
| throws IOException { |
| LogDocMergePolicy logByteSizeMergePolicy = new LogDocMergePolicy(); |
| logByteSizeMergePolicy.setNoCFSRatio(0.0); // make sure we use plain |
| // files |
| conf.setMergePolicy(logByteSizeMergePolicy); |
| |
| final IndexWriter writer = new IndexWriter(dir, conf); |
| return writer; |
| } |
| |
| private void addDocs(IndexWriter writer, int numDocs) throws IOException { |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| doc.add(newTextField("content", "aaa", Field.Store.NO)); |
| writer.addDocument(doc); |
| } |
| } |
| |
| private void addDocs2(IndexWriter writer, int numDocs) throws IOException { |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| doc.add(newTextField("content", "bbb", Field.Store.NO)); |
| writer.addDocument(doc); |
| } |
| } |
| |
| private void addDocs3(IndexWriter writer, int numDocs) throws IOException { |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| doc.add(newTextField("content", "ccc", Field.Store.NO)); |
| doc.add(newStringField("id", "" + i, Field.Store.YES)); |
| writer.addDocument(doc); |
| } |
| } |
| |
| /* |
| * Test that heterogeneous index segments are merge successfully |
| */ |
| @Test |
| public void testMergeUnusedPerFieldCodec() throws IOException { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random())) |
| .setOpenMode(OpenMode.CREATE).setCodec(new MockCodec()); |
| IndexWriter writer = newWriter(dir, iwconf); |
| addDocs(writer, 10); |
| writer.commit(); |
| addDocs3(writer, 10); |
| writer.commit(); |
| addDocs2(writer, 10); |
| writer.commit(); |
| assertEquals(30, writer.getDocStats().maxDoc); |
| TestUtil.checkIndex(dir); |
| writer.forceMerge(1); |
| assertEquals(30, writer.getDocStats().maxDoc); |
| writer.close(); |
| dir.close(); |
| } |
| |
| /* |
| * Test that heterogeneous index segments are merged sucessfully |
| */ |
| // TODO: not sure this test is that great, we should probably peek inside PerFieldPostingsFormat or something?! |
| @Test |
| public void testChangeCodecAndMerge() throws IOException { |
| Directory dir = newDirectory(); |
| if (VERBOSE) { |
| System.out.println("TEST: make new index"); |
| } |
| IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random())) |
| .setOpenMode(OpenMode.CREATE).setCodec(new MockCodec()); |
| iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); |
| //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10); |
| IndexWriter writer = newWriter(dir, iwconf); |
| |
| addDocs(writer, 10); |
| writer.commit(); |
| assertQuery(new Term("content", "aaa"), dir, 10); |
| if (VERBOSE) { |
| System.out.println("TEST: addDocs3"); |
| } |
| addDocs3(writer, 10); |
| writer.commit(); |
| writer.close(); |
| |
| assertQuery(new Term("content", "ccc"), dir, 10); |
| assertQuery(new Term("content", "aaa"), dir, 10); |
| Codec codec = iwconf.getCodec(); |
| |
| iwconf = newIndexWriterConfig(new MockAnalyzer(random())) |
| .setOpenMode(OpenMode.APPEND).setCodec(codec); |
| //((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0); |
| //((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10); |
| iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); |
| |
| iwconf.setCodec(new MockCodec()); // uses standard for field content |
| writer = newWriter(dir, iwconf); |
| // swap in new codec for currently written segments |
| if (VERBOSE) { |
| System.out.println("TEST: add docs w/ Standard codec for content field"); |
| } |
| addDocs2(writer, 10); |
| writer.commit(); |
| codec = iwconf.getCodec(); |
| assertEquals(30, writer.getDocStats().maxDoc); |
| assertQuery(new Term("content", "bbb"), dir, 10); |
| assertQuery(new Term("content", "ccc"), dir, 10); //// |
| assertQuery(new Term("content", "aaa"), dir, 10); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: add more docs w/ new codec"); |
| } |
| addDocs2(writer, 10); |
| writer.commit(); |
| assertQuery(new Term("content", "ccc"), dir, 10); |
| assertQuery(new Term("content", "bbb"), dir, 20); |
| assertQuery(new Term("content", "aaa"), dir, 10); |
| assertEquals(40, writer.getDocStats().maxDoc); |
| |
| if (VERBOSE) { |
| System.out.println("TEST: now optimize"); |
| } |
| writer.forceMerge(1); |
| assertEquals(40, writer.getDocStats().maxDoc); |
| writer.close(); |
| assertQuery(new Term("content", "ccc"), dir, 10); |
| assertQuery(new Term("content", "bbb"), dir, 20); |
| assertQuery(new Term("content", "aaa"), dir, 10); |
| |
| dir.close(); |
| } |
| |
| public void assertQuery(Term t, Directory dir, int num) |
| throws IOException { |
| if (VERBOSE) { |
| System.out.println("\nTEST: assertQuery " + t); |
| } |
| IndexReader reader = DirectoryReader.open(dir); |
| IndexSearcher searcher = newSearcher(reader); |
| TopDocs search = searcher.search(new TermQuery(t), num + 10); |
| assertEquals(num, search.totalHits.value); |
| reader.close(); |
| |
| } |
| |
| public static class MockCodec extends AssertingCodec { |
| final PostingsFormat luceneDefault = TestUtil.getDefaultPostingsFormat(); |
| final PostingsFormat direct = new DirectPostingsFormat(); |
| |
| @Override |
| public PostingsFormat getPostingsFormatForField(String field) { |
| if (field.equals("id")) { |
| return direct; |
| } else { |
| return luceneDefault; |
| } |
| } |
| } |
| |
| /* |
| * Test per field codec support - adding fields with random codecs |
| */ |
| @Test |
| public void testStressPerFieldCodec() throws IOException { |
| Directory dir = newDirectory(random()); |
| final int docsPerRound = 97; |
| int numRounds = atLeast(1); |
| for (int i = 0; i < numRounds; i++) { |
| int num = TestUtil.nextInt(random(), 30, 60); |
| IndexWriterConfig config = newIndexWriterConfig(random(), |
| new MockAnalyzer(random())); |
| config.setOpenMode(OpenMode.CREATE_OR_APPEND); |
| IndexWriter writer = newWriter(dir, config); |
| for (int j = 0; j < docsPerRound; j++) { |
| final Document doc = new Document(); |
| for (int k = 0; k < num; k++) { |
| FieldType customType = new FieldType(TextField.TYPE_NOT_STORED); |
| customType.setTokenized(random().nextBoolean()); |
| customType.setOmitNorms(random().nextBoolean()); |
| Field field = newField("" + k, TestUtil |
| .randomRealisticUnicodeString(random(), 128), customType); |
| doc.add(field); |
| } |
| writer.addDocument(doc); |
| } |
| if (random().nextBoolean()) { |
| writer.forceMerge(1); |
| } |
| writer.commit(); |
| assertEquals((i + 1) * docsPerRound, writer.getDocStats().maxDoc); |
| writer.close(); |
| } |
| dir.close(); |
| } |
| |
| public void testSameCodecDifferentInstance() throws Exception { |
| Codec codec = new AssertingCodec() { |
| @Override |
| public PostingsFormat getPostingsFormatForField(String field) { |
| if ("id".equals(field)) { |
| return new DirectPostingsFormat(); |
| } else if ("date".equals(field)) { |
| return new DirectPostingsFormat(); |
| } else { |
| return super.getPostingsFormatForField(field); |
| } |
| } |
| }; |
| doTestMixedPostings(codec); |
| } |
| |
| public void testSameCodecDifferentParams() throws Exception { |
| Codec codec = new AssertingCodec() { |
| @Override |
| public PostingsFormat getPostingsFormatForField(String field) { |
| if ("id".equals(field)) { |
| return new LuceneVarGapFixedInterval(1); |
| } else if ("date".equals(field)) { |
| return new LuceneVarGapFixedInterval(2); |
| } else { |
| return super.getPostingsFormatForField(field); |
| } |
| } |
| }; |
| doTestMixedPostings(codec); |
| } |
| |
| private void doTestMixedPostings(Codec codec) throws Exception { |
| Directory dir = newDirectory(); |
| IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random())); |
| iwc.setCodec(codec); |
| RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); |
| Document doc = new Document(); |
| FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); |
| // turn on vectors for the checkindex cross-check |
| ft.setStoreTermVectors(true); |
| ft.setStoreTermVectorOffsets(true); |
| ft.setStoreTermVectorPositions(true); |
| Field idField = new Field("id", "", ft); |
| Field dateField = new Field("date", "", ft); |
| doc.add(idField); |
| doc.add(dateField); |
| for (int i = 0; i < 100; i++) { |
| idField.setStringValue(Integer.toString(random().nextInt(50))); |
| dateField.setStringValue(Integer.toString(random().nextInt(100))); |
| iw.addDocument(doc); |
| } |
| iw.close(); |
| dir.close(); // checkindex |
| } |
| |
| @SuppressWarnings("deprecation") |
| public void testMergeCalledOnTwoFormats() throws IOException { |
| MergeRecordingPostingsFormatWrapper pf1 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat()); |
| MergeRecordingPostingsFormatWrapper pf2 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat()); |
| |
| IndexWriterConfig iwc = new IndexWriterConfig(); |
| iwc.setCodec(new AssertingCodec() { |
| @Override |
| public PostingsFormat getPostingsFormatForField(String field) { |
| switch (field) { |
| case "f1": |
| case "f2": |
| return pf1; |
| |
| case "f3": |
| case "f4": |
| return pf2; |
| |
| default: |
| return super.getPostingsFormatForField(field); |
| } |
| } |
| }); |
| |
| Directory directory = newDirectory(); |
| |
| IndexWriter iwriter = new IndexWriter(directory, iwc); |
| |
| Document doc = new Document(); |
| doc.add(new StringField("f1", "val1", Field.Store.NO)); |
| doc.add(new StringField("f2", "val2", Field.Store.YES)); |
| doc.add(new IntPoint("f3", 3)); // Points are not indexed as postings and should not appear in the merge fields |
| doc.add(new StringField("f4", "val4", Field.Store.NO)); |
| iwriter.addDocument(doc); |
| iwriter.commit(); |
| |
| doc = new Document(); |
| doc.add(new StringField("f1", "val5", Field.Store.NO)); |
| doc.add(new StringField("f2", "val6", Field.Store.YES)); |
| doc.add(new IntPoint("f3", 7)); |
| doc.add(new StringField("f4", "val8", Field.Store.NO)); |
| iwriter.addDocument(doc); |
| iwriter.commit(); |
| |
| iwriter.forceMerge(1, true); |
| iwriter.close(); |
| |
| assertEquals(1, pf1.nbMergeCalls); |
| assertEquals(new HashSet<>(Arrays.asList("f1", "f2")), new HashSet<>(pf1.fieldNames)); |
| assertEquals(1, pf2.nbMergeCalls); |
| assertEquals(Collections.singletonList("f4"), pf2.fieldNames); |
| |
| directory.close(); |
| } |
| |
| private static final class MergeRecordingPostingsFormatWrapper extends PostingsFormat { |
| private final PostingsFormat delegate; |
| final List<String> fieldNames = new ArrayList<>(); |
| int nbMergeCalls = 0; |
| |
| MergeRecordingPostingsFormatWrapper(PostingsFormat delegate) { |
| super(delegate.getName()); |
| this.delegate = delegate; |
| } |
| |
| @Override |
| public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { |
| final FieldsConsumer consumer = delegate.fieldsConsumer(state); |
| return new FieldsConsumer() { |
| @Override |
| public void write(Fields fields, NormsProducer norms) throws IOException { |
| consumer.write(fields, norms); |
| } |
| |
| @Override |
| public void merge(MergeState mergeState, NormsProducer norms) throws IOException { |
| nbMergeCalls++; |
| for (FieldInfo fi : mergeState.mergeFieldInfos) { |
| fieldNames.add(fi.name); |
| } |
| consumer.merge(mergeState, norms); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| consumer.close(); |
| } |
| }; |
| } |
| |
| @Override |
| public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { |
| return delegate.fieldsProducer(state); |
| } |
| } |
| } |