blob: 3c329a4ce7906e6f7dccea50e9d98144e23a54e5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.perfield;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.blockterms.LuceneVarGapFixedInterval;
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.Test;
/**
*
*
*/
//TODO: would be better in this test to pull termsenums and instanceof or something?
// this way we can verify PFPF is doing the right thing.
// for now we do termqueries.
@LuceneTestCase.SuppressCodecs("SimpleText")
public class TestPerFieldPostingsFormat2 extends LuceneTestCase {
private IndexWriter newWriter(Directory dir, IndexWriterConfig conf)
throws IOException {
LogDocMergePolicy logByteSizeMergePolicy = new LogDocMergePolicy();
logByteSizeMergePolicy.setNoCFSRatio(0.0); // make sure we use plain
// files
conf.setMergePolicy(logByteSizeMergePolicy);
final IndexWriter writer = new IndexWriter(dir, conf);
return writer;
}
private void addDocs(IndexWriter writer, int numDocs) throws IOException {
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(newTextField("content", "aaa", Field.Store.NO));
writer.addDocument(doc);
}
}
private void addDocs2(IndexWriter writer, int numDocs) throws IOException {
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(newTextField("content", "bbb", Field.Store.NO));
writer.addDocument(doc);
}
}
private void addDocs3(IndexWriter writer, int numDocs) throws IOException {
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(newTextField("content", "ccc", Field.Store.NO));
doc.add(newStringField("id", "" + i, Field.Store.YES));
writer.addDocument(doc);
}
}
/*
* Test that heterogeneous index segments are merge successfully
*/
@Test
public void testMergeUnusedPerFieldCodec() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.CREATE).setCodec(new MockCodec());
IndexWriter writer = newWriter(dir, iwconf);
addDocs(writer, 10);
writer.commit();
addDocs3(writer, 10);
writer.commit();
addDocs2(writer, 10);
writer.commit();
assertEquals(30, writer.getDocStats().maxDoc);
TestUtil.checkIndex(dir);
writer.forceMerge(1);
assertEquals(30, writer.getDocStats().maxDoc);
writer.close();
dir.close();
}
/*
* Test that heterogeneous index segments are merged sucessfully
*/
// TODO: not sure this test is that great, we should probably peek inside PerFieldPostingsFormat or something?!
@Test
public void testChangeCodecAndMerge() throws IOException {
Directory dir = newDirectory();
if (VERBOSE) {
System.out.println("TEST: make new index");
}
IndexWriterConfig iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.CREATE).setCodec(new MockCodec());
iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
//((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
IndexWriter writer = newWriter(dir, iwconf);
addDocs(writer, 10);
writer.commit();
assertQuery(new Term("content", "aaa"), dir, 10);
if (VERBOSE) {
System.out.println("TEST: addDocs3");
}
addDocs3(writer, 10);
writer.commit();
writer.close();
assertQuery(new Term("content", "ccc"), dir, 10);
assertQuery(new Term("content", "aaa"), dir, 10);
Codec codec = iwconf.getCodec();
iwconf = newIndexWriterConfig(new MockAnalyzer(random()))
.setOpenMode(OpenMode.APPEND).setCodec(codec);
//((LogMergePolicy) iwconf.getMergePolicy()).setNoCFSRatio(0.0);
//((LogMergePolicy) iwconf.getMergePolicy()).setMergeFactor(10);
iwconf.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH);
iwconf.setCodec(new MockCodec()); // uses standard for field content
writer = newWriter(dir, iwconf);
// swap in new codec for currently written segments
if (VERBOSE) {
System.out.println("TEST: add docs w/ Standard codec for content field");
}
addDocs2(writer, 10);
writer.commit();
codec = iwconf.getCodec();
assertEquals(30, writer.getDocStats().maxDoc);
assertQuery(new Term("content", "bbb"), dir, 10);
assertQuery(new Term("content", "ccc"), dir, 10); ////
assertQuery(new Term("content", "aaa"), dir, 10);
if (VERBOSE) {
System.out.println("TEST: add more docs w/ new codec");
}
addDocs2(writer, 10);
writer.commit();
assertQuery(new Term("content", "ccc"), dir, 10);
assertQuery(new Term("content", "bbb"), dir, 20);
assertQuery(new Term("content", "aaa"), dir, 10);
assertEquals(40, writer.getDocStats().maxDoc);
if (VERBOSE) {
System.out.println("TEST: now optimize");
}
writer.forceMerge(1);
assertEquals(40, writer.getDocStats().maxDoc);
writer.close();
assertQuery(new Term("content", "ccc"), dir, 10);
assertQuery(new Term("content", "bbb"), dir, 20);
assertQuery(new Term("content", "aaa"), dir, 10);
dir.close();
}
public void assertQuery(Term t, Directory dir, int num)
throws IOException {
if (VERBOSE) {
System.out.println("\nTEST: assertQuery " + t);
}
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = newSearcher(reader);
TopDocs search = searcher.search(new TermQuery(t), num + 10);
assertEquals(num, search.totalHits.value);
reader.close();
}
public static class MockCodec extends AssertingCodec {
final PostingsFormat luceneDefault = TestUtil.getDefaultPostingsFormat();
final PostingsFormat direct = new DirectPostingsFormat();
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if (field.equals("id")) {
return direct;
} else {
return luceneDefault;
}
}
}
/*
* Test per field codec support - adding fields with random codecs
*/
@Test
public void testStressPerFieldCodec() throws IOException {
Directory dir = newDirectory(random());
final int docsPerRound = 97;
int numRounds = atLeast(1);
for (int i = 0; i < numRounds; i++) {
int num = TestUtil.nextInt(random(), 30, 60);
IndexWriterConfig config = newIndexWriterConfig(random(),
new MockAnalyzer(random()));
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter writer = newWriter(dir, config);
for (int j = 0; j < docsPerRound; j++) {
final Document doc = new Document();
for (int k = 0; k < num; k++) {
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setTokenized(random().nextBoolean());
customType.setOmitNorms(random().nextBoolean());
Field field = newField("" + k, TestUtil
.randomRealisticUnicodeString(random(), 128), customType);
doc.add(field);
}
writer.addDocument(doc);
}
if (random().nextBoolean()) {
writer.forceMerge(1);
}
writer.commit();
assertEquals((i + 1) * docsPerRound, writer.getDocStats().maxDoc);
writer.close();
}
dir.close();
}
public void testSameCodecDifferentInstance() throws Exception {
Codec codec = new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if ("id".equals(field)) {
return new DirectPostingsFormat();
} else if ("date".equals(field)) {
return new DirectPostingsFormat();
} else {
return super.getPostingsFormatForField(field);
}
}
};
doTestMixedPostings(codec);
}
public void testSameCodecDifferentParams() throws Exception {
Codec codec = new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
if ("id".equals(field)) {
return new LuceneVarGapFixedInterval(1);
} else if ("date".equals(field)) {
return new LuceneVarGapFixedInterval(2);
} else {
return super.getPostingsFormatForField(field);
}
}
};
doTestMixedPostings(codec);
}
private void doTestMixedPostings(Codec codec) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setCodec(codec);
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
// turn on vectors for the checkindex cross-check
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field idField = new Field("id", "", ft);
Field dateField = new Field("date", "", ft);
doc.add(idField);
doc.add(dateField);
for (int i = 0; i < 100; i++) {
idField.setStringValue(Integer.toString(random().nextInt(50)));
dateField.setStringValue(Integer.toString(random().nextInt(100)));
iw.addDocument(doc);
}
iw.close();
dir.close(); // checkindex
}
@SuppressWarnings("deprecation")
public void testMergeCalledOnTwoFormats() throws IOException {
MergeRecordingPostingsFormatWrapper pf1 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat());
MergeRecordingPostingsFormatWrapper pf2 = new MergeRecordingPostingsFormatWrapper(TestUtil.getDefaultPostingsFormat());
IndexWriterConfig iwc = new IndexWriterConfig();
iwc.setCodec(new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
switch (field) {
case "f1":
case "f2":
return pf1;
case "f3":
case "f4":
return pf2;
default:
return super.getPostingsFormatForField(field);
}
}
});
Directory directory = newDirectory();
IndexWriter iwriter = new IndexWriter(directory, iwc);
Document doc = new Document();
doc.add(new StringField("f1", "val1", Field.Store.NO));
doc.add(new StringField("f2", "val2", Field.Store.YES));
doc.add(new IntPoint("f3", 3)); // Points are not indexed as postings and should not appear in the merge fields
doc.add(new StringField("f4", "val4", Field.Store.NO));
iwriter.addDocument(doc);
iwriter.commit();
doc = new Document();
doc.add(new StringField("f1", "val5", Field.Store.NO));
doc.add(new StringField("f2", "val6", Field.Store.YES));
doc.add(new IntPoint("f3", 7));
doc.add(new StringField("f4", "val8", Field.Store.NO));
iwriter.addDocument(doc);
iwriter.commit();
iwriter.forceMerge(1, true);
iwriter.close();
assertEquals(1, pf1.nbMergeCalls);
assertEquals(new HashSet<>(Arrays.asList("f1", "f2")), new HashSet<>(pf1.fieldNames));
assertEquals(1, pf2.nbMergeCalls);
assertEquals(Collections.singletonList("f4"), pf2.fieldNames);
directory.close();
}
private static final class MergeRecordingPostingsFormatWrapper extends PostingsFormat {
private final PostingsFormat delegate;
final List<String> fieldNames = new ArrayList<>();
int nbMergeCalls = 0;
MergeRecordingPostingsFormatWrapper(PostingsFormat delegate) {
super(delegate.getName());
this.delegate = delegate;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
final FieldsConsumer consumer = delegate.fieldsConsumer(state);
return new FieldsConsumer() {
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
consumer.write(fields, norms);
}
@Override
public void merge(MergeState mergeState, NormsProducer norms) throws IOException {
nbMergeCalls++;
for (FieldInfo fi : mergeState.mergeFieldInfos) {
fieldNames.add(fi.name);
}
consumer.merge(mergeState, norms);
}
@Override
public void close() throws IOException {
consumer.close();
}
};
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
return delegate.fieldsProducer(state);
}
}
}