blob: 1d6016407777baf3b9bab00335ec22c9c9597c62 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene80;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestDocValuesCompression extends LuceneTestCase {
private final Codec bestSpeed =
TestUtil.alwaysDocValuesFormat(
new Lucene80DocValuesFormat(Lucene80DocValuesFormat.Mode.BEST_SPEED));
private final Codec bestCompression =
TestUtil.alwaysDocValuesFormat(
new Lucene80DocValuesFormat(Lucene80DocValuesFormat.Mode.BEST_COMPRESSION));
public void testTermsDictCompressionForLowCardinalityFields() throws IOException {
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD - 1;
Set<String> valuesSet = new HashSet<>();
for (int i = 0; i < CARDINALITY; ++i) {
final int length = TestUtil.nextInt(random(), 10, 30);
String value = TestUtil.randomSimpleString(random(), length);
valuesSet.add(value);
}
List<String> values = new ArrayList<>(valuesSet);
long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values);
long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values);
// Ensure terms dict data was not compressed for low-cardinality fields.
assertEquals(sizeForBestSpeed, sizeForBestCompression);
}
public void testTermsDictCompressionForHighCardinalityFields() throws IOException {
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
Set<String> valuesSet = new HashSet<>();
for (int i = 0; i < CARDINALITY; ++i) {
final int length = TestUtil.nextInt(random(), 10, 30);
String value = TestUtil.randomSimpleString(random(), length);
// Add common suffix for better compression ratio.
valuesSet.add(value + "_CommonPartBetterForCompression");
}
List<String> values = new ArrayList<>(valuesSet);
long sizeForBestSpeed = writeAndGetDocValueFileSize(bestSpeed, values);
long sizeForBestCompression = writeAndGetDocValueFileSize(bestCompression, values);
// Compression happened.
assertTrue(sizeForBestCompression < sizeForBestSpeed);
}
public void testReseekAfterSkipDecompression() throws IOException {
final int CARDINALITY = (Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE << 1) + 11;
Set<String> valueSet = new HashSet<>(CARDINALITY);
for (int i = 0; i < CARDINALITY; i++) {
valueSet.add(TestUtil.randomSimpleString(random(), 64));
}
List<String> values = new ArrayList<>(valueSet);
Collections.sort(values);
// Create one non-existent value just between block-1 and block-2.
String nonexistentValue =
values.get(Lucene80DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SIZE - 1)
+ TestUtil.randomSimpleString(random(), 64, 128);
int docValues = values.size();
try (Directory directory = newDirectory()) {
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setCodec(bestCompression);
config.setUseCompoundFile(false);
IndexWriter writer = new IndexWriter(directory, config);
for (int i = 0; i < 280; i++) {
Document doc = new Document();
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % docValues))));
writer.addDocument(doc);
}
writer.commit();
writer.forceMerge(1);
DirectoryReader dReader = DirectoryReader.open(writer);
writer.close();
LeafReader reader = getOnlyLeafReader(dReader);
// Check values count.
SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv");
assertEquals(docValues, ssdvMulti.getValueCount());
// Seek to first block.
int ord1 = ssdvMulti.lookupTerm(new BytesRef(values.get(0)));
assertTrue(ord1 >= 0);
int ord2 = ssdvMulti.lookupTerm(new BytesRef(values.get(1)));
assertTrue(ord2 >= ord1);
// Ensure re-seek logic is correct after skip-decompression.
int nonexistentOrd2 = ssdvMulti.lookupTerm(new BytesRef(nonexistentValue));
assertTrue(nonexistentOrd2 < 0);
dReader.close();
}
}
public void testLargeTermsCompression() throws IOException {
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
Set<String> valuesSet = new HashSet<>();
for (int i = 0; i < CARDINALITY; ++i) {
final int length = TestUtil.nextInt(random(), 512, 1024);
valuesSet.add(TestUtil.randomSimpleString(random(), length));
}
int valuesCount = valuesSet.size();
List<String> values = new ArrayList<>(valuesSet);
try (Directory directory = newDirectory()) {
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setCodec(bestCompression);
config.setUseCompoundFile(false);
IndexWriter writer = new IndexWriter(directory, config);
for (int i = 0; i < 256; i++) {
Document doc = new Document();
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
writer.addDocument(doc);
}
writer.commit();
writer.forceMerge(1);
DirectoryReader ireader = DirectoryReader.open(writer);
writer.close();
LeafReader reader = getOnlyLeafReader(ireader);
// Check values count.
SortedDocValues ssdvMulti = reader.getSortedDocValues("sdv");
assertEquals(valuesCount, ssdvMulti.getValueCount());
ireader.close();
}
}
// Ensure the old segment can be merged together with the new compressed segment.
public void testMergeWithUncompressedSegment() throws IOException {
final int CARDINALITY = Lucene80DocValuesFormat.TERMS_DICT_BLOCK_COMPRESSION_THRESHOLD << 1;
Set<String> valuesSet = new HashSet<>();
for (int i = 0; i < CARDINALITY; ++i) {
final int length = TestUtil.nextInt(random(), 10, 30);
// Add common suffix for better compression ratio.
valuesSet.add(TestUtil.randomSimpleString(random(), length));
}
List<String> values = new ArrayList<>(valuesSet);
int valuesCount = values.size();
try (Directory directory = newDirectory()) {
// 1. Write 256 documents without terms dict compression.
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setCodec(bestSpeed);
config.setUseCompoundFile(false);
IndexWriter writer = new IndexWriter(directory, config);
for (int i = 0; i < 256; i++) {
Document doc = new Document();
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount))));
doc.add(
new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 1) % valuesCount))));
doc.add(
new SortedSetDocValuesField("ssdv", new BytesRef(values.get((i + 2) % valuesCount))));
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
writer.addDocument(doc);
}
writer.commit();
DirectoryReader ireader = DirectoryReader.open(writer);
assertEquals(256, ireader.numDocs());
LeafReader reader = getOnlyLeafReader(ireader);
SortedSetDocValues ssdv = reader.getSortedSetDocValues("ssdv");
assertEquals(valuesCount, ssdv.getValueCount());
SortedDocValues sdv = reader.getSortedDocValues("sdv");
assertEquals(valuesCount, sdv.getValueCount());
ireader.close();
writer.close();
// 2. Add another 100 documents, and enabling terms dict compression.
config = new IndexWriterConfig(analyzer);
config.setCodec(bestCompression);
config.setUseCompoundFile(false);
writer = new IndexWriter(directory, config);
// Add 2 new values.
valuesSet.add(TestUtil.randomSimpleString(random(), 10));
valuesSet.add(TestUtil.randomSimpleString(random(), 10));
values = new ArrayList<>(valuesSet);
valuesCount = valuesSet.size();
for (int i = 256; i < 356; i++) {
Document doc = new Document();
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
doc.add(new SortedSetDocValuesField("ssdv", new BytesRef(values.get(i % valuesCount))));
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
writer.addDocument(doc);
}
writer.commit();
writer.forceMerge(1);
ireader = DirectoryReader.open(writer);
assertEquals(356, ireader.numDocs());
reader = getOnlyLeafReader(ireader);
ssdv = reader.getSortedSetDocValues("ssdv");
assertEquals(valuesCount, ssdv.getValueCount());
ireader.close();
writer.close();
}
}
private static long writeAndGetDocValueFileSize(Codec codec, List<String> values)
throws IOException {
int valuesCount = values.size();
long dvdFileSize = -1;
try (Directory directory = newDirectory()) {
Analyzer analyzer = new StandardAnalyzer();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setCodec(codec);
config.setUseCompoundFile(false);
IndexWriter writer = new IndexWriter(directory, config);
for (int i = 0; i < 256; i++) {
Document doc = new Document();
doc.add(new StringField("id", "Doc" + i, Field.Store.NO));
// Multi value sorted-set field.
doc.add(
new SortedSetDocValuesField("ssdv_multi_", new BytesRef(values.get(i % valuesCount))));
doc.add(
new SortedSetDocValuesField(
"ssdv_multi_", new BytesRef(values.get((i + 1) % valuesCount))));
doc.add(
new SortedSetDocValuesField(
"ssdv_multi_", new BytesRef(values.get((i + 2) % valuesCount))));
// Single value sorted-set field.
doc.add(
new SortedSetDocValuesField("ssdv_single_", new BytesRef(values.get(i % valuesCount))));
// Sorted field.
doc.add(new SortedDocValuesField("sdv", new BytesRef(values.get(i % valuesCount))));
writer.addDocument(doc);
}
writer.commit();
writer.forceMerge(1);
DirectoryReader ireader = DirectoryReader.open(writer);
writer.close();
LeafReader reader = getOnlyLeafReader(ireader);
// Check values count.
SortedSetDocValues ssdvMulti = reader.getSortedSetDocValues("ssdv_multi_");
assertEquals(valuesCount, ssdvMulti.getValueCount());
for (int i = 0; i < valuesCount; i++) {
BytesRef term = ssdvMulti.lookupOrd(i);
assertTrue(term.bytes.length > 0);
}
for (int i = 0; i < valuesCount; i++) {
for (int j = 0; j < 3; j++) {
assertTrue(ssdvMulti.lookupTerm(new BytesRef(values.get((i + j) % valuesCount))) >= 0);
}
}
SortedSetDocValues ssdvSingle = reader.getSortedSetDocValues("ssdv_single_");
assertEquals(valuesCount, ssdvSingle.getValueCount());
for (int i = 0; i < valuesCount; i++) {
assertTrue(ssdvSingle.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0);
}
SortedDocValues sdv = reader.getSortedDocValues("sdv");
assertEquals(valuesCount, sdv.getValueCount());
for (int i = 0; i < valuesCount; i++) {
assertTrue(sdv.lookupTerm(new BytesRef(values.get(i % valuesCount))) >= 0);
}
dvdFileSize = docValueFileSize(directory);
assertTrue(dvdFileSize > 0);
ireader.close();
}
return dvdFileSize;
}
static long docValueFileSize(Directory d) throws IOException {
for (String file : d.listAll()) {
if (file.endsWith(Lucene80DocValuesFormat.DATA_EXTENSION)) {
return d.fileLength(file);
}
}
return -1;
}
}