blob: bac65ba6a6f76264cccb279c89f0f5a18a6b3e33 [file] [log] [blame]
Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java (revision 1689996)
+++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java (working copy)
@@ -23,6 +23,11 @@
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
@@ -34,6 +39,7 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongsRef;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
@@ -463,11 +469,22 @@
// The field is single-valued, we can encode it as NUMERIC
addNumericField(field, singletonView(docToValueCount, values, null));
} else {
- meta.writeVInt(SORTED_WITH_ADDRESSES);
- // write the stream of values as a numeric field
- addNumericField(field, values, true);
- // write the doc -> ord count as a absolute index to the stream
- addAddresses(field, docToValueCount);
+ final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToValueCount, values);
+ if (uniqueValueSets != null) {
+ meta.writeVInt(SORTED_SET_TABLE);
+
+ // write the set_id -> values mapping
+ writeDictionary(uniqueValueSets);
+
+ // write the doc -> set_id as a numeric field
+ addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
+ } else {
+ meta.writeVInt(SORTED_WITH_ADDRESSES);
+ // write the stream of values as a numeric field
+ addNumericField(field, values, true);
+ // write the doc -> ord count as a absolute index to the stream
+ addAddresses(field, docToValueCount);
+ }
}
}
@@ -481,20 +498,120 @@
// The field is single-valued, we can encode it as SORTED
addSortedField(field, values, singletonView(docToOrdCount, ords, -1L));
} else {
- meta.writeVInt(SORTED_WITH_ADDRESSES);
+ final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToOrdCount, ords);
+ if (uniqueValueSets != null) {
+ meta.writeVInt(SORTED_SET_TABLE);
- // write the ord -> byte[] as a binary field
- addTermsDict(field, values);
+ // write the set_id -> ords mapping
+ writeDictionary(uniqueValueSets);
- // write the stream of ords as a numeric field
- // NOTE: we could return an iterator that delta-encodes these within a doc
- addNumericField(field, ords, false);
+ // write the ord -> byte[] as a binary field
+ addTermsDict(field, values);
- // write the doc -> ord count as a absolute index to the stream
- addAddresses(field, docToOrdCount);
+ // write the doc -> set_id as a numeric field
+ addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
+ } else {
+ meta.writeVInt(SORTED_WITH_ADDRESSES);
+
+ // write the ord -> byte[] as a binary field
+ addTermsDict(field, values);
+
+ // write the stream of ords as a numeric field
+ // NOTE: we could return an iterator that delta-encodes these within a doc
+ addNumericField(field, ords, false);
+
+ // write the doc -> ord count as a absolute index to the stream
+ addAddresses(field, docToOrdCount);
+ }
}
}
-
+
+ private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) {
+ Set<LongsRef> uniqueValueSet = new HashSet<>();
+ LongsRef docValues = new LongsRef(256);
+
+ Iterator<Number> valueCountIterator = docToValueCount.iterator();
+ Iterator<Number> valueIterator = values.iterator();
+ int totalDictSize = 0;
+ while (valueCountIterator.hasNext()) {
+ docValues.length = valueCountIterator.next().intValue();
+ if (docValues.length > 256) {
+ return null;
+ }
+ for (int i = 0; i < docValues.length; ++i) {
+ docValues.longs[i] = valueIterator.next().longValue();
+ }
+ if (uniqueValueSet.contains(docValues)) {
+ continue;
+ }
+ totalDictSize += docValues.length;
+ if (totalDictSize > 256) {
+ return null;
+ }
+ uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length));
+ }
+ assert valueIterator.hasNext() == false;
+ return new TreeSet<>(uniqueValueSet);
+ }
+
+ private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException {
+ int lengthSum = 0;
+ for (LongsRef longs : uniqueValueSets) {
+ lengthSum += longs.length;
+ }
+
+ meta.writeInt(lengthSum);
+ for (LongsRef valueSet : uniqueValueSets) {
+ for (int i = 0; i < valueSet.length; ++i) {
+ meta.writeLong(valueSet.longs[valueSet.offset + i]);
+ }
+ }
+
+ meta.writeInt(uniqueValueSets.size());
+ for (LongsRef valueSet : uniqueValueSets) {
+ meta.writeInt(valueSet.length);
+ }
+ }
+
+ private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, Iterable<Number> docToValueCount, Iterable<Number> values) {
+ final Map<LongsRef, Integer> setIds = new HashMap<>();
+ int i = 0;
+ for (LongsRef set : uniqueValueSets) {
+ setIds.put(set, i++);
+ }
+ assert i == uniqueValueSets.size();
+
+ return new Iterable<Number>() {
+
+ @Override
+ public Iterator<Number> iterator() {
+ final Iterator<Number> valueCountIterator = docToValueCount.iterator();
+ final Iterator<Number> valueIterator = values.iterator();
+ final LongsRef docValues = new LongsRef(256);
+ return new Iterator<Number>() {
+
+ @Override
+ public boolean hasNext() {
+ return valueCountIterator.hasNext();
+ }
+
+ @Override
+ public Number next() {
+ docValues.length = valueCountIterator.next().intValue();
+ for (int i = 0; i < docValues.length; ++i) {
+ docValues.longs[i] = valueIterator.next().longValue();
+ }
+ final Integer id = setIds.get(docValues);
+ assert id != null;
+ return id;
+ }
+
+ };
+
+ }
+ };
+ }
+
// writes addressing information as MONOTONIC_COMPRESSED integer
private void addAddresses(FieldInfo field, Iterable<Number> values) throws IOException {
meta.writeVInt(field.number);
Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java (revision 1689996)
+++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java (working copy)
@@ -72,13 +72,21 @@
* <p>
* {@link DocValuesType#SORTED_SET SORTED_SET}:
* <ul>
+ * <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
+ * <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
+ * an id, a lookup table is written and the mapping from document to set id is written using the
+ * numeric strategies above.
* <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
* an ordinal list and per-document index into this list are written using the numeric strategies
- * above.
+ * above.
* </ul>
* <p>
* {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
* <ul>
+ * <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
+ * <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
+ * an id, a lookup table is written and the mapping from document to set id is written using the
+ * numeric strategies above.
* <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
* strategies above.
* </ul>
@@ -108,21 +116,24 @@
* <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
* <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
* <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
- * <li>SortedSetEntry --&gt; EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
- * <li>SortedNumericEntry --&gt; EntryType,NumericEntry,NumericEntry</li>
+ * <li>SortedSetEntry --&gt; SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li>
+ * <li>SingleSortedSetEntry --&gt; SetHeader,SortedEntry</li>
+ * <li>AddressesSortedSetEntry --&gt; SetHeader,BinaryEntry,NumericEntry,NumericEntry</li>
+ * <li>TableSortedSetEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li>
+ * <li>SetHeader --&gt; FieldNumber,EntryType,SetType</li>
+ * <li>SortedNumericEntry --&gt; SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li>
+ * <li>SingleNumericEntry --&gt; SetHeader,NumericEntry</li>
+ * <li>AddressesSortedNumericEntry --&gt; SetHeader,NumericEntry,NumericEntry</li>
+ * <li>TableSortedNumericEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li>
* <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
* <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --&gt; {@link DataOutput#writeLong Int64}</li>
- * <li>TableSize,BitsPerValue --&gt; {@link DataOutput#writeVInt vInt}</li>
+ * <li>TableSize,BitsPerValue,TotalTableLength --&gt; {@link DataOutput#writeVInt vInt}</li>
* <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
* </ul>
* <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
* and an ordinary NumericEntry for the document-to-ord metadata.</p>
- * <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
- * and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
- * <p>SortedNumeric fields have two entries: A NumericEntry with the value metadata,
- * and a numeric entry with the document-to-value index.</p>
* <p>FieldNumber of -1 indicates the end of metadata.</p>
* <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
* <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
@@ -144,6 +155,15 @@
* <li>1 --&gt; variable-width. An address for each value is stored.
* <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
* </ul>
+ * <p>SetType indicates how SortedSet and SortedNumeric values will be stored:
+ * <ul>
+ * <li>0 --&gt; with addresses. There are two numeric entries: a first one from document to start
+ * offset, and a second one from offset to ord/value.
+ * <li>1 --&gt; single-valued. Used when all documents have at most one value and is encoded like
+ * a regular Sorted/Numeric entry.
+ * <li>2 --&gt; table-encoded. A lookup table of unique sets of values is written, followed by a
+ * numeric entry that maps each document to an ordinal in this table.
+ * </ul>
* <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
* If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
* Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
@@ -187,7 +207,8 @@
static final String META_CODEC = "Lucene50DocValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
- static final int VERSION_CURRENT = VERSION_START;
+ static final int VERSION_SORTEDSET_TABLE = 1;
+ static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;
// indicates docvalues type
static final byte NUMERIC = 0;
@@ -235,6 +256,9 @@
/** Single-valued sorted set values, encoded as sorted values, so no level
* of indirection: {@code docId -> ord}. */
static final int SORTED_SINGLE_VALUED = 1;
+ /** Compressed giving IDs to unique sets of values:
+ * {@code docId -> setId -> ords} */
+ static final int SORTED_SET_TABLE = 2;
/** placeholder for missing offset that means there are no missing values */
static final int ALL_LIVE = -1;
Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java (revision 1689996)
+++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java (working copy)
@@ -206,6 +206,28 @@
ordIndexes.put(info.name, n2);
}
+ private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException {
+ // sortedset table = binary + ordset table + ordset index
+ if (meta.readVInt() != info.number) {
+ throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+ }
+ if (meta.readByte() != Lucene50DocValuesFormat.BINARY) {
+ throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+ }
+
+ BinaryEntry b = readBinaryEntry(meta);
+ binaries.put(info.name, b);
+
+ if (meta.readVInt() != info.number) {
+ throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+ }
+ if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+ throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+ }
+ NumericEntry n = readNumericEntry(meta);
+ ords.put(info.name, n);
+ }
+
private int readFields(IndexInput meta, FieldInfos infos) throws IOException {
int numFields = 0;
int fieldNumber = meta.readVInt();
@@ -229,6 +251,8 @@
sortedSets.put(info.name, ss);
if (ss.format == SORTED_WITH_ADDRESSES) {
readSortedSetFieldWithAddresses(info, meta);
+ } else if (ss.format == SORTED_SET_TABLE) {
+ readSortedSetFieldWithTable(info, meta);
} else if (ss.format == SORTED_SINGLE_VALUED) {
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
@@ -243,13 +267,6 @@
} else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) {
SortedSetEntry ss = readSortedSetEntry(meta);
sortedNumerics.put(info.name, ss);
- if (meta.readVInt() != fieldNumber) {
- throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
- }
- if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
- throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
- }
- numerics.put(info.name, readNumericEntry(meta));
if (ss.format == SORTED_WITH_ADDRESSES) {
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
@@ -257,9 +274,33 @@
if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
+ numerics.put(info.name, readNumericEntry(meta));
+ if (meta.readVInt() != fieldNumber) {
+ throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+ }
+ if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+ throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+ }
NumericEntry ordIndex = readNumericEntry(meta);
ordIndexes.put(info.name, ordIndex);
- } else if (ss.format != SORTED_SINGLE_VALUED) {
+ } else if (ss.format == SORTED_SET_TABLE) {
+ if (meta.readVInt() != info.number) {
+ throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+ }
+ if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+ throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+ }
+ NumericEntry n = readNumericEntry(meta);
+ ords.put(info.name, n);
+ } else if (ss.format == SORTED_SINGLE_VALUED) {
+ if (meta.readVInt() != fieldNumber) {
+ throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+ }
+ if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+ throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+ }
+ numerics.put(info.name, readNumericEntry(meta));
+ } else {
throw new AssertionError();
}
} else {
@@ -346,7 +387,24 @@
SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException {
SortedSetEntry entry = new SortedSetEntry();
entry.format = meta.readVInt();
- if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
+ if (entry.format == SORTED_SET_TABLE) {
+ final int totalTableLength = meta.readInt();
+ if (totalTableLength > 256) {
+ throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta);
+ }
+ entry.table = new long[totalTableLength];
+ for (int i = 0; i < totalTableLength; ++i) {
+ entry.table[i] = meta.readLong();
+ }
+ final int tableSize = meta.readInt();
+ if (tableSize > totalTableLength + 1) { // +1 because of the empty set
+ throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta);
+ }
+ entry.tableOffsets = new int[tableSize + 1];
+ for (int i = 1; i < entry.tableOffsets.length; ++i) {
+ entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt();
+ }
+ } else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
throw new CorruptIndexException("Unknown format: " + entry.format, meta);
}
return entry;
@@ -611,12 +669,14 @@
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedSetEntry ss = sortedNumerics.get(field.name);
- NumericEntry numericEntry = numerics.get(field.name);
- final LongValues values = getNumeric(numericEntry);
if (ss.format == SORTED_SINGLE_VALUED) {
+ NumericEntry numericEntry = numerics.get(field.name);
+ final LongValues values = getNumeric(numericEntry);
final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
return DocValues.singleton(values, docsWithField);
} else if (ss.format == SORTED_WITH_ADDRESSES) {
+ NumericEntry numericEntry = numerics.get(field.name);
+ final LongValues values = getNumeric(numericEntry);
final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name));
return new SortedNumericDocValues() {
@@ -639,6 +699,33 @@
return (int) (endOffset - startOffset);
}
};
+ } else if (ss.format == SORTED_SET_TABLE) {
+ NumericEntry entry = ords.get(field.name);
+ final LongValues ordinals = getNumeric(entry);
+
+ final long[] table = ss.table;
+ final int[] offsets = ss.tableOffsets;
+ return new SortedNumericDocValues() {
+ int startOffset;
+ int endOffset;
+
+ @Override
+ public void setDocument(int doc) {
+ final int ord = (int) ordinals.get(doc);
+ startOffset = offsets[ord];
+ endOffset = offsets[ord + 1];
+ }
+
+ @Override
+ public long valueAt(int index) {
+ return table[startOffset + index];
+ }
+
+ @Override
+ public int count() {
+ return endOffset - startOffset;
+ }
+ };
} else {
throw new AssertionError();
}
@@ -647,13 +734,20 @@
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry ss = sortedSets.get(field.name);
- if (ss.format == SORTED_SINGLE_VALUED) {
- final SortedDocValues values = getSorted(field);
- return DocValues.singleton(values);
- } else if (ss.format != SORTED_WITH_ADDRESSES) {
- throw new AssertionError();
+ switch (ss.format) {
+ case SORTED_SINGLE_VALUED:
+ final SortedDocValues values = getSorted(field);
+ return DocValues.singleton(values);
+ case SORTED_WITH_ADDRESSES:
+ return getSortedSetWithAddresses(field);
+ case SORTED_SET_TABLE:
+ return getSortedSetTable(field, ss);
+ default:
+ throw new AssertionError();
}
+ }
+ private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException {
final long valueCount = binaries.get(field.name).count;
// we keep the byte[]s and list of ords on disk, these could be large
final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
@@ -722,7 +816,76 @@
}
};
}
-
+
+ private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException {
+ final long valueCount = binaries.get(field.name).count;
+ final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
+ final LongValues ordinals = getNumeric(ords.get(field.name));
+
+ final long[] table = ss.table;
+ final int[] offsets = ss.tableOffsets;
+
+ return new RandomAccessOrds() {
+
+ int offset, startOffset, endOffset;
+
+ @Override
+ public void setDocument(int docID) {
+ final int ord = (int) ordinals.get(docID);
+ offset = startOffset = offsets[ord];
+ endOffset = offsets[ord + 1];
+ }
+
+ @Override
+ public long ordAt(int index) {
+ return table[startOffset + index];
+ }
+
+ @Override
+ public long nextOrd() {
+ if (offset == endOffset) {
+ return NO_MORE_ORDS;
+ } else {
+ return table[offset++];
+ }
+ }
+
+ @Override
+ public int cardinality() {
+ return endOffset - startOffset;
+ }
+
+ @Override
+ public BytesRef lookupOrd(long ord) {
+ return binary.get(ord);
+ }
+
+ @Override
+ public long getValueCount() {
+ return valueCount;
+ }
+
+ @Override
+ public long lookupTerm(BytesRef key) {
+ if (binary instanceof CompressedBinaryDocValues) {
+ return ((CompressedBinaryDocValues) binary).lookupTerm(key);
+ } else {
+ return super.lookupTerm(key);
+ }
+ }
+
+ @Override
+ public TermsEnum termsEnum() {
+ if (binary instanceof CompressedBinaryDocValues) {
+ return ((CompressedBinaryDocValues) binary).getTermsEnum();
+ } else {
+ return super.termsEnum();
+ }
+ }
+
+ };
+ }
+
private Bits getLiveBits(final long offset, final int count) throws IOException {
if (offset == ALL_MISSING) {
return new Bits.MatchNoBits(count);
@@ -831,6 +994,9 @@
static class SortedSetEntry {
private SortedSetEntry() {}
int format;
+
+ long[] table;
+ int[] tableOffsets;
}
// internally we compose complex dv (sorted/sortedset) from other ones
Index: lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java (revision 1689996)
+++ lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java (working copy)
@@ -64,7 +64,7 @@
public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
- doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16);
+ doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
}
}
@@ -72,7 +72,7 @@
public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
- doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16);
+ doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
}
}
Index: lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
===================================================================
--- lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (revision 1689996)
+++ lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (working copy)
@@ -24,6 +24,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -62,6 +63,8 @@
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
/**
@@ -1940,29 +1943,30 @@
directory.close();
}
- protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc) throws Exception {
+ protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) throws Exception {
Directory dir = newFSDirectory(createTempDir("dvduel"));
IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
-
+
+ Set<String> valueSet = new HashSet<String>();
+ for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) {
+ final int length = TestUtil.nextInt(random(), minLength, maxLength);
+ valueSet.add(TestUtil.randomSimpleString(random(), length));
+ }
+ String[] uniqueValues = valueSet.toArray(new String[0]);
+
// index some docs
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
doc.add(idField);
- final int length;
- if (minLength == maxLength) {
- length = minLength; // fixed length
- } else {
- length = TestUtil.nextInt(random(), minLength, maxLength);
- }
int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc);
// create a random set of strings
Set<String> values = new TreeSet<>();
for (int v = 0; v < numValues; v++) {
- values.add(TestUtil.randomSimpleString(random(), length));
+ values.add(RandomPicks.randomFrom(random(), uniqueValues));
}
-
+
// add ordered to the stored field
for (String v : values) {
doc.add(new StoredField("stored", v));
@@ -2041,7 +2045,7 @@
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
int fixedLength = TestUtil.nextInt(random(), 1, 10);
- doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16);
+ doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100);
}
}
@@ -2107,12 +2111,37 @@
);
}
}
-
+
+ public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception {
+ assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric());
+ final long[] values = new long[TestUtil.nextInt(random(), 2, 6)];
+ for (int i = 0; i < values.length; ++i) {
+ values[i] = random().nextLong();
+ }
+ int numIterations = atLeast(1);
+ for (int i = 0; i < numIterations; i++) {
+ doTestSortedNumericsVsStoredFields(
+ new LongProducer() {
+ @Override
+ long next() {
+ return TestUtil.nextLong(random(), 0, 6);
+ }
+ },
+ new LongProducer() {
+ @Override
+ long next() {
+ return values[random().nextInt(values.length)];
+ }
+ }
+ );
+ }
+ }
+
public void testSortedSetVariableLengthVsStoredFields() throws Exception {
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
- doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16);
+ doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100);
}
}
@@ -2121,7 +2150,7 @@
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
int fixedLength = TestUtil.nextInt(random(), 1, 10);
- doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1);
+ doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100);
}
}
@@ -2129,10 +2158,42 @@
assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {
- doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1);
+ doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100);
}
}
+ public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception {
+ assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+ int numIterations = atLeast(1);
+ for (int i = 0; i < numIterations; i++) {
+ doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6);
+ }
+ }
+
+ public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception {
+ assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+ int numIterations = atLeast(1);
+ for (int i = 0; i < numIterations; i++) {
+ doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6);
+ }
+ }
+
+ public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception {
+ assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+ int numIterations = atLeast(1);
+ for (int i = 0; i < numIterations; i++) {
+ doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000);
+ }
+ }
+
+ public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception {
+ assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+ int numIterations = atLeast(1);
+ for (int i = 0; i < numIterations; i++) {
+ doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000);
+ }
+ }
+
public void testGCDCompression() throws Exception {
int numIterations = atLeast(1);
for (int i = 0; i < numIterations; i++) {