| Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java (revision 1689996) |
| +++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java (working copy) |
| @@ -23,6 +23,11 @@ |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| +import java.util.Iterator; |
| +import java.util.Map; |
| +import java.util.Set; |
| +import java.util.SortedSet; |
| +import java.util.TreeSet; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.DocValuesConsumer; |
| @@ -34,6 +39,7 @@ |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IOUtils; |
| +import org.apache.lucene.util.LongsRef; |
| import org.apache.lucene.util.MathUtil; |
| import org.apache.lucene.util.PagedBytes; |
| import org.apache.lucene.util.PagedBytes.PagedBytesDataInput; |
| @@ -463,11 +469,22 @@ |
| // The field is single-valued, we can encode it as NUMERIC |
| addNumericField(field, singletonView(docToValueCount, values, null)); |
| } else { |
| - meta.writeVInt(SORTED_WITH_ADDRESSES); |
| - // write the stream of values as a numeric field |
| - addNumericField(field, values, true); |
| - // write the doc -> ord count as a absolute index to the stream |
| - addAddresses(field, docToValueCount); |
| + final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToValueCount, values); |
| + if (uniqueValueSets != null) { |
| + meta.writeVInt(SORTED_SET_TABLE); |
| + |
| + // write the set_id -> values mapping |
| + writeDictionary(uniqueValueSets); |
| + |
| + // write the doc -> set_id as a numeric field |
| + addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false); |
| + } else { |
| + meta.writeVInt(SORTED_WITH_ADDRESSES); |
| + // write the stream of values as a numeric field |
| + addNumericField(field, values, true); |
| + // write the doc -> ord count as a absolute index to the stream |
| + addAddresses(field, docToValueCount); |
| + } |
| } |
| } |
| |
| @@ -481,20 +498,120 @@ |
| // The field is single-valued, we can encode it as SORTED |
| addSortedField(field, values, singletonView(docToOrdCount, ords, -1L)); |
| } else { |
| - meta.writeVInt(SORTED_WITH_ADDRESSES); |
| + final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToOrdCount, ords); |
| + if (uniqueValueSets != null) { |
| + meta.writeVInt(SORTED_SET_TABLE); |
| |
| - // write the ord -> byte[] as a binary field |
| - addTermsDict(field, values); |
| + // write the set_id -> ords mapping |
| + writeDictionary(uniqueValueSets); |
| |
| - // write the stream of ords as a numeric field |
| - // NOTE: we could return an iterator that delta-encodes these within a doc |
| - addNumericField(field, ords, false); |
| + // write the ord -> byte[] as a binary field |
| + addTermsDict(field, values); |
| |
| - // write the doc -> ord count as a absolute index to the stream |
| - addAddresses(field, docToOrdCount); |
| + // write the doc -> set_id as a numeric field |
| + addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false); |
| + } else { |
| + meta.writeVInt(SORTED_WITH_ADDRESSES); |
| + |
| + // write the ord -> byte[] as a binary field |
| + addTermsDict(field, values); |
| + |
| + // write the stream of ords as a numeric field |
| + // NOTE: we could return an iterator that delta-encodes these within a doc |
| + addNumericField(field, ords, false); |
| + |
| + // write the doc -> ord count as a absolute index to the stream |
| + addAddresses(field, docToOrdCount); |
| + } |
| } |
| } |
| - |
| + |
| + private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) { |
| + Set<LongsRef> uniqueValueSet = new HashSet<>(); |
| + LongsRef docValues = new LongsRef(256); |
| + |
| + Iterator<Number> valueCountIterator = docToValueCount.iterator(); |
| + Iterator<Number> valueIterator = values.iterator(); |
| + int totalDictSize = 0; |
| + while (valueCountIterator.hasNext()) { |
| + docValues.length = valueCountIterator.next().intValue(); |
| + if (docValues.length > 256) { |
| + return null; |
| + } |
| + for (int i = 0; i < docValues.length; ++i) { |
| + docValues.longs[i] = valueIterator.next().longValue(); |
| + } |
| + if (uniqueValueSet.contains(docValues)) { |
| + continue; |
| + } |
| + totalDictSize += docValues.length; |
| + if (totalDictSize > 256) { |
| + return null; |
| + } |
| + uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length)); |
| + } |
| + assert valueIterator.hasNext() == false; |
| + return new TreeSet<>(uniqueValueSet); |
| + } |
| + |
| + private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException { |
| + int lengthSum = 0; |
| + for (LongsRef longs : uniqueValueSets) { |
| + lengthSum += longs.length; |
| + } |
| + |
| + meta.writeInt(lengthSum); |
| + for (LongsRef valueSet : uniqueValueSets) { |
| + for (int i = 0; i < valueSet.length; ++i) { |
| + meta.writeLong(valueSet.longs[valueSet.offset + i]); |
| + } |
| + } |
| + |
| + meta.writeInt(uniqueValueSets.size()); |
| + for (LongsRef valueSet : uniqueValueSets) { |
| + meta.writeInt(valueSet.length); |
| + } |
| + } |
| + |
| + private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, Iterable<Number> docToValueCount, Iterable<Number> values) { |
| + final Map<LongsRef, Integer> setIds = new HashMap<>(); |
| + int i = 0; |
| + for (LongsRef set : uniqueValueSets) { |
| + setIds.put(set, i++); |
| + } |
| + assert i == uniqueValueSets.size(); |
| + |
| + return new Iterable<Number>() { |
| + |
| + @Override |
| + public Iterator<Number> iterator() { |
| + final Iterator<Number> valueCountIterator = docToValueCount.iterator(); |
| + final Iterator<Number> valueIterator = values.iterator(); |
| + final LongsRef docValues = new LongsRef(256); |
| + return new Iterator<Number>() { |
| + |
| + @Override |
| + public boolean hasNext() { |
| + return valueCountIterator.hasNext(); |
| + } |
| + |
| + @Override |
| + public Number next() { |
| + docValues.length = valueCountIterator.next().intValue(); |
| + for (int i = 0; i < docValues.length; ++i) { |
| + docValues.longs[i] = valueIterator.next().longValue(); |
| + } |
| + final Integer id = setIds.get(docValues); |
| + assert id != null; |
| + return id; |
| + } |
| + |
| + }; |
| + |
| + } |
| + }; |
| + } |
| + |
| // writes addressing information as MONOTONIC_COMPRESSED integer |
| private void addAddresses(FieldInfo field, Iterable<Number> values) throws IOException { |
| meta.writeVInt(field.number); |
| Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java (revision 1689996) |
| +++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java (working copy) |
| @@ -72,13 +72,21 @@ |
| * <p> |
| * {@link DocValuesType#SORTED_SET SORTED_SET}: |
| * <ul> |
| + * <li>Single: if all documents have 0 or 1 value, then data are written like SORTED. |
| + * <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned |
| + * an id, a lookup table is written and the mapping from document to set id is written using the |
| + * numeric strategies above. |
| * <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, |
| * an ordinal list and per-document index into this list are written using the numeric strategies |
| - * above. |
| + * above. |
| * </ul> |
| * <p> |
| * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}: |
| * <ul> |
| + * <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC. |
| + * <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned |
| + * an id, a lookup table is written and the mapping from document to set id is written using the |
| + * numeric strategies above. |
| * <li>SortedNumeric: a value list and per-document index into this list are written using the numeric |
| * strategies above. |
| * </ul> |
| @@ -108,21 +116,24 @@ |
| * <li>PrefixBinaryEntry --> BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li> |
| * <li>BinaryHeader --> FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li> |
| * <li>SortedEntry --> FieldNumber,EntryType,BinaryEntry,NumericEntry</li> |
| - * <li>SortedSetEntry --> EntryType,BinaryEntry,NumericEntry,NumericEntry</li> |
| - * <li>SortedNumericEntry --> EntryType,NumericEntry,NumericEntry</li> |
| + * <li>SortedSetEntry --> SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li> |
| + * <li>SingleSortedSetEntry --> SetHeader,SortedEntry</li> |
| + * <li>AddressesSortedSetEntry --> SetHeader,BinaryEntry,NumericEntry,NumericEntry</li> |
| + * <li>TableSortedSetEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li> |
| + * <li>SetHeader --> FieldNumber,EntryType,SetType</li> |
| + * <li>SortedNumericEntry --> SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li> |
| + * <li>SingleNumericEntry --> SetHeader,NumericEntry</li> |
| + * <li>AddressesSortedNumericEntry --> SetHeader,NumericEntry,NumericEntry</li> |
| + * <li>TableSortedNumericEntry --> SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li> |
| * <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --> {@link DataOutput#writeVInt VInt}</li> |
| * <li>EntryType,CompressionType --> {@link DataOutput#writeByte Byte}</li> |
| * <li>Header --> {@link CodecUtil#writeIndexHeader IndexHeader}</li> |
| * <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --> {@link DataOutput#writeLong Int64}</li> |
| - * <li>TableSize,BitsPerValue --> {@link DataOutput#writeVInt vInt}</li> |
| + * <li>TableSize,BitsPerValue,TotalTableLength --> {@link DataOutput#writeVInt vInt}</li> |
| * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li> |
| * </ul> |
| * <p>Sorted fields have two entries: a BinaryEntry with the value metadata, |
| * and an ordinary NumericEntry for the document-to-ord metadata.</p> |
| - * <p>SortedSet fields have three entries: a BinaryEntry with the value metadata, |
| - * and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p> |
| - * <p>SortedNumeric fields have two entries: A NumericEntry with the value metadata, |
| - * and a numeric entry with the document-to-value index.</p> |
| * <p>FieldNumber of -1 indicates the end of metadata.</p> |
| * <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p> |
| * <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p> |
| @@ -144,6 +155,15 @@ |
| * <li>1 --> variable-width. An address for each value is stored. |
| * <li>2 --> prefix-compressed. An address to the start of every interval'th value is stored. |
| * </ul> |
| + * <p>SetType indicates how SortedSet and SortedNumeric values will be stored: |
| + * <ul> |
| + * <li>0 --> with addresses. There are two numeric entries: a first one from document to start |
| + * offset, and a second one from offset to ord/value. |
| + * <li>1 --> single-valued. Used when all documents have at most one value and is encoded like |
| + * a regular Sorted/Numeric entry. |
| + * <li>2 --> table-encoded. A lookup table of unique sets of values is written, followed by a |
| + * numeric entry that maps each document to an ordinal in this table. |
| + * </ul> |
| * <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values. |
| * If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length). |
| * Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize) |
| @@ -187,7 +207,8 @@ |
| static final String META_CODEC = "Lucene50DocValuesMetadata"; |
| static final String META_EXTENSION = "dvm"; |
| static final int VERSION_START = 0; |
| - static final int VERSION_CURRENT = VERSION_START; |
| + static final int VERSION_SORTEDSET_TABLE = 1; |
| + static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE; |
| |
| // indicates docvalues type |
| static final byte NUMERIC = 0; |
| @@ -235,6 +256,9 @@ |
| /** Single-valued sorted set values, encoded as sorted values, so no level |
| * of indirection: {@code docId -> ord}. */ |
| static final int SORTED_SINGLE_VALUED = 1; |
| + /** Compressed giving IDs to unique sets of values: |
| + * {@code docId -> setId -> ords} */ |
| + static final int SORTED_SET_TABLE = 2; |
| |
| /** placeholder for missing offset that means there are no missing values */ |
| static final int ALL_LIVE = -1; |
| Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java |
| =================================================================== |
| --- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java (revision 1689996) |
| +++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java (working copy) |
| @@ -206,6 +206,28 @@ |
| ordIndexes.put(info.name, n2); |
| } |
| |
| + private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException { |
| + // sortedset table = binary + ordset table + ordset index |
| + if (meta.readVInt() != info.number) { |
| + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + if (meta.readByte() != Lucene50DocValuesFormat.BINARY) { |
| + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + |
| + BinaryEntry b = readBinaryEntry(meta); |
| + binaries.put(info.name, b); |
| + |
| + if (meta.readVInt() != info.number) { |
| + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { |
| + throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + NumericEntry n = readNumericEntry(meta); |
| + ords.put(info.name, n); |
| + } |
| + |
| private int readFields(IndexInput meta, FieldInfos infos) throws IOException { |
| int numFields = 0; |
| int fieldNumber = meta.readVInt(); |
| @@ -229,6 +251,8 @@ |
| sortedSets.put(info.name, ss); |
| if (ss.format == SORTED_WITH_ADDRESSES) { |
| readSortedSetFieldWithAddresses(info, meta); |
| + } else if (ss.format == SORTED_SET_TABLE) { |
| + readSortedSetFieldWithTable(info, meta); |
| } else if (ss.format == SORTED_SINGLE_VALUED) { |
| if (meta.readVInt() != fieldNumber) { |
| throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta); |
| @@ -243,13 +267,6 @@ |
| } else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) { |
| SortedSetEntry ss = readSortedSetEntry(meta); |
| sortedNumerics.put(info.name, ss); |
| - if (meta.readVInt() != fieldNumber) { |
| - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| - } |
| - if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { |
| - throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| - } |
| - numerics.put(info.name, readNumericEntry(meta)); |
| if (ss.format == SORTED_WITH_ADDRESSES) { |
| if (meta.readVInt() != fieldNumber) { |
| throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| @@ -257,9 +274,33 @@ |
| if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { |
| throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| } |
| + numerics.put(info.name, readNumericEntry(meta)); |
| + if (meta.readVInt() != fieldNumber) { |
| + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { |
| + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| + } |
| NumericEntry ordIndex = readNumericEntry(meta); |
| ordIndexes.put(info.name, ordIndex); |
| - } else if (ss.format != SORTED_SINGLE_VALUED) { |
| + } else if (ss.format == SORTED_SET_TABLE) { |
| + if (meta.readVInt() != info.number) { |
| + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { |
| + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + NumericEntry n = readNumericEntry(meta); |
| + ords.put(info.name, n); |
| + } else if (ss.format == SORTED_SINGLE_VALUED) { |
| + if (meta.readVInt() != fieldNumber) { |
| + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) { |
| + throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta); |
| + } |
| + numerics.put(info.name, readNumericEntry(meta)); |
| + } else { |
| throw new AssertionError(); |
| } |
| } else { |
| @@ -346,7 +387,24 @@ |
| SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException { |
| SortedSetEntry entry = new SortedSetEntry(); |
| entry.format = meta.readVInt(); |
| - if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) { |
| + if (entry.format == SORTED_SET_TABLE) { |
| + final int totalTableLength = meta.readInt(); |
| + if (totalTableLength > 256) { |
| + throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta); |
| + } |
| + entry.table = new long[totalTableLength]; |
| + for (int i = 0; i < totalTableLength; ++i) { |
| + entry.table[i] = meta.readLong(); |
| + } |
| + final int tableSize = meta.readInt(); |
| + if (tableSize > totalTableLength + 1) { // +1 because of the empty set |
| + throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta); |
| + } |
| + entry.tableOffsets = new int[tableSize + 1]; |
| + for (int i = 1; i < entry.tableOffsets.length; ++i) { |
| + entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt(); |
| + } |
| + } else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) { |
| throw new CorruptIndexException("Unknown format: " + entry.format, meta); |
| } |
| return entry; |
| @@ -611,12 +669,14 @@ |
| @Override |
| public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException { |
| SortedSetEntry ss = sortedNumerics.get(field.name); |
| - NumericEntry numericEntry = numerics.get(field.name); |
| - final LongValues values = getNumeric(numericEntry); |
| if (ss.format == SORTED_SINGLE_VALUED) { |
| + NumericEntry numericEntry = numerics.get(field.name); |
| + final LongValues values = getNumeric(numericEntry); |
| final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc); |
| return DocValues.singleton(values, docsWithField); |
| } else if (ss.format == SORTED_WITH_ADDRESSES) { |
| + NumericEntry numericEntry = numerics.get(field.name); |
| + final LongValues values = getNumeric(numericEntry); |
| final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name)); |
| |
| return new SortedNumericDocValues() { |
| @@ -639,6 +699,33 @@ |
| return (int) (endOffset - startOffset); |
| } |
| }; |
| + } else if (ss.format == SORTED_SET_TABLE) { |
| + NumericEntry entry = ords.get(field.name); |
| + final LongValues ordinals = getNumeric(entry); |
| + |
| + final long[] table = ss.table; |
| + final int[] offsets = ss.tableOffsets; |
| + return new SortedNumericDocValues() { |
| + int startOffset; |
| + int endOffset; |
| + |
| + @Override |
| + public void setDocument(int doc) { |
| + final int ord = (int) ordinals.get(doc); |
| + startOffset = offsets[ord]; |
| + endOffset = offsets[ord + 1]; |
| + } |
| + |
| + @Override |
| + public long valueAt(int index) { |
| + return table[startOffset + index]; |
| + } |
| + |
| + @Override |
| + public int count() { |
| + return endOffset - startOffset; |
| + } |
| + }; |
| } else { |
| throw new AssertionError(); |
| } |
| @@ -647,13 +734,20 @@ |
| @Override |
| public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { |
| SortedSetEntry ss = sortedSets.get(field.name); |
| - if (ss.format == SORTED_SINGLE_VALUED) { |
| - final SortedDocValues values = getSorted(field); |
| - return DocValues.singleton(values); |
| - } else if (ss.format != SORTED_WITH_ADDRESSES) { |
| - throw new AssertionError(); |
| + switch (ss.format) { |
| + case SORTED_SINGLE_VALUED: |
| + final SortedDocValues values = getSorted(field); |
| + return DocValues.singleton(values); |
| + case SORTED_WITH_ADDRESSES: |
| + return getSortedSetWithAddresses(field); |
| + case SORTED_SET_TABLE: |
| + return getSortedSetTable(field, ss); |
| + default: |
| + throw new AssertionError(); |
| } |
| + } |
| |
| + private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException { |
| final long valueCount = binaries.get(field.name).count; |
| // we keep the byte[]s and list of ords on disk, these could be large |
| final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); |
| @@ -722,7 +816,76 @@ |
| } |
| }; |
| } |
| - |
| + |
| + private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException { |
| + final long valueCount = binaries.get(field.name).count; |
| + final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field); |
| + final LongValues ordinals = getNumeric(ords.get(field.name)); |
| + |
| + final long[] table = ss.table; |
| + final int[] offsets = ss.tableOffsets; |
| + |
| + return new RandomAccessOrds() { |
| + |
| + int offset, startOffset, endOffset; |
| + |
| + @Override |
| + public void setDocument(int docID) { |
| + final int ord = (int) ordinals.get(docID); |
| + offset = startOffset = offsets[ord]; |
| + endOffset = offsets[ord + 1]; |
| + } |
| + |
| + @Override |
| + public long ordAt(int index) { |
| + return table[startOffset + index]; |
| + } |
| + |
| + @Override |
| + public long nextOrd() { |
| + if (offset == endOffset) { |
| + return NO_MORE_ORDS; |
| + } else { |
| + return table[offset++]; |
| + } |
| + } |
| + |
| + @Override |
| + public int cardinality() { |
| + return endOffset - startOffset; |
| + } |
| + |
| + @Override |
| + public BytesRef lookupOrd(long ord) { |
| + return binary.get(ord); |
| + } |
| + |
| + @Override |
| + public long getValueCount() { |
| + return valueCount; |
| + } |
| + |
| + @Override |
| + public long lookupTerm(BytesRef key) { |
| + if (binary instanceof CompressedBinaryDocValues) { |
| + return ((CompressedBinaryDocValues) binary).lookupTerm(key); |
| + } else { |
| + return super.lookupTerm(key); |
| + } |
| + } |
| + |
| + @Override |
| + public TermsEnum termsEnum() { |
| + if (binary instanceof CompressedBinaryDocValues) { |
| + return ((CompressedBinaryDocValues) binary).getTermsEnum(); |
| + } else { |
| + return super.termsEnum(); |
| + } |
| + } |
| + |
| + }; |
| + } |
| + |
| private Bits getLiveBits(final long offset, final int count) throws IOException { |
| if (offset == ALL_MISSING) { |
| return new Bits.MatchNoBits(count); |
| @@ -831,6 +994,9 @@ |
| static class SortedSetEntry { |
| private SortedSetEntry() {} |
| int format; |
| + |
| + long[] table; |
| + int[] tableOffsets; |
| } |
| |
| // internally we compose complex dv (sorted/sortedset) from other ones |
| Index: lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java |
| =================================================================== |
| --- lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java (revision 1689996) |
| +++ lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java (working copy) |
| @@ -64,7 +64,7 @@ |
| public void testSortedSetVariableLengthBigVsStoredFields() throws Exception { |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| - doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16); |
| + doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100); |
| } |
| } |
| |
| @@ -72,7 +72,7 @@ |
| public void testSortedSetVariableLengthManyVsStoredFields() throws Exception { |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| - doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16); |
| + doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100); |
| } |
| } |
| |
| Index: lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java |
| =================================================================== |
| --- lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (revision 1689996) |
| +++ lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (working copy) |
| @@ -24,6 +24,7 @@ |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashMap; |
| +import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Map.Entry; |
| @@ -62,6 +63,8 @@ |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.TestUtil; |
| |
| +import com.carrotsearch.randomizedtesting.generators.RandomPicks; |
| + |
| import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; |
| |
| /** |
| @@ -1940,29 +1943,30 @@ |
| directory.close(); |
| } |
| |
| - protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc) throws Exception { |
| + protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) throws Exception { |
| Directory dir = newFSDirectory(createTempDir("dvduel")); |
| IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random())); |
| RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf); |
| - |
| + |
| + Set<String> valueSet = new HashSet<String>(); |
| + for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) { |
| + final int length = TestUtil.nextInt(random(), minLength, maxLength); |
| + valueSet.add(TestUtil.randomSimpleString(random(), length)); |
| + } |
| + String[] uniqueValues = valueSet.toArray(new String[0]); |
| + |
| // index some docs |
| for (int i = 0; i < numDocs; i++) { |
| Document doc = new Document(); |
| Field idField = new StringField("id", Integer.toString(i), Field.Store.NO); |
| doc.add(idField); |
| - final int length; |
| - if (minLength == maxLength) { |
| - length = minLength; // fixed length |
| - } else { |
| - length = TestUtil.nextInt(random(), minLength, maxLength); |
| - } |
| int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc); |
| // create a random set of strings |
| Set<String> values = new TreeSet<>(); |
| for (int v = 0; v < numValues; v++) { |
| - values.add(TestUtil.randomSimpleString(random(), length)); |
| + values.add(RandomPicks.randomFrom(random(), uniqueValues)); |
| } |
| - |
| + |
| // add ordered to the stored field |
| for (String v : values) { |
| doc.add(new StoredField("stored", v)); |
| @@ -2041,7 +2045,7 @@ |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| int fixedLength = TestUtil.nextInt(random(), 1, 10); |
| - doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16); |
| + doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100); |
| } |
| } |
| |
| @@ -2107,12 +2111,37 @@ |
| ); |
| } |
| } |
| - |
| + |
| + public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception { |
| + assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric()); |
| + final long[] values = new long[TestUtil.nextInt(random(), 2, 6)]; |
| + for (int i = 0; i < values.length; ++i) { |
| + values[i] = random().nextLong(); |
| + } |
| + int numIterations = atLeast(1); |
| + for (int i = 0; i < numIterations; i++) { |
| + doTestSortedNumericsVsStoredFields( |
| + new LongProducer() { |
| + @Override |
| + long next() { |
| + return TestUtil.nextLong(random(), 0, 6); |
| + } |
| + }, |
| + new LongProducer() { |
| + @Override |
| + long next() { |
| + return values[random().nextInt(values.length)]; |
| + } |
| + } |
| + ); |
| + } |
| + } |
| + |
| public void testSortedSetVariableLengthVsStoredFields() throws Exception { |
| assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| - doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16); |
| + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100); |
| } |
| } |
| |
| @@ -2121,7 +2150,7 @@ |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| int fixedLength = TestUtil.nextInt(random(), 1, 10); |
| - doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1); |
| + doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100); |
| } |
| } |
| |
| @@ -2129,10 +2158,42 @@ |
| assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |
| - doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1); |
| + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100); |
| } |
| } |
| |
| + public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception { |
| + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); |
| + int numIterations = atLeast(1); |
| + for (int i = 0; i < numIterations; i++) { |
| + doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6); |
| + } |
| + } |
| + |
| + public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception { |
| + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); |
| + int numIterations = atLeast(1); |
| + for (int i = 0; i < numIterations; i++) { |
| + doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6); |
| + } |
| + } |
| + |
| + public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception { |
| + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); |
| + int numIterations = atLeast(1); |
| + for (int i = 0; i < numIterations; i++) { |
| + doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000); |
| + } |
| + } |
| + |
| + public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception { |
| + assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet()); |
| + int numIterations = atLeast(1); |
| + for (int i = 0; i < numIterations; i++) { |
| + doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000); |
| + } |
| + } |
| + |
| public void testGCDCompression() throws Exception { |
| int numIterations = atLeast(1); |
| for (int i = 0; i < numIterations; i++) { |