docs/attachments/LUCENE-6668/LUCENE-6668.patch - lucene-jira-archive - Git at Google

 Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java	(revision 1689996)
 +++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java	(working copy)
 @@ -23,6 +23,11 @@
  import java.util.Collections;
  import java.util.HashMap;
  import java.util.HashSet;
 +import java.util.Iterator;
 +import java.util.Map;
 +import java.util.Set;
 +import java.util.SortedSet;
 +import java.util.TreeSet;

  import org.apache.lucene.codecs.CodecUtil;
  import org.apache.lucene.codecs.DocValuesConsumer;
 @@ -34,6 +39,7 @@
  import org.apache.lucene.util.BytesRef;
  import org.apache.lucene.util.BytesRefBuilder;
  import org.apache.lucene.util.IOUtils;
 +import org.apache.lucene.util.LongsRef;
  import org.apache.lucene.util.MathUtil;
  import org.apache.lucene.util.PagedBytes;
  import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
 @@ -463,11 +469,22 @@
        // The field is single-valued, we can encode it as NUMERIC
        addNumericField(field, singletonView(docToValueCount, values, null));
      } else {
 -      meta.writeVInt(SORTED_WITH_ADDRESSES);
 -      // write the stream of values as a numeric field
 -      addNumericField(field, values, true);
 -      // write the doc -> ord count as a absolute index to the stream
 -      addAddresses(field, docToValueCount);
 +      final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToValueCount, values);
 +      if (uniqueValueSets != null) {
 +        meta.writeVInt(SORTED_SET_TABLE);
 +
 +        // write the set_id -> values mapping
 +        writeDictionary(uniqueValueSets);
 +
 +        // write the doc -> set_id as a numeric field
 +        addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
 +      } else {
 +        meta.writeVInt(SORTED_WITH_ADDRESSES);
 +        // write the stream of values as a numeric field
 +        addNumericField(field, values, true);
 +        // write the doc -> ord count as a absolute index to the stream
 +        addAddresses(field, docToValueCount);
 +      }
      }
    }

 @@ -481,20 +498,120 @@
        // The field is single-valued, we can encode it as SORTED
        addSortedField(field, values, singletonView(docToOrdCount, ords, -1L));
      } else {
 -      meta.writeVInt(SORTED_WITH_ADDRESSES);
 +      final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToOrdCount, ords);
 +      if (uniqueValueSets != null) {
 +        meta.writeVInt(SORTED_SET_TABLE);

 -      // write the ord -> byte[] as a binary field
 -      addTermsDict(field, values);
 +        // write the set_id -> ords mapping
 +        writeDictionary(uniqueValueSets);

 -      // write the stream of ords as a numeric field
 -      // NOTE: we could return an iterator that delta-encodes these within a doc
 -      addNumericField(field, ords, false);
 +        // write the ord -> byte[] as a binary field
 +        addTermsDict(field, values);

 -      // write the doc -> ord count as a absolute index to the stream
 -      addAddresses(field, docToOrdCount);
 +        // write the doc -> set_id as a numeric field
 +        addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
 +      } else {
 +        meta.writeVInt(SORTED_WITH_ADDRESSES);
 +
 +        // write the ord -> byte[] as a binary field
 +        addTermsDict(field, values);
 +
 +        // write the stream of ords as a numeric field
 +        // NOTE: we could return an iterator that delta-encodes these within a doc
 +        addNumericField(field, ords, false);
 +
 +        // write the doc -> ord count as a absolute index to the stream
 +        addAddresses(field, docToOrdCount);
 +      }
      }
    }
 -
 +
 +  private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) {
 +    Set<LongsRef> uniqueValueSet = new HashSet<>();
 +    LongsRef docValues = new LongsRef(256);
 +
 +    Iterator<Number> valueCountIterator = docToValueCount.iterator();
 +    Iterator<Number> valueIterator = values.iterator();
 +    int totalDictSize = 0;
 +    while (valueCountIterator.hasNext()) {
 +      docValues.length = valueCountIterator.next().intValue();
 +      if (docValues.length > 256) {
 +        return null;
 +      }
 +      for (int i = 0; i < docValues.length; ++i) {
 +        docValues.longs[i] = valueIterator.next().longValue();
 +      }
 +      if (uniqueValueSet.contains(docValues)) {
 +        continue;
 +      }
 +      totalDictSize += docValues.length;
 +      if (totalDictSize > 256) {
 +        return null;
 +      }
 +      uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length));
 +    }
 +    assert valueIterator.hasNext() == false;
 +    return new TreeSet<>(uniqueValueSet);
 +  }
 +
 +  private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException {
 +    int lengthSum = 0;
 +    for (LongsRef longs : uniqueValueSets) {
 +      lengthSum += longs.length;
 +    }
 +
 +    meta.writeInt(lengthSum);
 +    for (LongsRef valueSet : uniqueValueSets) {
 +      for (int  i = 0; i < valueSet.length; ++i) {
 +        meta.writeLong(valueSet.longs[valueSet.offset + i]);
 +      }
 +    }
 +
 +    meta.writeInt(uniqueValueSets.size());
 +    for (LongsRef valueSet : uniqueValueSets) {
 +      meta.writeInt(valueSet.length);
 +    }
 +  }
 +
 +  private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, Iterable<Number> docToValueCount, Iterable<Number> values) {
 +    final Map<LongsRef, Integer> setIds = new HashMap<>();
 +    int i = 0;
 +    for (LongsRef set : uniqueValueSets) {
 +      setIds.put(set, i++);
 +    }
 +    assert i == uniqueValueSets.size();
 +
 +    return new Iterable<Number>() {
 +
 +      @Override
 +      public Iterator<Number> iterator() {
 +        final Iterator<Number> valueCountIterator = docToValueCount.iterator();
 +        final Iterator<Number> valueIterator = values.iterator();
 +        final LongsRef docValues = new LongsRef(256);
 +        return new Iterator<Number>() {
 +
 +          @Override
 +          public boolean hasNext() {
 +            return valueCountIterator.hasNext();
 +          }
 +
 +          @Override
 +          public Number next() {
 +            docValues.length = valueCountIterator.next().intValue();
 +            for (int i = 0; i < docValues.length; ++i) {
 +              docValues.longs[i] = valueIterator.next().longValue();
 +            }
 +            final Integer id = setIds.get(docValues);
 +            assert id != null;
 +            return id;
 +          }
 +
 +        };
 +
 +      }
 +    };
 +  }
 +
    // writes addressing information as MONOTONIC_COMPRESSED integer
    private void addAddresses(FieldInfo field, Iterable<Number> values) throws IOException {
      meta.writeVInt(field.number);
 Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java	(revision 1689996)
 +++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java	(working copy)
 @@ -72,13 +72,21 @@
   * <p>
   * {@link DocValuesType#SORTED_SET SORTED_SET}:
   * <ul>
 + *    <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
 + *    <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
 + *        an id, a lookup table is written and the mapping from document to set id is written using the
 + *        numeric strategies above.
   *    <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary,
   *        an ordinal list and per-document index into this list are written using the numeric strategies
 - *        above.
 + *        above.
   * </ul>
   * <p>
   * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
   * <ul>
 + *    <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
 + *    <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
 + *        an id, a lookup table is written and the mapping from document to set id is written using the
 + *        numeric strategies above.
   *    <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
   *        strategies above.
   * </ul>
 @@ -108,21 +116,24 @@
   *     <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
   *     <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
   *     <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
 - *     <li>SortedSetEntry --&gt; EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
 - *     <li>SortedNumericEntry --&gt; EntryType,NumericEntry,NumericEntry</li>
 + *     <li>SortedSetEntry --&gt; SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li>
 + *     <li>SingleSortedSetEntry --&gt; SetHeader,SortedEntry</li>
 + *     <li>AddressesSortedSetEntry --&gt; SetHeader,BinaryEntry,NumericEntry,NumericEntry</li>
 + *     <li>TableSortedSetEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li>
 + *     <li>SetHeader --&gt; FieldNumber,EntryType,SetType</li>
 + *     <li>SortedNumericEntry --&gt; SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li>
 + *     <li>SingleNumericEntry --&gt; SetHeader,NumericEntry</li>
 + *     <li>AddressesSortedNumericEntry --&gt; SetHeader,NumericEntry,NumericEntry</li>
 + *     <li>TableSortedNumericEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li>
   *     <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
   *     <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
   *     <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
   *     <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --&gt; {@link DataOutput#writeLong Int64}</li>
 - *     <li>TableSize,BitsPerValue --&gt; {@link DataOutput#writeVInt vInt}</li>
 + *     <li>TableSize,BitsPerValue,TotalTableLength --&gt; {@link DataOutput#writeVInt vInt}</li>
   *     <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
   *   </ul>
   *   <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
   *      and an ordinary NumericEntry for the document-to-ord metadata.</p>
 - *   <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
 - *      and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
 - *   <p>SortedNumeric fields have two entries: A NumericEntry with the value metadata,
 - *      and a numeric entry with the document-to-value index.</p>
   *   <p>FieldNumber of -1 indicates the end of metadata.</p>
   *   <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
   *   <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
 @@ -144,6 +155,15 @@
   *         <li>1 --&gt; variable-width. An address for each value is stored.
   *         <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
   *      </ul>
 + *   <p>SetType indicates how SortedSet and SortedNumeric values will be stored:
 + *       <ul>
 + *         <li>0 --&gt; with addresses. There are two numeric entries: a first one from document to start
 + *             offset, and a second one from offset to ord/value.
 + *         <li>1 --&gt; single-valued. Used when all documents have at most one value and is encoded like
 + *             a regular Sorted/Numeric entry.
 + *         <li>2 --&gt; table-encoded. A lookup table of unique sets of values is written, followed by a
 + *             numeric entry that maps each document to an ordinal in this table.
 + *       </ul>
   *   <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
   *      If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
   *      Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
 @@ -187,7 +207,8 @@
    static final String META_CODEC = "Lucene50DocValuesMetadata";
    static final String META_EXTENSION = "dvm";
    static final int VERSION_START = 0;
 -  static final int VERSION_CURRENT = VERSION_START;
 +  static final int VERSION_SORTEDSET_TABLE = 1;
 +  static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;

    // indicates docvalues type
    static final byte NUMERIC = 0;
 @@ -235,6 +256,9 @@
    /** Single-valued sorted set values, encoded as sorted values, so no level
     *  of indirection: {@code docId -> ord}. */
    static final int SORTED_SINGLE_VALUED = 1;
 +  /** Compressed giving IDs to unique sets of values:
 +   * {@code docId -> setId -> ords} */
 +  static final int SORTED_SET_TABLE = 2;

    /** placeholder for missing offset that means there are no missing values */
    static final int ALL_LIVE = -1;
 Index: lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java	(revision 1689996)
 +++ lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java	(working copy)
 @@ -206,6 +206,28 @@
      ordIndexes.put(info.name, n2);
    }

 +  private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException {
 +    // sortedset table = binary + ordset table + ordset index
 +    if (meta.readVInt() != info.number) {
 +      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
 +    }
 +    if (meta.readByte() != Lucene50DocValuesFormat.BINARY) {
 +      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
 +    }
 +
 +    BinaryEntry b = readBinaryEntry(meta);
 +    binaries.put(info.name, b);
 +
 +    if (meta.readVInt() != info.number) {
 +      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
 +    }
 +    if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
 +      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
 +    }
 +    NumericEntry n = readNumericEntry(meta);
 +    ords.put(info.name, n);
 +  }
 +
    private int readFields(IndexInput meta, FieldInfos infos) throws IOException {
      int numFields = 0;
      int fieldNumber = meta.readVInt();
 @@ -229,6 +251,8 @@
          sortedSets.put(info.name, ss);
          if (ss.format == SORTED_WITH_ADDRESSES) {
            readSortedSetFieldWithAddresses(info, meta);
 +        } else if (ss.format == SORTED_SET_TABLE) {
 +          readSortedSetFieldWithTable(info, meta);
          } else if (ss.format == SORTED_SINGLE_VALUED) {
            if (meta.readVInt() != fieldNumber) {
              throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
 @@ -243,13 +267,6 @@
        } else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) {
          SortedSetEntry ss = readSortedSetEntry(meta);
          sortedNumerics.put(info.name, ss);
 -        if (meta.readVInt() != fieldNumber) {
 -          throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 -        }
 -        if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
 -          throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 -        }
 -        numerics.put(info.name, readNumericEntry(meta));
          if (ss.format == SORTED_WITH_ADDRESSES) {
            if (meta.readVInt() != fieldNumber) {
              throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 @@ -257,9 +274,33 @@
            if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
              throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
            }
 +          numerics.put(info.name, readNumericEntry(meta));
 +          if (meta.readVInt() != fieldNumber) {
 +            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 +          }
 +          if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
 +            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 +          }
            NumericEntry ordIndex = readNumericEntry(meta);
            ordIndexes.put(info.name, ordIndex);
 -        } else if (ss.format != SORTED_SINGLE_VALUED) {
 +        } else if (ss.format == SORTED_SET_TABLE) {
 +          if (meta.readVInt() != info.number) {
 +            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 +          }
 +          if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
 +            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 +          }
 +          NumericEntry n = readNumericEntry(meta);
 +          ords.put(info.name, n);
 +        } else if (ss.format == SORTED_SINGLE_VALUED) {
 +          if (meta.readVInt() != fieldNumber) {
 +            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 +          }
 +          if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
 +            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
 +          }
 +          numerics.put(info.name, readNumericEntry(meta));
 +        } else {
            throw new AssertionError();
          }
        } else {
 @@ -346,7 +387,24 @@
    SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException {
      SortedSetEntry entry = new SortedSetEntry();
      entry.format = meta.readVInt();
 -    if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
 +    if (entry.format == SORTED_SET_TABLE) {
 +      final int totalTableLength = meta.readInt();
 +      if (totalTableLength > 256) {
 +        throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta);
 +      }
 +      entry.table = new long[totalTableLength];
 +      for (int i = 0; i < totalTableLength; ++i) {
 +        entry.table[i] = meta.readLong();
 +      }
 +      final int tableSize = meta.readInt();
 +      if (tableSize > totalTableLength + 1) { // +1 because of the empty set
 +        throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta);
 +      }
 +      entry.tableOffsets = new int[tableSize + 1];
 +      for (int i = 1; i < entry.tableOffsets.length; ++i) {
 +        entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt();
 +      }
 +    } else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
        throw new CorruptIndexException("Unknown format: " + entry.format, meta);
      }
      return entry;
 @@ -611,12 +669,14 @@
    @Override
    public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
      SortedSetEntry ss = sortedNumerics.get(field.name);
 -    NumericEntry numericEntry = numerics.get(field.name);
 -    final LongValues values = getNumeric(numericEntry);
      if (ss.format == SORTED_SINGLE_VALUED) {
 +      NumericEntry numericEntry = numerics.get(field.name);
 +      final LongValues values = getNumeric(numericEntry);
        final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
        return DocValues.singleton(values, docsWithField);
      } else if (ss.format == SORTED_WITH_ADDRESSES) {
 +      NumericEntry numericEntry = numerics.get(field.name);
 +      final LongValues values = getNumeric(numericEntry);
        final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name));

        return new SortedNumericDocValues() {
 @@ -639,6 +699,33 @@
            return (int) (endOffset - startOffset);
          }
        };
 +    } else if (ss.format == SORTED_SET_TABLE) {
 +      NumericEntry entry = ords.get(field.name);
 +      final LongValues ordinals = getNumeric(entry);
 +
 +      final long[] table = ss.table;
 +      final int[] offsets = ss.tableOffsets;
 +      return new SortedNumericDocValues() {
 +        int startOffset;
 +        int endOffset;
 +
 +        @Override
 +        public void setDocument(int doc) {
 +          final int ord = (int) ordinals.get(doc);
 +          startOffset = offsets[ord];
 +          endOffset = offsets[ord + 1];
 +        }
 +
 +        @Override
 +        public long valueAt(int index) {
 +          return table[startOffset + index];
 +        }
 +
 +        @Override
 +        public int count() {
 +          return endOffset - startOffset;
 +        }
 +      };
      } else {
        throw new AssertionError();
      }
 @@ -647,13 +734,20 @@
    @Override
    public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
      SortedSetEntry ss = sortedSets.get(field.name);
 -    if (ss.format == SORTED_SINGLE_VALUED) {
 -      final SortedDocValues values = getSorted(field);
 -      return DocValues.singleton(values);
 -    } else if (ss.format != SORTED_WITH_ADDRESSES) {
 -      throw new AssertionError();
 +    switch (ss.format) {
 +      case SORTED_SINGLE_VALUED:
 +        final SortedDocValues values = getSorted(field);
 +        return DocValues.singleton(values);
 +      case SORTED_WITH_ADDRESSES:
 +        return getSortedSetWithAddresses(field);
 +      case SORTED_SET_TABLE:
 +        return getSortedSetTable(field, ss);
 +      default:
 +        throw new AssertionError();
      }
 +  }

 +  private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException {
      final long valueCount = binaries.get(field.name).count;
      // we keep the byte[]s and list of ords on disk, these could be large
      final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
 @@ -722,7 +816,76 @@
        }
      };
    }
 -
 +
 +  private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException {
 +    final long valueCount = binaries.get(field.name).count;
 +    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
 +    final LongValues ordinals = getNumeric(ords.get(field.name));
 +
 +    final long[] table = ss.table;
 +    final int[] offsets = ss.tableOffsets;
 +
 +    return new RandomAccessOrds() {
 +
 +      int offset, startOffset, endOffset;
 +
 +      @Override
 +      public void setDocument(int docID) {
 +        final int ord = (int) ordinals.get(docID);
 +        offset = startOffset = offsets[ord];
 +        endOffset = offsets[ord + 1];
 +      }
 +
 +      @Override
 +      public long ordAt(int index) {
 +        return table[startOffset + index];
 +      }
 +
 +      @Override
 +      public long nextOrd() {
 +        if (offset == endOffset) {
 +          return NO_MORE_ORDS;
 +        } else {
 +          return table[offset++];
 +        }
 +      }
 +
 +      @Override
 +      public int cardinality() {
 +        return endOffset - startOffset;
 +      }
 +
 +      @Override
 +      public BytesRef lookupOrd(long ord) {
 +        return binary.get(ord);
 +      }
 +
 +      @Override
 +      public long getValueCount() {
 +        return valueCount;
 +      }
 +
 +      @Override
 +      public long lookupTerm(BytesRef key) {
 +        if (binary instanceof CompressedBinaryDocValues) {
 +          return ((CompressedBinaryDocValues) binary).lookupTerm(key);
 +        } else {
 +          return super.lookupTerm(key);
 +        }
 +      }
 +
 +      @Override
 +      public TermsEnum termsEnum() {
 +        if (binary instanceof CompressedBinaryDocValues) {
 +          return ((CompressedBinaryDocValues) binary).getTermsEnum();
 +        } else {
 +          return super.termsEnum();
 +        }
 +      }
 +
 +    };
 +  }
 +
    private Bits getLiveBits(final long offset, final int count) throws IOException {
      if (offset == ALL_MISSING) {
        return new Bits.MatchNoBits(count);
 @@ -831,6 +994,9 @@
    static class SortedSetEntry {
      private SortedSetEntry() {}
      int format;
 +
 +    long[] table;
 +    int[] tableOffsets;
    }

    // internally we compose complex dv (sorted/sortedset) from other ones
 Index: lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java
 ===================================================================
 --- lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java	(revision 1689996)
 +++ lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java	(working copy)
 @@ -64,7 +64,7 @@
    public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
      int numIterations = atLeast(1);
      for (int i = 0; i < numIterations; i++) {
 -      doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16);
 +      doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
      }
    }

 @@ -72,7 +72,7 @@
    public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
      int numIterations = atLeast(1);
      for (int i = 0; i < numIterations; i++) {
 -      doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16);
 +      doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
      }
    }

 Index: lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
 ===================================================================
 --- lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java	(revision 1689996)
 +++ lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java	(working copy)
 @@ -24,6 +24,7 @@
  import java.util.Arrays;
  import java.util.Collections;
  import java.util.HashMap;
 +import java.util.HashSet;
  import java.util.List;
  import java.util.Map;
  import java.util.Map.Entry;
 @@ -62,6 +63,8 @@
  import org.apache.lucene.util.IOUtils;
  import org.apache.lucene.util.TestUtil;

 +import com.carrotsearch.randomizedtesting.generators.RandomPicks;
 +
  import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;

  /**
 @@ -1940,29 +1943,30 @@
      directory.close();
    }

 -  protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc) throws Exception {
 +  protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) throws Exception {
      Directory dir = newFSDirectory(createTempDir("dvduel"));
      IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
 -
 +
 +    Set<String> valueSet = new HashSet<String>();
 +    for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) {
 +      final int length = TestUtil.nextInt(random(), minLength, maxLength);
 +      valueSet.add(TestUtil.randomSimpleString(random(), length));
 +    }
 +    String[] uniqueValues = valueSet.toArray(new String[0]);
 +
      // index some docs
      for (int i = 0; i < numDocs; i++) {
        Document doc = new Document();
        Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
        doc.add(idField);
 -      final int length;
 -      if (minLength == maxLength) {
 -        length = minLength; // fixed length
 -      } else {
 -        length = TestUtil.nextInt(random(), minLength, maxLength);
 -      }
        int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc);
        // create a random set of strings
        Set<String> values = new TreeSet<>();
        for (int v = 0; v < numValues; v++) {
 -        values.add(TestUtil.randomSimpleString(random(), length));
 +        values.add(RandomPicks.randomFrom(random(), uniqueValues));
        }
 -
 +
        // add ordered to the stored field
        for (String v : values) {
          doc.add(new StoredField("stored", v));
 @@ -2041,7 +2045,7 @@
      int numIterations = atLeast(1);
      for (int i = 0; i < numIterations; i++) {
        int fixedLength = TestUtil.nextInt(random(), 1, 10);
 -      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16);
 +      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100);
      }
    }

 @@ -2107,12 +2111,37 @@
        );
      }
    }
 -
 +
 +  public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception {
 +    assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric());
 +    final long[] values = new long[TestUtil.nextInt(random(), 2, 6)];
 +    for (int i = 0; i < values.length; ++i) {
 +      values[i] = random().nextLong();
 +    }
 +    int numIterations = atLeast(1);
 +    for (int i = 0; i < numIterations; i++) {
 +      doTestSortedNumericsVsStoredFields(
 +          new LongProducer() {
 +            @Override
 +            long next() {
 +              return TestUtil.nextLong(random(), 0, 6);
 +            }
 +          },
 +          new LongProducer() {
 +            @Override
 +            long next() {
 +              return values[random().nextInt(values.length)];
 +            }
 +          }
 +      );
 +    }
 +  }
 +
    public void testSortedSetVariableLengthVsStoredFields() throws Exception {
      assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
      int numIterations = atLeast(1);
      for (int i = 0; i < numIterations; i++) {
 -      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16);
 +      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100);
      }
    }

 @@ -2121,7 +2150,7 @@
      int numIterations = atLeast(1);
      for (int i = 0; i < numIterations; i++) {
        int fixedLength = TestUtil.nextInt(random(), 1, 10);
 -      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1);
 +      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100);
      }
    }

 @@ -2129,10 +2158,42 @@
      assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
      int numIterations = atLeast(1);
      for (int i = 0; i < numIterations; i++) {
 -      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1);
 +      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100);
      }
    }

 +  public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception {
 +    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
 +    int numIterations = atLeast(1);
 +    for (int i = 0; i < numIterations; i++) {
 +      doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6);
 +    }
 +  }
 +
 +  public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception {
 +    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
 +    int numIterations = atLeast(1);
 +    for (int i = 0; i < numIterations; i++) {
 +      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6);
 +    }
 +  }
 +
 +  public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception {
 +    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
 +    int numIterations = atLeast(1);
 +    for (int i = 0; i < numIterations; i++) {
 +      doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000);
 +    }
 +  }
 +
 +  public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception {
 +    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
 +    int numIterations = atLeast(1);
 +    for (int i = 0; i < numIterations; i++) {
 +      doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000);
 +    }
 +  }
 +
    public void testGCDCompression() throws Exception {
      int numIterations = atLeast(1);
      for (int i = 0; i < numIterations; i++) {