docs/attachments/LUCENE-7306/LUCENE-7903.patch - lucene-jira-archive - Git at Google

 diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefSliceComparator.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefSliceComparator.java
 new file mode 100644
 index 0000000..41351c1
 --- /dev/null
 +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefSliceComparator.java
 @@ -0,0 +1,38 @@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +package org.apache.lucene.util;
 +
 +import java.util.Comparator;
 +
 +/** A {@link Comparator} for a slice of a {@link BytesRef}.
 + *  @lucene.internal */
 +public final class BytesRefSliceComparator implements Comparator<BytesRef> {
 +
 +  final int offset, length;
 +
 +  /** Sole constructor. */
 +  public BytesRefSliceComparator(int offset, int length) {
 +    this.offset = offset;
 +    this.length = length;
 +  }
 +
 +  @Override
 +  public int compare(BytesRef a, BytesRef b) {
 +    return StringHelper.compare(length, a.bytes, a.offset + offset, b.bytes, b.offset + offset);
 +  }
 +
 +}
 diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java b/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java
 index 346b908..b2bbf12 100644
 --- a/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java
 +++ b/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java
 @@ -105,6 +105,37 @@ final class FixedLengthBytesRefArray implements SortableBytesRefArray {
        orderedEntries[i] = i;
      }

 +    if (comp instanceof BytesRefSliceComparator) {
 +      int offset = ((BytesRefSliceComparator) comp).offset;
 +      int length = ((BytesRefSliceComparator) comp).length;
 +
 +      new StringMSBRadixSorter() {
 +
 +        final BytesRef scratch;
 +
 +        {
 +          scratch = new BytesRef();
 +          scratch.length = length;
 +        }
 +
 +        @Override
 +        protected void swap(int i, int j) {
 +          int o = orderedEntries[i];
 +          orderedEntries[i] = orderedEntries[j];
 +          orderedEntries[j] = o;
 +        }
 +
 +        @Override
 +        protected BytesRef get(int i) {
 +          int index = orderedEntries[i];
 +          scratch.bytes = blocks[index / valuesPerBlock];
 +          scratch.offset = (index % valuesPerBlock) * valueLength + offset;
 +          return scratch;
 +        }
 +      }.sort(0, size());
 +      return orderedEntries;
 +    }
 +
      final BytesRef pivot = new BytesRef();
      final BytesRef scratch1 = new BytesRef();
      final BytesRef scratch2 = new BytesRef();
 @@ -120,7 +151,7 @@ final class FixedLengthBytesRefArray implements SortableBytesRefArray {
          orderedEntries[i] = orderedEntries[j];
          orderedEntries[j] = o;
        }
 -
 +
        @Override
        protected int compare(int i, int j) {
          int index1 = orderedEntries[i];
 diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
 index 09eef26..8daba0a 100644
 --- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
 +++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
 @@ -25,7 +25,6 @@ import java.util.List;

  import org.apache.lucene.codecs.CodecUtil;
  import org.apache.lucene.index.MergeState;
 -import org.apache.lucene.store.ByteArrayDataInput;
  import org.apache.lucene.store.ChecksumIndexInput;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.store.IOContext;
 @@ -33,6 +32,7 @@ import org.apache.lucene.store.IndexOutput;
  import org.apache.lucene.store.TrackingDirectoryWrapper;
  import org.apache.lucene.util.ArrayUtil;
  import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.BytesRefSliceComparator;
  import org.apache.lucene.util.FixedBitSet;
  import org.apache.lucene.util.IOUtils;
  import org.apache.lucene.util.IntroSorter;
 @@ -718,28 +718,34 @@ public class BKDWriter implements Closeable {

        final int offset = bytesPerDim * dim;

 -      Comparator<BytesRef> cmp = new Comparator<BytesRef>() {
 -
 -        final ByteArrayDataInput reader = new ByteArrayDataInput();
 +      Comparator<BytesRef> cmp;
 +      if (dim == numDims - 1) {
 +        // the bytes for the value and the doc id are contiguous so we can use a
 +        // BytesRefSliceComparator which will trigger an optimization when sorting
 +        // to use radix sort rather than a comparison-based sort
 +        cmp = new BytesRefSliceComparator(packedBytesLength - bytesPerDim, bytesPerDim + Integer.BYTES);
 +      } else {
 +        cmp = new Comparator<BytesRef>() {

 -        @Override
 -        public int compare(BytesRef a, BytesRef b) {
 -          // First compare by the requested dimension we are sorting by:
 -          int cmp = StringHelper.compare(bytesPerDim, a.bytes, a.offset + offset, b.bytes, b.offset + offset);
 +          @Override
 +          public int compare(BytesRef a, BytesRef b) {
 +            // First compare by the requested dimension we are sorting by:
 +            int cmp = StringHelper.compare(bytesPerDim, a.bytes, a.offset + offset, b.bytes, b.offset + offset);

 -          if (cmp != 0) {
 -            return cmp;
 -          }
 +            if (cmp != 0) {
 +              return cmp;
 +            }

 -          // Tie-break by docID ... no need to tie break on ord, for the case where the same doc has
 -          // the same value in a given dimension indexed more than once: it can't matter at search
 -          // time since we don't write ords into the index:
 +            // Tie-break by docID ... no need to tie break on ord, for the case where the same doc has
 +            // the same value in a given dimension indexed more than once: it can't matter at search
 +            // time since we don't write ords into the index:

 -          return StringHelper.compare(Integer.BYTES,
 -                                      a.bytes, a.offset + packedBytesLength,
 -                                      b.bytes, b.offset + packedBytesLength);
 -        }
 -      };
 +            return StringHelper.compare(Integer.BYTES,
 +                                        a.bytes, a.offset + packedBytesLength,
 +                                        b.bytes, b.offset + packedBytesLength);
 +          }
 +        };
 +      }

        OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {
	diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefSliceComparator.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefSliceComparator.java
	new file mode 100644
	index 0000000..41351c1
	--- /dev/null
	+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefSliceComparator.java
	@@ -0,0 +1,38 @@
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+package org.apache.lucene.util;
	+
	+import java.util.Comparator;
	+
	+/** A {@link Comparator} for a slice of a {@link BytesRef}.
	+ * @lucene.internal */
	+public final class BytesRefSliceComparator implements Comparator<BytesRef> {
	+
	+ final int offset, length;
	+
	+ /** Sole constructor. */
	+ public BytesRefSliceComparator(int offset, int length) {
	+ this.offset = offset;
	+ this.length = length;
	+ }
	+
	+ @Override
	+ public int compare(BytesRef a, BytesRef b) {
	+ return StringHelper.compare(length, a.bytes, a.offset + offset, b.bytes, b.offset + offset);
	+ }
	+
	+}
	diff --git a/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java b/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java
	index 346b908..b2bbf12 100644
	--- a/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java
	+++ b/lucene/core/src/java/org/apache/lucene/util/FixedLengthBytesRefArray.java
	@@ -105,6 +105,37 @@ final class FixedLengthBytesRefArray implements SortableBytesRefArray {
	orderedEntries[i] = i;
	}

	+ if (comp instanceof BytesRefSliceComparator) {
	+ int offset = ((BytesRefSliceComparator) comp).offset;
	+ int length = ((BytesRefSliceComparator) comp).length;
	+
	+ new StringMSBRadixSorter() {
	+
	+ final BytesRef scratch;
	+
	+ {
	+ scratch = new BytesRef();
	+ scratch.length = length;
	+ }
	+
	+ @Override
	+ protected void swap(int i, int j) {
	+ int o = orderedEntries[i];
	+ orderedEntries[i] = orderedEntries[j];
	+ orderedEntries[j] = o;
	+ }
	+
	+ @Override
	+ protected BytesRef get(int i) {
	+ int index = orderedEntries[i];
	+ scratch.bytes = blocks[index / valuesPerBlock];
	+ scratch.offset = (index % valuesPerBlock) * valueLength + offset;
	+ return scratch;
	+ }
	+ }.sort(0, size());
	+ return orderedEntries;
	+ }
	+
	final BytesRef pivot = new BytesRef();
	final BytesRef scratch1 = new BytesRef();
	final BytesRef scratch2 = new BytesRef();
	@@ -120,7 +151,7 @@ final class FixedLengthBytesRefArray implements SortableBytesRefArray {
	orderedEntries[i] = orderedEntries[j];
	orderedEntries[j] = o;
	}
	-
	+
	@Override
	protected int compare(int i, int j) {
	int index1 = orderedEntries[i];
	diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
	index 09eef26..8daba0a 100644
	--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
	+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
	@@ -25,7 +25,6 @@ import java.util.List;

	import org.apache.lucene.codecs.CodecUtil;
	import org.apache.lucene.index.MergeState;
	-import org.apache.lucene.store.ByteArrayDataInput;
	import org.apache.lucene.store.ChecksumIndexInput;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.IOContext;
	@@ -33,6 +32,7 @@ import org.apache.lucene.store.IndexOutput;
	import org.apache.lucene.store.TrackingDirectoryWrapper;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	+import org.apache.lucene.util.BytesRefSliceComparator;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.lucene.util.IOUtils;
	import org.apache.lucene.util.IntroSorter;
	@@ -718,28 +718,34 @@ public class BKDWriter implements Closeable {

	final int offset = bytesPerDim * dim;

	- Comparator<BytesRef> cmp = new Comparator<BytesRef>() {
	-
	- final ByteArrayDataInput reader = new ByteArrayDataInput();
	+ Comparator<BytesRef> cmp;
	+ if (dim == numDims - 1) {
	+ // the bytes for the value and the doc id are contiguous so we can use a
	+ // BytesRefSliceComparator which will trigger an optimization when sorting
	+ // to use radix sort rather than a comparison-based sort
	+ cmp = new BytesRefSliceComparator(packedBytesLength - bytesPerDim, bytesPerDim + Integer.BYTES);
	+ } else {
	+ cmp = new Comparator<BytesRef>() {

	- @Override
	- public int compare(BytesRef a, BytesRef b) {
	- // First compare by the requested dimension we are sorting by:
	- int cmp = StringHelper.compare(bytesPerDim, a.bytes, a.offset + offset, b.bytes, b.offset + offset);
	+ @Override
	+ public int compare(BytesRef a, BytesRef b) {
	+ // First compare by the requested dimension we are sorting by:
	+ int cmp = StringHelper.compare(bytesPerDim, a.bytes, a.offset + offset, b.bytes, b.offset + offset);

	- if (cmp != 0) {
	- return cmp;
	- }
	+ if (cmp != 0) {
	+ return cmp;
	+ }

	- // Tie-break by docID ... no need to tie break on ord, for the case where the same doc has
	- // the same value in a given dimension indexed more than once: it can't matter at search
	- // time since we don't write ords into the index:
	+ // Tie-break by docID ... no need to tie break on ord, for the case where the same doc has
	+ // the same value in a given dimension indexed more than once: it can't matter at search
	+ // time since we don't write ords into the index:

	- return StringHelper.compare(Integer.BYTES,
	- a.bytes, a.offset + packedBytesLength,
	- b.bytes, b.offset + packedBytesLength);
	- }
	- };
	+ return StringHelper.compare(Integer.BYTES,
	+ a.bytes, a.offset + packedBytesLength,
	+ b.bytes, b.offset + packedBytesLength);
	+ }
	+ };
	+ }

	OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix + "_bkd" + dim, cmp, offlineSorterBufferMB, offlineSorterMaxTempFiles, bytesPerDoc) {