lucene/core/src/java/org/apache/lucene/util/MSBRadixSorter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util;

 import java.util.Arrays;

 /** Radix sorter for variable-length strings. This class sorts based on the most
  *  significant byte first and falls back to {@link IntroSorter} when the size
  *  of the buckets to sort becomes small. It is <b>NOT</b> stable.
  *  Worst-case memory usage is about {@code 2.3 KB}.
  *  @lucene.internal */
 public abstract class MSBRadixSorter extends Sorter {

   // after that many levels of recursion we fall back to introsort anyway
   // this is used as a protection against the fact that radix sort performs
   // worse when there are long common prefixes (probably because of cache
   // locality)
   private static final int LEVEL_THRESHOLD = 8;
   // size of histograms: 256 + 1 to indicate that the string is finished
   private static final int HISTOGRAM_SIZE = 257;
   // buckets below this size will be sorted with introsort
   private static final int LENGTH_THRESHOLD = 100;

   // we store one histogram per recursion level
   private final int[][] histograms = new int[LEVEL_THRESHOLD][];
   private final int[] endOffsets = new int[HISTOGRAM_SIZE];
   private final int[] commonPrefix;

   private final int maxLength;

   /**
    * Sole constructor.
    * @param maxLength the maximum length of keys, pass {@link Integer#MAX_VALUE} if unknown.
    */
   protected MSBRadixSorter(int maxLength) {
     this.maxLength = maxLength;
     this.commonPrefix = new int[Math.min(24, maxLength)];
   }

   /** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
    * its length is less than or equal to {@code k}. This may only be called
    * with a value of {@code i} between {@code 0} included and
    * {@code maxLength} excluded. */
   protected abstract int byteAt(int i, int k);

   /** Get a fall-back sorter which may assume that the first k bytes of all compared strings are equal. */
   protected Sorter getFallbackSorter(int k) {
     return new IntroSorter() {
       @Override
       protected void swap(int i, int j) {
         MSBRadixSorter.this.swap(i, j);
       }

       @Override
       protected int compare(int i, int j) {
         for (int o = k; o < maxLength; ++o) {
           final int b1 = byteAt(i, o);
           final int b2 = byteAt(j, o);
           if (b1 != b2) {
             return b1 - b2;
           } else if (b1 == -1) {
             break;
           }
         }
         return 0;
       }

       @Override
       protected void setPivot(int i) {
         pivot.setLength(0);
         for (int o = k; o < maxLength; ++o) {
           final int b = byteAt(i, o);
           if (b == -1) {
             break;
           }
           pivot.append((byte) b);
         }
       }

       @Override
       protected int comparePivot(int j) {
         for (int o = 0; o < pivot.length(); ++o) {
           final int b1 = pivot.byteAt(o) & 0xff;
           final int b2 = byteAt(j, k + o);
           if (b1 != b2) {
             return b1 - b2;
           }
         }
         if (k + pivot.length() == maxLength) {
           return 0;
         }
         return -1 - byteAt(j, k + pivot.length());
       }

       private final BytesRefBuilder pivot = new BytesRefBuilder();
     };
   }

   @Override
   protected final int compare(int i, int j) {
     throw new UnsupportedOperationException("unused: not a comparison-based sort");
   }

   @Override
   public void sort(int from, int to) {
     checkRange(from, to);
     sort(from, to, 0, 0);
   }

   private void sort(int from, int to, int k, int l) {
     if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
       introSort(from, to, k);
     } else {
       radixSort(from, to, k, l);
     }
   }

   private void introSort(int from, int to, int k) {
     getFallbackSorter(k).sort(from, to);
   }

   /**
    * @param k the character number to compare
    * @param l the level of recursion
    */
   private void radixSort(int from, int to, int k, int l) {
     int[] histogram = histograms[l];
     if (histogram == null) {
       histogram = histograms[l] = new int[HISTOGRAM_SIZE];
     } else {
       Arrays.fill(histogram, 0);
     }

     final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram);
     if (commonPrefixLength > 0) {
       // if there are no more chars to compare or if all entries fell into the
       // first bucket (which means strings are shorter than k) then we are done
       // otherwise recurse
       if (k + commonPrefixLength < maxLength
           && histogram[0] < to - from) {
         radixSort(from, to, k + commonPrefixLength, l);
       }
       return;
     }
     assert assertHistogram(commonPrefixLength, histogram);

     int[] startOffsets = histogram;
     int[] endOffsets = this.endOffsets;
     sumHistogram(histogram, endOffsets);
     reorder(from, to, startOffsets, endOffsets, k);
     endOffsets = startOffsets;

     if (k + 1 < maxLength) {
       // recurse on all but the first bucket since all keys are equals in this
       // bucket (we already compared all bytes)
       for (int prev = endOffsets[0], i = 1; i < HISTOGRAM_SIZE; ++i) {
         int h = endOffsets[i];
         final int bucketLen = h - prev;
         if (bucketLen > 1) {
           sort(from + prev, from + h, k + 1, l + 1);
         }
         prev = h;
       }
     }
   }

   // only used from assert
   private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
     int numberOfUniqueBytes = 0;
     for (int freq : histogram) {
       if (freq > 0) {
         numberOfUniqueBytes++;
       }
     }
     if (numberOfUniqueBytes == 1) {
       assert commonPrefixLength >= 1;
     } else {
       assert commonPrefixLength == 0 : commonPrefixLength;
     }
     return true;
   }

   /** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
   private int getBucket(int i, int k) {
     return byteAt(i, k) + 1;
   }

   /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
    *  and return a common prefix length for all visited values.
    *  @see #buildHistogram */
   private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
     final int[] commonPrefix = this.commonPrefix;
     int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
     for (int j = 0; j < commonPrefixLength; ++j) {
       final int b = byteAt(from, k + j);
       commonPrefix[j] = b;
       if (b == -1) {
         commonPrefixLength = j + 1;
         break;
       }
     }

     int i;
     outer: for (i = from + 1; i < to; ++i) {
       for (int j = 0; j < commonPrefixLength; ++j) {
         final int b = byteAt(i, k + j);
         if (b != commonPrefix[j]) {
           commonPrefixLength = j;
           if (commonPrefixLength == 0) { // we have no common prefix
             histogram[commonPrefix[0] + 1] = i - from;
             histogram[b + 1] = 1;
             break outer;
           }
           break;
         }
       }
     }

     if (i < to) {
       // the loop got broken because there is no common prefix
       assert commonPrefixLength == 0;
       buildHistogram(i + 1, to, k, histogram);
     } else {
       assert commonPrefixLength > 0;
       histogram[commonPrefix[0] + 1] = to - from;
     }

     return commonPrefixLength;
   }

   /** Build an histogram of the k-th characters of values occurring between
    *  offsets {@code from} and {@code to}, using {@link #getBucket}. */
   private void buildHistogram(int from, int to, int k, int[] histogram) {
     for (int i = from; i < to; ++i) {
       histogram[getBucket(i, k)]++;
     }
   }

   /** Accumulate values of the histogram so that it does not store counts but
    *  start offsets. {@code endOffsets} will store the end offsets. */
   private static void sumHistogram(int[] histogram, int[] endOffsets) {
     int accum = 0;
     for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
       final int count = histogram[i];
       histogram[i] = accum;
       accum += count;
       endOffsets[i] = accum;
     }
   }

   /**
    * Reorder based on start/end offsets for each bucket. When this method
    * returns, startOffsets and endOffsets are equal.
    * @param startOffsets start offsets per bucket
    * @param endOffsets end offsets per bucket
    */
   private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
     // reorder in place, like the dutch flag problem
     for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
       final int limit = endOffsets[i];
       for (int h1 = startOffsets[i]; h1 < limit; h1 = startOffsets[i]) {
         final int b = getBucket(from + h1, k);
         final int h2 = startOffsets[b]++;
         swap(from + h1, from + h2);
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util;

	import java.util.Arrays;

	/** Radix sorter for variable-length strings. This class sorts based on the most
	* significant byte first and falls back to {@link IntroSorter} when the size
	* of the buckets to sort becomes small. It is <b>NOT</b> stable.
	* Worst-case memory usage is about {@code 2.3 KB}.
	* @lucene.internal */
	public abstract class MSBRadixSorter extends Sorter {

	// after that many levels of recursion we fall back to introsort anyway
	// this is used as a protection against the fact that radix sort performs
	// worse when there are long common prefixes (probably because of cache
	// locality)
	private static final int LEVEL_THRESHOLD = 8;
	// size of histograms: 256 + 1 to indicate that the string is finished
	private static final int HISTOGRAM_SIZE = 257;
	// buckets below this size will be sorted with introsort
	private static final int LENGTH_THRESHOLD = 100;

	// we store one histogram per recursion level
	private final int[][] histograms = new int[LEVEL_THRESHOLD][];
	private final int[] endOffsets = new int[HISTOGRAM_SIZE];
	private final int[] commonPrefix;

	private final int maxLength;

	/**
	* Sole constructor.
	* @param maxLength the maximum length of keys, pass {@link Integer#MAX_VALUE} if unknown.
	*/
	protected MSBRadixSorter(int maxLength) {
	this.maxLength = maxLength;
	this.commonPrefix = new int[Math.min(24, maxLength)];
	}

	/** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
	* its length is less than or equal to {@code k}. This may only be called
	* with a value of {@code i} between {@code 0} included and
	* {@code maxLength} excluded. */
	protected abstract int byteAt(int i, int k);

	/** Get a fall-back sorter which may assume that the first k bytes of all compared strings are equal. */
	protected Sorter getFallbackSorter(int k) {
	return new IntroSorter() {
	@Override
	protected void swap(int i, int j) {
	MSBRadixSorter.this.swap(i, j);
	}

	@Override
	protected int compare(int i, int j) {
	for (int o = k; o < maxLength; ++o) {
	final int b1 = byteAt(i, o);
	final int b2 = byteAt(j, o);
	if (b1 != b2) {
	return b1 - b2;
	} else if (b1 == -1) {
	break;
	}
	}
	return 0;
	}

	@Override
	protected void setPivot(int i) {
	pivot.setLength(0);
	for (int o = k; o < maxLength; ++o) {
	final int b = byteAt(i, o);
	if (b == -1) {
	break;
	}
	pivot.append((byte) b);
	}
	}

	@Override
	protected int comparePivot(int j) {
	for (int o = 0; o < pivot.length(); ++o) {
	final int b1 = pivot.byteAt(o) & 0xff;
	final int b2 = byteAt(j, k + o);
	if (b1 != b2) {
	return b1 - b2;
	}
	}
	if (k + pivot.length() == maxLength) {
	return 0;
	}
	return -1 - byteAt(j, k + pivot.length());
	}

	private final BytesRefBuilder pivot = new BytesRefBuilder();
	};
	}

	@Override
	protected final int compare(int i, int j) {
	throw new UnsupportedOperationException("unused: not a comparison-based sort");
	}

	@Override
	public void sort(int from, int to) {
	checkRange(from, to);
	sort(from, to, 0, 0);
	}

	private void sort(int from, int to, int k, int l) {
	if (to - from <= LENGTH_THRESHOLD \|\| l >= LEVEL_THRESHOLD) {
	introSort(from, to, k);
	} else {
	radixSort(from, to, k, l);
	}
	}

	private void introSort(int from, int to, int k) {
	getFallbackSorter(k).sort(from, to);
	}

	/**
	* @param k the character number to compare
	* @param l the level of recursion
	*/
	private void radixSort(int from, int to, int k, int l) {
	int[] histogram = histograms[l];
	if (histogram == null) {
	histogram = histograms[l] = new int[HISTOGRAM_SIZE];
	} else {
	Arrays.fill(histogram, 0);
	}

	final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram);
	if (commonPrefixLength > 0) {
	// if there are no more chars to compare or if all entries fell into the
	// first bucket (which means strings are shorter than k) then we are done
	// otherwise recurse
	if (k + commonPrefixLength < maxLength
	&& histogram[0] < to - from) {
	radixSort(from, to, k + commonPrefixLength, l);
	}
	return;
	}
	assert assertHistogram(commonPrefixLength, histogram);

	int[] startOffsets = histogram;
	int[] endOffsets = this.endOffsets;
	sumHistogram(histogram, endOffsets);
	reorder(from, to, startOffsets, endOffsets, k);
	endOffsets = startOffsets;

	if (k + 1 < maxLength) {
	// recurse on all but the first bucket since all keys are equals in this
	// bucket (we already compared all bytes)
	for (int prev = endOffsets[0], i = 1; i < HISTOGRAM_SIZE; ++i) {
	int h = endOffsets[i];
	final int bucketLen = h - prev;
	if (bucketLen > 1) {
	sort(from + prev, from + h, k + 1, l + 1);
	}
	prev = h;
	}
	}
	}

	// only used from assert
	private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
	int numberOfUniqueBytes = 0;
	for (int freq : histogram) {
	if (freq > 0) {
	numberOfUniqueBytes++;
	}
	}
	if (numberOfUniqueBytes == 1) {
	assert commonPrefixLength >= 1;
	} else {
	assert commonPrefixLength == 0 : commonPrefixLength;
	}
	return true;
	}

	/** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
	private int getBucket(int i, int k) {
	return byteAt(i, k) + 1;
	}

	/** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
	* and return a common prefix length for all visited values.
	* @see #buildHistogram */
	private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
	final int[] commonPrefix = this.commonPrefix;
	int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
	for (int j = 0; j < commonPrefixLength; ++j) {
	final int b = byteAt(from, k + j);
	commonPrefix[j] = b;
	if (b == -1) {
	commonPrefixLength = j + 1;
	break;
	}
	}

	int i;
	outer: for (i = from + 1; i < to; ++i) {
	for (int j = 0; j < commonPrefixLength; ++j) {
	final int b = byteAt(i, k + j);
	if (b != commonPrefix[j]) {
	commonPrefixLength = j;
	if (commonPrefixLength == 0) { // we have no common prefix
	histogram[commonPrefix[0] + 1] = i - from;
	histogram[b + 1] = 1;
	break outer;
	}
	break;
	}
	}
	}

	if (i < to) {
	// the loop got broken because there is no common prefix
	assert commonPrefixLength == 0;
	buildHistogram(i + 1, to, k, histogram);
	} else {
	assert commonPrefixLength > 0;
	histogram[commonPrefix[0] + 1] = to - from;
	}

	return commonPrefixLength;
	}

	/** Build an histogram of the k-th characters of values occurring between
	* offsets {@code from} and {@code to}, using {@link #getBucket}. */
	private void buildHistogram(int from, int to, int k, int[] histogram) {
	for (int i = from; i < to; ++i) {
	histogram[getBucket(i, k)]++;
	}
	}

	/** Accumulate values of the histogram so that it does not store counts but
	* start offsets. {@code endOffsets} will store the end offsets. */
	private static void sumHistogram(int[] histogram, int[] endOffsets) {
	int accum = 0;
	for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
	final int count = histogram[i];
	histogram[i] = accum;
	accum += count;
	endOffsets[i] = accum;
	}
	}

	/**
	* Reorder based on start/end offsets for each bucket. When this method
	* returns, startOffsets and endOffsets are equal.
	* @param startOffsets start offsets per bucket
	* @param endOffsets end offsets per bucket
	*/
	private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
	// reorder in place, like the dutch flag problem
	for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
	final int limit = endOffsets[i];
	for (int h1 = startOffsets[i]; h1 < limit; h1 = startOffsets[i]) {
	final int b = getBucket(from + h1, k);
	final int h2 = startOffsets[b]++;
	swap(from + h1, from + h2);
	}
	}
	}
	}