lucene/core/src/test/org/apache/lucene/util/TestMSBRadixSorter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util;

 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;

 public class TestMSBRadixSorter extends LuceneTestCase {

   private void test(BytesRef[] refs, int len) {
     BytesRef[] expected = ArrayUtil.copyOfSubArray(refs, 0, len);
     Arrays.sort(expected);

     int maxLength = 0;
     for (int i = 0; i < len; ++i) {
       BytesRef ref = refs[i];
       maxLength = Math.max(maxLength, ref.length);
     }
     switch (random().nextInt(3)) {
       case 0:
         maxLength += TestUtil.nextInt(random(), 1, 5);
         break;
       case 1:
         maxLength = Integer.MAX_VALUE;
         break;
       default:
         // leave unchanged
         break;
     }

     final int finalMaxLength = maxLength;
     new MSBRadixSorter(maxLength) {

       @Override
       protected int byteAt(int i, int k) {
         assertTrue(k < finalMaxLength);
         BytesRef ref = refs[i];
         if (ref.length <= k) {
           return -1;
         }
         return ref.bytes[ref.offset + k] & 0xff;
       }

       @Override
       protected void swap(int i, int j) {
         BytesRef tmp = refs[i];
         refs[i] = refs[j];
         refs[j] = tmp;
       }
     }.sort(0, len);
     BytesRef[] actual = ArrayUtil.copyOfSubArray(refs, 0, len);
     assertArrayEquals(expected, actual);
   }

   public void testEmpty() {
     test(new BytesRef[random().nextInt(5)], 0);
   }

   public void testOneValue() {
     BytesRef bytes = new BytesRef(TestUtil.randomSimpleString(random()));
     test(new BytesRef[] { bytes }, 1);
   }

   public void testTwoValues() {
     BytesRef bytes1 = new BytesRef(TestUtil.randomSimpleString(random()));
     BytesRef bytes2 = new BytesRef(TestUtil.randomSimpleString(random()));
     test(new BytesRef[] { bytes1, bytes2 }, 2);
   }

   private void testRandom(int commonPrefixLen, int maxLen) {
     byte[] commonPrefix = new byte[commonPrefixLen];
     random().nextBytes(commonPrefix);
     final int len = random().nextInt(100000);
     BytesRef[] bytes = new BytesRef[len + random().nextInt(50)];
     for (int i = 0; i < len; ++i) {
       byte[] b = new byte[commonPrefixLen + random().nextInt(maxLen)];
       random().nextBytes(b);
       System.arraycopy(commonPrefix, 0, b, 0, commonPrefixLen);
       bytes[i] = new BytesRef(b);
     }
     test(bytes, len);
   }

   public void testRandom() {
     for (int iter = 0; iter < 10; ++iter) {
       testRandom(0, 10);
     }
   }

   public void testRandomWithLotsOfDuplicates() {
     for (int iter = 0; iter < 10; ++iter) {
       testRandom(0, 2);
     }
   }

   public void testRandomWithSharedPrefix() {
     for (int iter = 0; iter < 10; ++iter) {
       testRandom(TestUtil.nextInt(random(), 1, 30), 10);
     }
   }

   public void testRandomWithSharedPrefixAndLotsOfDuplicates() {
     for (int iter = 0; iter < 10; ++iter) {
       testRandom(TestUtil.nextInt(random(), 1, 30), 2);
     }
   }

   public void testRandom2() {
     // how large our alphabet is
     int letterCount = TestUtil.nextInt(random(), 2, 10);

     // how many substring fragments to use
     int substringCount = TestUtil.nextInt(random(), 2, 10);
     Set<BytesRef> substringsSet = new HashSet<>();

     // how many strings to make
     int stringCount = atLeast(10000);

     //System.out.println("letterCount=" + letterCount + " substringCount=" + substringCount + " stringCount=" + stringCount);
     while(substringsSet.size() < substringCount) {
       int length = TestUtil.nextInt(random(), 2, 10);
       byte[] bytes = new byte[length];
       for(int i=0;i<length;i++) {
         bytes[i] = (byte) random().nextInt(letterCount);
       }
       BytesRef br = new BytesRef(bytes);
       substringsSet.add(br);
       //System.out.println("add substring count=" + substringsSet.size() + ": " + br);
     }

     BytesRef[] substrings = substringsSet.toArray(new BytesRef[substringsSet.size()]);
     double[] chance = new double[substrings.length];
     double sum = 0.0;
     for(int i=0;i<substrings.length;i++) {
       chance[i] = random().nextDouble();
       sum += chance[i];
     }

     // give each substring a random chance of occurring:
     double accum = 0.0;
     for(int i=0;i<substrings.length;i++) {
       accum += chance[i]/sum;
       chance[i] = accum;
     }

     Set<BytesRef> stringsSet = new HashSet<>();
     int iters = 0;
     while (stringsSet.size() < stringCount && iters < stringCount*5) {
       int count = TestUtil.nextInt(random(), 1, 5);
       BytesRefBuilder b = new BytesRefBuilder();
       for(int i=0;i<count;i++) {
         double v = random().nextDouble();
         accum = 0.0;
         for(int j=0;j<substrings.length;j++) {
           accum += chance[j];
           if (accum >= v) {
             b.append(substrings[j]);
             break;
           }
         }
       }
       BytesRef br = b.toBytesRef();
       stringsSet.add(br);
       //System.out.println("add string count=" + stringsSet.size() + ": " + br);
       iters++;
     }

     test(stringsSet.toArray(new BytesRef[stringsSet.size()]), stringsSet.size());
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util;

	import java.util.Arrays;
	import java.util.HashSet;
	import java.util.Set;

	public class TestMSBRadixSorter extends LuceneTestCase {

	private void test(BytesRef[] refs, int len) {
	BytesRef[] expected = ArrayUtil.copyOfSubArray(refs, 0, len);
	Arrays.sort(expected);

	int maxLength = 0;
	for (int i = 0; i < len; ++i) {
	BytesRef ref = refs[i];
	maxLength = Math.max(maxLength, ref.length);
	}
	switch (random().nextInt(3)) {
	case 0:
	maxLength += TestUtil.nextInt(random(), 1, 5);
	break;
	case 1:
	maxLength = Integer.MAX_VALUE;
	break;
	default:
	// leave unchanged
	break;
	}

	final int finalMaxLength = maxLength;
	new MSBRadixSorter(maxLength) {

	@Override
	protected int byteAt(int i, int k) {
	assertTrue(k < finalMaxLength);
	BytesRef ref = refs[i];
	if (ref.length <= k) {
	return -1;
	}
	return ref.bytes[ref.offset + k] & 0xff;
	}

	@Override
	protected void swap(int i, int j) {
	BytesRef tmp = refs[i];
	refs[i] = refs[j];
	refs[j] = tmp;
	}
	}.sort(0, len);
	BytesRef[] actual = ArrayUtil.copyOfSubArray(refs, 0, len);
	assertArrayEquals(expected, actual);
	}

	public void testEmpty() {
	test(new BytesRef[random().nextInt(5)], 0);
	}

	public void testOneValue() {
	BytesRef bytes = new BytesRef(TestUtil.randomSimpleString(random()));
	test(new BytesRef[] { bytes }, 1);
	}

	public void testTwoValues() {
	BytesRef bytes1 = new BytesRef(TestUtil.randomSimpleString(random()));
	BytesRef bytes2 = new BytesRef(TestUtil.randomSimpleString(random()));
	test(new BytesRef[] { bytes1, bytes2 }, 2);
	}

	private void testRandom(int commonPrefixLen, int maxLen) {
	byte[] commonPrefix = new byte[commonPrefixLen];
	random().nextBytes(commonPrefix);
	final int len = random().nextInt(100000);
	BytesRef[] bytes = new BytesRef[len + random().nextInt(50)];
	for (int i = 0; i < len; ++i) {
	byte[] b = new byte[commonPrefixLen + random().nextInt(maxLen)];
	random().nextBytes(b);
	System.arraycopy(commonPrefix, 0, b, 0, commonPrefixLen);
	bytes[i] = new BytesRef(b);
	}
	test(bytes, len);
	}

	public void testRandom() {
	for (int iter = 0; iter < 10; ++iter) {
	testRandom(0, 10);
	}
	}

	public void testRandomWithLotsOfDuplicates() {
	for (int iter = 0; iter < 10; ++iter) {
	testRandom(0, 2);
	}
	}

	public void testRandomWithSharedPrefix() {
	for (int iter = 0; iter < 10; ++iter) {
	testRandom(TestUtil.nextInt(random(), 1, 30), 10);
	}
	}

	public void testRandomWithSharedPrefixAndLotsOfDuplicates() {
	for (int iter = 0; iter < 10; ++iter) {
	testRandom(TestUtil.nextInt(random(), 1, 30), 2);
	}
	}

	public void testRandom2() {
	// how large our alphabet is
	int letterCount = TestUtil.nextInt(random(), 2, 10);

	// how many substring fragments to use
	int substringCount = TestUtil.nextInt(random(), 2, 10);
	Set<BytesRef> substringsSet = new HashSet<>();

	// how many strings to make
	int stringCount = atLeast(10000);

	//System.out.println("letterCount=" + letterCount + " substringCount=" + substringCount + " stringCount=" + stringCount);
	while(substringsSet.size() < substringCount) {
	int length = TestUtil.nextInt(random(), 2, 10);
	byte[] bytes = new byte[length];
	for(int i=0;i<length;i++) {
	bytes[i] = (byte) random().nextInt(letterCount);
	}
	BytesRef br = new BytesRef(bytes);
	substringsSet.add(br);
	//System.out.println("add substring count=" + substringsSet.size() + ": " + br);
	}

	BytesRef[] substrings = substringsSet.toArray(new BytesRef[substringsSet.size()]);
	double[] chance = new double[substrings.length];
	double sum = 0.0;
	for(int i=0;i<substrings.length;i++) {
	chance[i] = random().nextDouble();
	sum += chance[i];
	}

	// give each substring a random chance of occurring:
	double accum = 0.0;
	for(int i=0;i<substrings.length;i++) {
	accum += chance[i]/sum;
	chance[i] = accum;
	}

	Set<BytesRef> stringsSet = new HashSet<>();
	int iters = 0;
	while (stringsSet.size() < stringCount && iters < stringCount*5) {
	int count = TestUtil.nextInt(random(), 1, 5);
	BytesRefBuilder b = new BytesRefBuilder();
	for(int i=0;i<count;i++) {
	double v = random().nextDouble();
	accum = 0.0;
	for(int j=0;j<substrings.length;j++) {
	accum += chance[j];
	if (accum >= v) {
	b.append(substrings[j]);
	break;
	}
	}
	}
	BytesRef br = b.toBytesRef();
	stringsSet.add(br);
	//System.out.println("add string count=" + stringsSet.size() + ": " + br);
	iters++;
	}

	test(stringsSet.toArray(new BytesRef[stringsSet.size()]), stringsSet.size());
	}
	}