lucene/core/src/test/org/apache/lucene/util/automaton/TestUTF32ToUTF8.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util.automaton;


 import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
 import java.util.Random;
 import java.util.Set;

 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.Util;

 public class TestUTF32ToUTF8 extends LuceneTestCase {

   @Override
   public void setUp() throws Exception {
     super.setUp();
   }

   private static final int MAX_UNICODE = 0x10FFFF;

   private boolean matches(ByteRunAutomaton a, int code) {
     char[] chars = Character.toChars(code);
     byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
     final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
     return a.run(b, 0, len);
   }

   private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) {

     // Verify correct ints are accepted
     final int nonSurrogateCount;
     final boolean ovSurStart;
     if (endCode < UnicodeUtil.UNI_SUR_HIGH_START ||
         startCode > UnicodeUtil.UNI_SUR_LOW_END) {
       // no overlap w/ surrogates
       nonSurrogateCount = endCode - startCode + 1;
       ovSurStart = false;
     } else if (isSurrogate(startCode)) {
       // start of range overlaps surrogates
       nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1);
       ovSurStart = false;
     } else if (isSurrogate(endCode)) {
       // end of range overlaps surrogates
       ovSurStart = true;
       nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1);
     } else {
       // range completely subsumes surrogates
       ovSurStart = true;
       nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1);
     }

     assert nonSurrogateCount > 0;

     for(int iter=0;iter<iters;iter++) {
       // pick random code point in-range

       int code = startCode + r.nextInt(nonSurrogateCount);
       if (isSurrogate(code)) {
         if (ovSurStart) {
           code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START);
         } else {
           code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode);
         }
       }

       assert code >= startCode && code <= endCode: "code=" + code + " start=" + startCode + " end=" + endCode;
       assert !isSurrogate(code);

       assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code,
                  matches(a, code));
     }

     // Verify invalid ints are not accepted
     final int invalidRange = MAX_UNICODE - (endCode - startCode + 1);
     if (invalidRange > 0) {
       for(int iter=0;iter<iters;iter++) {
         int x = TestUtil.nextInt(r, 0, invalidRange - 1);
         final int code;
         if (x >= startCode) {
           code = endCode + 1 + x - startCode;
         } else {
           code = x;
         }
         if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) |
             (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) {
           iter--;
           continue;
         }
         assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code,
                     matches(a, code));

       }
     }
   }

   // Evenly picks random code point from the 4 "buckets"
   // (bucket = same #bytes when encoded to utf8)
   private int getCodeStart(Random r) {
     switch(r.nextInt(4)) {
     case 0:
       return TestUtil.nextInt(r, 0, 128);
     case 1:
       return TestUtil.nextInt(r, 128, 2048);
     case 2:
       return TestUtil.nextInt(r, 2048, 65536);
     default:
       return TestUtil.nextInt(r, 65536, 1 + MAX_UNICODE);
     }
   }

   private static boolean isSurrogate(int code) {
     return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END;
   }

   public void testRandomRanges() throws Exception {
     final Random r = random();
     int ITERS = atLeast(10);
     int ITERS_PER_DFA = atLeast(100);
     for(int iter=0;iter<ITERS;iter++) {
       int x1 = getCodeStart(r);
       int x2 = getCodeStart(r);
       final int startCode, endCode;

       if (x1 < x2) {
         startCode = x1;
         endCode = x2;
       } else {
         startCode = x2;
         endCode = x1;
       }

       if (isSurrogate(startCode) && isSurrogate(endCode)) {
         iter--;
         continue;
       }

       Automaton a = Automata.makeCharRange(startCode, endCode);
       testOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA);
     }
   }

   public void testSpecialCase() {
     RegExp re = new RegExp(".?");
     Automaton automaton = re.toAutomaton();
     CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
     ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
     // make sure character dfa accepts empty string
     assertTrue(cra.isAccept(0));
     assertTrue(cra.run(""));
     assertTrue(cra.run(new char[0], 0, 0));

     // make sure byte dfa accepts empty string
     assertTrue(bra.isAccept(0));
     assertTrue(bra.run(new byte[0], 0, 0));
   }

   public void testSpecialCase2() throws Exception {
     RegExp re = new RegExp(".+\u0775");
     String input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775";
     Automaton automaton = re.toAutomaton();
     CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
     ByteRunAutomaton bra = new ByteRunAutomaton(automaton);

     assertTrue(cra.run(input));

     byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
     assertTrue(bra.run(bytes, 0, bytes.length)); // this one fails!
   }

   public void testSpecialCase3() throws Exception {
     RegExp re = new RegExp("(\\鯺)*(.)*\\Ӕ");
     String input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4";
     Automaton automaton = re.toAutomaton();
     CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
     ByteRunAutomaton bra = new ByteRunAutomaton(automaton);

     assertTrue(cra.run(input));

     byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
     assertTrue(bra.run(bytes, 0, bytes.length));
   }

   public void testRandomRegexes() throws Exception {
     int num = atLeast(50);
     for (int i = 0; i < num; i++) {
       assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton());
     }
   }

   public void testSingleton() throws Exception {
     int iters = atLeast(100);
     for(int iter=0;iter<iters;iter++) {
       String s = TestUtil.randomRealisticUnicodeString(random());
       Automaton a = Automata.makeString(s);
       Automaton utf8 = new UTF32ToUTF8().convert(a);
       IntsRefBuilder ints = new IntsRefBuilder();
       Util.toIntsRef(new BytesRef(s), ints);
       Set<IntsRef> set = new HashSet<>();
       set.add(ints.get());
       assertEquals(set, TestOperations.getFiniteStrings(utf8));
     }
   }

   private void assertAutomaton(Automaton automaton) throws Exception {
     CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
     ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
     final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);

     int num = atLeast(1000);
     for (int i = 0; i < num; i++) {
       final String string;
       if (random().nextBoolean()) {
         // likely not accepted
         string = TestUtil.randomUnicodeString(random());
       } else {
         // will be accepted
         int[] codepoints = ras.getRandomAcceptedString(random());
         try {
           string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
         } catch (Exception e) {
           System.out.println(codepoints.length + " codepoints:");
           for(int j=0;j<codepoints.length;j++) {
             System.out.println("  " + Integer.toHexString(codepoints[j]));
           }
           throw e;
         }
       }
       byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
       assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util.automaton;


	import java.nio.charset.StandardCharsets;
	import java.util.HashSet;
	import java.util.Random;
	import java.util.Set;

	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.IntsRef;
	import org.apache.lucene.util.IntsRefBuilder;
	import org.apache.lucene.util.LuceneTestCase;
	import org.apache.lucene.util.TestUtil;
	import org.apache.lucene.util.UnicodeUtil;
	import org.apache.lucene.util.fst.Util;

	public class TestUTF32ToUTF8 extends LuceneTestCase {

	@Override
	public void setUp() throws Exception {
	super.setUp();
	}

	private static final int MAX_UNICODE = 0x10FFFF;

	private boolean matches(ByteRunAutomaton a, int code) {
	char[] chars = Character.toChars(code);
	byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
	final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
	return a.run(b, 0, len);
	}

	private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) {

	// Verify correct ints are accepted
	final int nonSurrogateCount;
	final boolean ovSurStart;
	if (endCode < UnicodeUtil.UNI_SUR_HIGH_START \|\|
	startCode > UnicodeUtil.UNI_SUR_LOW_END) {
	// no overlap w/ surrogates
	nonSurrogateCount = endCode - startCode + 1;
	ovSurStart = false;
	} else if (isSurrogate(startCode)) {
	// start of range overlaps surrogates
	nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1);
	ovSurStart = false;
	} else if (isSurrogate(endCode)) {
	// end of range overlaps surrogates
	ovSurStart = true;
	nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1);
	} else {
	// range completely subsumes surrogates
	ovSurStart = true;
	nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1);
	}

	assert nonSurrogateCount > 0;

	for(int iter=0;iter<iters;iter++) {
	// pick random code point in-range

	int code = startCode + r.nextInt(nonSurrogateCount);
	if (isSurrogate(code)) {
	if (ovSurStart) {
	code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START);
	} else {
	code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode);
	}
	}

	assert code >= startCode && code <= endCode: "code=" + code + " start=" + startCode + " end=" + endCode;
	assert !isSurrogate(code);

	assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code,
	matches(a, code));
	}

	// Verify invalid ints are not accepted
	final int invalidRange = MAX_UNICODE - (endCode - startCode + 1);
	if (invalidRange > 0) {
	for(int iter=0;iter<iters;iter++) {
	int x = TestUtil.nextInt(r, 0, invalidRange - 1);
	final int code;
	if (x >= startCode) {
	code = endCode + 1 + x - startCode;
	} else {
	code = x;
	}
	if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) \|
	(code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) {
	iter--;
	continue;
	}
	assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code,
	matches(a, code));

	}
	}
	}

	// Evenly picks random code point from the 4 "buckets"
	// (bucket = same #bytes when encoded to utf8)
	private int getCodeStart(Random r) {
	switch(r.nextInt(4)) {
	case 0:
	return TestUtil.nextInt(r, 0, 128);
	case 1:
	return TestUtil.nextInt(r, 128, 2048);
	case 2:
	return TestUtil.nextInt(r, 2048, 65536);
	default:
	return TestUtil.nextInt(r, 65536, 1 + MAX_UNICODE);
	}
	}

	private static boolean isSurrogate(int code) {
	return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END;
	}

	public void testRandomRanges() throws Exception {
	final Random r = random();
	int ITERS = atLeast(10);
	int ITERS_PER_DFA = atLeast(100);
	for(int iter=0;iter<ITERS;iter++) {
	int x1 = getCodeStart(r);
	int x2 = getCodeStart(r);
	final int startCode, endCode;

	if (x1 < x2) {
	startCode = x1;
	endCode = x2;
	} else {
	startCode = x2;
	endCode = x1;
	}

	if (isSurrogate(startCode) && isSurrogate(endCode)) {
	iter--;
	continue;
	}

	Automaton a = Automata.makeCharRange(startCode, endCode);
	testOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA);
	}
	}

	public void testSpecialCase() {
	RegExp re = new RegExp(".?");
	Automaton automaton = re.toAutomaton();
	CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
	ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
	// make sure character dfa accepts empty string
	assertTrue(cra.isAccept(0));
	assertTrue(cra.run(""));
	assertTrue(cra.run(new char[0], 0, 0));

	// make sure byte dfa accepts empty string
	assertTrue(bra.isAccept(0));
	assertTrue(bra.run(new byte[0], 0, 0));
	}

	public void testSpecialCase2() throws Exception {
	RegExp re = new RegExp(".+\u0775");
	String input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775";
	Automaton automaton = re.toAutomaton();
	CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
	ByteRunAutomaton bra = new ByteRunAutomaton(automaton);

	assertTrue(cra.run(input));

	byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
	assertTrue(bra.run(bytes, 0, bytes.length)); // this one fails!
	}

	public void testSpecialCase3() throws Exception {
	RegExp re = new RegExp("(\\鯺)(.)\\Ӕ");
	String input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4";
	Automaton automaton = re.toAutomaton();
	CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
	ByteRunAutomaton bra = new ByteRunAutomaton(automaton);

	assertTrue(cra.run(input));

	byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
	assertTrue(bra.run(bytes, 0, bytes.length));
	}

	public void testRandomRegexes() throws Exception {
	int num = atLeast(50);
	for (int i = 0; i < num; i++) {
	assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton());
	}
	}

	public void testSingleton() throws Exception {
	int iters = atLeast(100);
	for(int iter=0;iter<iters;iter++) {
	String s = TestUtil.randomRealisticUnicodeString(random());
	Automaton a = Automata.makeString(s);
	Automaton utf8 = new UTF32ToUTF8().convert(a);
	IntsRefBuilder ints = new IntsRefBuilder();
	Util.toIntsRef(new BytesRef(s), ints);
	Set<IntsRef> set = new HashSet<>();
	set.add(ints.get());
	assertEquals(set, TestOperations.getFiniteStrings(utf8));
	}
	}

	private void assertAutomaton(Automaton automaton) throws Exception {
	CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
	ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
	final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);

	int num = atLeast(1000);
	for (int i = 0; i < num; i++) {
	final String string;
	if (random().nextBoolean()) {
	// likely not accepted
	string = TestUtil.randomUnicodeString(random());
	} else {
	// will be accepted
	int[] codepoints = ras.getRandomAcceptedString(random());
	try {
	string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
	} catch (Exception e) {
	System.out.println(codepoints.length + " codepoints:");
	for(int j=0;j<codepoints.length;j++) {
	System.out.println(" " + Integer.toHexString(codepoints[j]));
	}
	throw e;
	}
	}
	byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
	assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
	}
	}
	}