blob: 0d63558a0872a0ff8fc5a714ec3794fc0317f8db [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.lucene.util.automaton;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.Util;
public class TestUTF32ToUTF8 extends LuceneTestCase {
public void setUp() throws Exception {
private static final int MAX_UNICODE = 0x10FFFF;
private boolean matches(ByteRunAutomaton a, int code) {
char[] chars = Character.toChars(code);
byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
return, 0, len);
private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) {
// Verify correct ints are accepted
final int nonSurrogateCount;
final boolean ovSurStart;
if (endCode < UnicodeUtil.UNI_SUR_HIGH_START ||
startCode > UnicodeUtil.UNI_SUR_LOW_END) {
// no overlap w/ surrogates
nonSurrogateCount = endCode - startCode + 1;
ovSurStart = false;
} else if (isSurrogate(startCode)) {
// start of range overlaps surrogates
nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1);
ovSurStart = false;
} else if (isSurrogate(endCode)) {
// end of range overlaps surrogates
ovSurStart = true;
nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1);
} else {
// range completely subsumes surrogates
ovSurStart = true;
nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1);
assert nonSurrogateCount > 0;
for(int iter=0;iter<iters;iter++) {
// pick random code point in-range
int code = startCode + r.nextInt(nonSurrogateCount);
if (isSurrogate(code)) {
if (ovSurStart) {
code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START);
} else {
code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode);
assert code >= startCode && code <= endCode: "code=" + code + " start=" + startCode + " end=" + endCode;
assert !isSurrogate(code);
assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code,
matches(a, code));
// Verify invalid ints are not accepted
final int invalidRange = MAX_UNICODE - (endCode - startCode + 1);
if (invalidRange > 0) {
for(int iter=0;iter<iters;iter++) {
int x = TestUtil.nextInt(r, 0, invalidRange - 1);
final int code;
if (x >= startCode) {
code = endCode + 1 + x - startCode;
} else {
code = x;
if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) |
(code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) {
assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code,
matches(a, code));
// Evenly picks random code point from the 4 "buckets"
// (bucket = same #bytes when encoded to utf8)
private int getCodeStart(Random r) {
switch(r.nextInt(4)) {
case 0:
return TestUtil.nextInt(r, 0, 128);
case 1:
return TestUtil.nextInt(r, 128, 2048);
case 2:
return TestUtil.nextInt(r, 2048, 65536);
return TestUtil.nextInt(r, 65536, 1 + MAX_UNICODE);
private static boolean isSurrogate(int code) {
return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END;
public void testRandomRanges() throws Exception {
final Random r = random();
int ITERS = atLeast(10);
int ITERS_PER_DFA = atLeast(100);
for(int iter=0;iter<ITERS;iter++) {
int x1 = getCodeStart(r);
int x2 = getCodeStart(r);
final int startCode, endCode;
if (x1 < x2) {
startCode = x1;
endCode = x2;
} else {
startCode = x2;
endCode = x1;
if (isSurrogate(startCode) && isSurrogate(endCode)) {
Automaton a = Automata.makeCharRange(startCode, endCode);
testOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA);
public void testSpecialCase() {
RegExp re = new RegExp(".?");
Automaton automaton = re.toAutomaton();
CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
// make sure character dfa accepts empty string
assertTrue( char[0], 0, 0));
// make sure byte dfa accepts empty string
assertTrue( byte[0], 0, 0));
public void testSpecialCase2() throws Exception {
RegExp re = new RegExp(".+\u0775");
String input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775";
Automaton automaton = re.toAutomaton();
CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
assertTrue(, 0, bytes.length)); // this one fails!
public void testSpecialCase3() throws Exception {
RegExp re = new RegExp("(\\鯺)*(.)*\\Ӕ");
String input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4";
Automaton automaton = re.toAutomaton();
CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
assertTrue(, 0, bytes.length));
public void testRandomRegexes() throws Exception {
int num = atLeast(50);
for (int i = 0; i < num; i++) {
assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton());
public void testSingleton() throws Exception {
int iters = atLeast(100);
for(int iter=0;iter<iters;iter++) {
String s = TestUtil.randomRealisticUnicodeString(random());
Automaton a = Automata.makeString(s);
Automaton utf8 = new UTF32ToUTF8().convert(a);
IntsRefBuilder ints = new IntsRefBuilder();
Util.toIntsRef(new BytesRef(s), ints);
Set<IntsRef> set = new HashSet<>();
assertEquals(set, TestOperations.getFiniteStrings(utf8));
private void assertAutomaton(Automaton automaton) throws Exception {
CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);
int num = atLeast(1000);
for (int i = 0; i < num; i++) {
final String string;
if (random().nextBoolean()) {
// likely not accepted
string = TestUtil.randomUnicodeString(random());
} else {
// will be accepted
int[] codepoints = ras.getRandomAcceptedString(random());
try {
string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
} catch (Exception e) {
System.out.println(codepoints.length + " codepoints:");
for(int j=0;j<codepoints.length;j++) {
System.out.println(" " + Integer.toHexString(codepoints[j]));
throw e;
byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
assertEquals(,, 0, bytes.length));