| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.automaton; |
| |
| |
| import java.nio.charset.StandardCharsets; |
| import java.util.HashSet; |
| import java.util.Random; |
| import java.util.Set; |
| |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.util.fst.Util; |
| |
| public class TestUTF32ToUTF8 extends LuceneTestCase { |
| |
| @Override |
| public void setUp() throws Exception { |
| super.setUp(); |
| } |
| |
| private static final int MAX_UNICODE = 0x10FFFF; |
| |
| private boolean matches(ByteRunAutomaton a, int code) { |
| char[] chars = Character.toChars(code); |
| byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)]; |
| final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b); |
| return a.run(b, 0, len); |
| } |
| |
| private void testOne(Random r, ByteRunAutomaton a, int startCode, int endCode, int iters) { |
| |
| // Verify correct ints are accepted |
| final int nonSurrogateCount; |
| final boolean ovSurStart; |
| if (endCode < UnicodeUtil.UNI_SUR_HIGH_START || |
| startCode > UnicodeUtil.UNI_SUR_LOW_END) { |
| // no overlap w/ surrogates |
| nonSurrogateCount = endCode - startCode + 1; |
| ovSurStart = false; |
| } else if (isSurrogate(startCode)) { |
| // start of range overlaps surrogates |
| nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - startCode + 1); |
| ovSurStart = false; |
| } else if (isSurrogate(endCode)) { |
| // end of range overlaps surrogates |
| ovSurStart = true; |
| nonSurrogateCount = endCode - startCode + 1 - (endCode - UnicodeUtil.UNI_SUR_HIGH_START + 1); |
| } else { |
| // range completely subsumes surrogates |
| ovSurStart = true; |
| nonSurrogateCount = endCode - startCode + 1 - (UnicodeUtil.UNI_SUR_LOW_END - UnicodeUtil.UNI_SUR_HIGH_START + 1); |
| } |
| |
| assert nonSurrogateCount > 0; |
| |
| for(int iter=0;iter<iters;iter++) { |
| // pick random code point in-range |
| |
| int code = startCode + r.nextInt(nonSurrogateCount); |
| if (isSurrogate(code)) { |
| if (ovSurStart) { |
| code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - UnicodeUtil.UNI_SUR_HIGH_START); |
| } else { |
| code = UnicodeUtil.UNI_SUR_LOW_END + 1 + (code - startCode); |
| } |
| } |
| |
| assert code >= startCode && code <= endCode: "code=" + code + " start=" + startCode + " end=" + endCode; |
| assert !isSurrogate(code); |
| |
| assertTrue("DFA for range " + startCode + "-" + endCode + " failed to match code=" + code, |
| matches(a, code)); |
| } |
| |
| // Verify invalid ints are not accepted |
| final int invalidRange = MAX_UNICODE - (endCode - startCode + 1); |
| if (invalidRange > 0) { |
| for(int iter=0;iter<iters;iter++) { |
| int x = TestUtil.nextInt(r, 0, invalidRange - 1); |
| final int code; |
| if (x >= startCode) { |
| code = endCode + 1 + x - startCode; |
| } else { |
| code = x; |
| } |
| if ((code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_HIGH_END) | |
| (code >= UnicodeUtil.UNI_SUR_LOW_START && code <= UnicodeUtil.UNI_SUR_LOW_END)) { |
| iter--; |
| continue; |
| } |
| assertFalse("DFA for range " + startCode + "-" + endCode + " matched invalid code=" + code, |
| matches(a, code)); |
| |
| } |
| } |
| } |
| |
| // Evenly picks random code point from the 4 "buckets" |
| // (bucket = same #bytes when encoded to utf8) |
| private int getCodeStart(Random r) { |
| switch(r.nextInt(4)) { |
| case 0: |
| return TestUtil.nextInt(r, 0, 128); |
| case 1: |
| return TestUtil.nextInt(r, 128, 2048); |
| case 2: |
| return TestUtil.nextInt(r, 2048, 65536); |
| default: |
| return TestUtil.nextInt(r, 65536, 1 + MAX_UNICODE); |
| } |
| } |
| |
| private static boolean isSurrogate(int code) { |
| return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END; |
| } |
| |
| public void testRandomRanges() throws Exception { |
| final Random r = random(); |
| int ITERS = atLeast(10); |
| int ITERS_PER_DFA = atLeast(100); |
| for(int iter=0;iter<ITERS;iter++) { |
| int x1 = getCodeStart(r); |
| int x2 = getCodeStart(r); |
| final int startCode, endCode; |
| |
| if (x1 < x2) { |
| startCode = x1; |
| endCode = x2; |
| } else { |
| startCode = x2; |
| endCode = x1; |
| } |
| |
| if (isSurrogate(startCode) && isSurrogate(endCode)) { |
| iter--; |
| continue; |
| } |
| |
| Automaton a = Automata.makeCharRange(startCode, endCode); |
| testOne(r, new ByteRunAutomaton(a), startCode, endCode, ITERS_PER_DFA); |
| } |
| } |
| |
| public void testSpecialCase() { |
| RegExp re = new RegExp(".?"); |
| Automaton automaton = re.toAutomaton(); |
| CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); |
| ByteRunAutomaton bra = new ByteRunAutomaton(automaton); |
| // make sure character dfa accepts empty string |
| assertTrue(cra.isAccept(0)); |
| assertTrue(cra.run("")); |
| assertTrue(cra.run(new char[0], 0, 0)); |
| |
| // make sure byte dfa accepts empty string |
| assertTrue(bra.isAccept(0)); |
| assertTrue(bra.run(new byte[0], 0, 0)); |
| } |
| |
| public void testSpecialCase2() throws Exception { |
| RegExp re = new RegExp(".+\u0775"); |
| String input = "\ufadc\ufffd\ub80b\uda5a\udc68\uf234\u0056\uda5b\udcc1\ufffd\ufffd\u0775"; |
| Automaton automaton = re.toAutomaton(); |
| CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); |
| ByteRunAutomaton bra = new ByteRunAutomaton(automaton); |
| |
| assertTrue(cra.run(input)); |
| |
| byte[] bytes = input.getBytes(StandardCharsets.UTF_8); |
| assertTrue(bra.run(bytes, 0, bytes.length)); // this one fails! |
| } |
| |
| public void testSpecialCase3() throws Exception { |
| RegExp re = new RegExp("(\\鯺)*(.)*\\Ӕ"); |
| String input = "\u5cfd\ufffd\ub2f7\u0033\ue304\u51d7\u3692\udb50\udfb3\u0576\udae2\udc62\u0053\u0449\u04d4"; |
| Automaton automaton = re.toAutomaton(); |
| CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); |
| ByteRunAutomaton bra = new ByteRunAutomaton(automaton); |
| |
| assertTrue(cra.run(input)); |
| |
| byte[] bytes = input.getBytes(StandardCharsets.UTF_8); |
| assertTrue(bra.run(bytes, 0, bytes.length)); |
| } |
| |
| public void testRandomRegexes() throws Exception { |
| int num = atLeast(50); |
| for (int i = 0; i < num; i++) { |
| assertAutomaton(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton()); |
| } |
| } |
| |
| public void testSingleton() throws Exception { |
| int iters = atLeast(100); |
| for(int iter=0;iter<iters;iter++) { |
| String s = TestUtil.randomRealisticUnicodeString(random()); |
| Automaton a = Automata.makeString(s); |
| Automaton utf8 = new UTF32ToUTF8().convert(a); |
| IntsRefBuilder ints = new IntsRefBuilder(); |
| Util.toIntsRef(new BytesRef(s), ints); |
| Set<IntsRef> set = new HashSet<>(); |
| set.add(ints.get()); |
| assertEquals(set, TestOperations.getFiniteStrings(utf8)); |
| } |
| } |
| |
| private void assertAutomaton(Automaton automaton) throws Exception { |
| CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton); |
| ByteRunAutomaton bra = new ByteRunAutomaton(automaton); |
| final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton); |
| |
| int num = atLeast(1000); |
| for (int i = 0; i < num; i++) { |
| final String string; |
| if (random().nextBoolean()) { |
| // likely not accepted |
| string = TestUtil.randomUnicodeString(random()); |
| } else { |
| // will be accepted |
| int[] codepoints = ras.getRandomAcceptedString(random()); |
| try { |
| string = UnicodeUtil.newString(codepoints, 0, codepoints.length); |
| } catch (Exception e) { |
| System.out.println(codepoints.length + " codepoints:"); |
| for(int j=0;j<codepoints.length;j++) { |
| System.out.println(" " + Integer.toHexString(codepoints[j])); |
| } |
| throw e; |
| } |
| } |
| byte bytes[] = string.getBytes(StandardCharsets.UTF_8); |
| assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length)); |
| } |
| } |
| } |