| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.util.automaton; |
| |
| |
| import java.util.Arrays; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| // TODO |
| // - do we really need the .bits...? if not we can make util in UnicodeUtil to convert 1 char into a BytesRef |
| |
| /** |
| * Converts UTF-32 automata to the equivalent UTF-8 representation. |
| * @lucene.internal |
| */ |
| public final class UTF32ToUTF8 { |
| |
| // Unicode boundaries for UTF8 bytes 1,2,3,4 |
| private static final int[] startCodes = new int[] {0, 128, 2048, 65536}; |
| private static final int[] endCodes = new int[] {127, 2047, 65535, 1114111}; |
| |
| static int[] MASKS = new int[32]; |
| static { |
| int v = 2; |
| for(int i=0;i<32;i++) { |
| MASKS[i] = v-1; |
| v *= 2; |
| } |
| } |
| |
| // Represents one of the N utf8 bytes that (in sequence) |
| // define a code point. value is the byte value; bits is |
| // how many bits are "used" by utf8 at that byte |
| private static class UTF8Byte { |
| int value; // TODO: change to byte |
| byte bits; |
| } |
| |
| // Holds a single code point, as a sequence of 1-4 utf8 bytes: |
| // TODO: maybe move to UnicodeUtil? |
| private static class UTF8Sequence { |
| private final UTF8Byte[] bytes; |
| private int len; |
| |
| public UTF8Sequence() { |
| bytes = new UTF8Byte[4]; |
| for(int i=0;i<4;i++) { |
| bytes[i] = new UTF8Byte(); |
| } |
| } |
| |
| public int byteAt(int idx) { |
| return bytes[idx].value; |
| } |
| |
| public int numBits(int idx) { |
| return bytes[idx].bits; |
| } |
| |
| private void set(int code) { |
| if (code < 128) { |
| // 0xxxxxxx |
| bytes[0].value = code; |
| bytes[0].bits = 7; |
| len = 1; |
| } else if (code < 2048) { |
| // 110yyyxx 10xxxxxx |
| bytes[0].value = (6 << 5) | (code >> 6); |
| bytes[0].bits = 5; |
| setRest(code, 1); |
| len = 2; |
| } else if (code < 65536) { |
| // 1110yyyy 10yyyyxx 10xxxxxx |
| bytes[0].value = (14 << 4) | (code >> 12); |
| bytes[0].bits = 4; |
| setRest(code, 2); |
| len = 3; |
| } else { |
| // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx |
| bytes[0].value = (30 << 3) | (code >> 18); |
| bytes[0].bits = 3; |
| setRest(code, 3); |
| len = 4; |
| } |
| } |
| |
| private void setRest(int code, int numBytes) { |
| for(int i=0;i<numBytes;i++) { |
| bytes[numBytes-i].value = 128 | (code & MASKS[5]); |
| bytes[numBytes-i].bits = 6; |
| code = code >> 6; |
| } |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder b = new StringBuilder(); |
| for(int i=0;i<len;i++) { |
| if (i > 0) { |
| b.append(' '); |
| } |
| b.append(Integer.toBinaryString(bytes[i].value)); |
| } |
| return b.toString(); |
| } |
| } |
| |
| /** Sole constructor. */ |
| public UTF32ToUTF8() { |
| } |
| |
| private final UTF8Sequence startUTF8 = new UTF8Sequence(); |
| private final UTF8Sequence endUTF8 = new UTF8Sequence(); |
| |
| private final UTF8Sequence tmpUTF8a = new UTF8Sequence(); |
| private final UTF8Sequence tmpUTF8b = new UTF8Sequence(); |
| |
| // Builds necessary utf8 edges between start & end |
| void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) { |
| startUTF8.set(startCodePoint); |
| endUTF8.set(endCodePoint); |
| build(start, end, startUTF8, endUTF8, 0); |
| } |
| |
| private void build(int start, int end, UTF8Sequence startUTF8, UTF8Sequence endUTF8, int upto) { |
| |
| // Break into start, middle, end: |
| if (startUTF8.byteAt(upto) == endUTF8.byteAt(upto)) { |
| // Degen case: lead with the same byte: |
| if (upto == startUTF8.len-1 && upto == endUTF8.len-1) { |
| // Super degen: just single edge, one UTF8 byte: |
| utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto)); |
| return; |
| } else { |
| assert startUTF8.len > upto+1; |
| assert endUTF8.len > upto+1; |
| int n = utf8.createState(); |
| |
| // Single value leading edge |
| utf8.addTransition(start, n, startUTF8.byteAt(upto)); |
| //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=single |
| |
| // Recurse for the rest |
| build(n, end, startUTF8, endUTF8, 1+upto); |
| } |
| } else if (startUTF8.len == endUTF8.len) { |
| if (upto == startUTF8.len-1) { |
| //start.addTransition(new Transition(startUTF8.byteAt(upto), endUTF8.byteAt(upto), end)); // type=startend |
| utf8.addTransition(start, end, startUTF8.byteAt(upto), endUTF8.byteAt(upto)); |
| } else { |
| start(start, end, startUTF8, upto, false); |
| if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { |
| // There is a middle |
| all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1); |
| } |
| end(start, end, endUTF8, upto, false); |
| } |
| } else { |
| |
| // start |
| start(start, end, startUTF8, upto, true); |
| |
| // possibly middle, spanning multiple num bytes |
| int byteCount = 1+startUTF8.len-upto; |
| final int limit = endUTF8.len-upto; |
| while (byteCount < limit) { |
| // wasteful: we only need first byte, and, we should |
| // statically encode this first byte: |
| tmpUTF8a.set(startCodes[byteCount-1]); |
| tmpUTF8b.set(endCodes[byteCount-1]); |
| all(start, end, |
| tmpUTF8a.byteAt(0), |
| tmpUTF8b.byteAt(0), |
| tmpUTF8a.len - 1); |
| byteCount++; |
| } |
| |
| // end |
| end(start, end, endUTF8, upto, true); |
| } |
| } |
| |
| private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) { |
| if (upto == startUTF8.len-1) { |
| // Done recursing |
| utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start |
| //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start |
| } else { |
| int n = utf8.createState(); |
| utf8.addTransition(start, n, startUTF8.byteAt(upto)); |
| //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start |
| start(n, end, startUTF8, 1+upto, true); |
| int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]; |
| if (doAll && startUTF8.byteAt(upto) != endCode) { |
| all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1); |
| } |
| } |
| } |
| |
| private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) { |
| if (upto == endUTF8.len-1) { |
| // Done recursing |
| //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end |
| utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto)); |
| } else { |
| final int startCode; |
| if (endUTF8.numBits(upto) == 5) { |
| // special case -- avoid created unused edges (endUTF8 |
| // doesn't accept certain byte sequences) -- there |
| // are other cases we could optimize too: |
| startCode = 194; |
| } else { |
| startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]); |
| } |
| if (doAll && endUTF8.byteAt(upto) != startCode) { |
| all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1); |
| } |
| int n = utf8.createState(); |
| //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end |
| utf8.addTransition(start, n, endUTF8.byteAt(upto)); |
| end(n, end, endUTF8, 1+upto, true); |
| } |
| } |
| |
| private void all(int start, int end, int startCode, int endCode, int left) { |
| if (left == 0) { |
| //start.addTransition(new Transition(startCode, endCode, end)); // type=all |
| utf8.addTransition(start, end, startCode, endCode); |
| } else { |
| int lastN = utf8.createState(); |
| //start.addTransition(new Transition(startCode, endCode, lastN)); // type=all |
| utf8.addTransition(start, lastN, startCode, endCode); |
| while (left > 1) { |
| int n = utf8.createState(); |
| //lastN.addTransition(new Transition(128, 191, n)); // type=all* |
| utf8.addTransition(lastN, n, 128, 191); // type=all* |
| left--; |
| lastN = n; |
| } |
| //lastN.addTransition(new Transition(128, 191, end)); // type = all* |
| utf8.addTransition(lastN, end, 128, 191); // type = all* |
| } |
| } |
| |
| Automaton.Builder utf8; |
| |
| /** Converts an incoming utf32 automaton to an equivalent |
| * utf8 one. The incoming automaton need not be |
| * deterministic. Note that the returned automaton will |
| * not in general be deterministic, so you must |
| * determinize it if that's needed. */ |
| public Automaton convert(Automaton utf32) { |
| if (utf32.getNumStates() == 0) { |
| return utf32; |
| } |
| |
| int[] map = new int[utf32.getNumStates()]; |
| Arrays.fill(map, -1); |
| |
| List<Integer> pending = new ArrayList<>(); |
| int utf32State = 0; |
| pending.add(utf32State); |
| utf8 = new Automaton.Builder(); |
| |
| int utf8State = utf8.createState(); |
| |
| utf8.setAccept(utf8State, utf32.isAccept(utf32State)); |
| |
| map[utf32State] = utf8State; |
| |
| Transition scratch = new Transition(); |
| |
| while (pending.size() != 0) { |
| utf32State = pending.remove(pending.size()-1); |
| utf8State = map[utf32State]; |
| assert utf8State != -1; |
| |
| int numTransitions = utf32.getNumTransitions(utf32State); |
| utf32.initTransition(utf32State, scratch); |
| for(int i=0;i<numTransitions;i++) { |
| utf32.getNextTransition(scratch); |
| int destUTF32 = scratch.dest; |
| int destUTF8 = map[destUTF32]; |
| if (destUTF8 == -1) { |
| destUTF8 = utf8.createState(); |
| utf8.setAccept(destUTF8, utf32.isAccept(destUTF32)); |
| map[destUTF32] = destUTF8; |
| pending.add(destUTF32); |
| } |
| |
| // Writes new transitions into pendingTransitions: |
| convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max); |
| } |
| } |
| |
| return utf8.finish(); |
| } |
| } |