| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.flink.table.runtime.util; |
| |
| import org.apache.flink.core.memory.MemorySegment; |
| |
| import java.io.UnsupportedEncodingException; |
| import java.util.Arrays; |
| |
| /** |
| * String utf-8 utils. |
| * |
| * <p>{@code StringUtf8Utils} refers to the implementation from SerializeWriter and IOUtils of Alibaba fastjson. |
| * The difference is that StringUtf8Utils need to handle the wrong code, just like StringCoding.decode. |
| */ |
| public class StringUtf8Utils { |
| |
| private static final int MAX_CHARS_LENGTH = 1024 * 32; |
| private static final int MAX_BYTES_LENGTH = 1024 * 64; |
| public static final int MAX_BYTES_PER_CHAR = 3; |
| |
| private static final ThreadLocal<char[]> CHARS_LOCAL = new ThreadLocal<>(); |
| private static final ThreadLocal<byte[]> BYTES_LOCAL = new ThreadLocal<>(); |
| |
| public static char[] allocateChars(int length) { |
| char[] chars = CHARS_LOCAL.get(); |
| |
| if (chars == null) { |
| if (length <= MAX_CHARS_LENGTH) { |
| chars = new char[MAX_CHARS_LENGTH]; |
| CHARS_LOCAL.set(chars); |
| } else { |
| chars = new char[length]; |
| } |
| } else if (chars.length < length) { |
| chars = new char[length]; |
| } |
| |
| return chars; |
| } |
| |
| public static byte[] allocateBytes(int length) { |
| byte[] bytes = BYTES_LOCAL.get(); |
| |
| if (bytes == null) { |
| if (length <= MAX_BYTES_LENGTH) { |
| bytes = new byte[MAX_BYTES_LENGTH]; |
| BYTES_LOCAL.set(bytes); |
| } else { |
| bytes = new byte[length]; |
| } |
| } else if (bytes.length < length) { |
| bytes = new byte[length]; |
| } |
| |
| return bytes; |
| } |
| |
| /** |
| * This method must have the same result with JDK's String.getBytes. |
| */ |
| public static byte[] encodeUTF8(String str) { |
| byte[] bytes = allocateBytes(str.length() * MAX_BYTES_PER_CHAR); |
| int len = encodeUTF8(str, bytes); |
| return Arrays.copyOf(bytes, len); |
| } |
| |
| public static int encodeUTF8(String str, byte[] bytes) { |
| int offset = 0; |
| int len = str.length(); |
| int sl = offset + len; |
| int dp = 0; |
| int dlASCII = dp + Math.min(len, bytes.length); |
| |
| // ASCII only optimized loop |
| while (dp < dlASCII && str.charAt(offset) < '\u0080') { |
| bytes[dp++] = (byte) str.charAt(offset++); |
| } |
| |
| while (offset < sl) { |
| char c = str.charAt(offset++); |
| if (c < 0x80) { |
| // Have at most seven bits |
| bytes[dp++] = (byte) c; |
| } else if (c < 0x800) { |
| // 2 bytes, 11 bits |
| bytes[dp++] = (byte) (0xc0 | (c >> 6)); |
| bytes[dp++] = (byte) (0x80 | (c & 0x3f)); |
| } else if (Character.isSurrogate(c)) { |
| final int uc; |
| int ip = offset - 1; |
| if (Character.isHighSurrogate(c)) { |
| if (sl - ip < 2) { |
| uc = -1; |
| } else { |
| char d = str.charAt(ip + 1); |
| if (Character.isLowSurrogate(d)) { |
| uc = Character.toCodePoint(c, d); |
| } else { |
| // for some illegal character |
| // the jdk will ignore the origin character and cast it to '?' |
| // this acts the same with jdk |
| return defaultEncodeUTF8(str, bytes); |
| } |
| } |
| } else { |
| if (Character.isLowSurrogate(c)) { |
| // for some illegal character |
| // the jdk will ignore the origin character and cast it to '?' |
| // this acts the same with jdk |
| return defaultEncodeUTF8(str, bytes); |
| } else { |
| uc = c; |
| } |
| } |
| |
| if (uc < 0) { |
| bytes[dp++] = (byte) '?'; |
| } else { |
| bytes[dp++] = (byte) (0xf0 | ((uc >> 18))); |
| bytes[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f)); |
| bytes[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f)); |
| bytes[dp++] = (byte) (0x80 | (uc & 0x3f)); |
| offset++; // 2 chars |
| } |
| } else { |
| // 3 bytes, 16 bits |
| bytes[dp++] = (byte) (0xe0 | ((c >> 12))); |
| bytes[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f)); |
| bytes[dp++] = (byte) (0x80 | (c & 0x3f)); |
| } |
| } |
| return dp; |
| } |
| |
| public static int defaultEncodeUTF8(String str, byte[] bytes) { |
| try { |
| byte[] buffer = str.getBytes("UTF-8"); |
| System.arraycopy(buffer, 0, bytes, 0, buffer.length); |
| return buffer.length; |
| } catch (UnsupportedEncodingException e) { |
| throw new RuntimeException("encodeUTF8 error", e); |
| } |
| } |
| |
| public static String decodeUTF8(byte[] input, int offset, int byteLen) { |
| char[] chars = allocateChars(byteLen); |
| int len = decodeUTF8Strict(input, offset, byteLen, chars); |
| if (len < 0) { |
| return defaultDecodeUTF8(input, offset, byteLen); |
| } |
| return new String(chars, 0, len); |
| } |
| |
| public static int decodeUTF8Strict(byte[] sa, int sp, int len, char[] da) { |
| final int sl = sp + len; |
| int dp = 0; |
| int dlASCII = Math.min(len, da.length); |
| |
| // ASCII only optimized loop |
| while (dp < dlASCII && sa[sp] >= 0) { |
| da[dp++] = (char) sa[sp++]; |
| } |
| |
| while (sp < sl) { |
| int b1 = sa[sp++]; |
| if (b1 >= 0) { |
| // 1 byte, 7 bits: 0xxxxxxx |
| da[dp++] = (char) b1; |
| } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { |
| // 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
| if (sp < sl) { |
| int b2 = sa[sp++]; |
| if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2) |
| return -1; |
| } else { |
| da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80))); |
| } |
| continue; |
| } |
| return -1; |
| } else if ((b1 >> 4) == -2) { |
| // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
| if (sp + 1 < sl) { |
| int b2 = sa[sp++]; |
| int b3 = sa[sp++]; |
| if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) |
| || (b2 & 0xc0) != 0x80 |
| || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3) |
| return -1; |
| } else { |
| char c = (char) ((b1 << 12) ^ (b2 << 6) ^ (b3 ^ |
| (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); |
| if (Character.isSurrogate(c)) { |
| return -1; |
| } else { |
| da[dp++] = c; |
| } |
| } |
| continue; |
| } |
| return -1; |
| } else if ((b1 >> 3) == -2) { |
| // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| if (sp + 2 < sl) { |
| int b2 = sa[sp++]; |
| int b3 = sa[sp++]; |
| int b4 = sa[sp++]; |
| int uc = ((b1 << 18) ^ |
| (b2 << 12) ^ |
| (b3 << 6) ^ |
| (b4 ^ (((byte) 0xF0 << 18) ^ ((byte) 0x80 << 12) ^ |
| ((byte) 0x80 << 6) ^ ((byte) 0x80)))); |
| // isMalformed4 and shortest form check |
| if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) |
| || !Character.isSupplementaryCodePoint(uc)) { |
| return -1; |
| } else { |
| da[dp++] = Character.highSurrogate(uc); |
| da[dp++] = Character.lowSurrogate(uc); |
| } |
| continue; |
| } |
| return -1; |
| } else { |
| return -1; |
| } |
| } |
| return dp; |
| } |
| |
| public static String decodeUTF8(MemorySegment input, int offset, int byteLen) { |
| char[] chars = allocateChars(byteLen); |
| int len = decodeUTF8Strict(input, offset, byteLen, chars); |
| if (len < 0) { |
| byte[] bytes = allocateBytes(byteLen); |
| input.get(offset, bytes, 0, byteLen); |
| return defaultDecodeUTF8(bytes, 0, byteLen); |
| } |
| return new String(chars, 0, len); |
| } |
| |
| public static int decodeUTF8Strict(MemorySegment segment, int sp, int len, char[] da) { |
| final int sl = sp + len; |
| int dp = 0; |
| int dlASCII = Math.min(len, da.length); |
| |
| // ASCII only optimized loop |
| while (dp < dlASCII && segment.get(sp) >= 0) { |
| da[dp++] = (char) segment.get(sp++); |
| } |
| |
| while (sp < sl) { |
| int b1 = segment.get(sp++); |
| if (b1 >= 0) { |
| // 1 byte, 7 bits: 0xxxxxxx |
| da[dp++] = (char) b1; |
| } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { |
| // 2 bytes, 11 bits: 110xxxxx 10xxxxxx |
| if (sp < sl) { |
| int b2 = segment.get(sp++); |
| if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2) |
| return -1; |
| } else { |
| da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80))); |
| } |
| continue; |
| } |
| return -1; |
| } else if ((b1 >> 4) == -2) { |
| // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx |
| if (sp + 1 < sl) { |
| int b2 = segment.get(sp++); |
| int b3 = segment.get(sp++); |
| if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) |
| || (b2 & 0xc0) != 0x80 |
| || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3) |
| return -1; |
| } else { |
| char c = (char) ((b1 << 12) ^ (b2 << 6) ^ (b3 ^ |
| (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); |
| if (Character.isSurrogate(c)) { |
| return -1; |
| } else { |
| da[dp++] = c; |
| } |
| } |
| continue; |
| } |
| return -1; |
| } else if ((b1 >> 3) == -2) { |
| // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| if (sp + 2 < sl) { |
| int b2 = segment.get(sp++); |
| int b3 = segment.get(sp++); |
| int b4 = segment.get(sp++); |
| int uc = ((b1 << 18) ^ |
| (b2 << 12) ^ |
| (b3 << 6) ^ |
| (b4 ^ (((byte) 0xF0 << 18) ^ ((byte) 0x80 << 12) ^ |
| ((byte) 0x80 << 6) ^ ((byte) 0x80)))); |
| // isMalformed4 and shortest form check |
| if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) |
| || !Character.isSupplementaryCodePoint(uc)) { |
| return -1; |
| } else { |
| da[dp++] = Character.highSurrogate(uc); |
| da[dp++] = Character.lowSurrogate(uc); |
| } |
| continue; |
| } |
| return -1; |
| } else { |
| return -1; |
| } |
| } |
| return dp; |
| } |
| |
| public static String defaultDecodeUTF8(byte[] bytes, int offset, int len) { |
| try { |
| return new String(bytes, offset, len, "UTF-8"); |
| } catch (UnsupportedEncodingException e) { |
| throw new RuntimeException("encodeUTF8 error", e); |
| } |
| } |
| } |