| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.tajo.util; |
| |
| import io.netty.buffer.ByteBuf; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.List; |
| |
| import static io.netty.util.internal.StringUtil.isSurrogate; |
| |
| /** |
| * Extra utilities for bytes |
| */ |
| public class BytesUtils { |
| |
| /** |
| * Parse the first byte of a vint/vlong to determine the number of bytes |
| * @param value the first byte of the vint/vlong |
| * @return the total number of bytes (1 to 9) |
| */ |
| public static int decodeVIntSize(byte value) { |
| if (value >= -112) { |
| return 1; |
| } else if (value < -120) { |
| return -119 - value; |
| } |
| return -111 - value; |
| } |
| |
| /** |
| * @param n Long to make a VLong of. |
| * @return VLong as bytes array. |
| */ |
| public static byte[] vlongToBytes(long n) { |
| byte [] result; |
| int offset = 0; |
| if (n >= -112 && n <= 127) { |
| result = new byte[1]; |
| result[offset] = (byte) n; |
| return result; |
| } |
| |
| int len = -112; |
| if (n < 0) { |
| n ^= -1L; // take one's complement' |
| len = -120; |
| } |
| |
| long tmp = n; |
| while (tmp != 0) { |
| tmp = tmp >> 8; |
| len--; |
| } |
| |
| int size = decodeVIntSize((byte) len); |
| |
| result = new byte[size]; |
| result[offset++] = (byte) len; |
| len = (len < -120) ? -(len + 120) : -(len + 112); |
| |
| for (int idx = len; idx != 0; idx--) { |
| int shiftbits = (idx - 1) * 8; |
| long mask = 0xFFL << shiftbits; |
| result[offset++] = (byte)((n & mask) >> shiftbits); |
| } |
| return result; |
| } |
| |
| public static void writeVLong(ByteArrayOutputStream byteStream, long l) { |
| byte[] vLongBytes = vlongToBytes(l); |
| byteStream.write(vLongBytes, 0, vLongBytes.length); |
| } |
| |
| /** |
| * Converts a char array to a ascii byte array. |
| * |
| * @param chars string |
| * @return the byte array |
| */ |
| static byte[] toASCIIBytes(char[] chars) { |
| byte[] buffer = new byte[chars.length]; |
| for (int i = 0; i < chars.length; i++) { |
| buffer[i] = (byte) chars[i]; |
| } |
| return buffer; |
| } |
| |
| public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target, int numColumns) { |
| return splitWorker(str, 0, -1, separatorChar, target, numColumns); |
| } |
| |
| public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { |
| return splitWorker(str, offset, length, separator, target, numColumns); |
| } |
| |
| public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int numColumns) { |
| return splitWorker(str, 0, -1, separatorChar, null, numColumns); |
| } |
| |
| private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar, |
| int[] target, int numColumns) { |
| return splitWorker(str, offset, length, new byte[] {(byte)separatorChar}, target, numColumns); |
| } |
| |
| /** |
| * Performs the logic for the <code>split</code> and |
| * <code>splitPreserveAllTokens</code> methods that do not return a |
| * maximum array length. |
| * |
| * @param str the String to parse, may be <code>null</code> |
| * @param length amount of bytes to str |
| * @param separator the ascii separate characters |
| * @param target the projection target |
| * @param numColumns number of columns to be retrieved |
| * @return an array of parsed Strings, <code>null</code> if null String input |
| */ |
| private static byte[][] splitWorker(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { |
| if (str == null) { |
| return null; |
| } |
| if (length == 0) { |
| return new byte[numColumns][0]; |
| } |
| if (length < 0) { |
| length = str.length - offset; |
| } |
| int indexMax = 0; |
| if (target != null) { |
| for (int index : target) { |
| indexMax = Math.max(indexMax, index + 1); |
| } |
| } else { |
| indexMax = numColumns; |
| } |
| |
| int[][] indices = split(str, offset, length, separator, new int[indexMax][]); |
| byte[][] result = new byte[numColumns][]; |
| |
| // not-picked -> null, picked but not-exists -> byte[0] |
| if (target != null) { |
| for (int i : target) { |
| int[] index = indices[i]; |
| result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); |
| } |
| } else { |
| for (int i = 0; i < result.length; i++) { |
| int[] index = indices[i]; |
| result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); |
| } |
| } |
| return result; |
| } |
| |
| public static int[][] split(byte[] str, int offset, int length, byte[] separator, int[][] indices) { |
| if (indices.length == 0) { |
| return indices; // trivial |
| } |
| final int limit = offset + length; |
| |
| int start = offset; |
| int colIndex = 0; |
| for (int index = offset; index < limit;) { |
| if (onDelimiter(str, index, limit, separator)) { |
| indices[colIndex++] = new int[] {start, index}; |
| if (colIndex >= indices.length) { |
| return indices; |
| } |
| index += separator.length; |
| start = index; |
| } else { |
| index++; |
| } |
| } |
| if (colIndex < indices.length) { |
| indices[colIndex] = new int[]{start, limit}; |
| } |
| return indices; |
| } |
| |
| private static boolean onDelimiter(byte[] input, int offset, int limit, byte[] delimiter) { |
| for (int i = 0; i < delimiter.length; i++) { |
| if (offset + i >= limit || input[offset + i] != delimiter[i]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| public static byte[][] splitTrivial(byte[] value, byte delimiter) { |
| List<byte[]> split = new ArrayList<>(); |
| int prev = 0; |
| for (int i = 0; i < value.length; i++) { |
| if (value[i] == delimiter) { |
| split.add(Arrays.copyOfRange(value, prev, i)); |
| prev = i + 1; |
| } |
| } |
| if (prev <= value.length) { |
| split.add(Arrays.copyOfRange(value, prev, value.length)); |
| } |
| return split.toArray(new byte[split.size()][]); |
| } |
| |
| /** |
| * It gets the maximum length among all given the array of bytes. |
| * Then, it adds padding (i.e., \0) to byte arrays which are shorter |
| * than the maximum length. |
| * |
| * @param bytes Byte arrays to be padded |
| * @return The array of padded bytes |
| */ |
| public static byte[][] padBytes(byte []...bytes) { |
| byte [][] padded = new byte[bytes.length][]; |
| |
| int maxLen = Integer.MIN_VALUE; |
| |
| for (byte[] aByte : bytes) { |
| maxLen = Math.max(maxLen, aByte.length); |
| } |
| |
| for (int i = 0; i < bytes.length; i++) { |
| int padLen = maxLen - bytes[i].length; |
| if (padLen == 0) { |
| padded[i] = bytes[i]; |
| } else if (padLen > 0) { |
| padded[i] = Bytes.padTail(bytes[i], padLen); |
| } else { |
| throw new RuntimeException("maximum length: " + maxLen + ", bytes[" + i + "].length:" + bytes[i].length); |
| } |
| } |
| |
| return padded; |
| } |
| |
| public static byte [] trimBytes(byte [] bytes) { |
| return new String(bytes).trim().getBytes(); |
| } |
| |
| /** |
| * this is an implementation copied from ByteBufUtil in netty4 |
| */ |
| public static int writeUtf8(ByteBuf buffer, char[] chars, boolean ignoreSurrogate) { |
| int oldWriterIndex = buffer.writerIndex(); |
| int writerIndex = oldWriterIndex; |
| |
| // We can use the _set methods as these not need to do any index checks and reference checks. |
| // This is possible as we called ensureWritable(...) before. |
| for (int i = 0; i < chars.length; i++) { |
| char c = chars[i]; |
| if (c < 0x80) { |
| buffer.setByte(writerIndex++, (byte) c); |
| } else if (c < 0x800) { |
| buffer.setByte(writerIndex++, (byte) (0xc0 | (c >> 6))); |
| buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f))); |
| } else if (!ignoreSurrogate && isSurrogate(c)) { |
| if (!Character.isHighSurrogate(c)) { |
| throw new IllegalArgumentException("Invalid encoding. " + |
| "Expected high (leading) surrogate at index " + i + " but got " + c); |
| } |
| final char c2; |
| try { |
| // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to avoid |
| // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we will |
| // re-throw a more informative exception describing the problem. |
| c2 = chars[++i]; |
| } catch (IndexOutOfBoundsException e) { |
| throw new IllegalArgumentException("Underflow. " + |
| "Expected low (trailing) surrogate at index " + i + " but no more characters found.", e); |
| } |
| if (!Character.isLowSurrogate(c2)) { |
| throw new IllegalArgumentException("Invalid encoding. " + |
| "Expected low (trailing) surrogate at index " + i + " but got " + c2); |
| } |
| int codePoint = Character.toCodePoint(c, c2); |
| // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630. |
| buffer.setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18))); |
| buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f))); |
| buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f))); |
| buffer.setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f))); |
| } else { |
| buffer.setByte(writerIndex++, (byte) (0xe0 | (c >> 12))); |
| buffer.setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f))); |
| buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f))); |
| } |
| } |
| // update the writerIndex without any extra checks for performance reasons |
| buffer.writerIndex(writerIndex); |
| return writerIndex - oldWriterIndex; |
| } |
| } |