| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with this |
| * work for additional information regarding copyright ownership. The ASF |
| * licenses this file to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| * License for the specific language governing permissions and limitations under |
| * the License. |
| */ |
| package org.apache.hadoop.yarn.server.timelineservice.storage.common; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.hadoop.hbase.util.Bytes; |
| |
| /** |
| * Used to separate row qualifiers, column qualifiers and compound fields. |
| */ |
| public enum Separator { |
| |
| /** |
| * separator in key or column qualifier fields. |
| */ |
| QUALIFIERS("!", "%0$"), |
| |
| /** |
| * separator in values, and/or compound key/column qualifier fields. |
| */ |
| VALUES("=", "%1$"), |
| |
| /** |
| * separator in values, often used to avoid having these in qualifiers and |
| * names. Note that if we use HTML form encoding through URLEncoder, we end up |
| * getting a + for a space, which may already occur in strings, so we don't |
| * want that. |
| */ |
| SPACE(" ", "%2$"), |
| |
| /** |
| * separator in values, often used to avoid having these in qualifiers and |
| * names. |
| */ |
| TAB("\t", "%3$"); |
| |
| // a reserved character that starts each of the encoded values and is encoded |
| // first in order to escape naturally occurring instances of encoded values |
| // although it can be expressed as an enum instance, we define them as private |
| // variables to hide it from callers |
| private static final String PERCENT = "%"; |
| private static final String PERCENT_ENCODED = "%9$"; |
| |
| private static final Pattern PERCENT_PATTERN = |
| Pattern.compile(PERCENT, Pattern.LITERAL); |
| private static final String PERCENT_REPLACEMENT = |
| Matcher.quoteReplacement(PERCENT); |
| |
| private static final Pattern PERCENT_ENCODED_PATTERN = |
| Pattern.compile(PERCENT_ENCODED, Pattern.LITERAL); |
| private static final String PERCENT_ENCODED_REPLACEMENT = |
| Matcher.quoteReplacement(PERCENT_ENCODED); |
| |
| /** |
| * The string value of this separator. |
| */ |
| private final String value; |
| |
| /** |
| * The bye representation of value. |
| */ |
| private final byte[] bytes; |
| |
| // pre-compiled patterns and quoted replacements for optimization |
| private final Pattern valuePattern; |
| private final String valueReplacement; |
| |
| private final Pattern encodedValuePattern; |
| private final String encodedValueReplacement; |
| |
| /** |
| * Indicator for variable size of an individual segment in a split. The |
| * segment ends wherever separator is encountered. |
| * Typically used for string. |
| * Also used to indicate that there is no fixed number of splits which need to |
| * be returned. If split limit is specified as this, all possible splits are |
| * returned. |
| */ |
| public static final int VARIABLE_SIZE = 0; |
| |
| |
| /** empty string. */ |
| public static final String EMPTY_STRING = ""; |
| |
| /** empty bytes. */ |
| public static final byte[] EMPTY_BYTES = new byte[0]; |
| |
| /** |
| * @param value of the separator to use. Cannot be null or empty string. |
| * @param encodedValue choose something that isn't likely to occur in the data |
| * itself. Cannot be null or empty string. |
| */ |
| private Separator(String value, String encodedValue) { |
| this.value = value; |
| |
| // validation |
| if (value == null || value.length() == 0 || encodedValue == null |
| || encodedValue.length() == 0) { |
| throw new IllegalArgumentException( |
| "Cannot create separator from null or empty string."); |
| } |
| |
| this.bytes = Bytes.toBytes(value); |
| this.valuePattern = Pattern.compile(value, Pattern.LITERAL); |
| this.valueReplacement = Matcher.quoteReplacement(value); |
| |
| this.encodedValuePattern = Pattern.compile(encodedValue, Pattern.LITERAL); |
| this.encodedValueReplacement = Matcher.quoteReplacement(encodedValue); |
| } |
| |
| /** |
| * @return the original value of the separator |
| */ |
| public String getValue() { |
| return value; |
| } |
| |
| /** |
| * Used to make token safe to be used with this separator without collisions. |
| * It <em>must</em> be paired with {@link #decode(String)} for it to be |
| * decoded correctly. |
| * <p> |
| * If you need to encode a given string for multiple separators, |
| * {@link #encode(String, Separator...)} should be used over successive |
| * invocations of this method. It will result in a more compact version of the |
| * encoded value. |
| * |
| * @param token Token to be encoded. |
| * @return the token with any occurrences of this separator URLEncoded. |
| */ |
| public String encode(String token) { |
| if (token == null || token.length() == 0) { |
| // Nothing to replace |
| return token; |
| } |
| // first encode the percent to escape naturally occurring encoded values |
| String escaped = encodePercent(token); |
| return encodeSingle(escaped, this); |
| } |
| |
| private static String replace(String token, Pattern pattern, |
| String replacement) { |
| return pattern.matcher(token).replaceAll(replacement); |
| } |
| |
| private static String encodeSingle(String token, Separator separator) { |
| return replace(token, separator.valuePattern, |
| separator.encodedValueReplacement); |
| } |
| |
| private static String encodePercent(String token) { |
| return replace(token, PERCENT_PATTERN, PERCENT_ENCODED_REPLACEMENT); |
| } |
| |
| /** |
| * Decode the token encoded using {@link #encode(String)}. It <em>must</em> be |
| * used for the result encoded with {@link #encode(String)} to be able to |
| * recover the original. |
| * |
| * @param token Token to be decoded. |
| * @return the token with any occurrences of the encoded separator replaced by |
| * the separator itself. |
| */ |
| public String decode(String token) { |
| if (token == null || token.length() == 0) { |
| // Nothing to replace |
| return token; |
| } |
| String escaped = decodeSingle(token, this); |
| // decode percent to de-escape |
| return decodePercent(escaped); |
| } |
| |
| private static String decodeSingle(String token, Separator separator) { |
| return replace(token, separator.encodedValuePattern, |
| separator.valueReplacement); |
| } |
| |
| private static String decodePercent(String token) { |
| return replace(token, PERCENT_ENCODED_PATTERN, PERCENT_REPLACEMENT); |
| } |
| |
| /** |
| * Encode the given separators in the token with their encoding equivalents. |
| * It <em>must</em> be paired with {@link #decode(byte[], Separator...)} or |
| * {@link #decode(String, Separator...)} with the same separators for it to be |
| * decoded correctly. |
| * <p> |
| * If you need to encode a given string for multiple separators, this form of |
| * encoding should be used over successive invocations of |
| * {@link #encode(String)}. It will result in a more compact version of the |
| * encoded value. |
| * |
| * @param token containing possible separators that need to be encoded. |
| * @param separators to be encoded in the token with their URLEncoding |
| * equivalent. |
| * @return non-null byte representation of the token with occurrences of the |
| * separators encoded. |
| */ |
| public static byte[] encode(String token, Separator... separators) { |
| if (token == null || token.length() == 0) { |
| return EMPTY_BYTES; |
| } |
| String result = token; |
| // first encode the percent to escape naturally occurring encoded values |
| result = encodePercent(token); |
| for (Separator separator : separators) { |
| if (separator != null) { |
| result = encodeSingle(result, separator); |
| } |
| } |
| return Bytes.toBytes(result); |
| } |
| |
| /** |
| * Decode the given separators in the token with their decoding equivalents. |
| * It <em>must</em> be used for the result encoded with |
| * {@link #encode(String, Separator...)} with the same separators to be able |
| * to recover the original. |
| * |
| * @param token containing possible separators that need to be encoded. |
| * @param separators to be encoded in the token with their URLEncoding |
| * equivalent. |
| * @return String representation of the token with occurrences of the URL |
| * encoded separators decoded. |
| */ |
| public static String decode(byte[] token, Separator... separators) { |
| if (token == null) { |
| return null; |
| } |
| return decode(Bytes.toString(token), separators); |
| } |
| |
| /** |
| * Decode the given separators in the token with their decoding equivalents. |
| * It <em>must</em> be used for the result encoded with |
| * {@link #encode(String, Separator...)} with the same separators to be able |
| * to recover the original. |
| * |
| * @param token containing possible separators that need to be encoded. |
| * @param separators to be encoded in the token with their URLEncoding |
| * equivalent. |
| * @return String representation of the token with occurrences of the URL |
| * encoded separators decoded. |
| */ |
| public static String decode(String token, Separator... separators) { |
| if (token == null) { |
| return null; |
| } |
| String result = token; |
| for (Separator separator : separators) { |
| if (separator != null) { |
| result = decodeSingle(result, separator); |
| } |
| } |
| // decode percent to de-escape |
| return decodePercent(result); |
| } |
| |
| /** |
| * Returns a single byte array containing all of the individual arrays |
| * components separated by this separator. |
| * |
| * @param components Byte array components to be joined together. |
| * @return byte array after joining the components |
| */ |
| public byte[] join(byte[]... components) { |
| if (components == null || components.length == 0) { |
| return EMPTY_BYTES; |
| } |
| |
| int finalSize = 0; |
| finalSize = this.value.length() * (components.length - 1); |
| for (byte[] comp : components) { |
| if (comp != null) { |
| finalSize += comp.length; |
| } |
| } |
| |
| byte[] buf = new byte[finalSize]; |
| int offset = 0; |
| for (int i = 0; i < components.length; i++) { |
| if (components[i] != null) { |
| System.arraycopy(components[i], 0, buf, offset, components[i].length); |
| offset += components[i].length; |
| } |
| if (i < (components.length - 1)) { |
| System.arraycopy(this.bytes, 0, buf, offset, this.value.length()); |
| offset += this.value.length(); |
| } |
| } |
| return buf; |
| } |
| |
| /** |
| * Concatenates items (as String), using this separator. |
| * |
| * @param items Items join, {@code toString()} will be called in each item. |
| * Any occurrence of the separator in the individual strings will be |
| * first encoded. Cannot be null. |
| * @return non-null joined result. Note that when separator is {@literal null} |
| * the result is simply all items concatenated and the process is not |
| * reversible through {@link #splitEncoded(String)} |
| */ |
| public String joinEncoded(String... items) { |
| if (items == null || items.length == 0) { |
| return ""; |
| } |
| |
| StringBuilder sb = new StringBuilder(encode(items[0].toString())); |
| // Start at 1, we've already grabbed the first value at index 0 |
| for (int i = 1; i < items.length; i++) { |
| sb.append(this.value); |
| sb.append(encode(items[i].toString())); |
| } |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Concatenates items (as String), using this separator. |
| * |
| * @param items Items join, {@code toString()} will be called in each item. |
| * Any occurrence of the separator in the individual strings will be |
| * first encoded. Cannot be null. |
| * @return non-null joined result. Note that when separator is {@literal null} |
| * the result is simply all items concatenated and the process is not |
| * reversible through {@link #splitEncoded(String)} |
| */ |
| public String joinEncoded(Iterable<?> items) { |
| if (items == null) { |
| return ""; |
| } |
| Iterator<?> i = items.iterator(); |
| if (!i.hasNext()) { |
| return ""; |
| } |
| |
| StringBuilder sb = new StringBuilder(encode(i.next().toString())); |
| while (i.hasNext()) { |
| sb.append(this.value); |
| sb.append(encode(i.next().toString())); |
| } |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * @param compoundValue containing individual values separated by this |
| * separator, which have that separator encoded. |
| * @return non-null set of values from the compoundValue with the separator |
| * decoded. |
| */ |
| public Collection<String> splitEncoded(String compoundValue) { |
| List<String> result = new ArrayList<String>(); |
| if (compoundValue != null) { |
| for (String val : valuePattern.split(compoundValue)) { |
| result.add(decode(val)); |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Splits the source array into multiple array segments using this separator, |
| * up to a maximum of count items. This will naturally produce copied byte |
| * arrays for each of the split segments. |
| * |
| * @param source to be split |
| * @param limit on how many segments are supposed to be returned. A |
| * non-positive value indicates no limit on number of segments. |
| * @return source split by this separator. |
| */ |
| public byte[][] split(byte[] source, int limit) { |
| return split(source, this.bytes, limit); |
| } |
| |
| /** |
| * Splits the source array into multiple array segments using this separator. |
| * The sizes indicate the sizes of the relative components/segments. |
| * In case one of the segments contains this separator before the specified |
| * size is reached, the separator will be considered part of that segment and |
| * we will continue till size is reached. |
| * Variable length strings cannot contain this separator and are indiced with |
| * a size of {@value #VARIABLE_SIZE}. Such strings are encoded for this |
| * separator and decoded after the results from split is returned. |
| * |
| * @param source byte array to be split. |
| * @param sizes sizes of relative components/segments. |
| * @return source split by this separator as per the sizes specified.. |
| */ |
| public byte[][] split(byte[] source, int[] sizes) { |
| return split(source, this.bytes, sizes); |
| } |
| |
| /** |
| * Splits the source array into multiple array segments using this separator, |
| * as many times as splits are found. This will naturally produce copied byte |
| * arrays for each of the split segments. |
| * |
| * @param source byte array to be split |
| * @return source split by this separator. |
| */ |
| public byte[][] split(byte[] source) { |
| return split(source, this.bytes); |
| } |
| |
| /** |
| * Returns a list of ranges identifying [start, end) -- closed, open -- |
| * positions within the source byte array that would be split using the |
| * separator byte array. |
| * The sizes indicate the sizes of the relative components/segments. |
| * In case one of the segments contains this separator before the specified |
| * size is reached, the separator will be considered part of that segment and |
| * we will continue till size is reached. |
| * Variable length strings cannot contain this separator and are indiced with |
| * a size of {@value #VARIABLE_SIZE}. Such strings are encoded for this |
| * separator and decoded after the results from split is returned. |
| * |
| * @param source the source data |
| * @param separator the separator pattern to look for |
| * @param sizes indicate the sizes of the relative components/segments. |
| * @return a list of ranges. |
| */ |
| private static List<Range> splitRanges(byte[] source, byte[] separator, |
| int[] sizes) { |
| List<Range> segments = new ArrayList<Range>(); |
| if (source == null || separator == null) { |
| return segments; |
| } |
| // VARIABLE_SIZE here indicates that there is no limit to number of segments |
| // to return. |
| int limit = VARIABLE_SIZE; |
| if (sizes != null && sizes.length > 0) { |
| limit = sizes.length; |
| } |
| int start = 0; |
| int currentSegment = 0; |
| itersource: for (int i = 0; i < source.length; i++) { |
| for (int j = 0; j < separator.length; j++) { |
| if (source[i + j] != separator[j]) { |
| continue itersource; |
| } |
| } |
| // all separator elements matched |
| if (limit > VARIABLE_SIZE) { |
| if (segments.size() >= (limit - 1)) { |
| // everything else goes in one final segment |
| break; |
| } |
| if (sizes != null) { |
| int currentSegExpectedSize = sizes[currentSegment]; |
| if (currentSegExpectedSize > VARIABLE_SIZE) { |
| int currentSegSize = i - start; |
| if (currentSegSize < currentSegExpectedSize) { |
| // Segment not yet complete. More bytes to parse. |
| continue itersource; |
| } else if (currentSegSize > currentSegExpectedSize) { |
| // Segment is not as per size. |
| throw new IllegalArgumentException( |
| "Segments not separated as per expected sizes"); |
| } |
| } |
| } |
| } |
| segments.add(new Range(start, i)); |
| start = i + separator.length; |
| // i will be incremented again in outer for loop |
| i += separator.length - 1; |
| currentSegment++; |
| } |
| // add in remaining to a final range |
| if (start <= source.length) { |
| if (sizes != null) { |
| // Check if final segment is as per size specified. |
| if (sizes[currentSegment] > VARIABLE_SIZE && |
| source.length - start > sizes[currentSegment]) { |
| // Segment is not as per size. |
| throw new IllegalArgumentException( |
| "Segments not separated as per expected sizes"); |
| } |
| } |
| segments.add(new Range(start, source.length)); |
| } |
| return segments; |
| } |
| |
| /** |
| * Splits based on segments calculated based on limit/sizes specified for the |
| * separator. |
| * |
| * @param source byte array to be split. |
| * @param segments specifies the range for each segment. |
| * @return a byte[][] split as per the segment ranges. |
| */ |
| private static byte[][] split(byte[] source, List<Range> segments) { |
| byte[][] splits = new byte[segments.size()][]; |
| for (int i = 0; i < segments.size(); i++) { |
| Range r = segments.get(i); |
| byte[] tmp = new byte[r.length()]; |
| if (tmp.length > 0) { |
| System.arraycopy(source, r.start(), tmp, 0, r.length()); |
| } |
| splits[i] = tmp; |
| } |
| return splits; |
| } |
| |
| /** |
| * Splits the source array into multiple array segments using the given |
| * separator based on the sizes. This will naturally produce copied byte |
| * arrays for each of the split segments. |
| * |
| * @param source source array. |
| * @param separator separator represented as a byte array. |
| * @param sizes sizes of relative components/segments. |
| * @return byte[][] after splitting the source. |
| */ |
| private static byte[][] split(byte[] source, byte[] separator, int[] sizes) { |
| List<Range> segments = splitRanges(source, separator, sizes); |
| return split(source, segments); |
| } |
| |
| /** |
| * Splits the source array into multiple array segments using the given |
| * separator. This will naturally produce copied byte arrays for each of the |
| * split segments. |
| * |
| * @param source Source array. |
| * @param separator Separator represented as a byte array. |
| * @return byte[][] after splitting the source. |
| */ |
| private static byte[][] split(byte[] source, byte[] separator) { |
| return split(source, separator, (int[]) null); |
| } |
| |
| /** |
| * Splits the source array into multiple array segments using the given |
| * separator, up to a maximum of count items. This will naturally produce |
| * copied byte arrays for each of the split segments. |
| * |
| * @param source Source array. |
| * @param separator Separator represented as a byte array. |
| * @param limit a non-positive value indicates no limit on number of segments. |
| * @return byte[][] after splitting the input source. |
| */ |
| private static byte[][] split(byte[] source, byte[] separator, int limit) { |
| int[] sizes = null; |
| if (limit > VARIABLE_SIZE) { |
| sizes = new int[limit]; |
| } |
| List<Range> segments = splitRanges(source, separator, sizes); |
| return split(source, segments); |
| } |
| } |