| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hadoop.hive.ql.udf; |
| |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import com.google.common.collect.Iterators; |
| import org.apache.hadoop.hive.ql.exec.Description; |
| import org.apache.hadoop.hive.ql.exec.UDF; |
| import org.apache.hadoop.io.Text; |
| import org.codehaus.jackson.JsonFactory; |
| import org.codehaus.jackson.JsonParser.Feature; |
| import org.codehaus.jackson.map.ObjectMapper; |
| import org.codehaus.jackson.map.type.TypeFactory; |
| import org.codehaus.jackson.type.JavaType; |
| |
| /** |
| * UDFJson. |
| * |
| */ |
| @Description(name = "get_json_object", |
| value = "_FUNC_(json_txt, path) - Extract a json object from path ", |
| extended = "Extract json object from a json string based on json path " |
| + "specified, and return json string of the extracted json object. It " |
| + "will return null if the input json string is invalid.\n" |
| + "A limited version of JSONPath supported:\n" |
| + " $ : Root object\n" |
| + " . : Child operator\n" |
| + " [] : Subscript operator for array\n" |
| + " * : Wildcard for []\n" |
| + "Syntax not supported that's worth noticing:\n" |
| + " '' : Zero length string as key\n" |
| + " .. : Recursive descent\n" |
| + " @ : Current object/element\n" |
| + " () : Script expression\n" |
| + " ?() : Filter (script) expression.\n" |
| + " [,] : Union operator\n" |
| + " [start:end:step] : array slice operator\n") |
| public class UDFJson extends UDF { |
| private static final Pattern patternKey = Pattern.compile("^([a-zA-Z0-9_\\-\\:\\s]+).*"); |
| private static final Pattern patternIndex = Pattern.compile("\\[([0-9]+|\\*)\\]"); |
| private static final JavaType MAP_TYPE = TypeFactory.fromClass(Map.class); |
| private static final JavaType LIST_TYPE = TypeFactory.fromClass(List.class); |
| |
| private final JsonFactory jsonFactory = new JsonFactory(); |
| private final ObjectMapper objectMapper = new ObjectMapper(jsonFactory); |
| |
| // An LRU cache using a linked hash map |
| static class HashCache<K, V> extends LinkedHashMap<K, V> { |
| |
| private static final int CACHE_SIZE = 16; |
| private static final int INIT_SIZE = 32; |
| private static final float LOAD_FACTOR = 0.6f; |
| |
| HashCache() { |
| super(INIT_SIZE, LOAD_FACTOR); |
| } |
| |
| private static final long serialVersionUID = 1; |
| |
| @Override |
| protected boolean removeEldestEntry(Map.Entry<K, V> eldest) { |
| return size() > CACHE_SIZE; |
| } |
| |
| } |
| |
| Map<String, Object> extractObjectCache = new HashCache<String, Object>(); |
| Map<String, String[]> pathExprCache = new HashCache<String, String[]>(); |
| Map<String, ArrayList<String>> indexListCache = |
| new HashCache<String, ArrayList<String>>(); |
| Map<String, String> mKeyGroup1Cache = new HashCache<String, String>(); |
| Map<String, Boolean> mKeyMatchesCache = new HashCache<String, Boolean>(); |
| |
| public UDFJson() { |
| // Allows for unescaped ASCII control characters in JSON values |
| jsonFactory.enable(Feature.ALLOW_UNQUOTED_CONTROL_CHARS); |
| // Enabled to accept quoting of all character backslash qooting mechanism |
| jsonFactory.enable(Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER); |
| } |
| |
| /** |
| * Extract json object from a json string based on json path specified, and |
| * return json string of the extracted json object. It will return null if the |
| * input json string is invalid. |
| * |
| * A limited version of JSONPath supported: $ : Root object . : Child operator |
| * [] : Subscript operator for array * : Wildcard for [] |
| * |
| * Syntax not supported that's worth noticing: '' : Zero length string as key |
| * .. : Recursive descent &#064; : Current object/element () : Script |
| * expression ?() : Filter (script) expression. [,] : Union operator |
| * [start:end:step] : array slice operator |
| * |
| * @param jsonString |
| * the json string. |
| * @param pathString |
| * the json path expression. |
| * @return json string or null when an error happens. |
| */ |
| public Text evaluate(String jsonString, String pathString) { |
| if (jsonString == null || jsonString.isEmpty() || pathString == null |
| || pathString.isEmpty() || pathString.charAt(0) != '$') { |
| return null; |
| } |
| |
| int pathExprStart = 1; |
| boolean unknownType = pathString.equals("$"); |
| boolean isRootArray = false; |
| |
| if (pathString.length() > 1) { |
| if (pathString.charAt(1) == '[') { |
| pathExprStart = 0; |
| isRootArray = true; |
| } else if (pathString.charAt(1) == '.') { |
| isRootArray = pathString.length() > 2 && pathString.charAt(2) == '['; |
| } else { |
| return null; |
| } |
| } |
| |
| // Cache pathExpr |
| String[] pathExpr = pathExprCache.get(pathString); |
| if (pathExpr == null) { |
| pathExpr = pathString.split("\\.", -1); |
| pathExprCache.put(pathString, pathExpr); |
| } |
| |
| // Cache extractObject |
| Object extractObject = extractObjectCache.get(jsonString); |
| if (extractObject == null) { |
| if (unknownType) { |
| try { |
| extractObject = objectMapper.readValue(jsonString, LIST_TYPE); |
| } catch (Exception e) { |
| // Ignore exception |
| } |
| if (extractObject == null) { |
| try { |
| extractObject = objectMapper.readValue(jsonString, MAP_TYPE); |
| } catch (Exception e) { |
| return null; |
| } |
| } |
| } else { |
| JavaType javaType = isRootArray ? LIST_TYPE : MAP_TYPE; |
| try { |
| extractObject = objectMapper.readValue(jsonString, javaType); |
| } catch (Exception e) { |
| return null; |
| } |
| } |
| extractObjectCache.put(jsonString, extractObject); |
| } |
| |
| for (int i = pathExprStart; i < pathExpr.length; i++) { |
| if (extractObject == null) { |
| return null; |
| } |
| extractObject = extract(extractObject, pathExpr[i], i == pathExprStart && isRootArray); |
| } |
| |
| Text result = new Text(); |
| if (extractObject instanceof Map || extractObject instanceof List) { |
| try { |
| result.set(objectMapper.writeValueAsString(extractObject)); |
| } catch (Exception e) { |
| return null; |
| } |
| } else if (extractObject != null) { |
| result.set(extractObject.toString()); |
| } else { |
| return null; |
| } |
| return result; |
| } |
| |
| private Object extract(Object json, String path, boolean skipMapProc) { |
| // skip MAP processing for the first path element if root is array |
| if (!skipMapProc) { |
| // Cache patternkey.matcher(path).matches() |
| Matcher mKey = null; |
| Boolean mKeyMatches = mKeyMatchesCache.get(path); |
| if (mKeyMatches == null) { |
| mKey = patternKey.matcher(path); |
| mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE; |
| mKeyMatchesCache.put(path, mKeyMatches); |
| } |
| if (!mKeyMatches.booleanValue()) { |
| return null; |
| } |
| |
| // Cache mkey.group(1) |
| String mKeyGroup1 = mKeyGroup1Cache.get(path); |
| if (mKeyGroup1 == null) { |
| if (mKey == null) { |
| mKey = patternKey.matcher(path); |
| mKeyMatches = mKey.matches() ? Boolean.TRUE : Boolean.FALSE; |
| mKeyMatchesCache.put(path, mKeyMatches); |
| if (!mKeyMatches.booleanValue()) { |
| return null; |
| } |
| } |
| mKeyGroup1 = mKey.group(1); |
| mKeyGroup1Cache.put(path, mKeyGroup1); |
| } |
| json = extract_json_withkey(json, mKeyGroup1); |
| } |
| // Cache indexList |
| ArrayList<String> indexList = indexListCache.get(path); |
| if (indexList == null) { |
| Matcher mIndex = patternIndex.matcher(path); |
| indexList = new ArrayList<String>(); |
| while (mIndex.find()) { |
| indexList.add(mIndex.group(1)); |
| } |
| indexListCache.put(path, indexList); |
| } |
| |
| if (indexList.size() > 0) { |
| json = extract_json_withindex(json, indexList); |
| } |
| |
| return json; |
| } |
| |
| private transient AddingList jsonList = new AddingList(); |
| |
| private static class AddingList extends ArrayList<Object> { |
| private static final long serialVersionUID = 1L; |
| |
| @Override |
| public Iterator<Object> iterator() { |
| return Iterators.forArray(toArray()); |
| } |
| @Override |
| public void removeRange(int fromIndex, int toIndex) { |
| super.removeRange(fromIndex, toIndex); |
| } |
| }; |
| |
| @SuppressWarnings("unchecked") |
| private Object extract_json_withindex(Object json, ArrayList<String> indexList) { |
| jsonList.clear(); |
| jsonList.add(json); |
| for (String index : indexList) { |
| int targets = jsonList.size(); |
| if (index.equalsIgnoreCase("*")) { |
| for (Object array : jsonList) { |
| if (array instanceof List) { |
| for (int j = 0; j < ((List<Object>)array).size(); j++) { |
| jsonList.add(((List<Object>)array).get(j)); |
| } |
| } |
| } |
| } else { |
| for (Object array : jsonList) { |
| int indexValue = Integer.parseInt(index); |
| if (!(array instanceof List)) { |
| continue; |
| } |
| List<Object> list = (List<Object>) array; |
| if (indexValue >= list.size()) { |
| continue; |
| } |
| jsonList.add(list.get(indexValue)); |
| } |
| } |
| if (jsonList.size() == targets) { |
| return null; |
| } |
| jsonList.removeRange(0, targets); |
| } |
| if (jsonList.isEmpty()) { |
| return null; |
| } |
| return (jsonList.size() > 1) ? new ArrayList<Object>(jsonList) : jsonList.get(0); |
| } |
| |
| @SuppressWarnings("unchecked") |
| private Object extract_json_withkey(Object json, String path) { |
| if (json instanceof List) { |
| List<Object> jsonArray = new ArrayList<Object>(); |
| for (int i = 0; i < ((List<Object>) json).size(); i++) { |
| Object json_elem = ((List<Object>) json).get(i); |
| Object json_obj = null; |
| if (json_elem instanceof Map) { |
| json_obj = ((Map<String, Object>) json_elem).get(path); |
| } else { |
| continue; |
| } |
| if (json_obj instanceof List) { |
| for (int j = 0; j < ((List<Object>) json_obj).size(); j++) { |
| jsonArray.add(((List<Object>) json_obj).get(j)); |
| } |
| } else if (json_obj != null) { |
| jsonArray.add(json_obj); |
| } |
| } |
| return (jsonArray.size() == 0) ? null : jsonArray; |
| } else if (json instanceof Map) { |
| return ((Map<String, Object>) json).get(path); |
| } else { |
| return null; |
| } |
| } |
| } |