This page describes a list of useful Hivemall generic functions. See also a list of machine-learning-related functions.
arange([int start=0, ] int stop, [int step=1])
- Return evenly spaced values within a given interval
SELECT arange(5), arange(1, 5), arange(1, 5, 1), arange(0, 5, 1); > [0,1,2,3,4] [1,2,3,4] [1,2,3,4] [0,1,2,3,4] SELECT arange(1, 6, 2); > 1, 3, 5 SELECT arange(-1, -6, 2); > -1, -3, -5
argmax(array<T> a)
- Returns the first index of the maximum value
SELECT argmax(array(5,2,0,1)); > 0
argmin(array<T> a)
- Returns the first index of the minimum value
SELECT argmin(array(5,2,0,1)); > 2
argrank(array<ANY> a)
- Returns the indices that would sort an array.
SELECT argrank(array(5,2,0,1)), argsort(argsort(array(5,2,0,1))); > [3, 2, 0, 1] [3, 2, 0, 1]
argsort(array<ANY> a)
- Returns the indices that would sort an array.
SELECT argsort(array(5,2,0,1)); > 2, 3, 1, 0 SELECT array_slice(array(5,2,0,1), argsort(array(5,2,0,1))); > 0, 1, 2, 5
array_append(array<T> arr, T elem)
- Append an element to the end of an array
SELECT array_append(array(1,2),3); 1,2,3 SELECT array_append(array('a','b'),'c'); "a","b","c"
array_avg(array<number>)
- Returns an array<double> in which each element is the mean of a set of numbers
WITH input as ( select array(1.0, 2.0, 3.0) as nums UNION ALL select array(2.0, 3.0, 4.0) as nums ) select array_avg(nums) from input; ["1.5","2.5","3.5"]
array_concat(array<ANY> x1, array<ANY> x2, ..)
- Returns a concatenated array
SELECT array_concat(array(1),array(2,3)); [1,2,3]
array_flatten(array<array<ANY>>)
- Returns an array with the elements flattened.
SELECT array_flatten(array(array(1,2,3),array(4,5),array(6,7,8))); [1,2,3,4,5,6,7,8]
array_intersect(array<ANY> x1, array<ANY> x2, ..)
- Returns an intersect of given arrays
SELECT array_intersect(array(1,3,4),array(2,3,4),array(3,5)); [3]
array_remove(array<int|text> original, int|text|array<int> target)
- Returns an array that the target is removed from the original array
SELECT array_remove(array(1,null,3),array(null)); [3] SELECT array_remove(array("aaa","bbb"),"bbb"); ["aaa"]
array_slice(array<ANY> values, int offset [, int length])
- Slices the given array by the given offset and length parameters.
SELECT array_slice(array(1,2,3,4,5,6),2,4), array_slice( array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), 0, -- offset 2 -- length ), array_slice( array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), 6, -- offset 3 -- length ), array_slice( array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), 6, -- offset 10 -- length ), array_slice( array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), 6 -- offset ), array_slice( array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), -3 -- offset ), array_slice( array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), -3, -- offset 2 -- length ); [3,4] ["zero","one"] ["six","seven","eight"] ["six","seven","eight","nine","ten"] ["six","seven","eight","nine","ten"] ["eight","nine","ten"] ["eight","nine"]
array_sum(array<number>)
- Returns an array<double> in which each element is summed up
WITH input as ( select array(1.0, 2.0, 3.0) as nums UNION ALL select array(2.0, 3.0, 4.0) as nums ) select array_sum(nums) from input; ["3.0","5.0","7.0"]
array_to_str(array arr [, string sep=','])
- Convert array to string using a sperator
SELECT array_to_str(array(1,2,3),'-'); 1-2-3
array_union(array1, array2, ...)
- Returns the union of a set of arrays
SELECT array_union(array(1,2),array(1,2)); [1,2] SELECT array_union(array(1,2),array(2,3),array(2,5)); [1,2,3,5]
conditional_emit(array<boolean> conditions, array<primitive> features)
- Emit features of a row according to various conditions
WITH input as ( select array(true, false, true) as conditions, array("one", "two", "three") as features UNION ALL select array(true, true, false), array("four", "five", "six") ) SELECT conditional_emit( conditions, features ) FROM input; one three four five
element_at(array<T> list, int pos)
- Returns an element at the given position
SELECT element_at(array(1,2,3,4),0); 1 SELECT element_at(array(1,2,3,4),-2); 3
first_element(x)
- Returns the first element in an array
SELECT first_element(array('a','b','c')); a SELECT first_element(array()); NULL
float_array(nDims)
- Returns an array<float> of nDims elements
last_element(x)
- Return the last element in an array
SELECT last_element(array('a','b','c')); c
select_k_best(array<number> array, const array<number> importance, const int k)
- Returns selected top-k elements as array<double>
sort_and_uniq_array(array<int>)
- Takes array<int> and returns a sorted array with duplicate elements eliminated
SELECT sort_and_uniq_array(array(3,1,1,-2,10)); [-2,1,3,10]
subarray(array<ANY> values, int fromIndex [, int toIndex])
- Returns a slice of the original array between the inclusive fromIndex and the exclusive toIndex.
SELECT subarray(array(0,1,2,3,4,5),4), subarray(array(0,1,2,3,4,5),3,4), subarray(array(0,1,2,3,4,5),3,3), subarray(array(0,1,2,3,4,5),3,2), subarray(array(0,1,2,3,4,5),0,2), subarray(array(0,1,2,3,4,5),-1,2), subarray(array(1,2,3,4,5,6),4), subarray(array(1,2,3,4,5,6),4,6), subarray(array(1,2,3,4,5,6),2,4), subarray(array(1,2,3,4,5,6),0,2), subarray(array(1,2,3,4,5,6),4,6), subarray(array(1,2,3,4,5,6),4,7); [4,5] [3] [] [] [0,1] [0,1] [5,6] [5,6] [3,4] [1,2] [5,6] [5,6]
subarray_endwith(array<int|text> original, int|text key)
- Returns an array that ends with the specified key
SELECT subarray_endwith(array(1,2,3,4), 3); [1,2,3]
subarray_startwith(array<int|text> original, int|text key)
- Returns an array that starts with the specified key
SELECT subarray_startwith(array(1,2,3,4), 2); [2,3,4]
to_string_array(array<ANY>)
- Returns an array of strings
select to_string_array(array(1.0,2.0,3.0)); ["1.0","2.0","3.0"]
to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])
- Return list of values sorted by value itself or specific key
WITH t as ( SELECT 5 as key, 'apple' as value UNION ALL SELECT 3 as key, 'banana' as value UNION ALL SELECT 4 as key, 'candy' as value UNION ALL SELECT 2 as key, 'donut' as value UNION ALL SELECT 3 as key, 'egg' as value ) SELECT -- expected output to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order) to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k) to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut] to_ordered_list(value, key, '-k 2 -reverse'), -- [donut, (banana | egg)] (reverse top-k = tail-k) to_ordered_list(value, key), -- [donut, (banana, egg | egg, banana), candy, apple] (natural order) to_ordered_list(value, key, '-k -2'), -- [donut, (banana | egg)] (tail-k) to_ordered_list(value, key, '-k -100'), -- [donut, (banana, egg | egg, banana), candy, apple] to_ordered_list(value, key, '-k -2 -reverse'), -- [apple, candy] (reverse tail-k = top-k) to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically) to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys) to_ordered_list(key), -- [2, 3, 3, 4, 5] (natural ordered keys) to_ordered_list(value, key, '-k 2 -kv_map'), -- {4:"candy",5:"apple"} to_ordered_list(value, key, '-k 2 -vk_map') -- {"candy":4,"apple":5} FROM t
bits_collect(int|long x)
- Returns a bitset in array<long>
bits_or(array<long> b1, array<long> b2, ..)
- Returns a logical OR given bitsets
SELECT unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); [1,2,3,4]
to_bits(int[] indexes)
- Returns an bitset representation if the given indexes in long[]
SELECT to_bits(array(1,2,3,128)); [14,-9223372036854775808]
unbits(long[] bitset)
- Returns an long array of the give bitset representation
SELECT unbits(to_bits(array(1,4,2,3))); [1,2,3,4]
deflate(TEXT data [, const int compressionLevel])
- Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9]
SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); AA+=kaIM|WTt!+wbGAA
inflate(BINARY compressedData)
- Returns a decompressed STRING by using Inflater
SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); aaaaaaaaaaaaaaaabbbbccc
sessionize(long timeInSec, long thresholdInSec [, String subject])
- Returns a UUID string of a session.SELECT sessionize(time, 3600, ip_addr) as session_id, time, ip_addr FROM ( SELECT time, ipaddr FROM weblog DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC ) t1
from_json(string jsonString, const string returnTypes [, const array<string>|const string columnNames])
- Return Hive object.
SELECT from_json(to_json(map('one',1,'two',2)), 'map<string,int>'), from_json( '{ "person" : { "name" : "makoto" , "age" : 37 } }', 'struct<name:string,age:int>', array('person') ), from_json( '[0.1,1.1,2.2]', 'array<double>' ), from_json(to_json( ARRAY( NAMED_STRUCT("country", "japan", "city", "tokyo"), NAMED_STRUCT("country", "japan", "city", "osaka") ) ),'array<struct<country:string,city:string>>'), from_json(to_json( ARRAY( NAMED_STRUCT("country", "japan", "city", "tokyo"), NAMED_STRUCT("country", "japan", "city", "osaka") ), array('city') ), 'array<struct<country:string,city:string>>'), from_json(to_json( ARRAY( NAMED_STRUCT("country", "japan", "city", "tokyo"), NAMED_STRUCT("country", "japan", "city", "osaka") ) ),'array<struct<city:string>>');
{"one":1,"two":2} {"name":"makoto","age":37} [0.1,1.1,2.2] [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] [{"city":"tokyo"},{"city":"osaka"}]
to_json(ANY object [, const array<string>|const string columnNames])
- Returns Json string
SELECT NAMED_STRUCT("Name", "John", "age", 31), to_json( NAMED_STRUCT("Name", "John", "age", 31) ), to_json( NAMED_STRUCT("Name", "John", "age", 31), array('Name', 'age') ), to_json( NAMED_STRUCT("Name", "John", "age", 31), array('name', 'age') ), to_json( NAMED_STRUCT("Name", "John", "age", 31), array('age') ), to_json( NAMED_STRUCT("Name", "John", "age", 31), array() ), to_json( null, array() ), to_json( struct("123", "456", 789, array(314,007)), array('ti','si','i','bi') ), to_json( struct("123", "456", 789, array(314,007)), 'ti,si,i,bi' ), to_json( struct("123", "456", 789, array(314,007)) ), to_json( NAMED_STRUCT("country", "japan", "city", "tokyo") ), to_json( NAMED_STRUCT("country", "japan", "city", "tokyo"), array('city') ), to_json( ARRAY( NAMED_STRUCT("country", "japan", "city", "tokyo"), NAMED_STRUCT("country", "japan", "city", "osaka") ) ), to_json( ARRAY( NAMED_STRUCT("country", "japan", "city", "tokyo"), NAMED_STRUCT("country", "japan", "city", "osaka") ), array('city') );
{"name":"John","age":31} {"name":"John","age":31} {"Name":"John","age":31} {"name":"John","age":31} {"age":31} {} NULL {"ti":"123","si":"456","i":789,"bi":[314,7]} {"ti":"123","si":"456","i":789,"bi":[314,7]} {"col1":"123","col2":"456","col3":789,"col4":[314,7]} {"country":"japan","city":"tokyo"} {"city":"tokyo"} [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}]
map_exclude_keys(Map<K,V> map, array<K> filteringKeys)
- Returns the filtered entries of a map not having specified keys
SELECT map_exclude_keys(map(1,'one',2,'two',3,'three'),array(2,3)); {1:"one"}
map_get(MAP<K> a, K n)
- Returns the value corresponding to the key in the map.
Note this is a workaround for a Hive issue that non-constant expression for map indexes not supported. See https://issues.apache.org/jira/browse/HIVE-1955 WITH tmp as ( SELECT "one" as key UNION ALL SELECT "two" as key ) SELECT map_get(map("one",1,"two",2),key) FROM tmp; > 1 > 2
map_get_sum(map<int,float> src, array<int> keys)
- Returns sum of values that are retrieved by keys
map_include_keys(Map<K,V> map, array<K> filteringKeys)
- Returns the filtered entries of a map having specified keys
SELECT map_include_keys(map(1,'one',2,'two',3,'three'),array(2,3)); {2:"two",3:"three"}
map_key_values(MAP<K, V> map)
- Returns a array of key-value pairs in array<named_struct<key,value>>
SELECT map_key_values(map("one",1,"two",2)); > [{"key":"one","value":1},{"key":"two","value":2}]
map_roulette(Map<K, number> map [, (const)
int/bigint seed]) - Returns a map key based on weighted random sampling of map values. Average of values is used for null values
-- `map_roulette(map<key, number> [, integer seed])` returns key by weighted random selection SELECT map_roulette(to_map(a, b)) -- 25% Tom, 21% Zhang, 54% Wang FROM ( -- see https://issues.apache.org/jira/browse/HIVE-17406 select 'Wang' as a, 54 as b union all select 'Zhang' as a, 21 as b union all select 'Tom' as a, 25 as b ) tmp; > Wang -- Weight random selection with using filling nulls with the average value SELECT map_roulette(map(1, 0.5, 'Wang', null)), -- 50% Wang, 50% 1 map_roulette(map(1, 0.5, 'Wang', null, 'Zhang', null)) -- 1/3 Wang, 1/3 1, 1/3 Zhang ; -- NULL will be returned if every key is null SELECT map_roulette(map()), map_roulette(map(null, null, null, null)); > NULL NULL -- Return NULL if all weights are zero SELECT map_roulette(map(1, 0)), map_roulette(map(1, 0, '5', 0)) ; > NULL NULL -- map_roulette does not support non-numeric weights or negative weights. SELECT map_roulette(map('Wong', 'A string', 'Zhao', 2)); > HiveException: Error evaluating map_roulette(map('Wong':'A string','Zhao':2)) SELECT map_roulette(map('Wong', 'A string', 'Zhao', 2)); > UDFArgumentException: Map value must be greather than or equals to zero: -2
map_tail_n(map SRC, int N)
- Returns the last N elements from a sorted array of SRC
merge_maps(Map x)
- Returns a map which contains the union of an aggregation of maps. Note that an existing value of a key can be replaced with the other duplicate key entry.
SELECT merge_maps(m) FROM ( SELECT map('A',10,'B',20,'C',30) UNION ALL SELECT map('A',10,'B',20,'C',30) ) t
to_map(key, value)
- Convert two aggregated columns into a key-value map
WITH input as ( select 'aaa' as key, 111 as value UNION all select 'bbb' as key, 222 as value ) select to_map(key, value) from input; > {"bbb":222,"aaa":111}
to_ordered_map(key, value [, const int k|const boolean reverseOrder=false])
- Convert two aggregated columns into an ordered key-value map
with t as ( select 10 as key, 'apple' as value union all select 3 as key, 'banana' as value union all select 4 as key, 'candy' as value ) select to_ordered_map(key, value, true), -- {10:"apple",4:"candy",3:"banana"} (reverse) to_ordered_map(key, value, 1), -- {10:"apple"} (top-1) to_ordered_map(key, value, 2), -- {10:"apple",4:"candy"} (top-2) to_ordered_map(key, value, 3), -- {10:"apple",4:"candy",3:"banana"} (top-3) to_ordered_map(key, value, 100), -- {10:"apple",4:"candy",3:"banana"} (top-100) to_ordered_map(key, value), -- {3:"banana",4:"candy",10:"apple"} (natural) to_ordered_map(key, value, -1), -- {3:"banana"} (tail-1) to_ordered_map(key, value, -2), -- {3:"banana",4:"candy"} (tail-2) to_ordered_map(key, value, -3), -- {3:"banana",4:"candy",10:"apple"} (tail-3) to_ordered_map(key, value, -100) -- {3:"banana",4:"candy",10:"apple"} (tail-100) from t
distcache_gets(filepath, key, default_value [, parseKey])
- Returns map<key_type, value_type>|value_type
jobconf_gets()
- Returns the value from JobConf
jobid()
- Returns the value of mapred.job.id
rowid()
- Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER}
rownum()
- Returns a generated row number sprintf(
%d%04d,sequence,taskId)
in long
SELECT rownum() as rownum, xxx from ...
taskid()
- Returns the value of mapred.task.partition
infinity()
- Returns the constant representing positive infinity.
is_finite(x)
- Determine if x is finite.
SELECT is_finite(333), is_finite(infinity()); > true false
is_infinite(x)
- Determine if x is infinite.
is_nan(x)
- Determine if x is not-a-number.
l2_norm(double x)
- Return a L2 norm of the given input x.
WITH input as ( select generate_series(1,3) as v ) select l2_norm(v) as l2norm from input; > 3.7416573867739413 = sqrt(1^2+2^2+3^2))
nan()
- Returns the constant representing not-a-number.
SELECT nan(), is_nan(nan()); > NaN true
sigmoid(x)
- Returns 1.0 / (1.0 + exp(-x))
WITH input as ( SELECT 3.0 as x UNION ALL SELECT -3.0 as x ) select 1.0 / (1.0 + exp(-x)), sigmoid(x) from input; > 0.04742587317756678 0.04742587357759476 > 0.9525741268224334 0.9525741338729858
transpose_and_dot(array<number> X, array<number> Y)
- Returns dot(X.T, Y) as array<array<double>>, shape = (X.#cols, Y.#cols)
WITH input as ( select array(1.0, 2.0, 3.0, 4.0) as x, array(1, 2) as y UNION ALL select array(2.0, 3.0, 4.0, 5.0) as x, array(1, 2) as y ) select transpose_and_dot(x, y) as xy, transpose_and_dot(y, x) as yx from input; > [["3.0","6.0"],["5.0","10.0"],["7.0","14.0"],["9.0","18.0"]] [["3.0","5.0","7.0","9.0"],["6.0","10.0","14.0","18.0"]]
vector_add(array<NUMBER> x, array<NUMBER> y)
- Perform vector ADD operation.
SELECT vector_add(array(1.0,2.0,3.0), array(2, 3, 4)); [3.0,5.0,7.0]
vector_dot(array<NUMBER> x, array<NUMBER> y)
- Performs vector dot product.
SELECT vector_dot(array(1.0,2.0,3.0),array(2.0,3.0,4.0)); 20 SELECT vector_dot(array(1.0,2.0,3.0),2); [2.0,4.0,6.0]
assert(boolean condition)
or FUNC(boolean condition, string errMsg)- Throws HiveException if condition is not met
SELECT count(1) FROM stock_price WHERE assert(price > 0.0); SELECT count(1) FROM stock_price WHERE assert(price > 0.0, 'price MUST be more than 0.0')
raise_error()
or FUNC(string msg) - Throws an error
SELECT product_id, price, raise_error('Found an invalid record') FROM xxx WHERE price < 0.0
base91(BINARY bin)
- Convert the argument from binary to a BASE91 string
SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); AA+=kaIM|WTt!+wbGAA
is_stopword(string word)
- Returns whether English stopword or not
normalize_unicode(string str [, string form])
- Transforms str
with the specified normalization form. The form
takes one of NFC (default), NFD, NFKC, or NFKD
SELECT normalize_unicode('ハンカクカナ','NFKC'); ハンカクカナ SELECT normalize_unicode('㈱㌧㌦Ⅲ','NFKC'); (株)トンドルIII
singularize(string word)
- Returns singular form of a given English word
SELECT singularize(lower("Apples")); "apple"
split_words(string query [, string regex])
- Returns an array<text> containing splitted strings
tokenize(string englishText [, boolean toLowerCase])
- Returns tokenized words in array<string>
unbase91(string)
- Convert a BASE91 string to a binary
SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); aaaaaaaaaaaaaaaabbbbccc
word_ngrams(array<string> words, int minSize, int maxSize])
- Returns list of n-grams for given words, where minSize <= n <= maxSize
SELECT word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); ["machine","machine learning","learning","learning is","is","is fun","fun"]
moving_avg(NUMBER value, const int windowSize)
- Returns moving average of a time series using a given windowSELECT moving_avg(x, 3) FROM (SELECT explode(array(1.0,2.0,3.0,4.0,5.0,6.0,7.0)) as x) series; 1.0 1.5 2.0 3.0 4.0 5.0 6.0
convert_label(const int|const float)
- Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1
each_top_k(int K, Object group, double cmpKey, *)
- Returns top-K values (or tail-K values when k is less than 0)
generate_series(const int|bigint start, const int|bigint end)
- Generate a series of values, from start to end. A similar function to PostgreSQL's generate_serics
SELECT generate_series(2,4); 2 3 4 SELECT generate_series(5,1,-2); 5 3 1 SELECT generate_series(4,3); (no return) SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t; 2018-04-21 1 2018-04-22 2 2018-04-23 3 WITH input as ( SELECT 1 as c1, 10 as c2, 3 as step UNION ALL SELECT 10, 2, -3 ) SELECT generate_series(c1, c2, step) as series FROM input; 1 4 7 10 10 7 4
try_cast(ANY src, const string typeName)
- Explicitly cast a value as a type. Returns null if cast fails.
SELECT try_cast(array(1.0,2.0,3.0), 'array<string>') SELECT try_cast(map('A',10,'B',20,'C',30), 'map<string,double>')
x_rank(KEY)
- Generates a pseudo sequence number starting from 1 for each key