This page describes a list of useful Hivemall generic functions.
array_concat(array<ANY> x1, array<ANY> x2, ..)
- Returns a concatenated array
select array_concat(array(1),array(2,3)); > [1,2,3]
array_intersect(array<ANY> x1, array<ANY> x2, ..)
- Returns an intersect of given arrays
select array_intersect(array(1,3,4),array(2,3,4),array(3,5)); > [3]
array_remove(array<int|text> original, int|text|array<int> target)
- Returns an array that the target is removed from the original array
select array_remove(array(1,null,3),array(null)); > [3] select array_remove(array("aaa","bbb"),"bbb"); > ["aaa"]
sort_and_uniq_array(array<int>)
- Takes an array of type INT and returns a sorted array in a natural order with duplicate elements eliminated
select sort_and_uniq_array(array(3,1,1,-2,10)); > [-2,1,3,10]
subarray_endwith(array<int|text> original, int|text key)
- Returns an array that ends with the specified key
select subarray_endwith(array(1,2,3,4), 3); > [1,2,3]
subarray_startwith(array<int|text> original, int|text key)
- Returns an array that starts with the specified key
select subarray_startwith(array(1,2,3,4), 2); > [2,3,4]
subarray(array<int> orignal, int fromIndex, int toIndex)
- Returns a slice of the original array between the inclusive fromIndex
and the exclusive toIndex
select subarray(array(1,2,3,4,5,6), 2,4); > [3,4]
array_avg(array<NUMBER>)
- Returns an array in which each element is the mean of a set of numbers
array_sum(array<NUMBER>)
- Returns an array in which each element is summed up
to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])
or to_ordered_list(value, key [, const string options])
- Return list of values sorted by value itself or specific key
with t as ( select 5 as key, 'apple' as value union all select 3 as key, 'banana' as value union all select 4 as key, 'candy' as value union all select 2 as key, 'donut' as value union all select 3 as key, 'egg' as value ) select -- expected output to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order) to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k) to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut] to_ordered_list(value, key, '-k 2 -reverse'), -- [donut, (banana | egg)] (reverse top-k = tail-k) to_ordered_list(value, key), -- [donut, (banana, egg | egg, banana), candy, apple] (natural order) to_ordered_list(value, key, '-k -2'), -- [donut, (banana | egg)] (tail-k) to_ordered_list(value, key, '-k -100'), -- [donut, (banana, egg | egg, banana), candy, apple] to_ordered_list(value, key, '-k -2 -reverse'), -- [apple, candy] (reverse tail-k = top-k) to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically) to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys) to_ordered_list(key) -- [2, 3, 3, 4, 5] (natural ordered keys) from t ;
to_bits(int[] indexes)
- Returns an bitset representation if the given indexes in long[]
select to_bits(array(1,2,3,128)); >[14,-9223372036854775808]
unbits(long[] bitset)
- Returns an long array of the give bitset representation
select unbits(to_bits(array(1,4,2,3))); > [1,2,3,4]
bits_or(array<long> b1, array<long> b2, ..)
- Returns a logical OR given bitsets
select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); > [1,2,3,4]
bits_collect(int|long x)
- Returns a bitset in arraydeflate(TEXT data [, const int compressionLevel])
- Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9]
select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); > AA+=kaIM|WTt!+wbGAA
inflate(BINARY compressedData)
- Returns a decompressed STRING by using Inflater
select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); > aaaaaaaaaaaaaaaabbbbccc
map_get_sum(map<int,float> src, array<int> keys)
- Returns sum of values that are retrieved by keys
map_tail_n(map SRC, int N)
- Returns the last N elements from a sorted array of SRC
to_map(key, value)
- Convert two aggregated columns into a key-value map
to_ordered_map(key, value [, const int k|const boolean reverseOrder=false])
- Convert two aggregated columns into an ordered key-value map
with t as ( select 10 as key, 'apple' as value union all select 3 as key, 'banana' as value union all select 4 as key, 'candy' as value ) select to_ordered_map(key, value, true), -- {10:"apple",4:"candy",3:"banana"} (reverse) to_ordered_map(key, value, 1), -- {10:"apple"} (top-1) to_ordered_map(key, value, 2), -- {10:"apple",4:"candy"} (top-2) to_ordered_map(key, value, 3), -- {10:"apple",4:"candy",3:"banana"} (top-3) to_ordered_map(key, value, 100), -- {10:"apple",4:"candy",3:"banana"} (top-100) to_ordered_map(key, value), -- {3:"banana",4:"candy",10:"apple"} (natural) to_ordered_map(key, value, -1), -- {3:"banana"} (tail-1) to_ordered_map(key, value, -2), -- {3:"banana",4:"candy"} (tail-2) to_ordered_map(key, value, -3), -- {3:"banana",4:"candy",10:"apple"} (tail-3) to_ordered_map(key, value, -100) -- {3:"banana",4:"candy",10:"apple"} (tail-100) from t ;
rowid()
- Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER}
taskid()
- Returns the value of mapred.task.partition
sigmoid(x)
- Returns 1.0 / (1.0 + exp(-x))
base91(binary)
- Convert the argument from binary to a BASE91 string
select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); > AA+=kaIM|WTt!+wbGAA
unbase91(string)
- Convert a BASE91 string to a binary
select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); > aaaaaaaaaaaaaaaabbbbccc
normalize_unicode(string str [, string form])
- Transforms str
with the specified normalization form. The form
takes one of NFC (default), NFD, NFKC, or NFKD
select normalize_unicode('ハンカクカナ','NFKC'); > ハンカクカナ select normalize_unicode('㈱㌧㌦Ⅲ','NFKC'); > (株)トンドルIII
split_words(string query [, string regex])
- Returns an array containing splitted strings
is_stopword(string word)
- Returns whether English stopword or not
singularize(string word)
- Returns singular form of a given English word
select singularize(lower("Apples")); > "apple"
tokenize(string englishText [, boolean toLowerCase])
- Returns words in array
tokenize_ja(String line [, const string mode = "normal", const list<string> stopWords, const list<string> stopTags])
- returns tokenized strings in array. Refer this article for detail.
select tokenize_ja("kuromojiを使った分かち書きのテストです。第二引数にはnormal/search/extendedを指定できます。デフォルトではnormalモードです。"); > ["kuromoji","使う","分かち書き","テスト","第","二","引数","normal","search","extended","指定","デフォルト","normal"," モード"]
word_ngrams(array<string> words, int minSize, int maxSize)
- Returns list of n-grams where minSize <= n <= maxSize
select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); > ["machine","machine learning","learning","learning is","is","is fun","fun"]
convert_label(const int|const float)
- Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1
each_top_k(int K, Object group, double cmpKey, *)
- Returns top-K values (or tail-K values when k is less than 0). Refer this article for detail.
generate_series(const int|bigint start, const int|bigint end)
- Generate a series of values, from start to end
select generate_series(1,9); 1 2 3 4 5 6 7 8 9
A similar function to PostgreSQL's generate_serics
. http://www.postgresql.org/docs/current/static/functions-srf.html
x_rank(KEY)
- Generates a pseudo sequence number starting from 1 for each key