https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (bridge to algebra)
add jar ./tmp/hivemall.jar; source ./tmp/define-all.hive; create database kdd2010; use kdd2010; create external table kdd10b_train ( rowid int, label int, features ARRAY<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/kdd10b/train'; create external table kdd10b_test ( rowid int, label int, features ARRAY<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/kdd10b/test';
awk -f conv.awk kddb | hadoop fs -put - /dataset/kdd10b/train/kddb awk -f conv.awk kddb.t | hadoop fs -put - /dataset/kdd10b/test/kddb.t
create table kdd10b_test_exploded as select rowid, label, split(feature,":")[0] as feature, cast(split(feature,":")[1] as float) as value from kdd10b_test LATERAL VIEW explode(add_bias(features)) t AS feature; set hivevar:xtimes=3; set hivevar:shufflebuffersize=1000; create or replace view kdd10b_train_x3 as select rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features) from kdd10b_train;