https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (algebra)
add jar ./tmp/hivemall.jar; source ./tmp/define-all.hive; create database kdd2010; use kdd2010; create external table kdd10a_train ( rowid int, label int, features ARRAY<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/kdd10a/train'; create external table kdd10a_test ( rowid int, label int, features ARRAY<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/kdd10a/test';
awk -f conv.awk kdda | hadoop fs -put - /dataset/kdd10a/train/kdda awk -f conv.awk kdda.t | hadoop fs -put - /dataset/kdd10a/test/kdda.t
create table kdd10a_train_orcfile ( rowid bigint, label int, features array<string> ) STORED AS orc tblproperties ("orc.compress"="SNAPPY"); -- SET mapred.reduce.tasks=64; INSERT OVERWRITE TABLE kdd10a_train_orcfile select * from kdd10a_train CLUSTER BY rand(); -- SET mapred.reduce.tasks=-1; create table kdd10a_test_exploded as select rowid, label, split(feature,":")[0] as feature, cast(split(feature,":")[1] as float) as value from kdd10a_test LATERAL VIEW explode(add_bias(features)) t AS feature; set hivevar:xtimes=3; set hivevar:shufflebuffersize=1000; -- set hivemall.amplify.seed=32; create or replace view kdd10a_train_x3 as select rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features) from kdd10a_train_orcfile;