Get the dataset from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#webspam
hadoop fs -mkdir -p /dataset/webspam/raw awk -f conv.awk webspam_wc_normalized_trigram.svm | \ hadoop fs -put - /dataset/webspam/raw/
create database webspam; use webspam; create external table webspam_raw ( rowid int, label int, features ARRAY<STRING> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/webspam/raw'; set hive.sample.seednumber=43; create table webspam_test as select * from webspam_raw TABLESAMPLE(1000 ROWS) s CLUSTER BY rand(43) limit 70000;
create table webspam_train_orcfile ( rowid int, label int, features array<string> ) STORED AS orc tblproperties ("orc.compress"="SNAPPY"); -- SET mapred.reduce.tasks=128; INSERT OVERWRITE TABLE webspam_train_orcfile select s.rowid, label, add_bias(features) as features from webspam_raw s where not exists (select rowid from webspam_test t where s.rowid = t.rowid) CLUSTER BY rand(43); -- SET mapred.reduce.tasks=-1; set hivevar:xtimes=3; set hivevar:shufflebuffersize=100; set hivemall.amplify.seed=32; create or replace view webspam_train_x3 as select rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features) from webspam_train_orcfile; create table webspam_test_exploded as select rowid, label, split(feature,":")[0] as feature, cast(split(feature,":")[1] as float) as value from webspam_test LATERAL VIEW explode(add_bias(features)) t AS feature;
Caution: For this dataset, use small shufflebuffersize because each training example has lots of features though (xtimes * shufflebuffersize * N) training examples are cached in memory.