https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (algebra)

  • the number of classes: 2
  • the number of data: 8,407,752 (training) / 510,302 (testing)
  • the number of features: 20,216,830 in about 2.73 GB (training) / 20,216,830 (testing)

Define training/testing tables

add jar ./tmp/hivemall.jar;
source ./tmp/define-all.hive;

create database kdd2010;
use kdd2010;

create external table kdd10a_train (
  rowid int,
  label int,
  features ARRAY<STRING>
) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," 
STORED AS TEXTFILE LOCATION '/dataset/kdd10a/train';

create external table kdd10a_test (
  rowid int, 
  label int,
  features ARRAY<STRING>
) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY "," 
STORED AS TEXTFILE LOCATION '/dataset/kdd10a/test';

Putting data into HDFS

conv.awk

awk -f conv.awk kdda | hadoop fs -put - /dataset/kdd10a/train/kdda
awk -f conv.awk kdda.t | hadoop fs -put - /dataset/kdd10a/test/kdda.t

Make auxiliary tables

create table kdd10a_train_orcfile (
 rowid bigint,
 label int,
 features array<string>
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");

-- SET mapred.reduce.tasks=64;
INSERT OVERWRITE TABLE kdd10a_train_orcfile
select * from kdd10a_train
CLUSTER BY rand();
-- SET mapred.reduce.tasks=-1;

create table kdd10a_test_exploded as
select 
  rowid,
  label,
  split(feature,":")[0] as feature,
  cast(split(feature,":")[1] as float) as value
from 
  kdd10a_test LATERAL VIEW explode(add_bias(features)) t AS feature;

set hivevar:xtimes=3;
set hivevar:shufflebuffersize=1000;
-- set hivemall.amplify.seed=32;
create or replace view kdd10a_train_x3
as
select
   rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from  
   kdd10a_train_orcfile;