Get dataset of Kaggle Display Advertising Challenge from one of the following sources:
It should be noted that you must accept and agree with CRITEO LABS DATA TERM OF USE before downloading the data.
Here, you can use a script prepared by one of the Hivemall PPMC members: takuti/criteo-ffm.
Clone the repository:
git clone git@github.com:takuti/criteo-ffm.git cd criteo-ffm
A script data.sh
downloads the original data and converts them into CSV format:
./data.sh # downloads the original data and generates `train.csv` and `test.csv` ln -s train.csv tr.csv ln -s test.csv te.csv
Or, since the original data is very huge, starting from the tiny sample data bundled into the repository would be better:
ln -s train.tiny.csv tr.csv ln -s test.tiny.csv te.csv
Load the CSV files to Hive tables as:
hadoop fs -put tr.csv /criteo/train hadoop fs -put te.csv /criteo/test
CREATE DATABASE IF NOT EXISTS criteo; use criteo;
DROP TABLE IF EXISTS train; CREATE EXTERNAL TABLE train ( id bigint, label int, -- quantitative features i1 int,i2 int,i3 int,i4 int,i5 int,i6 int,i7 int,i8 int,i9 int,i10 int,i11 int,i12 int,i13 int, -- categorical features c1 string,c2 string,c3 string,c4 string,c5 string,c6 string,c7 string,c8 string,c9 string,c10 string,c11 string,c12 string,c13 string,c14 string,c15 string,c16 string,c17 string,c18 string,c19 string,c20 string,c21 string,c22 string,c23 string,c24 string,c25 string,c26 string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/criteo/train';
DROP TABLE IF EXISTS test; CREATE EXTERNAL TABLE test ( label int, -- quantitative features i1 int,i2 int,i3 int,i4 int,i5 int,i6 int,i7 int,i8 int,i9 int,i10 int,i11 int,i12 int,i13 int, -- categorical features c1 string,c2 string,c3 string,c4 string,c5 string,c6 string,c7 string,c8 string,c9 string,c10 string,c11 string,c12 string,c13 string,c14 string,c15 string,c16 string,c17 string,c18 string,c19 string,c20 string,c21 string,c22 string,c23 string,c24 string,c25 string,c26 string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/criteo/test';