The task is predicting the click through rate (CTR) of advertisement, meaning that we are to predict the probability of each ad being clicked. http://www.kddcup2012.org/c/kddcup2012-track2


Dataset

FileSizeRecords
KDD_Track2_solution.csv244MB20,297,595 (20,297,594 w/o header)
descriptionid_tokensid.txt268MB3,171,830
purchasedkeywordid_tokensid.txt26MB1,249,785
queryid_tokensid.txt704MB26,243,606
test.txt1.3GB20,297,594
titleid_tokensid.txt171MB4,051,441
training.txt9.9GB149,639,105
serid_profile.txt283MB23,669,283

tables

Tokens are actually not used in this example. Try using them on your own.


create database kdd12track2;
use kdd12track2;

delete jar /tmp/hivemall.jar;
add jar /tmp/hivemall.jar;
source /tmp/define-all.hive;

Create external table training (
  RowID BIGINT,
  Clicks INT, 
  Impression INT, 
  DisplayURL STRING, 
  AdID INT,
  AdvertiserID INT, 
  Depth SMALLINT, 
  Position SMALLINT, 
  QueryID INT, 
  KeywordID INT,
  TitleID INT, 
  DescriptionID INT, 
  UserID INT
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/training';

Create external table testing (
  RowID BIGINT,
  DisplayURL STRING, 
  AdID INT,
  AdvertiserID INT, 
  Depth SMALLINT, 
  Position SMALLINT, 
  QueryID INT, 
  KeywordID INT,
  TitleID INT, 
  DescriptionID INT, 
  UserID INT
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/testing';

Create external table user (
  UserID INT, 
  Gender TINYINT,
  Age TINYINT
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/user';

Create external table query (
  QueryID INT,
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/query';

Create external table keyword (
  KeywordID INT,
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/keyword';

Create external table title (
  TitleID INT, 
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/title';

Create external table description (
  DescriptionID INT,
  Tokens STRING
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/description';

Create external table solution (
   RowID BIGINT,
   Clicks INT,
   Impressions INT,
   Private BOOLEAN 
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/solution';
gawk '{print NR"\t"$0;}' training.txt | \
hadoop fs -put - /kddcup2012/track2/training/training.tsv

gawk '{print NR"\t"$0;}' test.txt | \
hadoop fs -put - /kddcup2012/track2/testing/test.tsv

hadoop fs -put userid_profile.txt /kddcup2012/track2/user/user.tsv

tail -n +2 KDD_Track2_solution.csv | sed -e 's/Public/FALSE/g' | sed -e 's/Private/TRUE/g' | gawk '{print NR","$0;}' \
hadoop fs -put - /kddcup2012/track2/solution/solution.csv

hadoop fs -put  queryid_tokensid.txt /kddcup2012/track2/query/tokensid.tsv
hadoop fs -put purchasedkeywordid_tokensid.txt /kddcup2012/track2/keyword/tokensid.tsv
hadoop fs -put titleid_tokensid.txt /kddcup2012/track2/title/tokensid.tsv
hadoop fs -put descriptionid_tokensid.txt /kddcup2012/track2/description/tokensid.tsv

Converting feature representation by feature hashing

http://en.wikipedia.org/wiki/Feature_hashing

mhash is the MurmurHash3 function to convert a feature vector into a hash value.

create or replace view training2 as
select
  rowid,
  clicks,
  (impression - clicks) as noclick,
  mhash(concat("1:", displayurl)) as displayurl, 
  mhash(concat("2:", adid)) as adid, 
  mhash(concat("3:", advertiserid)) as advertiserid, 
  mhash(concat("4:", depth)) as depth, 
  mhash(concat("5:", position)) as position, 
  mhash(concat("6:", queryid)) as queryid, 
  mhash(concat("7:", keywordid)) as keywordid, 
  mhash(concat("8:", titleid)) as titleid, 
  mhash(concat("9:", descriptionid)) as descriptionid, 
  mhash(concat("10:", userid)) as userid, 
  mhash(concat("11:", COALESCE(gender,"0"))) as gender, 
  mhash(concat("12:", COALESCE(age,"-1"))) as age, 
  -1 as bias
from (
select
  t.*,
  u.gender,
  u.age
from 
  training t 
  LEFT OUTER JOIN user u 
    on t.userid = u.userid
) t;

create or replace view testing2 as
select
  rowid, 
  array(displayurl, adid, advertiserid, depth, position, queryid, keywordid, titleid, descriptionid, userid, gender, age, bias) 
    as features
from (
select
  rowid,
  mhash(concat("1:", displayurl)) as displayurl, 
  mhash(concat("2:", adid)) as adid, 
  mhash(concat("3:", advertiserid)) as advertiserid, 
  mhash(concat("4:", depth)) as depth, 
  mhash(concat("5:", position)) as position, 
  mhash(concat("6:", queryid)) as queryid, 
  mhash(concat("7:", keywordid)) as keywordid, 
  mhash(concat("8:", titleid)) as titleid, 
  mhash(concat("9:", descriptionid)) as descriptionid, 
  mhash(concat("10:", userid)) as userid, 
  mhash(concat("11:", COALESCE(gender,"0"))) as gender, 
  mhash(concat("12:", COALESCE(age,"-1"))) as age, 
  -1 as bias
from (
select
  t.*,
  u.gender,
  u.age
from 
  testing t 
  LEFT OUTER JOIN user u 
    on t.userid = u.userid
) t1
) t2;

Compressing large training tables

create table training_orcfile (
 rowid bigint,
 label float,
 features array<int>
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");

Caution: Joining between training table and user table takes a long time. Consider not to use gender and age and avoid joins if your Hadoop cluster is small.

kddconv.awk

add file /tmp/kddconv.awk;

-- SET mapred.reduce.tasks=64;
-- SET hive.auto.convert.join=false;

INSERT OVERWRITE TABLE training_orcfile 
select transform(*) 
  ROW FORMAT DELIMITED
     FIELDS TERMINATED BY "\t"
     LINES TERMINATED BY "\n"
using 'gawk -f kddconv.awk' 
  as (rowid BIGINT, label FLOAT, features ARRAY<INT>)
  ROW FORMAT DELIMITED
     FIELDS TERMINATED BY "\t"
     COLLECTION ITEMS TERMINATED BY ","
     LINES TERMINATED BY "\n"
from training2
CLUSTER BY rand();

-- SET mapred.reduce.tasks=-1;
-- SET hive.auto.convert.join=true;

create table testing_exploded as
select 
  rowid,
  feature
from 
  testing2 
  LATERAL VIEW explode(features) t AS feature;

Caution: We recommend you to set “mapred.reduce.tasks” in the above example to partition the training_orcfile table into pieces.