docs/gitbook/regression/kddcup12tr2_dataset.md - incubator-hivemall - Git at Google

 <!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing,
   software distributed under the License is distributed on an
   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   KIND, either express or implied.  See the License for the
   specific language governing permissions and limitations
   under the License.
 -->

 The task is predicting the click through rate (CTR) of advertisement, meaning that we are to predict the probability of each ad being clicked.
 http://www.kddcup2012.org/c/kddcup2012-track2

 ---

 **Dataset**

 |    File    |    Size     |    Records   |
 |:-----------|------------:|:------------:|
 | KDD_Track2_solution.csv | 244MB | 20,297,595 (20,297,594 w/o header) |
 | descriptionid_tokensid.txt | 268MB | 3,171,830 |
 | purchasedkeywordid_tokensid.txt | 26MB | 1,249,785 |
 | queryid_tokensid.txt | 704MB | 26,243,606 |
 | test.txt | 1.3GB | 20,297,594 |
 | titleid_tokensid.txt | 171MB | 4,051,441 |
 | training.txt | 9.9GB | 149,639,105 |
 | serid_profile.txt | 283MB | 23,669,283 |

 ![tables](../resources/images/kddtrack2tables.png)

 _Tokens are actually not used in this example. Try using them on your own._

 ---

 ```sql
 create database kdd12track2;
 use kdd12track2;

 delete jar /tmp/hivemall.jar;
 add jar /tmp/hivemall.jar;
 source /tmp/define-all.hive;

 Create external table training (
   RowID BIGINT,
   Clicks INT,
   Impression INT,
   DisplayURL STRING,
   AdID INT,
   AdvertiserID INT,
   Depth SMALLINT,
   Position SMALLINT,
   QueryID INT,
   KeywordID INT,
   TitleID INT,
   DescriptionID INT,
   UserID INT
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/training';

 Create external table testing (
   RowID BIGINT,
   DisplayURL STRING,
   AdID INT,
   AdvertiserID INT,
   Depth SMALLINT,
   Position SMALLINT,
   QueryID INT,
   KeywordID INT,
   TitleID INT,
   DescriptionID INT,
   UserID INT
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/testing';

 Create external table user (
   UserID INT,
   Gender TINYINT,
   Age TINYINT
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/user';

 Create external table query (
   QueryID INT,
   Tokens STRING
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/query';

 Create external table keyword (
   KeywordID INT,
   Tokens STRING
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/keyword';

 Create external table title (
   TitleID INT,
   Tokens STRING
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/title';

 Create external table description (
   DescriptionID INT,
   Tokens STRING
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/description';

 Create external table solution (
    RowID BIGINT,
    Clicks INT,
    Impressions INT,
    Private BOOLEAN
 ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/solution';
 ```

 ```sh
 gawk '{print NR"\t"$0;}' training.txt | \
 hadoop fs -put - /kddcup2012/track2/training/training.tsv

 gawk '{print NR"\t"$0;}' test.txt | \
 hadoop fs -put - /kddcup2012/track2/testing/test.tsv

 hadoop fs -put userid_profile.txt /kddcup2012/track2/user/user.tsv

 tail -n +2 KDD_Track2_solution.csv | sed -e 's/Public/FALSE/g' | sed -e 's/Private/TRUE/g' | gawk '{print NR","$0;}' \
 hadoop fs -put - /kddcup2012/track2/solution/solution.csv

 hadoop fs -put  queryid_tokensid.txt /kddcup2012/track2/query/tokensid.tsv
 hadoop fs -put purchasedkeywordid_tokensid.txt /kddcup2012/track2/keyword/tokensid.tsv
 hadoop fs -put titleid_tokensid.txt /kddcup2012/track2/title/tokensid.tsv
 hadoop fs -put descriptionid_tokensid.txt /kddcup2012/track2/description/tokensid.tsv
 ```

 ## Converting feature representation by feature hashing
 http://en.wikipedia.org/wiki/Feature_hashing

 *mhash* is the MurmurHash3 function to convert a feature vector into a hash value.

 ```sql
 create or replace view training2 as
 select
   rowid,
   clicks,
   (impression - clicks) as noclick,
   mhash(concat("1:", displayurl)) as displayurl,
   mhash(concat("2:", adid)) as adid,
   mhash(concat("3:", advertiserid)) as advertiserid,
   mhash(concat("4:", depth)) as depth,
   mhash(concat("5:", position)) as position,
   mhash(concat("6:", queryid)) as queryid,
   mhash(concat("7:", keywordid)) as keywordid,
   mhash(concat("8:", titleid)) as titleid,
   mhash(concat("9:", descriptionid)) as descriptionid,
   mhash(concat("10:", userid)) as userid,
   mhash(concat("11:", COALESCE(gender,"0"))) as gender,
   mhash(concat("12:", COALESCE(age,"-1"))) as age,
   -1 as bias
 from (
 select
   t.*,
   u.gender,
   u.age
 from
   training t
   LEFT OUTER JOIN user u
     on t.userid = u.userid
 ) t;

 create or replace view testing2 as
 select
   rowid,
   array(displayurl, adid, advertiserid, depth, position, queryid, keywordid, titleid, descriptionid, userid, gender, age, bias)
     as features
 from (
 select
   rowid,
   mhash(concat("1:", displayurl)) as displayurl,
   mhash(concat("2:", adid)) as adid,
   mhash(concat("3:", advertiserid)) as advertiserid,
   mhash(concat("4:", depth)) as depth,
   mhash(concat("5:", position)) as position,
   mhash(concat("6:", queryid)) as queryid,
   mhash(concat("7:", keywordid)) as keywordid,
   mhash(concat("8:", titleid)) as titleid,
   mhash(concat("9:", descriptionid)) as descriptionid,
   mhash(concat("10:", userid)) as userid,
   mhash(concat("11:", COALESCE(gender,"0"))) as gender,
   mhash(concat("12:", COALESCE(age,"-1"))) as age,
   -1 as bias
 from (
 select
   t.*,
   u.gender,
   u.age
 from
   testing t
   LEFT OUTER JOIN user u
     on t.userid = u.userid
 ) t1
 ) t2;
 ```

 ## Compressing large training tables
 ```sql
 create table training_orcfile (
  rowid bigint,
  label float,
  features array<int>
 ) STORED AS orc tblproperties ("orc.compress"="SNAPPY");
 ```
 _Caution: Joining between training table and user table takes a long time. Consider not to use gender and age and avoid joins if your Hadoop cluster is small._

 [kddconv.awk](https://github.com/myui/hivemall/blob/master/resources/examples/kddtrack2/kddconv.awk)

 ```sql
 add file /tmp/kddconv.awk;

 -- SET mapred.reduce.tasks=64;
 -- SET hive.auto.convert.join=false;

 INSERT OVERWRITE TABLE training_orcfile
 select transform(*)
   ROW FORMAT DELIMITED
      FIELDS TERMINATED BY "\t"
      LINES TERMINATED BY "\n"
 using 'gawk -f kddconv.awk'
   as (rowid BIGINT, label FLOAT, features ARRAY<INT>)
   ROW FORMAT DELIMITED
      FIELDS TERMINATED BY "\t"
      COLLECTION ITEMS TERMINATED BY ","
      LINES TERMINATED BY "\n"
 from training2
 CLUSTER BY rand();

 -- SET mapred.reduce.tasks=-1;
 -- SET hive.auto.convert.join=true;

 create table testing_exploded as
 select
   rowid,
   feature
 from
   testing2
   LATERAL VIEW explode(features) t AS feature;
 ```
 _Caution: We recommend you to set "mapred.reduce.tasks" in the above example to partition the training_orcfile table into pieces._
	<!--
	Licensed to the Apache Software Foundation (ASF) under one
	or more contributor license agreements. See the NOTICE file
	distributed with this work for additional information
	regarding copyright ownership. The ASF licenses this file
	to you under the Apache License, Version 2.0 (the
	"License"); you may not use this file except in compliance
	with the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing,
	software distributed under the License is distributed on an
	"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	KIND, either express or implied. See the License for the
	specific language governing permissions and limitations
	under the License.
	-->

	The task is predicting the click through rate (CTR) of advertisement, meaning that we are to predict the probability of each ad being clicked.
	http://www.kddcup2012.org/c/kddcup2012-track2

	---

	Dataset

	\| File \| Size \| Records \|
	\|:-----------\|------------:\|:------------:\|
	\| KDD_Track2_solution.csv \| 244MB \| 20,297,595 (20,297,594 w/o header) \|
	\| descriptionid_tokensid.txt \| 268MB \| 3,171,830 \|
	\| purchasedkeywordid_tokensid.txt \| 26MB \| 1,249,785 \|
	\| queryid_tokensid.txt \| 704MB \| 26,243,606 \|
	\| test.txt \| 1.3GB \| 20,297,594 \|
	\| titleid_tokensid.txt \| 171MB \| 4,051,441 \|
	\| training.txt \| 9.9GB \| 149,639,105 \|
	\| serid_profile.txt \| 283MB \| 23,669,283 \|

	![tables](../resources/images/kddtrack2tables.png)

	_Tokens are actually not used in this example. Try using them on your own._

	---

	```sql
	create database kdd12track2;
	use kdd12track2;

	delete jar /tmp/hivemall.jar;
	add jar /tmp/hivemall.jar;
	source /tmp/define-all.hive;

	Create external table training (
	RowID BIGINT,
	Clicks INT,
	Impression INT,
	DisplayURL STRING,
	AdID INT,
	AdvertiserID INT,
	Depth SMALLINT,
	Position SMALLINT,
	QueryID INT,
	KeywordID INT,
	TitleID INT,
	DescriptionID INT,
	UserID INT
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/training';

	Create external table testing (
	RowID BIGINT,
	DisplayURL STRING,
	AdID INT,
	AdvertiserID INT,
	Depth SMALLINT,
	Position SMALLINT,
	QueryID INT,
	KeywordID INT,
	TitleID INT,
	DescriptionID INT,
	UserID INT
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/testing';

	Create external table user (
	UserID INT,
	Gender TINYINT,
	Age TINYINT
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/user';

	Create external table query (
	QueryID INT,
	Tokens STRING
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/query';

	Create external table keyword (
	KeywordID INT,
	Tokens STRING
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/keyword';

	Create external table title (
	TitleID INT,
	Tokens STRING
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/title';

	Create external table description (
	DescriptionID INT,
	Tokens STRING
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/description';

	Create external table solution (
	RowID BIGINT,
	Clicks INT,
	Impressions INT,
	Private BOOLEAN
	) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION '/kddcup2012/track2/solution';
	```

	```sh
	gawk '{print NR"\t"$0;}' training.txt \| \
	hadoop fs -put - /kddcup2012/track2/training/training.tsv

	gawk '{print NR"\t"$0;}' test.txt \| \
	hadoop fs -put - /kddcup2012/track2/testing/test.tsv

	hadoop fs -put userid_profile.txt /kddcup2012/track2/user/user.tsv

	tail -n +2 KDD_Track2_solution.csv \| sed -e 's/Public/FALSE/g' \| sed -e 's/Private/TRUE/g' \| gawk '{print NR","$0;}' \
	hadoop fs -put - /kddcup2012/track2/solution/solution.csv

	hadoop fs -put queryid_tokensid.txt /kddcup2012/track2/query/tokensid.tsv
	hadoop fs -put purchasedkeywordid_tokensid.txt /kddcup2012/track2/keyword/tokensid.tsv
	hadoop fs -put titleid_tokensid.txt /kddcup2012/track2/title/tokensid.tsv
	hadoop fs -put descriptionid_tokensid.txt /kddcup2012/track2/description/tokensid.tsv
	```

	## Converting feature representation by feature hashing
	http://en.wikipedia.org/wiki/Feature_hashing

	mhash is the MurmurHash3 function to convert a feature vector into a hash value.

	```sql
	create or replace view training2 as
	select
	rowid,
	clicks,
	(impression - clicks) as noclick,
	mhash(concat("1:", displayurl)) as displayurl,
	mhash(concat("2:", adid)) as adid,
	mhash(concat("3:", advertiserid)) as advertiserid,
	mhash(concat("4:", depth)) as depth,
	mhash(concat("5:", position)) as position,
	mhash(concat("6:", queryid)) as queryid,
	mhash(concat("7:", keywordid)) as keywordid,
	mhash(concat("8:", titleid)) as titleid,
	mhash(concat("9:", descriptionid)) as descriptionid,
	mhash(concat("10:", userid)) as userid,
	mhash(concat("11:", COALESCE(gender,"0"))) as gender,
	mhash(concat("12:", COALESCE(age,"-1"))) as age,
	-1 as bias
	from (
	select
	t.*,
	u.gender,
	u.age
	from
	training t
	LEFT OUTER JOIN user u
	on t.userid = u.userid
	) t;

	create or replace view testing2 as
	select
	rowid,
	array(displayurl, adid, advertiserid, depth, position, queryid, keywordid, titleid, descriptionid, userid, gender, age, bias)
	as features
	from (
	select
	rowid,
	mhash(concat("1:", displayurl)) as displayurl,
	mhash(concat("2:", adid)) as adid,
	mhash(concat("3:", advertiserid)) as advertiserid,
	mhash(concat("4:", depth)) as depth,
	mhash(concat("5:", position)) as position,
	mhash(concat("6:", queryid)) as queryid,
	mhash(concat("7:", keywordid)) as keywordid,
	mhash(concat("8:", titleid)) as titleid,
	mhash(concat("9:", descriptionid)) as descriptionid,
	mhash(concat("10:", userid)) as userid,
	mhash(concat("11:", COALESCE(gender,"0"))) as gender,
	mhash(concat("12:", COALESCE(age,"-1"))) as age,
	-1 as bias
	from (
	select
	t.*,
	u.gender,
	u.age
	from
	testing t
	LEFT OUTER JOIN user u
	on t.userid = u.userid
	) t1
	) t2;
	```

	## Compressing large training tables
	```sql
	create table training_orcfile (
	rowid bigint,
	label float,
	features array<int>
	) STORED AS orc tblproperties ("orc.compress"="SNAPPY");
	```
	_Caution: Joining between training table and user table takes a long time. Consider not to use gender and age and avoid joins if your Hadoop cluster is small._

	[kddconv.awk](https://github.com/myui/hivemall/blob/master/resources/examples/kddtrack2/kddconv.awk)

	```sql
	add file /tmp/kddconv.awk;

	-- SET mapred.reduce.tasks=64;
	-- SET hive.auto.convert.join=false;

	INSERT OVERWRITE TABLE training_orcfile
	select transform(*)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY "\t"
	LINES TERMINATED BY "\n"
	using 'gawk -f kddconv.awk'
	as (rowid BIGINT, label FLOAT, features ARRAY<INT>)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY "\t"
	COLLECTION ITEMS TERMINATED BY ","
	LINES TERMINATED BY "\n"
	from training2
	CLUSTER BY rand();

	-- SET mapred.reduce.tasks=-1;
	-- SET hive.auto.convert.join=true;

	create table testing_exploded as
	select
	rowid,
	feature
	from
	testing2
	LATERAL VIEW explode(features) t AS feature;
	```
	_Caution: We recommend you to set "mapred.reduce.tasks" in the above example to partition the training_orcfile table into pieces._