Dataset prepration

Iris Dataset: https://archive.ics.uci.edu/ml/datasets/Iris

$ wget http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
$ less iris.data

   ...
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
   ...

Create training/test table in Hive

create database iris;
use iris;

create external table iris_raw (
  rowid int,
  label string,
  features array<float>
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' COLLECTION ITEMS TERMINATED BY "," STORED AS TEXTFILE LOCATION '/dataset/iris/raw';

Loading data into HDFS

$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | head -3

1|Iris-setosa|5.1,3.5,1.4,0.2
2|Iris-setosa|4.9,3.0,1.4,0.2
3|Iris-setosa|4.7,3.2,1.3,0.2
$ awk -F"," 'NF >0 {OFS="|"; print NR,$5,$1","$2","$3","$4}' iris.data | hadoop fs -put - /dataset/iris/raw/iris.data
select count(1) from iris_raw;

> 150

Feature scaling

Normalization of feature weights is very important to get a good prediction in machine learning.

select 
  min(features[0]), max(features[0]),
  min(features[1]), max(features[1]),
  min(features[2]), max(features[2]),
  min(features[3]), max(features[3])
from
  iris_raw;

> 4.3     7.9     2.0     4.4     1.0     6.9     0.1     2.5
set hivevar:f0_min=4.3;
set hivevar:f0_max=7.9;
set hivevar:f1_min=2.0;
set hivevar:f1_max=4.4;
set hivevar:f2_min=1.0;
set hivevar:f2_max=6.9;
set hivevar:f3_min=0.1;
set hivevar:f3_max=2.5;

create or replace view iris_scaled
as
select
  rowid, 
  label,
  add_bias(array(
     concat("1:", rescale(features[0],${f0_min},${f0_max})), 
     concat("2:", rescale(features[1],${f1_min},${f1_max})), 
     concat("3:", rescale(features[2],${f2_min},${f2_max})), 
     concat("4:", rescale(features[3],${f3_min},${f3_max}))
  )) as features
from 
  iris_raw;
select * from iris_scaled limit 3;

> 1       Iris-setosa     ["1:0.22222215","2:0.625","3:0.0677966","4:0.041666664","0:1.0"]
> 2       Iris-setosa     ["1:0.16666664","2:0.41666666","3:0.0677966","4:0.041666664","0:1.0"]
> 3       Iris-setosa     ["1:0.11111101","2:0.5","3:0.05084745","4:0.041666664","0:1.0"]

LibSVM web page provides a normalized (using ZScore) version of Iris dataset.

Create training/test data

set hivevar:rand_seed=31;

create table iris_shuffled 
as
select rand(${rand_seed}) as rnd, * from iris_scaled;

-- 80% for training
create table train80p as
select * from iris_shuffled 
order by rnd DESC
limit 120;

-- 20% for testing
create table test20p as
select * from iris_shuffled 
order by rnd ASC
limit 30;

create table test20p_exploded 
as
select 
  rowid,
  label,
  extract_feature(feature) as feature,
  extract_weight(feature) as value
from 
  test20p LATERAL VIEW explode(features) t AS feature;

Define an amplified view for the training input

set hivevar:xtimes=10;
set hivevar:shufflebuffersize=1000;

create or replace view training_x10
as
select
   rand_amplify(${xtimes}, ${shufflebuffersize}, rowid, label, features) as (rowid, label, features)
from  
   train80p;