L1 and L2 normalization ensures that each feature vector has unit length:
select l1_normalize(array('apple:1.0', 'banana:0.5'))
[“apple:0.6666667”,“banana:0.33333334”]
select l2_normalize(array('apple:1.0', 'banana:0.5'))
[“apple:0.8944272”,“banana:0.4472136”]
https://en.wikipedia.org/wiki/Feature_scaling#Rescaling
select min(target), max(target) from ( select target from e2006tfidf_train -- union all -- select target from e2006tfidf_test ) t;
-7.899578 -0.51940954
set hivevar:min_target=-7.899578; set hivevar:max_target=-0.51940954; create or replace view e2006tfidf_train_scaled as select rowid, rescale(target, ${min_target}, ${max_target}) as target, features from e2006tfidf_train;
https://en.wikipedia.org/wiki/Standard_score
select avg(target), stddev_pop(target) from ( select target from e2006tfidf_train -- union all -- select target from e2006tfidf_test ) t;
-3.566241460963296 0.6278076335455348
set hivevar:mean_target=-3.566241460963296; set hivevar:stddev_target=0.6278076335455348; create or replace view e2006tfidf_train_scaled as select rowid, zscore(target, ${mean_target}, ${stddev_target}) as target, features from e2006tfidf_train;
Apply normalization to the following data.
select rowid, features from train limit 3;
1 ["weight:69.613","specific_heat:129.07","reflectance:52.111"] 2 ["weight:70.67","specific_heat:128.161","reflectance:52.446"] 3 ["weight:72.303","specific_heat:128.45","reflectance:52.853"]
We can create a normalized table as follows:
create table train_normalized as WITH fv as ( select rowid, extract_feature(feature) as feature, extract_weight(feature) as value from train LATERAL VIEW explode(features) exploded AS feature ), stats as ( select feature, -- avg(value) as mean, stddev_pop(value) as stddev min(value) as min, max(value) as max from fv group by feature ), norm as ( select rowid, t1.feature, -- zscore(t1.value, t2.mean, t2.stddev) as zscore rescale(t1.value, t2.min, t2.max) as minmax from fv t1 JOIN stats t2 ON (t1.feature = t2.feature) ), norm_fv as ( select rowid, -- concat(feature, ":", zscore) as feature -- concat(feature, ":", minmax) as feature -- Before Hivemall v0.3.2-1 feature(feature, minmax) as feature -- Hivemall v0.3.2-1 or later from norm ) select rowid, collect_list(feature) as features from norm_fv group by rowid ;
1 ["reflectance:0.5252967","specific_heat:0.19863537","weight:0.0"] 2 ["reflectance:0.5950446","specific_heat:0.09166764","weight:0.052084323"] 3 ["reflectance:0.6797837","specific_heat:0.12567581","weight:0.13255163"] ...
WITH quantative as ( select id, true as minmax, "age" as feature, age as value from train union all select id, false as minmax, "balance" as feature, balance as value from train union all select id, true as minmax, "day" as feature, day as value from train union all select id, false as minmax, "duration" as feature, duration as value from train union all select id, false as minmax, "campaign" as feature, campaign as value from train union all select id, false as minmax, "pdays" as feature, if(pdays = -1, 0, pdays) as value from train union all select id, false as minmax, "previous" as feature, previous as value from train ), quantative_stats as ( select feature, avg(value) as mean, stddev_pop(value) as stddev, min(value) as min, max(value) as max from quantative group by feature ), quantative_norm as ( select t1.id, collect_list( feature( t1.feature, if(t1.minmax,rescale(t1.value, t2.min, t2.max),zscore(t1.value, t2.mean, t2.stddev)) ) ) as features from quantative t1 JOIN quantative_stats t2 ON (t1.feature = t2.feature) group by t1.id ) ...