L1/L2 Normalization

L1 and L2 normalization ensures that each feature vector has unit length:

select l1_normalize(array('apple:1.0', 'banana:0.5'))

[“apple:0.6666667”,“banana:0.33333334”]

select l2_normalize(array('apple:1.0', 'banana:0.5'))

[“apple:0.8944272”,“banana:0.4472136”]

Min-Max Normalization

Min-max normalization converts values to range [0.0,1.0].

select 
  rescale(target, min(target) over (), max(target) over ()) as target
from
  e2006tfidf_train

It can also expressed without Windowing function as follows:

select min(target), max(target)
from (
  select target from e2006tfidf_train 
-- union all
-- select target from e2006tfidf_test 
) t;

-7.899578 -0.51940954

set hivevar:min_target=-7.899578;
set hivevar:max_target=-0.51940954;

create or replace view e2006tfidf_train_scaled 
as
select 
  rowid,
  rescale(target, ${min_target}, ${max_target}) as target, 
  features
from 
  e2006tfidf_train;

Feature scaling by zscore

Refer this article to get details about Zscore.

select 
  zscore(target, avg(target) over (), stddev_pop(target) over ()) as target
from 
  e2006tfidf_train;

Apply Normalization to more complex feature vector

Apply normalization to the following data.

create table train as 
select 
  1 as rowid, array("weight:69.613","specific_heat:129.07","reflectance:52.111") as features
UNION ALL
select 
  2 as rowid, array("weight:70.67","specific_heat:128.161","reflectance:52.446") as features
UNION ALL
select 
  3 as rowid, array("weight:72.303","specific_heat:128.45","reflectance:52.853") as features

select rowid, features from train;
1       ["weight:69.613","specific_heat:129.07","reflectance:52.111"]
2       ["weight:70.67","specific_heat:128.161","reflectance:52.446"]
3       ["weight:72.303","specific_heat:128.45","reflectance:52.853"]

We can create a normalized table as follows:

create table train_normalized
as
WITH exploded as (
  select 
    rowid, 
    extract_feature(feature) as feature,
    extract_weight(feature) as value
  from 
    train 
    LATERAL VIEW explode(features) exploded AS feature
), 
scaled as (
  select 
    rowid,
    feature,
    rescale(value, min(value) over (partition by feature), max(value) over (partition by feature)) as minmax,
    zscore(value, avg(value) over (partition by feature), stddev_pop(value) over (partition by feature)) as zscore
  from 
    exploded
)
select
  rowid,
  collect_list(feature(feature, minmax)) as features
from
  scaled
group by
  rowid;
1       ["reflectance:0.0","specific_heat:1.0","weight:0.0"]
2       ["reflectance:0.4514809","specific_heat:0.0","weight:0.39293614"]
3       ["reflectance:1.0","specific_heat:0.31792927","weight:1.0"]
...