docs/gitbook/ft_engineering/scaling.md - incubator-hivemall - Git at Google

 <!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing,
   software distributed under the License is distributed on an
   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   KIND, either express or implied.  See the License for the
   specific language governing permissions and limitations
   under the License.
 -->

 <!-- toc -->

 # L1/L2 Normalization

 [L1](http://mathworld.wolfram.com/L1-Norm.html) and [L2](http://mathworld.wolfram.com/L2-Norm.html) normalization ensures that each feature vector has unit length:

 ```sql
 select l1_normalize(array('apple:1.0', 'banana:0.5'))
 ```

 > ["apple:0.6666667","banana:0.33333334"]

 ```sql
 select l2_normalize(array('apple:1.0', 'banana:0.5'))
 ```

 > ["apple:0.8944272","banana:0.4472136"]

 # Min-Max Normalization

 [Min-max normalization](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling) converts values to range `[0.0,1.0]`.

 ```sql
 select
   rescale(target, min(target) over (), max(target) over ()) as target
 from
   e2006tfidf_train
 ```

 It can also expressed without Windowing function as follows:

 ```sql
 select min(target), max(target)
 from (
   select target from e2006tfidf_train
 -- union all
 -- select target from e2006tfidf_test
 ) t;
 ```

 > -7.899578       -0.51940954

 ```sql
 set hivevar:min_target=-7.899578;
 set hivevar:max_target=-0.51940954;

 create or replace view e2006tfidf_train_scaled
 as
 select
   rowid,
   rescale(target, ${min_target}, ${max_target}) as target,
   features
 from
   e2006tfidf_train;
 ```

 # Feature scaling by zscore

 Refer [this article](https://en.wikipedia.org/wiki/Standard_score) to get details about Zscore.

 ```sql
 select
   zscore(target, avg(target) over (), stddev_pop(target) over ()) as target
 from
   e2006tfidf_train;
 ```

 # Apply Normalization to more complex feature vector

 Apply normalization to the following data.

 ```sql
 create table train as
 select
   1 as rowid, array("weight:69.613","specific_heat:129.07","reflectance:52.111") as features
 UNION ALL
 select
   2 as rowid, array("weight:70.67","specific_heat:128.161","reflectance:52.446") as features
 UNION ALL
 select
   3 as rowid, array("weight:72.303","specific_heat:128.45","reflectance:52.853") as features

 select rowid, features from train;
 ```

 ```
 1       ["weight:69.613","specific_heat:129.07","reflectance:52.111"]
 2       ["weight:70.67","specific_heat:128.161","reflectance:52.446"]
 3       ["weight:72.303","specific_heat:128.45","reflectance:52.853"]
 ```

 We can create a normalized table as follows:

 ```sql
 create table train_normalized
 as
 WITH exploded as (
   select
     rowid,
     extract_feature(feature) as feature,
     extract_weight(feature) as value
   from
     train
     LATERAL VIEW explode(features) exploded AS feature
 ),
 scaled as (
   select
     rowid,
     feature,
     rescale(value, min(value) over (partition by feature), max(value) over (partition by feature)) as minmax,
     zscore(value, avg(value) over (partition by feature), stddev_pop(value) over (partition by feature)) as zscore
   from
     exploded
 )
 select
   rowid,
   collect_list(feature(feature, minmax)) as features
 from
   scaled
 group by
   rowid;
 ```

 ```
 1       ["reflectance:0.0","specific_heat:1.0","weight:0.0"]
 2       ["reflectance:0.4514809","specific_heat:0.0","weight:0.39293614"]
 3       ["reflectance:1.0","specific_heat:0.31792927","weight:1.0"]
 ...
 ```
	<!--
	Licensed to the Apache Software Foundation (ASF) under one
	or more contributor license agreements. See the NOTICE file
	distributed with this work for additional information
	regarding copyright ownership. The ASF licenses this file
	to you under the Apache License, Version 2.0 (the
	"License"); you may not use this file except in compliance
	with the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing,
	software distributed under the License is distributed on an
	"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	KIND, either express or implied. See the License for the
	specific language governing permissions and limitations
	under the License.
	-->

	<!-- toc -->

	# L1/L2 Normalization

	[L1](http://mathworld.wolfram.com/L1-Norm.html) and [L2](http://mathworld.wolfram.com/L2-Norm.html) normalization ensures that each feature vector has unit length:

	```sql
	select l1_normalize(array('apple:1.0', 'banana:0.5'))
	```

	> ["apple:0.6666667","banana:0.33333334"]

	```sql
	select l2_normalize(array('apple:1.0', 'banana:0.5'))
	```

	> ["apple:0.8944272","banana:0.4472136"]

	# Min-Max Normalization

	[Min-max normalization](https://en.wikipedia.org/wiki/Feature_scaling#Rescaling) converts values to range `[0.0,1.0]`.

	```sql
	select
	rescale(target, min(target) over (), max(target) over ()) as target
	from
	e2006tfidf_train
	```

	It can also expressed without Windowing function as follows:

	```sql
	select min(target), max(target)
	from (
	select target from e2006tfidf_train
	-- union all
	-- select target from e2006tfidf_test
	) t;
	```

	> -7.899578 -0.51940954

	```sql
	set hivevar:min_target=-7.899578;
	set hivevar:max_target=-0.51940954;

	create or replace view e2006tfidf_train_scaled
	as
	select
	rowid,
	rescale(target, ${min_target}, ${max_target}) as target,
	features
	from
	e2006tfidf_train;
	```

	# Feature scaling by zscore

	Refer [this article](https://en.wikipedia.org/wiki/Standard_score) to get details about Zscore.

	```sql
	select
	zscore(target, avg(target) over (), stddev_pop(target) over ()) as target
	from
	e2006tfidf_train;
	```

	# Apply Normalization to more complex feature vector

	Apply normalization to the following data.

	```sql
	create table train as
	select
	1 as rowid, array("weight:69.613","specific_heat:129.07","reflectance:52.111") as features
	UNION ALL
	select
	2 as rowid, array("weight:70.67","specific_heat:128.161","reflectance:52.446") as features
	UNION ALL
	select
	3 as rowid, array("weight:72.303","specific_heat:128.45","reflectance:52.853") as features

	select rowid, features from train;
	```

	```
	1 ["weight:69.613","specific_heat:129.07","reflectance:52.111"]
	2 ["weight:70.67","specific_heat:128.161","reflectance:52.446"]
	3 ["weight:72.303","specific_heat:128.45","reflectance:52.853"]
	```

	We can create a normalized table as follows:

	```sql
	create table train_normalized
	as
	WITH exploded as (
	select
	rowid,
	extract_feature(feature) as feature,
	extract_weight(feature) as value
	from
	train
	LATERAL VIEW explode(features) exploded AS feature
	),
	scaled as (
	select
	rowid,
	feature,
	rescale(value, min(value) over (partition by feature), max(value) over (partition by feature)) as minmax,
	zscore(value, avg(value) over (partition by feature), stddev_pop(value) over (partition by feature)) as zscore
	from
	exploded
	)
	select
	rowid,
	collect_list(feature(feature, minmax)) as features
	from
	scaled
	group by
	rowid;
	```

	```
	1 ["reflectance:0.0","specific_heat:1.0","weight:0.0"]
	2 ["reflectance:0.4514809","specific_heat:0.0","weight:0.39293614"]
	3 ["reflectance:1.0","specific_heat:0.31792927","weight:1.0"]
	...
	```