docs/gitbook/tips/ensemble_learning.md - incubator-hivemall - Git at Google

 <!--
   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing,
   software distributed under the License is distributed on an
   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
   KIND, either express or implied.  See the License for the
   specific language governing permissions and limitations
   under the License.
 -->

 This example explains how to run ensemble learning in Hivemall.
 Two heads are better than one? Let's verify it by ensemble learning.

 <!-- toc -->

 ---

 ## UDF preparation
 ```sql
 delete jar /home/myui/tmp/hivemall.jar;
 add jar /home/myui/tmp/hivemall.jar;

 source /home/myui/tmp/define-all.hive;
 ```

 # [Case1] Model ensemble/mixing

 ## training
 ```sql
 SET hive.exec.parallel=true;
 SET hive.exec.parallel.thread.number=8;
 SET mapred.reduce.tasks=4;

 drop table news20mc_ensemble_model1;
 create table news20mc_ensemble_model1 as
 select
  label,
  -- cast(feature as int) as feature, -- hivemall v0.1
  argmin_kld(feature, covar) as feature, -- hivemall v0.2 or later
  voted_avg(weight) as weight
 from
  (select
      -- train_multiclass_cw(add_bias(features),label) as (label,feature,weight)      -- hivemall v0.1
      train_multiclass_cw(add_bias(features),label) as (label,feature,weight,covar)   -- hivemall v0.2 or later
   from
      news20mc_train_x3
   union all
   select
      -- train_multiclass_arow(add_bias(features),label) as (label,feature,weight)    -- hivemall v0.1
      train_multiclass_arow(add_bias(features),label) as (label,feature,weight,covar) -- hivemall v0.2 or later
   from
      news20mc_train_x3
   union all
   select
      -- train_multiclass_scw(add_bias(features),label) as (label,feature,weight)     -- hivemall v0.1
      train_multiclass_scw(add_bias(features),label) as (label,feature,weight,covar)  -- hivemall v0.2 or later
   from
      news20mc_train_x3
  ) t
 group by label, feature;

 -- reset to the default
 SET hive.exec.parallel=false;
 SET mapred.reduce.tasks=-1;
 ```

 ## prediction
 ```sql
 create or replace view news20mc_ensemble_predict1
 as
 select
   rowid,
   m.col0 as score,
   m.col1 as label
 from (
 select
    rowid,
    maxrow(score, label) as m
 from (
   select
     t.rowid,
     m.label,
     sum(m.weight * t.value) as score
   from
     news20mc_test_exploded t LEFT OUTER JOIN
     news20mc_ensemble_model1 m ON (t.feature = m.feature)
   group by
     t.rowid, m.label
 ) t1
 group by rowid
 ) t2;
 ```

 ## evaluation
 ```sql
 create or replace view news20mc_ensemble_submit1 as
 select
   t.label as actual,
   pd.label as predicted
 from
   news20mc_test t JOIN news20mc_ensemble_predict1 pd
     on (t.rowid = pd.rowid);
 ```

 ```
 select count(1)/3993 from news20mc_ensemble_submit1
 where actual == predicted;
 ```

 > 0.8494866015527173

 ## Cleaning

 ```sql
 drop table news20mc_ensemble_model1;
 drop view news20mc_ensemble_predict1;
 drop view news20mc_ensemble_submit1;
 ```
 ---

 Unfortunately, too many cooks spoil the broth in this case :-(

 | Algorithm | Accuracy |
 |:-----------|------------:|
 | AROW | 0.8474830954169797 |
 | SCW2 |  0.8482344102178813 |
 | Ensemble(model) | 0.8494866015527173 |
 | CW |  0.850488354620586 |


 ---

 # [Case2] Prediction ensemble

 ## prediction
 ```sql
 create or replace view news20mc_pred_ensemble_predict1
 as
 select
   rowid,
   m.col1 as label
 from (
   select
     rowid,
     maxrow(cnt, label) as m
   from (
     select
       rowid,
       label,
       count(1) as cnt
     from (
       select * from news20mc_arow_predict1
       union all
       select * from news20mc_scw2_predict1
       union all
       select * from news20mc_cw_predict1
     ) t1
     group by rowid, label
   ) t2
   group by rowid
 ) t3;
 ```

 ## evaluation
 ```sql
 create or replace view news20mc_pred_ensemble_submit1 as
 select
   t.label as actual,
   pd.label as predicted
 from
   news20mc_test t JOIN news20mc_pred_ensemble_predict1 pd
     on (t.rowid = pd.rowid);
 ```

 ```
 select count(1)/3993 from news20mc_pred_ensemble_submit1
 where actual == predicted;
 ```

 > 0.8499874780866516

 Unfortunately, too many cooks spoil the broth in this case too :-(

 | Algorithm | Accuracy |
 |:-----------|------------:|
 | AROW | 0.8474830954169797 |
 | SCW2 |  0.8482344102178813 |
 | Ensemble(model) | 0.8494866015527173 |
 | Ensemble(prediction) | 0.8499874780866516 |
 | CW |  0.850488354620586 |
	<!--
	Licensed to the Apache Software Foundation (ASF) under one
	or more contributor license agreements. See the NOTICE file
	distributed with this work for additional information
	regarding copyright ownership. The ASF licenses this file
	to you under the Apache License, Version 2.0 (the
	"License"); you may not use this file except in compliance
	with the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing,
	software distributed under the License is distributed on an
	"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	KIND, either express or implied. See the License for the
	specific language governing permissions and limitations
	under the License.
	-->

	This example explains how to run ensemble learning in Hivemall.
	Two heads are better than one? Let's verify it by ensemble learning.

	<!-- toc -->

	---

	## UDF preparation
	```sql
	delete jar /home/myui/tmp/hivemall.jar;
	add jar /home/myui/tmp/hivemall.jar;

	source /home/myui/tmp/define-all.hive;
	```

	# [Case1] Model ensemble/mixing

	## training
	```sql
	SET hive.exec.parallel=true;
	SET hive.exec.parallel.thread.number=8;
	SET mapred.reduce.tasks=4;

	drop table news20mc_ensemble_model1;
	create table news20mc_ensemble_model1 as
	select
	label,
	-- cast(feature as int) as feature, -- hivemall v0.1
	argmin_kld(feature, covar) as feature, -- hivemall v0.2 or later
	voted_avg(weight) as weight
	from
	(select
	-- train_multiclass_cw(add_bias(features),label) as (label,feature,weight) -- hivemall v0.1
	train_multiclass_cw(add_bias(features),label) as (label,feature,weight,covar) -- hivemall v0.2 or later
	from
	news20mc_train_x3
	union all
	select
	-- train_multiclass_arow(add_bias(features),label) as (label,feature,weight) -- hivemall v0.1
	train_multiclass_arow(add_bias(features),label) as (label,feature,weight,covar) -- hivemall v0.2 or later
	from
	news20mc_train_x3
	union all
	select
	-- train_multiclass_scw(add_bias(features),label) as (label,feature,weight) -- hivemall v0.1
	train_multiclass_scw(add_bias(features),label) as (label,feature,weight,covar) -- hivemall v0.2 or later
	from
	news20mc_train_x3
	) t
	group by label, feature;

	-- reset to the default
	SET hive.exec.parallel=false;
	SET mapred.reduce.tasks=-1;
	```

	## prediction
	```sql
	create or replace view news20mc_ensemble_predict1
	as
	select
	rowid,
	m.col0 as score,
	m.col1 as label
	from (
	select
	rowid,
	maxrow(score, label) as m
	from (
	select
	t.rowid,
	m.label,
	sum(m.weight * t.value) as score
	from
	news20mc_test_exploded t LEFT OUTER JOIN
	news20mc_ensemble_model1 m ON (t.feature = m.feature)
	group by
	t.rowid, m.label
	) t1
	group by rowid
	) t2;
	```

	## evaluation
	```sql
	create or replace view news20mc_ensemble_submit1 as
	select
	t.label as actual,
	pd.label as predicted
	from
	news20mc_test t JOIN news20mc_ensemble_predict1 pd
	on (t.rowid = pd.rowid);
	```

	```
	select count(1)/3993 from news20mc_ensemble_submit1
	where actual == predicted;
	```

	> 0.8494866015527173

	## Cleaning

	```sql
	drop table news20mc_ensemble_model1;
	drop view news20mc_ensemble_predict1;
	drop view news20mc_ensemble_submit1;
	```
	---

	Unfortunately, too many cooks spoil the broth in this case :-(

	\| Algorithm \| Accuracy \|
	\|:-----------\|------------:\|
	\| AROW \| 0.8474830954169797 \|
	\| SCW2 \| 0.8482344102178813 \|
	\| Ensemble(model) \| 0.8494866015527173 \|
	\| CW \| 0.850488354620586 \|


	---

	# [Case2] Prediction ensemble

	## prediction
	```sql
	create or replace view news20mc_pred_ensemble_predict1
	as
	select
	rowid,
	m.col1 as label
	from (
	select
	rowid,
	maxrow(cnt, label) as m
	from (
	select
	rowid,
	label,
	count(1) as cnt
	from (
	select * from news20mc_arow_predict1
	union all
	select * from news20mc_scw2_predict1
	union all
	select * from news20mc_cw_predict1
	) t1
	group by rowid, label
	) t2
	group by rowid
	) t3;
	```

	## evaluation
	```sql
	create or replace view news20mc_pred_ensemble_submit1 as
	select
	t.label as actual,
	pd.label as predicted
	from
	news20mc_test t JOIN news20mc_pred_ensemble_predict1 pd
	on (t.rowid = pd.rowid);
	```

	```
	select count(1)/3993 from news20mc_pred_ensemble_submit1
	where actual == predicted;
	```

	> 0.8499874780866516

	Unfortunately, too many cooks spoil the broth in this case too :-(

	\| Algorithm \| Accuracy \|
	\|:-----------\|------------:\|
	\| AROW \| 0.8474830954169797 \|
	\| SCW2 \| 0.8482344102178813 \|
	\| Ensemble(model) \| 0.8494866015527173 \|
	\| Ensemble(prediction) \| 0.8499874780866516 \|
	\| CW \| 0.850488354620586 \|