Hivemall generally uses model averaging (i.e., model ensemble) for creating a unified prediction model. In this tutorial, we show how to apply bagging (i.e., prediction ensemble) for making a prediction.
-- set mapred.reduce.tasks=3; -- explicitly use 3 reducers CREATE TABLE bagging_models as WITH train as ( select train_classifier( add_bias(features), label, '-loss logistic -opt AdamHD -reg l1 -iters 20' ) as (feature,weight) from news20b_train_x3 ), models as ( select taskid() as modelid, feature, weight from train ) select modelid, feature, voted_avg(weight) as weight -- or simply avg(weight) from models group by modelid, feature;
create table bagging_predict as WITH weights as ( select t.rowid, m.modelid, sum(m.weight * t.value) as total_weight from news20b_test_exploded t LEFT OUTER JOIN bagging_models m ON (t.feature = m.feature) group by rowid, modelid ), voted as ( select rowid, voted_avg(total_weight) as total_weight from weights group by rowid ) select rowid, max(total_weight) as total_weight, -- max is dummy case when sum(total_weight) > 0.0 then 1 else -1 end as label from voted group by rowid;
WITH submit as ( select t.label as actual, p.label as predicted from news20b_test t JOIN bagging_predict p on (t.rowid = p.rowid) ) select sum(if(actual = predicted, 1, 0)) / count(1) as accuracy from submit;
0.9641713370696557