Hivemall generally uses model averaging (i.e., model ensemble) for creating a unified prediction model. In this tutorial, we show how to apply bagging (i.e., prediction ensemble) for making a prediction.

Training

-- set mapred.reduce.tasks=3; -- explicitly use 3 reducers

CREATE TABLE bagging_models
as 
WITH train as (
  select 
     train_classifier(
       add_bias(features), label, 
       '-loss logistic -opt AdamHD -reg l1 -iters 20'
     ) as (feature,weight)
  from
     news20b_train_x3
),
models as (
  select
    taskid() as modelid,
    feature,
    weight
  from 
    train
)
select
  modelid,
  feature,
  voted_avg(weight) as weight -- or simply avg(weight)
from
  models
group by
  modelid, feature;

prediction

create table bagging_predict
as
WITH weights as (
  select
    t.rowid,
    m.modelid,
    sum(m.weight * t.value) as total_weight
  from
    news20b_test_exploded t 
    LEFT OUTER JOIN
    bagging_models m ON (t.feature = m.feature)
  group by
    rowid, modelid
),
voted as (
  select
    rowid,
    voted_avg(total_weight) as total_weight
  from 
    weights
  group by
    rowid 
)
select
  rowid,
  max(total_weight) as total_weight, -- max is dummy 
  case when sum(total_weight) > 0.0 then 1 else -1 end as label
from
  voted
group by
  rowid;

evaluation

WITH submit as (
  select 
    t.label as actual, 
    p.label as predicted
  from 
    news20b_test t 
    JOIN bagging_predict p on (t.rowid = p.rowid)
)
select 
  sum(if(actual = predicted, 1, 0)) / count(1) as accuracy
from
  submit;

0.9641713370696557