https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf
#[PA1a]
##Training
set mapred.reduce.tasks=64; drop table e2006tfidf_pa1a_model ; create table e2006tfidf_pa1a_model as select feature, avg(weight) as weight from (select train_pa1a_regr(add_bias(features),target) as (feature,weight) from e2006tfidf_train_x3 ) t group by feature; set mapred.reduce.tasks=-1;
Caution: Do not use voted_avg() for regression. voted_avg() is for classification.
create or replace view e2006tfidf_pa1a_predict as select t.rowid, sum(m.weight * t.value) as predicted from e2006tfidf_test_exploded t LEFT OUTER JOIN e2006tfidf_pa1a_model m ON (t.feature = m.feature) group by t.rowid;
drop table e2006tfidf_pa1a_submit; create table e2006tfidf_pa1a_submit as select t.target as actual, p.predicted as predicted from e2006tfidf_test t JOIN e2006tfidf_pa1a_predict p on (t.rowid = p.rowid); select avg(actual), avg(predicted) from e2006tfidf_pa1a_submit;
-3.8200363760415414 -3.8869923258589476
set hivevar:mean_actual=-3.8200363760415414; select sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, sum(pow(predicted - actual,2.0))/count(1) as MSE, sum(abs(predicted - actual))/count(1) as MAE, 1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2 from e2006tfidf_pa1a_submit;
0.3797959864675519 0.14424499133686086 0.23846059576113587 0.5010367946980386
#[PA2a]
##Training
set mapred.reduce.tasks=64; drop table e2006tfidf_pa2a_model; create table e2006tfidf_pa2a_model as select feature, avg(weight) as weight from (select train_pa2a_regr(add_bias(features),target) as (feature,weight) from e2006tfidf_train_x3 ) t group by feature; set mapred.reduce.tasks=-1;
create or replace view e2006tfidf_pa2a_predict as select t.rowid, sum(m.weight * t.value) as predicted from e2006tfidf_test_exploded t LEFT OUTER JOIN e2006tfidf_pa2a_model m ON (t.feature = m.feature) group by t.rowid;
drop table e2006tfidf_pa2a_submit; create table e2006tfidf_pa2a_submit as select t.target as actual, pd.predicted as predicted from e2006tfidf_test t JOIN e2006tfidf_pa2a_predict pd on (t.rowid = pd.rowid); select avg(actual), avg(predicted) from e2006tfidf_pa2a_submit;
-3.8200363760415414 -3.9124877451612488
set hivevar:mean_actual=-3.8200363760415414; select sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, sum(pow(predicted - actual,2.0))/count(1) as MSE, sum(abs(predicted - actual))/count(1) as MAE, 1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2 from e2006tfidf_pa2a_submit;
0.38538660838804495 0.14852283792484033 0.2466732002711477 0.48623913673053565
#[AROW]
##Training
set mapred.reduce.tasks=64; drop table e2006tfidf_arow_model ; create table e2006tfidf_arow_model as select feature, -- avg(weight) as weight -- [hivemall v0.1] argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] from (select -- train_arow_regr(add_bias(features),target) as (feature,weight) -- [hivemall v0.1] train_arow_regr(add_bias(features),target) as (feature,weight,covar) -- [hivemall v0.2 or later] from e2006tfidf_train_x3 ) t group by feature; set mapred.reduce.tasks=-1;
create or replace view e2006tfidf_arow_predict as select t.rowid, sum(m.weight * t.value) as predicted from e2006tfidf_test_exploded t LEFT OUTER JOIN e2006tfidf_arow_model m ON (t.feature = m.feature) group by t.rowid;
drop table e2006tfidf_arow_submit; create table e2006tfidf_arow_submit as select t.target as actual, p.predicted as predicted from e2006tfidf_test t JOIN e2006tfidf_arow_predict p on (t.rowid = p.rowid); select avg(actual), avg(predicted) from e2006tfidf_arow_submit;
-3.8200363760415414 -3.8692518911517433
set hivevar:mean_actual=-3.8200363760415414; select sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, sum(pow(predicted - actual,2.0))/count(1) as MSE, sum(abs(predicted - actual))/count(1) as MAE, 1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2 from e2006tfidf_arow_submit;
0.37862513029019407 0.14335698928726642 0.2368787001269389 0.5041085155590119
#[AROWe] AROWe is a modified version of AROW that uses Hinge loss (epsilion = 0.1)
##Training
set mapred.reduce.tasks=64; drop table e2006tfidf_arowe_model ; create table e2006tfidf_arowe_model as select feature, -- avg(weight) as weight -- [hivemall v0.1] argmin_kld(weight, covar) as weight -- [hivemall v0.2 or later] from (select -- train_arowe_regr(add_bias(features),target) as (feature,weight) -- [hivemall v0.1] train_arowe_regr(add_bias(features),target) as (feature,weight,covar) -- [hivemall v0.2 or later] from e2006tfidf_train_x3 ) t group by feature; set mapred.reduce.tasks=-1;
create or replace view e2006tfidf_arowe_predict as select t.rowid, sum(m.weight * t.value) as predicted from e2006tfidf_test_exploded t LEFT OUTER JOIN e2006tfidf_arowe_model m ON (t.feature = m.feature) group by t.rowid;
drop table e2006tfidf_arowe_submit; create table e2006tfidf_arowe_submit as select t.target as actual, p.predicted as predicted from e2006tfidf_test t JOIN e2006tfidf_arowe_predict p on (t.rowid = p.rowid); select avg(actual), avg(predicted) from e2006tfidf_arowe_submit;
-3.8200363760415414 -3.86494905688414
set hivevar:mean_actual=-3.8200363760415414; select sqrt(sum(pow(predicted - actual,2.0))/count(1)) as RMSE, sum(pow(predicted - actual,2.0))/count(1) as MSE, sum(abs(predicted - actual))/count(1) as MAE, 1 - sum(pow(actual - predicted,2.0)) / sum(pow(actual - ${mean_actual},2.0)) as R2 from e2006tfidf_arowe_submit;
0.37789148212861856 0.14280197226536404 0.2357339155291536 0.5060283955470721