Caution: Factorization Machine is supported from Hivemall v0.4 or later.

Data preparation

First of all, please create ratings table described in this article.

use movielens;

SET hivevar:seed=31;

DROP TABLE ratings_fm;
CREATE TABLE ratings_fm
as
select
	rowid() as rowid,
	categorical_features(array("userid","movieid"), userid, movieid) 
	  as features,
	rating,
	rand(${seed}) as rnd
from
	ratings
CLUSTER BY rand(43); -- shuffle training input

select * from ratings_fm limit 2;

rowid	features	rating	rnd
1-383970	[“userid#2244”,“movieid#1272”]	5	0.33947035987020546
1-557913	[“userid#3425”,“movieid#2791”]	4	0.12344886396954391

-- use 80% for training
DROP TABLE training_fm;
CREATE TABLE training_fm
as
select * from ratings_fm
order by rnd DESC
limit 800000;

-- use 20% for testing
DROP TABLE testing_fm;
CREATE TABLE testing_fm
as
select * from ratings_fm
order by rnd ASC
limit 200209;

-- testing table for prediction
CREATE OR REPLACE VIEW testing_fm_exploded
as 
select 
  rowid,
  extract_feature(fv) as feature,
  extract_weight(fv) as Xi,
  rating
from
  testing_fm t1 LATERAL VIEW explode(add_bias(features)) t2 as fv;

Caution: Don't forget to call add_bias in the above query. No need to call add_bias for preparing training data in Factorization Machines because it always considers it.

Training

Hyperparamters for Training

-- number of factors
set hivevar:factor=10;
-- maximum number of training iterations
set hivevar:iters=50;

Build a prediction model by Factorization Machine

drop table fm_model;
create table fm_model
as
select
  feature,
  avg(Wi) as Wi,
  array_avg(Vif) as Vif
from (
  select 
    train_fm(features, rating, "-factor ${factor} -iters ${iters} -eta 0.01") 
    	as (feature, Wi, Vif)
  from 
    training_fm
) t
group by feature;

Note: setting eta option is optional. However, setting -eta 0.01 usually works well.

Usage of `train_fm`

You can get usages of train_fm by giving -help option as follows:

select 
  train_fm(features, rating, "-help") as (feature, Wi, Vif)
from 
  training_fm

usage: train_fm(array<string> x, double y [, const string options]) -
       Returns a prediction value [-adareg] [-c] [-cv_rate <arg>]
       [-disable_cv] [-eta <arg>] [-eta0 <arg>] [-f <arg>] [-help]
       [-init_v <arg>] [-int_feature] [-iters <arg>] [-lambda <arg>] [-max
       <arg>] [-maxval <arg>] [-min <arg>] [-min_init_stddev <arg>] [-p
       <arg>] [-power_t <arg>] [-seed <arg>] [-sigma <arg>] [-t <arg>]
       [-va_ratio <arg>] [-va_threshold <arg>]
 -adareg,--adaptive_regularizaion             Whether to enable adaptive
                                              regularization [default:
                                              OFF]
 -c,--classification                          Act as classification
 -cv_rate,--convergence_rate <arg>            Threshold to determine
                                              convergence [default: 0.005]
 -disable_cv,--disable_cvtest                 Whether to disable
                                              convergence check [default:
                                              OFF]
 -eta <arg>                                   The initial learning rate
 -eta0 <arg>                                  The initial learning rate
                                              [default 0.1]
 -f,--factor <arg>                            The number of the latent
                                              variables [default: 10]
 -help                                        Show function help
 -init_v <arg>                                Initialization strategy of
                                              matrix V [random, gaussian]
                                              (default: random)
 -int_feature,--feature_as_integer            Parse a feature as integer
                                              [default: OFF, ON if -p
                                              option is specified]
 -iters,--iterations <arg>                    The number of iterations
                                              [default: 1]
 -lambda,--lambda0 <arg>                      The initial lambda value for
                                              regularization [default:
                                              0.01]
 -max,--max_target <arg>                      The maximum value of target
                                              variable
 -maxval,--max_init_value <arg>               The maximum initial value in
                                              the matrix V [default: 1.0]
 -min,--min_target <arg>                      The minimum value of target
                                              variable
 -min_init_stddev <arg>                       The minimum standard
                                              deviation of initial matrix
                                              V [default: 0.1]
 -p,--size_x <arg>                            The size of x
 -power_t <arg>                               The exponent for inverse
                                              scaling learning rate
                                              [default 0.1]
 -seed <arg>                                  Seed value [default: -1
                                              (random)]
 -sigma <arg>                                 The standard deviation for
                                              initializing V [default:
                                              0.1]
 -t,--total_steps <arg>                       The total number of training
                                              examples
 -va_ratio,--validation_ratio <arg>           Ratio of training data used
                                              for validation [default:
                                              0.05f]
 -va_threshold,--validation_threshold <arg>   Threshold to start
                                              validation. At least N
                                              training examples are used
                                              before validation [default:
                                              1000]

Prediction

-- workaround for a bug 
-- https://issues.apache.org/jira/browse/HIVE-11051
set hive.mapjoin.optimized.hashtable=false;

drop table fm_predict;
create table fm_predict
as
select
  t1.rowid,
  fm_predict(p1.Wi, p1.Vif, t1.Xi) as predicted
from 
  testing_fm_exploded t1
  LEFT OUTER JOIN fm_model p1 ON (t1.feature = p1.feature)
group by
  t1.rowid;

Evaluation

select
  mae(p.predicted, rating) as mae,
  rmse(p.predicted, rating) as rmse
from
  testing_fm as t
  JOIN fm_predict as p on (t.rowid = p.rowid);

0.6736798239047873 (mae) 0.858938110314545 (rmse)

Fast Factorization Machines Training using Int Features

Training of Factorization Machines (FM) can be done more efficietly, in term of speed, by using INT features. In this section, we show how to run FM training by using int features, more specifically by using feature hashing.

set hivevar:factor=10;
set hivevar:iters=50;

drop table fm_model;
create table fm_model
as
select
  feature,
  avg(Wi) as Wi,
  array_avg(Vif) as Vif
from (
  select 
    train_fm(feature_hashing(features), rating, "-factor ${factor} -iters ${iters} -eta 0.01 -int_feature")  -- internally use a sparse map
 -- train_fm(feature_hashing(features), rating, "-factor ${factor} -iters ${iters} -eta 0.01 -int_feature -num_features 16777216") -- internally use a dense array 
        as (feature, Wi, Vif)
  from 
    training_fm
) t
group by feature;

set hive.mapjoin.optimized.hashtable=false; -- workaround for https://issues.apache.org/jira/browse/HIVE-11051

WITH predicted as (
  select
    t1.rowid,
    fm_predict(p1.Wi, p1.Vif, t1.Xi) as predicted
  from 
    testing_fm_exploded t1
    LEFT OUTER JOIN fm_model p1 ON (feature_hashing(t1.feature) = p1.feature)
  group by
    t1.rowid
)
select
  mae(p.predicted, rating) as mae,
  rmse(p.predicted, rating) as rmse
from
  testing_fm as t
  JOIN predicted as p on (t.rowid = p.rowid);