This page explains how to run matrix factorization on MovieLens 1M dataset.

Calculate the mean rating in the training dataset

use movielens;

select avg(rating) from training;

3.593565

Set variables (hyperparameters) for training

-- mean rating
set hivevar:mu=3.593565;
-- number of factors
set hivevar:factor=10;
-- maximum number of training iterations
set hivevar:iters=50;

Note that there are no need to set an exact value for $mu. It actually works without setting $mu but recommended to set one for getting a better prediction.

Due to a bug in Hive, do not issue comments in CLI.

Training

create table sgd_model
as
select
  idx, 
  array_avg(u_rank) as Pu, 
  array_avg(m_rank) as Qi, 
  avg(u_bias) as Bu, 
  avg(m_bias) as Bi
from (
  select 
    train_mf_sgd(userid, movieid, rating, '-factor ${factor} -mu ${mu} -iter ${iters}') as (idx, u_rank, m_rank, u_bias, m_bias)
  from 
    training
) t
group by idx;

Note
Hivemall also provides train_mf_adagrad for training using AdaGrad. -help option shows a complete list of hyperparameters.

Predict

select
  t2.actual,
  mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
from (
  select
    t1.userid, 
    t1.movieid,
    t1.rating as actual,
    p1.Pu,
    p1.Bu
  from
    testing t1 LEFT OUTER JOIN sgd_model p1
    ON (t1.userid = p1.idx) 
) t2 
LEFT OUTER JOIN sgd_model p2
ON (t2.movieid = p2.idx);

Evaluate (computes MAE and RMSE)

select
  mae(predicted, actual) as mae,
  rmse(predicted, actual) as rmse
from (
  select
    t2.actual,
    mf_predict(t2.Pu, p2.Qi, t2.Bu, p2.Bi, ${mu}) as predicted
  from (
    select
      t1.userid, 
      t1.movieid,
      t1.rating as actual,
      p1.Pu,
      p1.Bu
    from
      testing t1 LEFT OUTER JOIN sgd_model p1
      ON (t1.userid = p1.idx) 
  ) t2 
  LEFT OUTER JOIN sgd_model p2
  ON (t2.movieid = p2.idx)
) t;

MAE	RMSE
0.6728969407733578	0.8584162122694449

Item Recommendation

Recommend top-k movies that a user have not ever seen.

set hivevar:userid=1;
set hivevar:topk=5;

select
  t1.movieid, 
  mf_predict(t2.Pu, t1.Qi, t2.Bu, t1.Bi, ${mu}) as predicted
from (
  select
    idx movieid,
    Qi, 
    Bi
  from
    sgd_model p
  where
    p.idx NOT IN 
      (select movieid from training where userid=${userid})
) t1 CROSS JOIN (
  select
    Pu,
    Bu
  from 
    sgd_model
  where
    idx = ${userid}
) t2
order by
  predicted DESC
limit ${topk};

movieid	predicted
318	4.8051853
2503	4.788541
53	4.7518783
904	4.7463417
953	4.732769