List related (similar) articles for each article.
use news20; delete jar /home/myui/tmp/hivemall.jar; add jar /home/myui/tmp/hivemall.jar; source /home/myui/tmp/define-all.hive;
set hivevar:hashes=100; -- Generate N sets of minhash values for each row (DEFAULT: 5) set hivevar:keygroups=2; -- Use K minhash value for generating a resulting value (DEFAULT: 2) create table news20_clusterid_assign as select -- minhash(rowid, features) as (clusterId, rowid) minhash(rowid, features, "-n ${hashes} -k ${keygroups}") as (clusterId, rowid) from news20mc_train; --set hivevar:min_cluster_size=5; create or replace view news20_cluster as select clusterId, collect_set(rowid) as rowids from news20_clusterid_assign group by clusterId -- having size(rowids) > ${min_cluster_size} ;
create table news20_similar_articles as WITH t1 as ( select l.rowid, r.rowid as other_id, count(1) as cnt from news20_clusterid_assign l LEFT OUTER JOIN news20_clusterid_assign r ON (l.clusterid = r.clusterid) where l.rowid != r.rowid group by l.rowid, r.rowid having -- 10/${hashes}=10/100=0.1 (filter by a pseudo Jaccard similarity by Minhash is greater than or equals to 0.1) cnt >= 10 ) select rowid, collect_set(other_id) as related_articles from t1 group by rowid -- order by rowid asc ;
List all possible clusters w/o using a similarity threshold:
create table news20_similar_articles2 as select l.rowid, collect_set(r.rowid) as related_articles from news20_clusterid_assign l LEFT OUTER JOIN news20_clusterid_assign r ON (l.clusterid = r.clusterid) where l.rowid != r.rowid group by l.rowid -- order by rowid asc ;
create table news20_jaccard_similarity as WITH t1 as ( select l.rowid, r.rowid as other_id, count(1) / ${hashes} as similarity from news20_clusterid_assign l JOIN news20_clusterid_assign r ON (l.clusterid = r.clusterid) where l.rowid != r.rowid group by l.rowid, r.rowid ) select rowid, other_id, similarity, 1.0 - similarity as distance from t1 where similarity >= 0.1 ;