Function Signature of bbit_minhash
Text bbit_minhash(array<int|string> features)
Text bbit_minhash(array<int|string> features, int numHashes=128)
Text bbit_minhash(array<int|string> features, boolean discardWeight=false)
Text bbit_minhash(array<int|string> features, int numHashes=128, boolean discardWeight=false)
Create a signature for each article
create table new20mc_with_signature
as
select
rowid,
bbit_minhash(features, false) as signature
from
news20mc_train;
kNN brute-force search using b-Bit minhash
set hivevar:topn=10;
select
t1.rowid,
jaccard_similarity(t1.signature, q1.signature,128) as similarity
-- , popcnt(t1.signature, q1.signature) as popcnt
from
new20mc_with_signature t1
CROSS JOIN
(select bbit_minhash(features,128,false) as signature from news20mc_test where rowid = 1) q1
order by
similarity DESC
limit ${topn};
rowid | similarity | popcnt |
---|
11952 | 0.390625 | 41 |
10748 | 0.359375 | 41 |
12902 | 0.34375 | 45 |
3087 | 0.328125 | 48 |
3 | 0.328125 | 37 |
11493 | 0.328125 | 38 |
3839 | 0.328125 | 41 |
12669 | 0.328125 | 37 |
13604 | 0.3125 | 41 |
6333 | 0.3125 | 39 |