Function Signature of bbit_minhash

Text bbit_minhash(array<int|string> features)
Text bbit_minhash(array<int|string> features, int numHashes=128)
Text bbit_minhash(array<int|string> features, boolean discardWeight=false)
Text bbit_minhash(array<int|string> features, int numHashes=128, boolean discardWeight=false)

Create a signature for each article

create table new20mc_with_signature
as
select
  rowid, 
  bbit_minhash(features, false) as signature
from
  news20mc_train;

kNN brute-force search using b-Bit minhash

set hivevar:topn=10;

select
  t1.rowid, 
  jaccard_similarity(t1.signature, q1.signature,128) as similarity
--  , popcnt(t1.signature, q1.signature) as popcnt
from
  new20mc_with_signature t1 
  CROSS JOIN 
  (select bbit_minhash(features,128,false) as signature from news20mc_test where rowid = 1) q1
order by
  similarity DESC
limit ${topn};
rowidsimilaritypopcnt
119520.39062541
107480.35937541
129020.3437545
30870.32812548
30.32812537
114930.32812538
38390.32812541
126690.32812537
136040.312541
63330.312539