blob: 9f2fd076f35d587c1be711df2b24251c15a4e7a1 [file] [log] [blame]
drop dataverse fuzzyjoin if exists;
create dataverse fuzzyjoin;
use dataverse fuzzyjoin;
create type UserType as open {
uid: int32,
name: string,
lottery_numbers: [int32],
interests: {{string}}
}
create type VisitorType as open {
vid: int32,
name: string,
lottery_numbers: [int32],
interests: {{string}}
}
create dataset Users(UserType) partitioned by key uid;
create dataset Visitors(VisitorType) partitioned by key vid;
load dataset Users
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/users-visitors-small/users.json"),("format"="adm"));
load dataset Visitors
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/users-visitors-small/visitors.json"),("format"="adm"));
write output to nc1:'rttest/fuzzyjoin_user-vis-lot-3_1.adm';
//
// -- - Stage 3 - --
//
for $ridpair in
//
// -- - Stage 2 - --
//
for $user in dataset('Users')
let $lenUser := len($user.lottery_numbers)
let $tokensUser :=
for $token in $user.lottery_numbers
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $user in dataset('Users')
for $token in $user.lottery_numbers
group by $tokenGroupped := $token with $user
order by count($user)
return $tokenGroupped
where $token = $tokenRanked
order by $i
return $i
for $prefixTokenUser in subset-collection(
$tokensUser,
0,
prefix-len-jaccard($lenUser, .5f))
for $visitor in dataset('Visitors')
let $lenVisitor := len($visitor.lottery_numbers)
let $tokensVisitor :=
for $token in $visitor.lottery_numbers
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $user in dataset('Users')
for $token in $user.lottery_numbers
group by $tokenGroupped := $token with $user
order by count($user)
return $tokenGroupped
where $token = $tokenRanked
order by $i
return $i
for $prefixTokenVisitor in subset-collection(
$tokensVisitor,
0,
prefix-len-jaccard($lenVisitor, .5f))
where $prefixTokenUser = $prefixTokenVisitor
let $sim := similarity-jaccard-prefix(
$lenUser,
$tokensUser,
$lenVisitor,
$tokensVisitor,
$prefixTokenUser,
.5f)
where $sim >= .5f
group by $uid := $user.uid, $vid := $visitor.vid with $sim
return {'uid': $uid, 'vid': $vid, 'sim': $sim[0]}
for $user in dataset('Users')
for $visitor in dataset('Visitors')
where $ridpair.uid = $user.uid and $ridpair.vid = $visitor.vid
order by $user.uid, $visitor.vid
return {'user': $user, 'visitor': $visitor, 'sim': $ridpair.sim}