use dataverse fuzzyjoin; | |
// | |
// -- - Stage 2 - -- | |
// | |
for $paperDBLP in dataset('DBLP') | |
let $tokensDBLP := | |
for $tokenUnranked in counthashed-word-tokens($paperDBLP.title) | |
for $tokenRanked at $i in | |
// | |
// -- - Stage 1 - -- | |
// | |
for $paper in dataset('DBLP') | |
let $id := $paper.id | |
for $token in counthashed-word-tokens($paper.title) | |
group by $tokenGroupped := $token with $id | |
order by count($id), $tokenGroupped | |
return $tokenGroupped | |
where $tokenUnranked = $tokenRanked | |
order by $i | |
return $i | |
for $prefixTokenDBLP in subset-collection( | |
$tokensDBLP, | |
0, | |
prefix-len-jaccard(len($tokensDBLP), .5f)) | |
for $paperCSX in dataset('CSX') | |
let $tokensCSX := | |
for $tokenUnranked in counthashed-word-tokens($paperCSX.title) | |
for $tokenRanked at $i in | |
// | |
// -- - Stage 1 - -- | |
// | |
for $paper in dataset('DBLP') | |
let $id := $paper.id | |
for $token in counthashed-word-tokens($paper.title) | |
group by $tokenGroupped := $token with $id | |
order by count($id), $tokenGroupped | |
return $tokenGroupped | |
where $tokenUnranked = $tokenRanked | |
order by $i | |
return $i | |
for $prefixTokenCSX in subset-collection( | |
$tokensCSX, | |
0, | |
prefix-len-jaccard(len($tokensCSX), .5f)) | |
where $prefixTokenDBLP = $prefixTokenCSX | |
let $sim := similarity-jaccard-prefix( | |
len(counthashed-word-tokens($paperDBLP.title)), | |
$tokensDBLP, | |
len(counthashed-word-tokens($paperCSX.title)), | |
$tokensCSX, | |
$prefixTokenDBLP, | |
.5f) | |
where $sim >= .5f | |
group by $idDBLP := $paperDBLP.id, $idCSX := $paperCSX.id, $sim := $sim with $sim | |
order by $idDBLP, $idCSX | |
return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]} |