blob: 8847306e951cc1760401f3bd0fdf79f7b8e2b8a8 [file] [log] [blame]
use dataverse fuzzyjoin;
//
// -- - Stage 2 - --
//
for $paperDBLP in dataset('DBLP')
let $tokensDBLP :=
for $tokenUnranked in counthashed-word-tokens($paperDBLP.title)
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in counthashed-word-tokens($paper.title)
group by $tokenGroupped := $token with $id
order by count($id), $tokenGroupped
return $tokenGroupped
where $tokenUnranked = $tokenRanked
order by $i
return $i
for $prefixTokenDBLP in subset-collection(
$tokensDBLP,
0,
prefix-len-jaccard(len($tokensDBLP), .5f))
for $paperCSX in dataset('CSX')
let $tokensCSX :=
for $tokenUnranked in counthashed-word-tokens($paperCSX.title)
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in counthashed-word-tokens($paper.title)
group by $tokenGroupped := $token with $id
order by count($id), $tokenGroupped
return $tokenGroupped
where $tokenUnranked = $tokenRanked
order by $i
return $i
for $prefixTokenCSX in subset-collection(
$tokensCSX,
0,
prefix-len-jaccard(len($tokensCSX), .5f))
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-prefix(
len(counthashed-word-tokens($paperDBLP.title)),
$tokensDBLP,
len(counthashed-word-tokens($paperCSX.title)),
$tokensCSX,
$prefixTokenDBLP,
.5f)
where $sim >= .5f
group by $idDBLP := $paperDBLP.id, $idCSX := $paperCSX.id, $sim := $sim with $sim
order by $idDBLP, $idCSX
return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}