blob: de3c8e8e1c632f008aa2546e0ece44982ff81560 [file] [log] [blame]
use dataverse fuzzyjoin;
//
// -- - Stage 2 - --
//
for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $tokensUnrankedDBLP := counthashed-word-tokens($paperDBLP.title)
let $lenDBLP := len($tokensUnrankedDBLP)
let $tokensDBLP :=
for $tokenUnranked in $tokensUnrankedDBLP
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in counthashed-word-tokens($paper.title)
/*+ hash */
group by $tokenGroupped := $token with $id
/*+ inmem 1 302 */
order by count($id), $tokenGroupped
return $tokenGroupped
where $tokenUnranked = /*+ bcast */ $tokenRanked
order by $i
return $i
for $prefixTokenDBLP in subset-collection(
$tokensDBLP,
0,
prefix-len-jaccard(len($tokensDBLP), .5f))
for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $tokensUnrankedCSX := counthashed-word-tokens($paperCSX.title)
let $lenCSX := len($tokensUnrankedCSX)
let $tokensCSX :=
for $tokenUnranked in $tokensUnrankedCSX
for $tokenRanked at $i in
//
// -- - Stage 1 - --
//
for $paper in dataset('DBLP')
let $id := $paper.id
for $token in counthashed-word-tokens($paper.title)
/*+ hash */
group by $tokenGroupped := $token with $id
/*+ inmem 1 302 */
order by count($id), $tokenGroupped
return $tokenGroupped
where $tokenUnranked = /*+ bcast */ $tokenRanked
order by $i
return $i
for $prefixTokenCSX in subset-collection(
$tokensCSX,
0,
prefix-len-jaccard(len($tokensCSX), .5f))
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-prefix(
$lenDBLP,
$tokensDBLP,
$lenCSX,
$tokensCSX,
$prefixTokenDBLP,
.5f)
where $sim >= .5f
/*+ hash*/
group by $idDBLP := $idDBLP, $idCSX := $idCSX, $sim := $sim with $sim
order by $idDBLP, $idCSX
return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[0]}