blob: 4efb5eeb17e0b2447c2cb940412bf24adf12cc9c [file] [log] [blame]
/*
* Description : Fuzzy joins two datasets, Customers and Customers2, based on the Jaccard similarity of their interest sets.
* Customers has a keyword index on interests, and we expect the join to be transformed into an indexed nested-loop join.
* Success : Yes
*/
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type AddressType as closed {
number: int32,
street: string,
city: string
}
create type CustomerType as closed {
cid: int32,
name: string,
age: int32?,
address: AddressType?,
interests: {{string}},
children: [ { name: string, age: int32? } ]
}
create dataset Customers(CustomerType) partitioned by key cid;
create dataset Customers2(CustomerType) partitioned by key cid;
load dataset Customers
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/semistructured/co1k/customer.adm"),("format"="adm"));
load dataset Customers2
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/semistructured/co1k/customer.adm"),("format"="adm"));
create index interests_index on Customers(interests) type keyword;
write output to nc1:"rttest/index-join_inverted-index-ulist-jaccard.adm";
for $a in dataset('Customers')
for $b in dataset('Customers2')
where /*+ indexnl */ similarity-jaccard($a.interests, $b.interests) >= 0.9f
and $a.cid < $b.cid and len($a.interests) > 1 and len($b.interests) > 1
order by $a.cid, $b.cid
return { "a": $a.interests, "b": $b.interests }