blob: 0ea267c7da5f362270362b8ba60d4b5caff57d71 [file] [log] [blame]
/*
* Description : Fuzzy joins two datasets, Customers and Customers2, based on the edit-distance function of their names.
* Customers has a 3-gram index on name, and we expect the join to be transformed into an indexed nested-loop join.
* We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index.
* We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary.
* Success : Yes
*/
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type AddressType as open {
number: int32,
street: string,
city: string
}
create type CustomerType as open {
cid: int32,
name: string,
age: int32?,
address: AddressType?,
interests: [string],
children: [ { name: string, age: int32? } ]
}
create dataset Customers(CustomerType) partitioned by key cid;
create dataset Customers2(CustomerType) partitioned by key cid;
load dataset Customers
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
load dataset Customers2
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/semistructured/co1k_olist/customer.adm"),("format"="adm"));
create index ngram_index on Customers(name) type ngram(3);
write output to nc1:"rttest/inverted-index-join-noeqjoin_ngram-edit-distance-inline.adm";
for $a in dataset('Customers')
for $b in dataset('Customers2')
let $ed := edit-distance($a.name, $b.name)
where $ed <= 4 and $a.cid < $b.cid
order by $ed, $a.cid, $b.cid
return { "a": $a.name, "b": $b.name, "ed": $ed }