asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard-inline.aql - asterixdb - Git at Google

 /*
  * Description    : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' 3-gram tokens.
  *                  DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join.
  *                  We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index.
  * Success        : Yes
  */

 drop dataverse test if exists;
 create dataverse test;
 use dataverse test;

 create type DBLPType as closed {
   id: int32,
   dblpid: string,
   title: string,
   authors: string,
   misc: string
 }

 create type CSXType as closed {
   id: int32,
   csxid: string,
   title: string,
   authors: string,
   misc: string
 }

 create dataset DBLP(DBLPType) partitioned by key id;

 create dataset CSX(CSXType) partitioned by key id;

 load dataset DBLP
 using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
 (("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;

 load dataset CSX
 using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
 (("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));

 create index ngram_index on DBLP(title) type ngram(3);

 write output to nc1:"rttest/inverted-index-join_ngram-jaccard-inline.adm";

 for $a in dataset('DBLP')
 for $b in dataset('CSX')
 let $jacc := similarity-jaccard(gram-tokens($a.title, 3, false), gram-tokens($b.title, 3, false))
 where $jacc >= 0.5f and $a.id < $b.id
 order by $jacc, $a.id, $b.id
 return { "arec": $a, "brec": $b, "jacc": $jacc }
	/*
	* Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' 3-gram tokens.
	* DBLP has a 3-gram index on title, and we expect the join to be transformed into an indexed nested-loop join.
	* We test the inlining of variables that enable the select to be pushed into the join for subsequent optimization with an index.
	* Success : Yes
	*/

	drop dataverse test if exists;
	create dataverse test;
	use dataverse test;

	create type DBLPType as closed {
	id: int32,
	dblpid: string,
	title: string,
	authors: string,
	misc: string
	}

	create type CSXType as closed {
	id: int32,
	csxid: string,
	title: string,
	authors: string,
	misc: string
	}

	create dataset DBLP(DBLPType) partitioned by key id;

	create dataset CSX(CSXType) partitioned by key id;

	load dataset DBLP
	using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
	(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;

	load dataset CSX
	using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
	(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));

	create index ngram_index on DBLP(title) type ngram(3);

	write output to nc1:"rttest/inverted-index-join_ngram-jaccard-inline.adm";

	for $a in dataset('DBLP')
	for $b in dataset('CSX')
	let $jacc := similarity-jaccard(gram-tokens($a.title, 3, false), gram-tokens($b.title, 3, false))
	where $jacc >= 0.5f and $a.id < $b.id
	order by $jacc, $a.id, $b.id
	return { "arec": $a, "brec": $b, "jacc": $jacc }