asterixdb/asterix-app/src/test/resources/optimizerts/queries_sqlpp/inverted-index-join-noeqjoin/word-jaccard.sqlpp - asterixdb - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 /*
  * Description    : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' word tokens.
  *                  DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join.
  *                  We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary.
  * Success        : Yes
  */

 drop  dataverse test if exists;
 create  dataverse test;

 use test;


 create type test.DBLPType as
  closed {
   id : int32,
   dblpid : string,
   title : string,
   authors : string,
   misc : string
 }

 create type test.CSXType as
  closed {
   id : int32,
   csxid : string,
   title : string,
   authors : string,
   misc : string
 }

 create  dataset DBLP(DBLPType) primary key id;

 create  dataset CSX(CSXType) primary key id;

 create  index keyword_index  on DBLP (title) type keyword;

 write output to asterix_nc1:"rttest/inverted-index-join-noeqjoin_word-jaccard.adm"
 select element {'atitle':a.title,'btitle':b.title}
 from  DBLP as a,
       CSX as b
 where ((test.`similarity-jaccard`(test.`word-tokens`(a.title),test.`word-tokens`(b.title)) >= 0.500000f) and (a.id < b.id))
 ;
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	/*
	* Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' word tokens.
	* DBLP has a keyword index on title, and we expect the join to be transformed into an indexed nested-loop join.
	* We expect the top-level equi join introduced because of surrogate optimization to be removed, since it is not necessary.
	* Success : Yes
	*/

	drop dataverse test if exists;
	create dataverse test;

	use test;


	create type test.DBLPType as
	closed {
	id : int32,
	dblpid : string,
	title : string,
	authors : string,
	misc : string
	}

	create type test.CSXType as
	closed {
	id : int32,
	csxid : string,
	title : string,
	authors : string,
	misc : string
	}

	create dataset DBLP(DBLPType) primary key id;

	create dataset CSX(CSXType) primary key id;

	create index keyword_index on DBLP (title) type keyword;

	write output to asterix_nc1:"rttest/inverted-index-join-noeqjoin_word-jaccard.adm"
	select element {'atitle':a.title,'btitle':b.title}
	from DBLP as a,
	CSX as b
	where ((test.`similarity-jaccard`(test.`word-tokens`(a.title),test.`word-tokens`(b.title)) >= 0.500000f) and (a.id < b.id))
	;