blob: 900c29cec4c275443384695f06f296abd19bf3a2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Description : Fuzzy self joins a dataset, TweetMessages, based on the similarity-jaccard-check function of its text-messages' word tokens.
* TweetMessages has a keyword index on text-message and btree index on the primary key tweetid, and we expect the join to be
* transformed into btree and inverted indexed nested-loop joins. We test whether the join condition can be transformed into
* multiple indexed nested loop joins of various type of indexes.
* Success : Yes
*/
drop dataverse test if exists;
create dataverse test;
use test;
create type test.TwitterUserType as
closed {
`screen-name` : string,
lang : string,
`friends-count` : int32,
`statuses-count` : int32,
name : string,
`followers-count` : int32
}
create type test.TweetMessageNestedType as
closed {
tweetid : int64,
user : TwitterUserType,
`sender-location` : point,
`send-time` : datetime,
`referred-topics` : {{string}},
`message-text` : string,
countA : int32,
countB : int32
}
create type test.TweetMessageType as
closed {
nested : TweetMessageNestedType
}
create dataset TweetMessages(TweetMessageType) primary key nested.tweetid;
create index twmSndLocIx on TweetMessages (nested.`sender-location`) type rtree;
create index msgCountAIx on TweetMessages (nested.countA) type btree;
create index msgCountBIx on TweetMessages (nested.countB) type btree;
create index msgTextIx on TweetMessages (nested.`message-text`) type keyword;
write output to asterix_nc1:"rttest/inverted-index-join_word-jaccard-check-after-btree-access.adm"
select element {'t1':t1.nested.tweetid,'t2':t2.nested.tweetid,'sim':sim[1]}
from TweetMessages as t1,
TweetMessages as t2
with sim as test.`similarity-jaccard-check`(test.`word-tokens`(t1.nested.`message-text`),test.`word-tokens`(t2.nested.`message-text`),0.600000f)
where (sim[0] and (t1.nested.tweetid < test.int64('20')) and (t2.nested.tweetid != t1.nested.tweetid))
;