blob: 25cd55ba5de50d0121509ea96ad929e1c6093ae0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import type { TabEntry } from '../../druid-models';
import { WorkbenchQuery } from '../../druid-models';
const BASE_QUERY = WorkbenchQuery.blank().changeQueryContext({ maxNumTasks: 2 });
export function getDemoQueries(): TabEntry[] {
function makeDemoQuery(queryString: string): WorkbenchQuery {
return BASE_QUERY.changeQueryString(queryString.trim());
}
return [
{
id: 'demo1',
tabName: 'Demo 1',
query: makeDemoQuery(
`
-- Demo 1 showcases the new SQL syntax available as part of the sql-msq-task engine.
--
-- The two syntax pieces highlighted in this demo are:
--
-- 1. The REPLACE clause wraps a query and specifies that the result of the underlying query should be written to a datasource.
-- You can also use INSERT instead of REPLACE to append data instead of replacing it.
--
-- 2. The TABLE(EXTERN(...)) function defines a external table that can be used anywhere a table can be used.
--
-- This demo reads the file located at https://static.imply.io/example-data/kttm-v2/kttm-v2-2019-08-25.json.gz and writes it into
-- the "kttm_simple" datasource (overwriting it if it already exists).
--
-- You can click the "Preview" button if you want to see a preview of the shape of the data that will be ingested and then
-- click "Run" to actually create the kttm_simple datasource (it will be needed for later demos).
--
-- The ingestion here is as simple as it could be (SELECT *), in "Demo 2" you will perform more transformations on the data.
REPLACE INTO "kttm_simple" OVERWRITE ALL
SELECT *
FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://static.imply.io/example-data/kttm-v2/kttm-v2-2019-08-25.json.gz"]}',
'{"type":"json"}'
)
) EXTEND ("timestamp" VARCHAR, "agent_category" VARCHAR, "agent_type" VARCHAR, "browser" VARCHAR, "browser_version" VARCHAR, "city" VARCHAR, "continent" VARCHAR, "country" VARCHAR, "version" VARCHAR, "event_type" VARCHAR, "event_subtype" VARCHAR, "loaded_image" VARCHAR, "adblock_list" VARCHAR, "forwarded_for" VARCHAR, "language" VARCHAR, "number" VARCHAR, "os" VARCHAR, "path" VARCHAR, "platform" VARCHAR, "referrer" VARCHAR, "referrer_host" VARCHAR, "region" VARCHAR, "remote_address" VARCHAR, "screen" VARCHAR, "session" VARCHAR, "session_length" BIGINT, "timezone" VARCHAR, "timezone_offset" VARCHAR, "window" VARCHAR)
PARTITIONED BY ALL TIME
`,
),
},
{
id: 'demo2',
tabName: 'Demo 2',
query: makeDemoQuery(
`
-- In Demo 2, you build on the concepts of Demo 1 by transforming data.
-- You're no longer simply reading data from a file and writing it to a datasource.
--
-- In this ingestion, you:
-- - select only a handful of columns.
-- - parse the "timestamp" column, truncate it to MINUTE, and make it the "primary time column" by aliasing it to __time.
-- - filter the rows with a WHERE clause so that only data for 'iOS' is ingested.
-- - perform "rollup" by applying a GROUP BY clause and specifying some aggregates ("metrics").
-- - apply time based partitioning as well as clustering on two columns.
--
-- In Demo 3, you'll enrich data at ingestion time by transforming it further.
REPLACE INTO "kttm_rollup" OVERWRITE ALL
WITH kttm_data AS (
SELECT * FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://static.imply.io/example-data/kttm-v2/kttm-v2-2019-08-25.json.gz"]}',
'{"type":"json"}'
)
) EXTEND ("timestamp" VARCHAR, "agent_category" VARCHAR, "agent_type" VARCHAR, "browser" VARCHAR, "browser_version" VARCHAR, "city" VARCHAR, "continent" VARCHAR, "country" VARCHAR, "version" VARCHAR, "event_type" VARCHAR, "event_subtype" VARCHAR, "loaded_image" VARCHAR, "adblock_list" VARCHAR, "forwarded_for" VARCHAR, "language" VARCHAR, "number" VARCHAR, "os" VARCHAR, "path" VARCHAR, "platform" VARCHAR, "referrer" VARCHAR, "referrer_host" VARCHAR, "region" VARCHAR, "remote_address" VARCHAR, "screen" VARCHAR, "session" VARCHAR, "session_length" BIGINT, "timezone" VARCHAR, "timezone_offset" VARCHAR, "window" VARCHAR))
SELECT
FLOOR(TIME_PARSE("timestamp") TO MINUTE) AS __time,
session,
agent_category,
agent_type,
browser,
browser_version,
MV_TO_ARRAY("language") AS "language",
os,
city,
country,
forwarded_for AS ip_address,
COUNT(*) AS "cnt",
SUM(session_length) AS session_length,
APPROX_COUNT_DISTINCT_DS_HLL(event_type) AS unique_event_types
FROM kttm_data
WHERE os = 'iOS'
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
PARTITIONED BY HOUR
CLUSTERED BY browser, session
`,
),
},
{
id: 'demo3',
tabName: 'Demo 3',
query: makeDemoQuery(
`
-- This demo has the same query as Demo 2, except that it utilizes an additional external data file (you can have as many
-- as you want) to enrich the data via a JOIN. Specifically for every country in the fact data you add a column for Capital
-- and that countries ISO3 code.
-- This query also computes a derived column, "browser_major", by applying a SQL transformation.
--
-- In the next demo you will see how you are not limited to reading external data in these queries.
REPLACE INTO "kttm_transformed" OVERWRITE ALL
WITH
kttm_data AS (
SELECT * FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://static.imply.io/example-data/kttm-v2/kttm-v2-2019-08-25.json.gz"]}',
'{"type":"json"}'
)
) EXTEND ("timestamp" VARCHAR, "agent_category" VARCHAR, "agent_type" VARCHAR, "browser" VARCHAR, "browser_version" VARCHAR, "city" VARCHAR, "continent" VARCHAR, "country" VARCHAR, "version" VARCHAR, "event_type" VARCHAR, "event_subtype" VARCHAR, "loaded_image" VARCHAR, "adblock_list" VARCHAR, "forwarded_for" VARCHAR, "language" VARCHAR, "number" VARCHAR, "os" VARCHAR, "path" VARCHAR, "platform" VARCHAR, "referrer" VARCHAR, "referrer_host" VARCHAR, "region" VARCHAR, "remote_address" VARCHAR, "screen" VARCHAR, "session" VARCHAR, "session_length" BIGINT, "timezone" VARCHAR, "timezone_offset" VARCHAR, "window" VARCHAR)),
country_lookup AS (
SELECT * FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://static.imply.io/example-data/lookup/countries.tsv"]}',
'{"type":"tsv","findColumnsFromHeader":true}'
)
) EXTEND ("Country" VARCHAR, "Capital" VARCHAR, "ISO3" VARCHAR, "ISO2" VARCHAR))
SELECT
FLOOR(TIME_PARSE("timestamp") TO MINUTE) AS __time,
session,
agent_category,
agent_type,
browser,
browser_version,
CAST(REGEXP_EXTRACT(browser_version, '^(\\d+)') AS BIGINT) AS browser_major,
MV_TO_ARRAY("language") AS "language",
os,
city,
country,
country_lookup.Capital AS capital,
country_lookup.ISO3 AS iso3,
forwarded_for AS ip_address,
COUNT(*) AS "cnt",
SUM(session_length) AS session_length,
APPROX_COUNT_DISTINCT_DS_HLL(event_type) AS unique_event_types
FROM kttm_data
LEFT JOIN country_lookup ON country_lookup.Country = kttm_data.country
WHERE os = 'iOS'
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
PARTITIONED BY HOUR
CLUSTERED BY browser, session
`,
),
},
{
id: 'demo4a',
tabName: 'Demo 4a',
query: makeDemoQuery(
`
-- This demo has a query that is identical to the previous demo except instead of reading the fact data from an external
-- file, it reads the "kttm_simple" datasource that was created in Demo 1.
-- This shows you that you can mix and match data already stored in Druid with external data and transform as needed.
--
-- In the next demo you will look at another type of data transformation.
REPLACE INTO "kttm_reingest" OVERWRITE ALL
WITH
country_lookup AS (
SELECT * FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://static.imply.io/example-data/lookup/countries.tsv"]}',
'{"type":"tsv","findColumnsFromHeader":true}'
)
) EXTEND ("Country" VARCHAR, "Capital" VARCHAR, "ISO3" VARCHAR, "ISO2" VARCHAR))
SELECT
FLOOR(TIME_PARSE("timestamp") TO MINUTE) AS __time,
session,
agent_category,
agent_type,
browser,
browser_version,
CAST(REGEXP_EXTRACT(browser_version, '^(\\d+)') AS BIGINT) AS browser_major,
MV_TO_ARRAY("language") AS "language",
os,
city,
country,
country_lookup.Capital AS capital,
country_lookup.ISO3 AS iso3,
forwarded_for AS ip_address,
COUNT(*) AS "cnt",
SUM(session_length) AS session_length,
APPROX_COUNT_DISTINCT_DS_HLL(event_type) AS unique_event_types
FROM kttm_simple
LEFT JOIN country_lookup ON country_lookup.Country = kttm_simple.country
WHERE os = 'iOS'
GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
PARTITIONED BY HOUR
CLUSTERED BY browser, session
`,
),
},
{
id: 'demo4b',
tabName: 'Demo 4b',
query: makeDemoQuery(
`
-- Imagine you are an avid reader of https://www.reddit.com/r/MapsWithoutNZ and want your data to reflect your cartography
-- humor. In this demo you transform the datasource created in Demo 1 to remove all entries for 'New Zealand'.
-- Notice that you can read and write to the same datasource.
-- You can open a new tab and run 'SELECT COUNT(*) FROM kttm_simple' before and after running this query to see the change.
REPLACE INTO kttm_simple OVERWRITE ALL
SELECT *
FROM kttm_simple
WHERE NOT(country = 'New Zealand')
PARTITIONED BY ALL TIME
`,
),
},
{
id: 'demo5',
tabName: 'Demo 5',
query: makeDemoQuery(
`
-- You don't have to wrap your query in a REPLACE / INSERT clause to run it with the sql-msq-task engine. In fact, when you
-- use the "Preview" button for any of the earlier demos, it works by removing the REPLACE / INSERT clause and
-- running the query "inline" as a SELECT with a limit.
--
-- If you want to run a one time calculation on data, you can do it without ingesting it into Druid. Doing this
-- takes the same order of magnitude of resources and time as it would to ingest the data into Druid though.
--
-- This functionality should be considered as a tech preview of what the sql-msq-task engine can do and not something for
-- production use (outside of specific, narrow uses like the "Preview" feature in this console).
WITH
kttm_data AS (
SELECT * FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://static.imply.io/example-data/kttm-v2/kttm-v2-2019-08-25.json.gz"]}',
'{"type":"json"}'
)
) EXTEND ("timestamp" VARCHAR, "agent_category" VARCHAR, "agent_type" VARCHAR, "browser" VARCHAR, "browser_version" VARCHAR, "city" VARCHAR, "continent" VARCHAR, "country" VARCHAR, "version" VARCHAR, "event_type" VARCHAR, "event_subtype" VARCHAR, "loaded_image" VARCHAR, "adblock_list" VARCHAR, "forwarded_for" VARCHAR, "language" VARCHAR, "number" VARCHAR, "os" VARCHAR, "path" VARCHAR, "platform" VARCHAR, "referrer" VARCHAR, "referrer_host" VARCHAR, "region" VARCHAR, "remote_address" VARCHAR, "screen" VARCHAR, "session" VARCHAR, "session_length" BIGINT, "timezone" VARCHAR, "timezone_offset" VARCHAR, "window" VARCHAR)),
country_lookup AS (
SELECT * FROM TABLE(
EXTERN(
'{"type":"http","uris":["https://static.imply.io/example-data/lookup/countries.tsv"]}',
'{"type":"tsv","findColumnsFromHeader":true}'
)
) EXTEND ("Country" VARCHAR, "Capital" VARCHAR, "ISO3" VARCHAR, "ISO2" VARCHAR))
SELECT
os,
CONCAT(country, ' (', country_lookup.ISO3, ')') AS "country",
COUNT(DISTINCT session) AS "unique_sessions"
FROM kttm_data
LEFT JOIN country_lookup ON country_lookup.Country = kttm_data.country
GROUP BY 1, 2
ORDER BY 3 DESC
LIMIT 10
`,
),
},
{
id: 'demo6',
tabName: 'Demo 6',
query: makeDemoQuery(
`
-- At the heart of the sql-msq-task engine is the ability to sort arbitrarily large amounts of data. Demo 6
-- re-sorts the kttm_simple datasource on an arbitrary sort condition. This isn't possible with the "sql" or "native"
-- engines since it requires the same order of magnitude of resources as ingesting the entire dataset over again.
SELECT
session,
number,
browser,
browser_version,
"language",
event_type,
event_subtype
FROM "kttm_simple"
ORDER BY
session ASC,
number ASC
`,
).changeEngine('sql-msq-task'),
},
];
}