blob: 8e6b8e478ba87d33a29c5a5263a300c8f0acb4a8 [file] [log] [blame]
-- disable ORCA
SET optimizer TO off;
-- Test case group 1: basic functions
CREATE TABLE agg_pushdown_parent (
i int primary key,
x int);
CREATE TABLE agg_pushdown_child1 (
j int,
parent int,
v double precision,
PRIMARY KEY (j, parent));
CREATE INDEX ON agg_pushdown_child1(parent);
CREATE TABLE agg_pushdown_child2 (
k int,
parent int,
v double precision,
PRIMARY KEY (k, parent));;
INSERT INTO agg_pushdown_parent(i, x)
SELECT n, n
FROM generate_series(0, 7) AS s(n);
INSERT INTO agg_pushdown_child1(j, parent, v)
SELECT 128 * i + n, i, random()
FROM generate_series(0, 127) AS s(n), agg_pushdown_parent;
INSERT INTO agg_pushdown_child2(k, parent, v)
SELECT 128 * i + n, i, random()
FROM generate_series(0, 127) AS s(n), agg_pushdown_parent;
ANALYZE agg_pushdown_parent;
ANALYZE agg_pushdown_child1;
ANALYZE agg_pushdown_child2;
SET enable_nestloop TO on;
SET enable_hashjoin TO off;
SET enable_mergejoin TO off;
-- Perform scan of a table, aggregate the result, join it to the other table
-- and finalize the aggregation.
--
-- In addition, check that functionally dependent column "p.x" can be
-- referenced by SELECT although GROUP BY references "p.i".
SET gp_enable_agg_pushdown TO off;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.x, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.i GROUP BY p.i;
QUERY PLAN
----------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.x, (avg(c1.v)), p.i
-> HashAggregate
Output: p.x, avg(c1.v), p.i
Group Key: p.i
-> Nested Loop
Output: p.i, p.x, c1.v
Inner Unique: true
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: c1.v, c1.parent
Hash Key: c1.parent
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.v, c1.parent
-> Memoize
Output: p.x, p.i
Cache Key: c1.parent
Cache Mode: logical
-> Index Scan using agg_pushdown_parent_pkey on public.agg_pushdown_parent p
Output: p.x, p.i
Index Cond: (p.i = c1.parent)
Settings: enable_hashjoin = 'off', enable_mergejoin = 'off', enable_nestloop = 'on', gp_enable_agg_pushdown = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(22 rows)
SET gp_enable_agg_pushdown TO on;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.x, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.i GROUP BY p.i;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.x, (avg(c1.v)), p.i
-> Finalize GroupAggregate
Output: p.x, avg(c1.v), p.i
Group Key: p.i
-> Sort
Output: p.i, p.x, (PARTIAL avg(c1.v))
Sort Key: p.i
-> Nested Loop
Output: p.i, p.x, (PARTIAL avg(c1.v))
Inner Unique: true
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: c1.parent, (PARTIAL avg(c1.v))
Hash Key: c1.parent
-> Partial HashAggregate
Output: c1.parent, PARTIAL avg(c1.v)
Group Key: c1.parent
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
-> Index Scan using agg_pushdown_parent_pkey on public.agg_pushdown_parent p
Output: p.i, p.x
Index Cond: (p.i = c1.parent)
Settings: enable_hashjoin = 'off', enable_mergejoin = 'off', enable_nestloop = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(24 rows)
-- The same for hash join.
SET enable_nestloop TO off;
SET enable_hashjoin TO on;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.i GROUP BY p.i;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg(c1.v))
-> Finalize GroupAggregate
Output: p.i, avg(c1.v)
Group Key: p.i
-> Sort
Output: p.i, (PARTIAL avg(c1.v))
Sort Key: p.i
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: p.i, (PARTIAL avg(c1.v))
Hash Key: p.i
-> Hash Join
Output: p.i, (PARTIAL avg(c1.v))
Hash Cond: (p.i = c1.parent)
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: p.i
-> Seq Scan on public.agg_pushdown_parent p
Output: p.i
-> Hash
Output: c1.parent, (PARTIAL avg(c1.v))
-> Partial HashAggregate
Output: c1.parent, PARTIAL avg(c1.v)
Group Key: c1.parent
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(27 rows)
-- The same for merge join.
SET enable_hashjoin TO off;
SET enable_mergejoin TO on;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.i GROUP BY p.i;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg(c1.v))
-> Finalize HashAggregate
Output: p.i, avg(c1.v)
Group Key: p.i
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: p.i, (PARTIAL avg(c1.v))
Hash Key: p.i
-> Merge Join
Output: p.i, (PARTIAL avg(c1.v))
Merge Cond: (p.i = c1.parent)
-> Sort
Output: p.i
Sort Key: p.i
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: p.i
-> Seq Scan on public.agg_pushdown_parent p
Output: p.i
-> Sort
Output: c1.parent, (PARTIAL avg(c1.v))
Sort Key: c1.parent
-> Partial HashAggregate
Output: c1.parent, PARTIAL avg(c1.v)
Group Key: c1.parent
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
Settings: enable_hashjoin = 'off', enable_mergejoin = 'on', enable_nestloop = 'off', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(28 rows)
-- Restore the default values.
SET enable_nestloop TO on;
SET enable_hashjoin TO on;
-- Scan index on agg_pushdown_child1(parent) column and aggregate the result
-- using AGG_SORTED strategy.
SET gp_enable_agg_pushdown TO off;
SET enable_seqscan TO off;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.i GROUP BY p.i;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg(c1.v))
-> HashAggregate
Output: p.i, avg(c1.v)
Group Key: p.i
-> Nested Loop
Output: p.i, c1.v
Inner Unique: true
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: c1.v, c1.parent
Hash Key: c1.parent
-> Index Scan using agg_pushdown_child1_parent_idx on public.agg_pushdown_child1 c1
Output: c1.v, c1.parent
-> Memoize
Output: p.i
Cache Key: c1.parent
Cache Mode: logical
-> Index Only Scan using agg_pushdown_parent_pkey on public.agg_pushdown_parent p
Output: p.i
Index Cond: (p.i = c1.parent)
Settings: enable_hashjoin = 'on', enable_mergejoin = 'on', enable_nestloop = 'on', enable_seqscan = 'off', gp_enable_agg_pushdown = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(22 rows)
SET gp_enable_agg_pushdown TO on;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.i GROUP BY p.i;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg(c1.v))
-> Finalize GroupAggregate
Output: p.i, avg(c1.v)
Group Key: p.i
-> Sort
Output: p.i, (PARTIAL avg(c1.v))
Sort Key: p.i
-> Nested Loop
Output: p.i, (PARTIAL avg(c1.v))
Inner Unique: true
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: c1.parent, (PARTIAL avg(c1.v))
Hash Key: c1.parent
-> Partial GroupAggregate
Output: c1.parent, PARTIAL avg(c1.v)
Group Key: c1.parent
-> Index Scan using agg_pushdown_child1_parent_idx on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
-> Index Only Scan using agg_pushdown_parent_pkey on public.agg_pushdown_parent p
Output: p.i
Index Cond: (p.i = c1.parent)
Settings: enable_hashjoin = 'on', enable_mergejoin = 'on', enable_nestloop = 'on', enable_seqscan = 'off', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(24 rows)
SET enable_seqscan TO on;
-- Join "c1" to "p.x" column, i.e. one that is not in the GROUP BY clause. The
-- planner should still use "c1.parent" as grouping expression for partial
-- aggregation, although it's not in the same equivalence class as the GROUP
-- BY expression ("p.i"). The reason to use "c1.parent" for partial
-- aggregation is that this is the only way for "c1" to provide the join
-- expression with input data.
SET gp_enable_agg_pushdown TO off;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.x GROUP BY p.i;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg(c1.v))
-> Finalize HashAggregate
Output: p.i, avg(c1.v)
Group Key: p.i
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: p.i, (PARTIAL avg(c1.v))
Hash Key: p.i
-> Partial HashAggregate
Output: p.i, PARTIAL avg(c1.v)
Group Key: p.i
-> Hash Join
Output: p.i, c1.v
Hash Cond: (c1.parent = p.x)
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
-> Hash
Output: p.i, p.x
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: p.i, p.x
-> Seq Scan on public.agg_pushdown_parent p
Output: p.i, p.x
Settings: enable_hashjoin = 'on', enable_mergejoin = 'on', enable_nestloop = 'on', enable_seqscan = 'on', gp_enable_agg_pushdown = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(24 rows)
SET gp_enable_agg_pushdown TO on;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v) FROM agg_pushdown_parent AS p JOIN agg_pushdown_child1
AS c1 ON c1.parent = p.x GROUP BY p.i;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg(c1.v))
-> Finalize GroupAggregate
Output: p.i, avg(c1.v)
Group Key: p.i
-> Sort
Output: p.i, (PARTIAL avg(c1.v))
Sort Key: p.i
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: p.i, (PARTIAL avg(c1.v))
Hash Key: p.i
-> Hash Join
Output: p.i, (PARTIAL avg(c1.v))
Hash Cond: (p.x = c1.parent)
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: p.i, p.x
-> Seq Scan on public.agg_pushdown_parent p
Output: p.i, p.x
-> Hash
Output: c1.parent, (PARTIAL avg(c1.v))
-> Partial HashAggregate
Output: c1.parent, PARTIAL avg(c1.v)
Group Key: c1.parent
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
Settings: enable_hashjoin = 'on', enable_mergejoin = 'on', enable_nestloop = 'on', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(27 rows)
-- Perform nestloop join between agg_pushdown_child1 and agg_pushdown_child2
-- and aggregate the result.
SET enable_nestloop TO on;
SET enable_hashjoin TO off;
SET enable_mergejoin TO off;
SET gp_enable_agg_pushdown TO off;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v + c2.v) FROM agg_pushdown_parent AS p JOIN
agg_pushdown_child1 AS c1 ON c1.parent = p.i JOIN agg_pushdown_child2 AS c2 ON
c2.parent = p.i WHERE c1.j = c2.k GROUP BY p.i;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg((c1.v + c2.v)))
-> HashAggregate
Output: p.i, avg((c1.v + c2.v))
Group Key: p.i
-> Nested Loop
Output: p.i, c1.v, c2.v
Inner Unique: true
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: c1.v, c1.parent, c2.v, c2.parent
Hash Key: c1.parent
-> Nested Loop
Output: c1.v, c1.parent, c2.v, c2.parent
Inner Unique: true
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
-> Index Scan using agg_pushdown_child2_pkey on public.agg_pushdown_child2 c2
Output: c2.k, c2.parent, c2.v
Index Cond: ((c2.k = c1.j) AND (c2.parent = c1.parent))
-> Memoize
Output: p.i
Cache Key: c1.parent
Cache Mode: logical
-> Index Only Scan using agg_pushdown_parent_pkey on public.agg_pushdown_parent p
Output: p.i
Index Cond: (p.i = c1.parent)
Settings: enable_hashjoin = 'off', enable_mergejoin = 'off', enable_nestloop = 'on', enable_seqscan = 'on', gp_enable_agg_pushdown = 'off', optimizer = 'off'
Optimizer: Postgres query optimizer
(28 rows)
SET gp_enable_agg_pushdown TO on;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v + c2.v) FROM agg_pushdown_parent AS p JOIN
agg_pushdown_child1 AS c1 ON c1.parent = p.i JOIN agg_pushdown_child2 AS c2 ON
c2.parent = p.i WHERE c1.j = c2.k GROUP BY p.i;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg((c1.v + c2.v)))
-> Finalize GroupAggregate
Output: p.i, avg((c1.v + c2.v))
Group Key: p.i
-> Sort
Output: p.i, (PARTIAL avg((c1.v + c2.v)))
Sort Key: p.i
-> Nested Loop
Output: p.i, (PARTIAL avg((c1.v + c2.v)))
Inner Unique: true
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: c1.parent, c2.parent, (PARTIAL avg((c1.v + c2.v)))
Hash Key: c1.parent
-> Partial HashAggregate
Output: c1.parent, c2.parent, PARTIAL avg((c1.v + c2.v))
Group Key: c1.parent
-> Nested Loop
Output: c1.v, c1.parent, c2.v, c2.parent
Inner Unique: true
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
-> Index Scan using agg_pushdown_child2_pkey on public.agg_pushdown_child2 c2
Output: c2.k, c2.parent, c2.v
Index Cond: ((c2.k = c1.j) AND (c2.parent = c1.parent))
-> Index Only Scan using agg_pushdown_parent_pkey on public.agg_pushdown_parent p
Output: p.i
Index Cond: (p.i = c1.parent)
Settings: enable_hashjoin = 'off', enable_mergejoin = 'off', enable_nestloop = 'on', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(30 rows)
-- The same for hash join.
SET enable_nestloop TO off;
SET enable_hashjoin TO on;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v + c2.v) FROM agg_pushdown_parent AS p JOIN
agg_pushdown_child1 AS c1 ON c1.parent = p.i JOIN agg_pushdown_child2 AS c2 ON
c2.parent = p.i WHERE c1.j = c2.k GROUP BY p.i;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg((c1.v + c2.v)))
-> Finalize GroupAggregate
Output: p.i, avg((c1.v + c2.v))
Group Key: p.i
-> Sort
Output: p.i, (PARTIAL avg((c1.v + c2.v)))
Sort Key: p.i
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: p.i, (PARTIAL avg((c1.v + c2.v)))
Hash Key: p.i
-> Hash Join
Output: p.i, (PARTIAL avg((c1.v + c2.v)))
Hash Cond: (p.i = c1.parent)
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: p.i
-> Seq Scan on public.agg_pushdown_parent p
Output: p.i
-> Hash
Output: c1.parent, c2.parent, (PARTIAL avg((c1.v + c2.v)))
-> Partial HashAggregate
Output: c1.parent, c2.parent, PARTIAL avg((c1.v + c2.v))
Group Key: c1.parent
-> Hash Join
Output: c1.v, c1.parent, c2.v, c2.parent
Inner Unique: true
Hash Cond: ((c1.parent = c2.parent) AND (c1.j = c2.k))
-> Seq Scan on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
-> Hash
Output: c2.v, c2.parent, c2.k
-> Seq Scan on public.agg_pushdown_child2 c2
Output: c2.v, c2.parent, c2.k
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(35 rows)
-- The same for merge join.
SET enable_hashjoin TO off;
SET enable_mergejoin TO on;
SET enable_seqscan TO off;
EXPLAIN (VERBOSE on, COSTS off)
SELECT p.i, avg(c1.v + c2.v) FROM agg_pushdown_parent AS p JOIN
agg_pushdown_child1 AS c1 ON c1.parent = p.i JOIN agg_pushdown_child2 AS c2 ON
c2.parent = p.i WHERE c1.j = c2.k GROUP BY p.i;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: p.i, (avg((c1.v + c2.v)))
-> Finalize GroupAggregate
Output: p.i, avg((c1.v + c2.v))
Group Key: p.i
-> Merge Join
Output: p.i, (PARTIAL avg((c1.v + c2.v)))
Inner Unique: true
Merge Cond: (c1.parent = p.i)
-> Sort
Output: c1.parent, c2.parent, (PARTIAL avg((c1.v + c2.v)))
Sort Key: c1.parent
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: c1.parent, c2.parent, (PARTIAL avg((c1.v + c2.v)))
Hash Key: c1.parent
-> Partial HashAggregate
Output: c1.parent, c2.parent, PARTIAL avg((c1.v + c2.v))
Group Key: c1.parent
-> Merge Join
Output: c1.v, c1.parent, c2.v, c2.parent
Inner Unique: true
Merge Cond: ((c1.j = c2.k) AND (c1.parent = c2.parent))
-> Index Scan using agg_pushdown_child1_pkey on public.agg_pushdown_child1 c1
Output: c1.j, c1.parent, c1.v
-> Index Scan using agg_pushdown_child2_pkey on public.agg_pushdown_child2 c2
Output: c2.k, c2.parent, c2.v
-> Index Only Scan using agg_pushdown_parent_pkey on public.agg_pushdown_parent p
Output: p.i
Settings: enable_hashjoin = 'off', enable_mergejoin = 'on', enable_nestloop = 'off', enable_seqscan = 'off', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(30 rows)
SET enable_seqscan TO on;
-- Clear tables
DROP TABLE agg_pushdown_child1;
DROP TABLE agg_pushdown_child2;
DROP TABLE agg_pushdown_parent;
-- Test case group 2: Pushdown with different join keys and group keys.
DROP TABLE IF EXISTS t1, t2;
NOTICE: table "t1" does not exist, skipping
NOTICE: table "t2" does not exist, skipping
CREATE TABLE t1 (id int, val int, comment VARCHAR(20));
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE t2 (id int, val int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
SET enable_nestloop TO off;
SET enable_hashjoin TO on;
SET enable_mergejoin TO off;
SET gp_enable_agg_pushdown TO ON;
-- Join key and group key are the same.
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.id, SUM(t1.val) FROM t1, t2 WHERE t1.id = t2.id GROUP BY t1.id;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.id, (sum(t1.val))
-> Finalize HashAggregate
Output: t1.id, sum(t1.val)
Group Key: t1.id
-> Hash Join
Output: t1.id, (PARTIAL sum(t1.val))
Hash Cond: (t2.id = t1.id)
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.id, (PARTIAL sum(t1.val))
-> Partial HashAggregate
Output: t1.id, PARTIAL sum(t1.val)
Group Key: t1.id
-> Seq Scan on public.t1
Output: t1.id, t1.val, t1.comment
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(19 rows)
-- Join key and group key are different.
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.val, SUM(t1.id) FROM t1, t2 WHERE t1.id = t2.id GROUP BY t1.val;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.val, (sum(t1.id))
-> Finalize HashAggregate
Output: t1.val, sum(t1.id)
Group Key: t1.val
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: t1.val, (PARTIAL sum(t1.id))
Hash Key: t1.val
-> Hash Join
Output: t1.val, (PARTIAL sum(t1.id))
Hash Cond: (t2.id = t1.id)
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.val, t1.id, (PARTIAL sum(t1.id))
-> Partial HashAggregate
Output: t1.val, t1.id, PARTIAL sum(t1.id)
Group Key: t1.val, t1.id
-> Seq Scan on public.t1
Output: t1.id, t1.val, t1.comment
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(22 rows)
-- Pushdown with equivclass.
EXPLAIN (VERBOSE on, COSTS off)
SELECT t2.id, SUM(t1.val) FROM t1, t2 WHERE t1.id = t2.id GROUP BY t2.id;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t2.id, (sum(t1.val))
-> Finalize HashAggregate
Output: t2.id, sum(t1.val)
Group Key: t2.id
-> Hash Join
Output: t2.id, (PARTIAL sum(t1.val))
Hash Cond: (t2.id = t1.id)
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.id, (PARTIAL sum(t1.val))
-> Partial HashAggregate
Output: t1.id, PARTIAL sum(t1.val)
Group Key: t1.id
-> Seq Scan on public.t1
Output: t1.id, t1.val, t1.comment
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(19 rows)
-- Group by column from t2 and aggregate column from t1.
EXPLAIN (VERBOSE on, COSTS off)
SELECT t2.val, SUM(t1.val) FROM t1, t2 WHERE t1.id = t2.id GROUP BY t2.val;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t2.val, (sum(t1.val))
-> Finalize HashAggregate
Output: t2.val, sum(t1.val)
Group Key: t2.val
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: t2.val, (PARTIAL sum(t1.val))
Hash Key: t2.val
-> Hash Join
Output: t2.val, (PARTIAL sum(t1.val))
Hash Cond: (t2.id = t1.id)
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.id, (PARTIAL sum(t1.val))
-> Partial HashAggregate
Output: t1.id, PARTIAL sum(t1.val)
Group Key: t1.id
-> Seq Scan on public.t1
Output: t1.id, t1.val, t1.comment
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(22 rows)
-- Pushdown with multiply group keys.
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.id, t1.comment, SUM(t1.val) FROM t1, t2 WHERE t1.id = t2.id GROUP BY t1.id, t1.comment;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.id, t1.comment, (sum(t1.val))
-> Finalize HashAggregate
Output: t1.id, t1.comment, sum(t1.val)
Group Key: t1.id, t1.comment
-> Hash Join
Output: t1.id, t1.comment, (PARTIAL sum(t1.val))
Hash Cond: (t2.id = t1.id)
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.id, t1.comment, (PARTIAL sum(t1.val))
-> Partial HashAggregate
Output: t1.id, t1.comment, PARTIAL sum(t1.val)
Group Key: t1.id, t1.comment
-> Seq Scan on public.t1
Output: t1.id, t1.val, t1.comment
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(19 rows)
-- Pushdown with multiply join keys.
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.id, t1.comment, SUM(t1.val) FROM t1, t2 WHERE t1.id = t2.id and t1.val = t2.val GROUP BY t1.id, t1.comment;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.id, t1.comment, (sum(t1.val))
-> Finalize GroupAggregate
Output: t1.id, t1.comment, sum(t1.val)
Group Key: t1.id, t1.comment
-> Sort
Output: t1.id, t1.comment, (PARTIAL sum(t1.val))
Sort Key: t1.id, t1.comment
-> Hash Join
Output: t1.id, t1.comment, (PARTIAL sum(t1.val))
Hash Cond: ((t2.id = t1.id) AND (t2.val = t1.val))
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.id, t1.comment, t1.val, (PARTIAL sum(t1.val))
-> Partial HashAggregate
Output: t1.id, t1.comment, t1.val, PARTIAL sum(t1.val)
Group Key: t1.id, t1.comment, t1.val
-> Seq Scan on public.t1
Output: t1.id, t1.val, t1.comment
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(22 rows)
-- Test above case with different data distributions
INSERT INTO t1 SELECT i, i, 'asd' FROM generate_series(1, 10000) s(i);
ANALYZE t1;
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.id, t1.comment, SUM(t1.val) FROM t1, t2 WHERE t1.id = t2.id and t1.val = t2.val GROUP BY t1.id, t1.comment;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.id, t1.comment, (sum(t1.val))
-> GroupAggregate
Output: t1.id, t1.comment, sum(t1.val)
Group Key: t1.id, t1.comment
-> Sort
Output: t1.id, t1.comment, t1.val
Sort Key: t1.id, t1.comment
-> Hash Join
Output: t1.id, t1.comment, t1.val
Hash Cond: ((t2.id = t1.id) AND (t2.val = t1.val))
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.id, t1.comment, t1.val
-> Seq Scan on public.t1
Output: t1.id, t1.comment, t1.val
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(19 rows)
DELETE FROM t1;
INSERT INTO t1 SELECT i % 10, 1, 'asd' FROM generate_series(1, 10000) s(i);
ANALYZE t1;
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.id, t1.comment, SUM(t1.val) FROM t1, t2 WHERE t1.id = t2.id and t1.val = t2.val GROUP BY t1.id, t1.comment;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.id, t1.comment, (sum(t1.val))
-> Finalize GroupAggregate
Output: t1.id, t1.comment, sum(t1.val)
Group Key: t1.id, t1.comment
-> Sort
Output: t1.id, t1.comment, (PARTIAL sum(t1.val))
Sort Key: t1.id, t1.comment
-> Hash Join
Output: t1.id, t1.comment, (PARTIAL sum(t1.val))
Hash Cond: ((t2.id = t1.id) AND (t2.val = t1.val))
-> Seq Scan on public.t2
Output: t2.id, t2.val
-> Hash
Output: t1.id, t1.comment, t1.val, (PARTIAL sum(t1.val))
-> Partial HashAggregate
Output: t1.id, t1.comment, t1.val, PARTIAL sum(t1.val)
Group Key: t1.id, t1.comment, t1.val
-> Seq Scan on public.t1
Output: t1.id, t1.val, t1.comment
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(22 rows)
-- Clear tables
DROP TABLE t1, t2;
-- Test case group 3: Pushdown in subquery and group from subquery.
DROP TABLE IF EXISTS part, lineitem;
NOTICE: table "part" does not exist, skipping
NOTICE: table "lineitem" does not exist, skipping
CREATE TABLE part (p_partkey int, p_size int, p_price int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'p_partkey' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE lineitem (l_orderkey int, l_partkey int, l_amount int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'l_orderkey' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
SET enable_nestloop TO off;
SET enable_hashjoin TO on;
SET enable_mergejoin TO off;
SET gp_enable_agg_pushdown TO ON;
-- Pushdown within subquery.
EXPLAIN (VERBOSE on, COSTS off)
SELECT SUM(slp) FROM
(SELECT l_partkey, SUM(p_price) from lineitem, part
WHERE l_partkey = p_partkey AND p_size < 40
GROUP BY l_partkey
ORDER BY l_partkey
LIMIT 100) temp(lp, slp)
WHERE slp > 10;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Aggregate
Output: sum(temp.slp)
-> Subquery Scan on temp
Output: temp.lp, temp.slp
Filter: (temp.slp > 10)
-> Limit
Output: lineitem.l_partkey, (sum(part.p_price))
-> Gather Motion 3:1 (slice1; segments: 3)
Output: lineitem.l_partkey, (sum(part.p_price))
Merge Key: lineitem.l_partkey
-> Limit
Output: lineitem.l_partkey, (sum(part.p_price))
-> Sort
Output: lineitem.l_partkey, (sum(part.p_price))
Sort Key: lineitem.l_partkey
-> Finalize HashAggregate
Output: lineitem.l_partkey, sum(part.p_price)
Group Key: lineitem.l_partkey
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: lineitem.l_partkey, (PARTIAL sum(part.p_price))
Hash Key: lineitem.l_partkey
-> Hash Join
Output: lineitem.l_partkey, (PARTIAL sum(part.p_price))
Hash Cond: (lineitem.l_partkey = part.p_partkey)
-> Seq Scan on public.lineitem
Output: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_amount
-> Hash
Output: part.p_partkey, (PARTIAL sum(part.p_price))
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: part.p_partkey, (PARTIAL sum(part.p_price))
-> Partial HashAggregate
Output: part.p_partkey, PARTIAL sum(part.p_price)
Group Key: part.p_partkey
-> Seq Scan on public.part
Output: part.p_partkey, part.p_size, part.p_price
Filter: (part.p_size < 40)
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(38 rows)
-- Group base on subquery.
EXPLAIN (VERBOSE on, COSTS off)
SELECT p_partkey, SUM(l_amount) FROM
part, (SELECT l_partkey, l_amount + 10
FROM lineitem ORDER BY l_partkey LIMIT 10000) li(l_partkey, l_amount)
WHERE l_partkey = p_partkey
GROUP BY p_partkey;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: part.p_partkey, (sum(((lineitem.l_amount + 10))))
-> Finalize HashAggregate
Output: part.p_partkey, sum(((lineitem.l_amount + 10)))
Group Key: part.p_partkey
-> Hash Join
Output: part.p_partkey, (PARTIAL sum(((lineitem.l_amount + 10))))
Hash Cond: (part.p_partkey = lineitem.l_partkey)
-> Seq Scan on public.part
Output: part.p_partkey, part.p_size, part.p_price
-> Hash
Output: lineitem.l_partkey, (PARTIAL sum(((lineitem.l_amount + 10))))
-> Redistribute Motion 1:3 (slice2; segments: 1)
Output: lineitem.l_partkey, (PARTIAL sum(((lineitem.l_amount + 10))))
Hash Key: lineitem.l_partkey
-> Partial GroupAggregate
Output: lineitem.l_partkey, PARTIAL sum(((lineitem.l_amount + 10)))
Group Key: lineitem.l_partkey
-> Limit
Output: lineitem.l_partkey, ((lineitem.l_amount + 10))
-> Gather Motion 3:1 (slice3; segments: 3)
Output: lineitem.l_partkey, ((lineitem.l_amount + 10))
Merge Key: lineitem.l_partkey
-> Limit
Output: lineitem.l_partkey, ((lineitem.l_amount + 10))
-> Sort
Output: lineitem.l_partkey, ((lineitem.l_amount + 10))
Sort Key: lineitem.l_partkey
-> Seq Scan on public.lineitem
Output: lineitem.l_partkey, (lineitem.l_amount + 10)
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(32 rows)
-- Clear tables
DROP TABLE part, lineitem;
-- Test case group 4: construct grouped join rel from 2 plain rels
DROP TABLE IF EXISTS vendor_pd, customer_pd, nation_pd;
NOTICE: table "vendor_pd" does not exist, skipping
NOTICE: table "customer_pd" does not exist, skipping
NOTICE: table "nation_pd" does not exist, skipping
CREATE TABLE vendor_pd (v_id int, v_name VARCHAR(20)) WITH (APPENDONLY=true, ORIENTATION=column);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v_id' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE customer_pd (c_id int primary key, c_v_id int, c_n_id int, c_type int, c_consumption int);
CREATE TABLE nation_pd (n_id int, n_name VARCHAR(20), n_type int, n_population int) WITH (APPENDONLY=true, ORIENTATION=column);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'n_id' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO nation_pd SELECT i, 'abc', 1, 1 from generate_series(1, 100) s(i);
INSERT INTO customer_pd SELECT i, i % 100, i % 100, 1, 100 from generate_series(1, 10000) s(i);
ANALYZE nation_pd, customer_pd;
-- For each vendor, calculate the total consumption of qualified customers
EXPLAIN (VERBOSE on, COSTS off)
SELECT v_id, v_name, SUM(c_consumption)
FROM vendor_pd, customer_pd, nation_pd
WHERE v_id = c_v_id AND c_n_id = n_id AND c_id > n_population
GROUP BY v_id, v_name;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: vendor_pd.v_id, vendor_pd.v_name, (sum(customer_pd.c_consumption))
-> Finalize HashAggregate
Output: vendor_pd.v_id, vendor_pd.v_name, sum(customer_pd.c_consumption)
Group Key: vendor_pd.v_id, vendor_pd.v_name
-> Hash Join
Output: vendor_pd.v_id, vendor_pd.v_name, (PARTIAL sum(customer_pd.c_consumption))
Hash Cond: (vendor_pd.v_id = customer_pd.c_v_id)
-> Seq Scan on public.vendor_pd
Output: vendor_pd.v_id, vendor_pd.v_name
-> Hash
Output: customer_pd.c_v_id, (PARTIAL sum(customer_pd.c_consumption))
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: customer_pd.c_v_id, (PARTIAL sum(customer_pd.c_consumption))
Hash Key: customer_pd.c_v_id
-> Partial HashAggregate
Output: customer_pd.c_v_id, PARTIAL sum(customer_pd.c_consumption)
Group Key: customer_pd.c_v_id
-> Hash Join
Output: customer_pd.c_consumption, customer_pd.c_v_id
Hash Cond: (customer_pd.c_n_id = nation_pd.n_id)
Join Filter: (customer_pd.c_id > nation_pd.n_population)
-> Seq Scan on public.customer_pd
Output: customer_pd.c_id, customer_pd.c_v_id, customer_pd.c_n_id, customer_pd.c_type, customer_pd.c_consumption
-> Hash
Output: nation_pd.n_id, nation_pd.n_population
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: nation_pd.n_id, nation_pd.n_population
-> Seq Scan on public.nation_pd
Output: nation_pd.n_id, nation_pd.n_population
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(32 rows)
-- For each vendor/c_type/n_type, calculate the total consumption of qualified customers
EXPLAIN (VERBOSE on, COSTS off)
SELECT v_id, c_type, n_type, SUM(c_consumption)
FROM vendor_pd, customer_pd, nation_pd
WHERE v_id = c_v_id AND c_n_id = n_id AND c_id > n_population
GROUP BY v_id, c_type, n_type;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: vendor_pd.v_id, customer_pd.c_type, nation_pd.n_type, (sum(customer_pd.c_consumption))
-> Finalize HashAggregate
Output: vendor_pd.v_id, customer_pd.c_type, nation_pd.n_type, sum(customer_pd.c_consumption)
Group Key: vendor_pd.v_id, customer_pd.c_type, nation_pd.n_type
-> Hash Join
Output: vendor_pd.v_id, customer_pd.c_type, nation_pd.n_type, (PARTIAL sum(customer_pd.c_consumption))
Hash Cond: (vendor_pd.v_id = customer_pd.c_v_id)
-> Seq Scan on public.vendor_pd
Output: vendor_pd.v_id
-> Hash
Output: customer_pd.c_type, customer_pd.c_v_id, nation_pd.n_type, (PARTIAL sum(customer_pd.c_consumption))
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: customer_pd.c_type, customer_pd.c_v_id, nation_pd.n_type, (PARTIAL sum(customer_pd.c_consumption))
Hash Key: customer_pd.c_v_id
-> Partial HashAggregate
Output: customer_pd.c_type, customer_pd.c_v_id, nation_pd.n_type, PARTIAL sum(customer_pd.c_consumption)
Group Key: customer_pd.c_type, customer_pd.c_v_id, nation_pd.n_type
-> Hash Join
Output: customer_pd.c_type, customer_pd.c_consumption, customer_pd.c_v_id, nation_pd.n_type
Hash Cond: (customer_pd.c_n_id = nation_pd.n_id)
Join Filter: (customer_pd.c_id > nation_pd.n_population)
-> Seq Scan on public.customer_pd
Output: customer_pd.c_id, customer_pd.c_v_id, customer_pd.c_n_id, customer_pd.c_type, customer_pd.c_consumption
-> Hash
Output: nation_pd.n_type, nation_pd.n_id, nation_pd.n_population
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: nation_pd.n_type, nation_pd.n_id, nation_pd.n_population
-> Seq Scan on public.nation_pd
Output: nation_pd.n_type, nation_pd.n_id, nation_pd.n_population
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(32 rows)
-- For each vendor/n_type, calculate the total consumption of customers from nation with condition.
EXPLAIN (VERBOSE on, COSTS off)
SELECT v_id, v_name, n_type, SUM(c_consumption)
FROM vendor_pd, customer_pd, nation_pd
WHERE v_id = c_v_id AND c_n_id = n_id AND n_population > 100
GROUP BY v_id, v_name, n_type;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: vendor_pd.v_id, vendor_pd.v_name, nation_pd.n_type, (sum(customer_pd.c_consumption))
-> Finalize HashAggregate
Output: vendor_pd.v_id, vendor_pd.v_name, nation_pd.n_type, sum(customer_pd.c_consumption)
Group Key: vendor_pd.v_id, vendor_pd.v_name, nation_pd.n_type
-> Hash Join
Output: vendor_pd.v_id, vendor_pd.v_name, nation_pd.n_type, (PARTIAL sum(customer_pd.c_consumption))
Hash Cond: (vendor_pd.v_id = customer_pd.c_v_id)
-> Seq Scan on public.vendor_pd
Output: vendor_pd.v_id, vendor_pd.v_name
-> Hash
Output: customer_pd.c_v_id, nation_pd.n_type, (PARTIAL sum(customer_pd.c_consumption))
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: customer_pd.c_v_id, nation_pd.n_type, (PARTIAL sum(customer_pd.c_consumption))
Hash Key: customer_pd.c_v_id
-> Partial HashAggregate
Output: customer_pd.c_v_id, nation_pd.n_type, PARTIAL sum(customer_pd.c_consumption)
Group Key: customer_pd.c_v_id, nation_pd.n_type
-> Hash Join
Output: customer_pd.c_consumption, customer_pd.c_v_id, nation_pd.n_type
Hash Cond: (customer_pd.c_n_id = nation_pd.n_id)
-> Seq Scan on public.customer_pd
Output: customer_pd.c_id, customer_pd.c_v_id, customer_pd.c_n_id, customer_pd.c_type, customer_pd.c_consumption
-> Hash
Output: nation_pd.n_type, nation_pd.n_id
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: nation_pd.n_type, nation_pd.n_id
-> Seq Scan on public.nation_pd
Output: nation_pd.n_type, nation_pd.n_id
Filter: (nation_pd.n_population > 100)
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(32 rows)
-- Clear tables
DROP TABLE vendor_pd, customer_pd, nation_pd;
-- Test case group 4: OLAP-like cases
DROP TABLE IF EXISTS fact, dim;
NOTICE: table "fact" does not exist, skipping
NOTICE: table "dim" does not exist, skipping
CREATE TABLE fact (id int, did int, fact_time int, val int) WITH (APPENDONLY=true, ORIENTATION=column);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE dim (did int, proj_name varchar(20), brand int, model int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'did' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO dim SELECT i % 100, 1, 1 FROM generate_series(1, 100) s(i);
INSERT INTO fact SELECT i % 10, i % 100, 30, 1 FROM generate_series(1, 10000) s(i);
ANALYZE dim, fact;
-- Test sum fact vals group by dim column
EXPLAIN (VERBOSE on, COSTS off)
SELECT dim.did, sum(val)
FROM fact JOIN dim ON fact.did = dim.did
WHERE fact_time > 10 AND fact_time < 2000
GROUP BY dim.did;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: dim.did, (sum(fact.val))
-> Finalize HashAggregate
Output: dim.did, sum(fact.val)
Group Key: dim.did
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: dim.did, (PARTIAL sum(fact.val))
Hash Key: dim.did
-> Hash Join
Output: dim.did, (PARTIAL sum(fact.val))
Hash Cond: (dim.did = fact.did)
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: dim.did
-> Seq Scan on public.dim
Output: dim.did
-> Hash
Output: fact.did, (PARTIAL sum(fact.val))
-> Partial HashAggregate
Output: fact.did, PARTIAL sum(fact.val)
Group Key: fact.did
-> Seq Scan on public.fact
Output: fact.val, fact.did
Filter: ((fact.fact_time > 10) AND (fact.fact_time < 2000))
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(25 rows)
EXPLAIN (VERBOSE on, COSTS off)
SELECT dim.proj_name, sum(val)
FROM fact JOIN dim ON fact.did = dim.did
WHERE fact_time > 10 AND fact_time < 2000
GROUP BY dim.proj_name;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: dim.proj_name, (sum(fact.val))
-> Finalize HashAggregate
Output: dim.proj_name, sum(fact.val)
Group Key: dim.proj_name
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: dim.proj_name, (PARTIAL sum(fact.val))
Hash Key: dim.proj_name
-> Hash Join
Output: dim.proj_name, (PARTIAL sum(fact.val))
Hash Cond: (dim.did = fact.did)
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: dim.proj_name, dim.did
-> Seq Scan on public.dim
Output: dim.proj_name, dim.did
-> Hash
Output: fact.did, (PARTIAL sum(fact.val))
-> Partial HashAggregate
Output: fact.did, PARTIAL sum(fact.val)
Group Key: fact.did
-> Seq Scan on public.fact
Output: fact.val, fact.did
Filter: ((fact.fact_time > 10) AND (fact.fact_time < 2000))
Settings: enable_hashjoin = 'on', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(25 rows)
-- Clear tables
DROP TABLE dim, fact;
-- Test case group 5: partition table and inherit table
SET enable_incremental_sort TO off;
DROP TABLE IF EXISTS pagg_pd;
NOTICE: table "pagg_pd" does not exist, skipping
CREATE TABLE pagg_pd (a int, b int, c text, d int) PARTITION BY LIST(c);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE pagg_pd_p1 PARTITION OF pagg_pd FOR VALUES IN ('0000', '0001', '0002', '0003', '0004');
NOTICE: table has parent, setting distribution columns to match parent table
CREATE TABLE pagg_pd_p2 PARTITION OF pagg_pd FOR VALUES IN ('0005', '0006', '0007', '0008');
NOTICE: table has parent, setting distribution columns to match parent table
CREATE TABLE pagg_pd_p3 PARTITION OF pagg_pd FOR VALUES IN ('0009', '0010', '0011');
NOTICE: table has parent, setting distribution columns to match parent table
INSERT INTO pagg_pd SELECT i % 20, i % 30, to_char(i % 12, 'FM0000'), i % 30 FROM generate_series(0, 2999) i;
ANALYZE pagg_pd;
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.c, sum(t1.a)
FROM pagg_pd t1 JOIN pagg_pd t2 ON t1.c < t2.c
GROUP BY t1.c
ORDER BY 1, 2;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.c, (sum(t1.a))
Merge Key: t1.c, (sum(t1.a))
-> Sort
Output: t1.c, (sum(t1.a))
Sort Key: t1.c, (sum(t1.a))
-> Finalize GroupAggregate
Output: t1.c, sum(t1.a)
Group Key: t1.c
-> Sort
Output: t1.c, (PARTIAL sum(t1.a))
Sort Key: t1.c
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: t1.c, (PARTIAL sum(t1.a))
Hash Key: t1.c
-> Nested Loop
Output: t1.c, (PARTIAL sum(t1.a))
Join Filter: (t1.c < t2.c)
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: t1.c, (PARTIAL sum(t1.a))
-> Partial HashAggregate
Output: t1.c, PARTIAL sum(t1.a)
Group Key: t1.c
-> Append
-> Seq Scan on public.pagg_pd_p1 t1_1
Output: t1_1.c, t1_1.a
-> Seq Scan on public.pagg_pd_p2 t1_2
Output: t1_2.c, t1_2.a
-> Seq Scan on public.pagg_pd_p3 t1_3
Output: t1_3.c, t1_3.a
-> Materialize
Output: t2.c
-> Append
-> Seq Scan on public.pagg_pd_p1 t2_1
Output: t2_1.c
-> Seq Scan on public.pagg_pd_p2 t2_2
Output: t2_2.c
-> Seq Scan on public.pagg_pd_p3 t2_3
Output: t2_3.c
Settings: enable_hashjoin = 'on', enable_incremental_sort = 'off', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(41 rows)
SELECT t1.c, sum(t1.a)
FROM pagg_pd t1 JOIN pagg_pd t2 ON t1.c < t2.c
GROUP BY t1.c
ORDER BY 1, 2;
c | sum
------+---------
0000 | 5500000
0001 | 5625000
0002 | 5625000
0003 | 5500000
0004 | 3500000
0005 | 3375000
0006 | 3125000
0007 | 2750000
0008 | 1500000
0009 | 1125000
0010 | 625000
(11 rows)
DROP TABLE pagg_pd;
CREATE TABLE pagg_pd_p (a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE pagg_pd (c text, d int) inherits (pagg_pd_p) PARTITION BY LIST(c);
ERROR: cannot create partitioned table as inheritance child
DROP TABLE IF EXISTS pagg_pd, pagg_pd_p;
NOTICE: table "pagg_pd" does not exist, skipping
CREATE TABLE pagg_pd_p (a int, b int, c text) PARTITION BY LIST(c);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE pagg_pd (d int) inherits (pagg_pd_p);
NOTICE: table has parent, setting distribution columns to match parent table
ERROR: cannot inherit from partitioned table "pagg_pd_p"
DROP TABLE IF EXISTS pagg_pd, pagg_pd_p;
NOTICE: table "pagg_pd" does not exist, skipping
CREATE TABLE pagg_pd_p (a int, b int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE TABLE pagg_pd (c text, d int) inherits (pagg_pd_p);
NOTICE: table has parent, setting distribution columns to match parent table
INSERT INTO pagg_pd SELECT i % 20, i % 30, to_char(i % 12, 'FM0000'), i % 30 FROM generate_series(0, 2999) i;
ANALYZE pagg_pd;
EXPLAIN (VERBOSE on, COSTS off)
SELECT t1.c, sum(t1.a)
FROM pagg_pd t1 JOIN pagg_pd t2 ON t1.c < t2.c
GROUP BY t1.c
ORDER BY 1, 2;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
Output: t1.c, (sum(t1.a))
Merge Key: t1.c, (sum(t1.a))
-> Sort
Output: t1.c, (sum(t1.a))
Sort Key: t1.c, (sum(t1.a))
-> Finalize GroupAggregate
Output: t1.c, sum(t1.a)
Group Key: t1.c
-> Sort
Output: t1.c, (PARTIAL sum(t1.a))
Sort Key: t1.c
-> Redistribute Motion 3:3 (slice2; segments: 3)
Output: t1.c, (PARTIAL sum(t1.a))
Hash Key: t1.c
-> Nested Loop
Output: t1.c, (PARTIAL sum(t1.a))
Join Filter: (t1.c < t2.c)
-> Broadcast Motion 3:3 (slice3; segments: 3)
Output: t1.c, (PARTIAL sum(t1.a))
-> Partial HashAggregate
Output: t1.c, PARTIAL sum(t1.a)
Group Key: t1.c
-> Seq Scan on public.pagg_pd t1
Output: t1.a, t1.b, t1.c, t1.d
-> Materialize
Output: t2.c
-> Seq Scan on public.pagg_pd t2
Output: t2.c
Settings: enable_hashjoin = 'on', enable_incremental_sort = 'off', enable_mergejoin = 'off', enable_nestloop = 'off', enable_seqscan = 'on', gp_enable_agg_pushdown = 'on', optimizer = 'off'
Optimizer: Postgres query optimizer
(31 rows)
SELECT t1.c, sum(t1.a)
FROM pagg_pd t1 JOIN pagg_pd t2 ON t1.c < t2.c
GROUP BY t1.c
ORDER BY 1, 2;
c | sum
------+---------
0000 | 5500000
0001 | 5625000
0002 | 5625000
0003 | 5500000
0004 | 3500000
0005 | 3375000
0006 | 3125000
0007 | 2750000
0008 | 1500000
0009 | 1125000
0010 | 625000
(11 rows)
DROP TABLE pagg_pd, pagg_pd_p;
RESET enable_incremental_sort;
-- Clear settings
SET optimizer TO default;
SET gp_enable_agg_pushdown TO off;
SET enable_seqscan TO on;
SET enable_nestloop TO on;
SET enable_hashjoin TO on;
SET enable_mergejoin TO on;