blob: fa15d208caf3cec5793bdfd0c48c688642afbe56 [file] [log] [blame]
create table cost_agg_t1(a int, b int, c int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into cost_agg_t1 select i, random() * 99999, i % 2000 from generate_series(1, 1000000) i;
create table cost_agg_t2 as select * from cost_agg_t1 with no data;
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
insert into cost_agg_t2 select i, random() * 99999, i % 300000 from generate_series(1, 1000000) i;
analyze cost_agg_t1;
analyze cost_agg_t2;
--
-- Test planner's decisions on aggregates when only little memory is available.
--
set statement_mem= '1800 kB';
-- There are only 2000 distinct values of 'c' in the table, which fits
-- comfortably in an in-memory hash table.
explain select avg(b) from cost_agg_t1 group by c;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..485.40 rows=2001 width=8)
-> Finalize HashAggregate (cost=0.00..485.34 rows=667 width=8)
Group Key: c
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..485.26 rows=667 width=12)
Hash Key: c
-> Streaming Partial HashAggregate (cost=0.00..485.23 rows=667 width=12)
Group Key: c
-> Seq Scan on cost_agg_t1 (cost=0.00..438.70 rows=333334 width=8)
Optimizer: Pivotal Optimizer (GPORCA)
(9 rows)
-- In the other table, there are 300000 distinct values of 'c', which doesn't
-- fit in statement_mem. The planner chooses to do a single-phase agg for this.
--
-- In the single-phase plan, the aggregation is performed after redistrbuting
-- the data, which means that each node only has to process 1/(# of segments)
-- fraction of the data. That fits in memory, whereas an initial stage before
-- redistributing would not. And it would eliminate only a few rows, anyway.
explain select avg(b) from cost_agg_t2 group by c;
QUERY PLAN
-----------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..501.88 rows=305691 width=8)
-> HashAggregate (cost=0.00..492.76 rows=101897 width=8)
Group Key: c
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..452.01 rows=333334 width=8)
Hash Key: c
-> Seq Scan on cost_agg_t2 (cost=0.00..438.70 rows=333334 width=8)
Optimizer: Pivotal Optimizer (GPORCA)
(7 rows)
-- But if there are a lot more duplicate values, the two-stage plan becomes
-- cheaper again, even though it doesn't git in memory and has to spill.
insert into cost_agg_t2 select i, random() * 99999,1 from generate_series(1, 200000) i;
analyze cost_agg_t2;
explain select avg(b) from cost_agg_t2 group by c;
QUERY PLAN
-----------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..504.85 rows=102902 width=8)
-> Finalize HashAggregate (cost=0.00..501.78 rows=34301 width=8)
Group Key: c
-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..497.48 rows=34301 width=12)
Hash Key: c
-> Streaming Partial HashAggregate (cost=0.00..496.19 rows=34301 width=12)
Group Key: c
-> Seq Scan on cost_agg_t2 (cost=0.00..440.24 rows=400000 width=8)
Optimizer: Pivotal Optimizer (GPORCA)
(9 rows)
drop table cost_agg_t1;
drop table cost_agg_t2;
reset statement_mem;
-- The following case is to test GUC gp_eager_two_phase_agg for planner
-- When it is set true, planner will choose two stage agg.
create table t_planner_force_multi_stage(a int, b int) distributed randomly;
analyze t_planner_force_multi_stage;
show gp_eager_two_phase_agg;
gp_eager_two_phase_agg
------------------------
off
(1 row)
-- the GUC gp_eager_two_phase_agg is default false, the table contains no data
-- so one stage agg will win.
explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> GroupAggregate
Group Key: b
-> Sort
Sort Key: b
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: b
-> Seq Scan on t_planner_force_multi_stage
Optimizer: Pivotal Optimizer (GPORCA)
(9 rows)
set gp_eager_two_phase_agg = on;
-- when forcing two stage, it should generate two stage agg plan.
explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> GroupAggregate
Group Key: b
-> Sort
Sort Key: b
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: b
-> Seq Scan on t_planner_force_multi_stage
Optimizer: Pivotal Optimizer (GPORCA)
(9 rows)
reset gp_eager_two_phase_agg;
drop table t_planner_force_multi_stage;
-- test operatorMem
begin;
create table test_operator_mem (i int, j int) distributed by (i);
insert into test_operator_mem select i, i+1 from generate_series(1, 100)i;
analyze test_operator_mem;
set local statement_mem=1024;
set local gp_resqueue_print_operator_memory_limits=on;
explain(costs off)
select count(*) from test_operator_mem;
QUERY PLAN
-----------------------------------------------------------------------
Finalize Aggregate (operatorMem: 100kB)
-> Gather Motion 3:1 (slice1; segments: 3) (operatorMem: 100kB)
-> Partial Aggregate (operatorMem: 100kB)
-> Seq Scan on test_operator_mem (operatorMem: 100kB)
Optimizer: Pivotal Optimizer (GPORCA)
(5 rows)
abort;
-- Test user-defined aggregate marked safe to execute on replicated slices without motion
CREATE AGGREGATE my_unsafe_avg (float8)
(
sfunc = float8_accum,
stype = float8[],
finalfunc = float8_avg,
initcond = '{0,0,0}'
);
CREATE AGGREGATE my_safe_avg (float8)
(
sfunc = float8_accum,
stype = float8[],
finalfunc = float8_avg,
initcond = '{0,0,0}',
repsafe = true
);
CREATE TABLE a_reptable (a int) DISTRIBUTED REPLICATED;
CREATE TABLE b_reptable (b int) DISTRIBUTED REPLICATED;
EXPLAIN INSERT INTO a_reptable(a) SELECT my_unsafe_avg(b) FROM b_reptable;
QUERY PLAN
---------------------------------------------------------------------------------------------
Insert on a_reptable (cost=0.00..431.03 rows=1 width=4)
-> Result (cost=0.00..431.00 rows=3 width=8)
-> Broadcast Motion 1:3 (slice1; segments: 1) (cost=0.00..431.00 rows=3 width=4)
-> Aggregate (cost=0.00..431.00 rows=3 width=8)
-> Seq Scan on b_reptable (cost=0.00..431.00 rows=3 width=4)
Optimizer: Pivotal Optimizer (GPORCA)
(6 rows)
EXPLAIN INSERT INTO a_reptable(a) SELECT my_safe_avg(b) FROM b_reptable;
QUERY PLAN
------------------------------------------------------------------------
Insert on a_reptable (cost=0.00..431.03 rows=1 width=4)
-> Aggregate (cost=0.00..431.00 rows=3 width=8)
-> Seq Scan on b_reptable (cost=0.00..431.00 rows=3 width=4)
Optimizer: Pivotal Optimizer (GPORCA)
(4 rows)