src/test/regress/expected/gp_aggregates_costs_optimizer.out - cloudberry - Git at Google

 create table cost_agg_t1(a int, b int, c int);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 insert into cost_agg_t1 select i, random() * 99999, i % 2000 from generate_series(1, 1000000) i;
 create table cost_agg_t2 as select * from cost_agg_t1 with no data;
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
 insert into cost_agg_t2 select i, random() * 99999, i % 300000 from generate_series(1, 1000000) i;
 analyze cost_agg_t1;
 analyze cost_agg_t2;
 --
 -- Test planner's decisions on aggregates when only little memory is available.
 --
 set statement_mem= '1800 kB';
 -- There are only 2000 distinct values of 'c' in the table, which fits
 -- comfortably in an in-memory hash table.
 explain select avg(b) from cost_agg_t1 group by c;
                                             QUERY PLAN
 ---------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..485.40 rows=2001 width=8)
    ->  Finalize HashAggregate  (cost=0.00..485.34 rows=667 width=8)
          Group Key: c
          ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..485.26 rows=667 width=12)
                Hash Key: c
                ->  Streaming Partial HashAggregate  (cost=0.00..485.23 rows=667 width=12)
                      Group Key: c
                      ->  Seq Scan on cost_agg_t1  (cost=0.00..438.70 rows=333334 width=8)
  Optimizer: Pivotal Optimizer (GPORCA)
 (9 rows)

 -- In the other table, there are 300000 distinct values of 'c', which doesn't
 -- fit in statement_mem. The planner chooses to do a single-phase agg for this.
 --
 -- In the single-phase plan, the aggregation is performed after redistrbuting
 -- the data, which means that each node only has to process 1/(# of segments)
 -- fraction of the data. That fits in memory, whereas an initial stage before
 -- redistributing would not. And it would eliminate only a few rows, anyway.
 explain select avg(b) from cost_agg_t2 group by c;
                                              QUERY PLAN
 -----------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..501.88 rows=305691 width=8)
    ->  HashAggregate  (cost=0.00..492.76 rows=101897 width=8)
          Group Key: c
          ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..452.01 rows=333334 width=8)
                Hash Key: c
                ->  Seq Scan on cost_agg_t2  (cost=0.00..438.70 rows=333334 width=8)
  Optimizer: Pivotal Optimizer (GPORCA)
 (7 rows)

 -- But if there are a lot more duplicate values, the two-stage plan becomes
 -- cheaper again, even though it doesn't git in memory and has to spill.
 insert into cost_agg_t2 select i, random() * 99999,1 from generate_series(1, 200000) i;
 analyze cost_agg_t2;
 explain select avg(b) from cost_agg_t2 group by c;
                                              QUERY PLAN
 -----------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..504.85 rows=102902 width=8)
    ->  Finalize HashAggregate  (cost=0.00..501.78 rows=34301 width=8)
          Group Key: c
          ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..497.48 rows=34301 width=12)
                Hash Key: c
                ->  Streaming Partial HashAggregate  (cost=0.00..496.19 rows=34301 width=12)
                      Group Key: c
                      ->  Seq Scan on cost_agg_t2  (cost=0.00..440.24 rows=400000 width=8)
  Optimizer: Pivotal Optimizer (GPORCA)
 (9 rows)

 drop table cost_agg_t1;
 drop table cost_agg_t2;
 reset statement_mem;
 -- The following case is to test GUC gp_eager_two_phase_agg for planner
 -- When it is set true, planner will choose two stage agg.
 create table t_planner_force_multi_stage(a int, b int) distributed randomly;
 analyze t_planner_force_multi_stage;
 show gp_eager_two_phase_agg;
  gp_eager_two_phase_agg
 ------------------------
  off
 (1 row)

 -- the GUC gp_eager_two_phase_agg is default false, the table contains no data
 -- so one stage agg will win.
 explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
                             QUERY PLAN
 ------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
    ->  GroupAggregate
          Group Key: b
          ->  Sort
                Sort Key: b
                ->  Redistribute Motion 3:3  (slice2; segments: 3)
                      Hash Key: b
                      ->  Seq Scan on t_planner_force_multi_stage
  Optimizer: Pivotal Optimizer (GPORCA)
 (9 rows)

 set gp_eager_two_phase_agg = on;
 -- when forcing two stage, it should generate two stage agg plan.
 explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
                             QUERY PLAN
 ------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)
    ->  GroupAggregate
          Group Key: b
          ->  Sort
                Sort Key: b
                ->  Redistribute Motion 3:3  (slice2; segments: 3)
                      Hash Key: b
                      ->  Seq Scan on t_planner_force_multi_stage
  Optimizer: Pivotal Optimizer (GPORCA)
 (9 rows)

 reset gp_eager_two_phase_agg;
 drop table t_planner_force_multi_stage;
 -- test operatorMem
 begin;
 create table test_operator_mem (i int, j int) distributed by (i);
 insert into test_operator_mem select i, i+1 from generate_series(1, 100)i;
 analyze test_operator_mem;
 set local statement_mem=1024;
 set local gp_resqueue_print_operator_memory_limits=on;
 explain(costs off)
 select count(*) from test_operator_mem;
                               QUERY PLAN
 -----------------------------------------------------------------------
  Finalize Aggregate  (operatorMem: 100kB)
    ->  Gather Motion 3:1  (slice1; segments: 3)  (operatorMem: 100kB)
          ->  Partial Aggregate  (operatorMem: 100kB)
                ->  Seq Scan on test_operator_mem  (operatorMem: 100kB)
  Optimizer: Pivotal Optimizer (GPORCA)
 (5 rows)

 abort;
 -- Test user-defined aggregate marked safe to execute on replicated slices without motion
 CREATE AGGREGATE my_unsafe_avg (float8)
 (
     sfunc = float8_accum,
     stype = float8[],
     finalfunc = float8_avg,
     initcond = '{0,0,0}'
 );
 CREATE AGGREGATE my_safe_avg (float8)
 (
     sfunc = float8_accum,
     stype = float8[],
     finalfunc = float8_avg,
     initcond = '{0,0,0}',
     repsafe = true
 );
 CREATE TABLE a_reptable (a int) DISTRIBUTED REPLICATED;
 CREATE TABLE b_reptable (b int) DISTRIBUTED REPLICATED;
 EXPLAIN INSERT INTO a_reptable(a) SELECT my_unsafe_avg(b) FROM b_reptable;
                                          QUERY PLAN
 ---------------------------------------------------------------------------------------------
  Insert on a_reptable  (cost=0.00..431.03 rows=1 width=4)
    ->  Result  (cost=0.00..431.00 rows=3 width=8)
          ->  Broadcast Motion 1:3  (slice1; segments: 1)  (cost=0.00..431.00 rows=3 width=4)
                ->  Aggregate  (cost=0.00..431.00 rows=3 width=8)
                      ->  Seq Scan on b_reptable  (cost=0.00..431.00 rows=3 width=4)
  Optimizer: Pivotal Optimizer (GPORCA)
 (6 rows)

 EXPLAIN INSERT INTO a_reptable(a) SELECT my_safe_avg(b) FROM b_reptable;
                                QUERY PLAN
 ------------------------------------------------------------------------
  Insert on a_reptable  (cost=0.00..431.03 rows=1 width=4)
    ->  Aggregate  (cost=0.00..431.00 rows=3 width=8)
          ->  Seq Scan on b_reptable  (cost=0.00..431.00 rows=3 width=4)
  Optimizer: Pivotal Optimizer (GPORCA)
 (4 rows)
	create table cost_agg_t1(a int, b int, c int);
	NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
	HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
	insert into cost_agg_t1 select i, random() * 99999, i % 2000 from generate_series(1, 1000000) i;
	create table cost_agg_t2 as select * from cost_agg_t1 with no data;
	NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry.
	insert into cost_agg_t2 select i, random() * 99999, i % 300000 from generate_series(1, 1000000) i;
	analyze cost_agg_t1;
	analyze cost_agg_t2;
	--
	-- Test planner's decisions on aggregates when only little memory is available.
	--
	set statement_mem= '1800 kB';
	-- There are only 2000 distinct values of 'c' in the table, which fits
	-- comfortably in an in-memory hash table.
	explain select avg(b) from cost_agg_t1 group by c;
	QUERY PLAN
	---------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..485.40 rows=2001 width=8)
	-> Finalize HashAggregate (cost=0.00..485.34 rows=667 width=8)
	Group Key: c
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..485.26 rows=667 width=12)
	Hash Key: c
	-> Streaming Partial HashAggregate (cost=0.00..485.23 rows=667 width=12)
	Group Key: c
	-> Seq Scan on cost_agg_t1 (cost=0.00..438.70 rows=333334 width=8)
	Optimizer: Pivotal Optimizer (GPORCA)
	(9 rows)

	-- In the other table, there are 300000 distinct values of 'c', which doesn't
	-- fit in statement_mem. The planner chooses to do a single-phase agg for this.
	--
	-- In the single-phase plan, the aggregation is performed after redistrbuting
	-- the data, which means that each node only has to process 1/(# of segments)
	-- fraction of the data. That fits in memory, whereas an initial stage before
	-- redistributing would not. And it would eliminate only a few rows, anyway.
	explain select avg(b) from cost_agg_t2 group by c;
	QUERY PLAN
	-----------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..501.88 rows=305691 width=8)
	-> HashAggregate (cost=0.00..492.76 rows=101897 width=8)
	Group Key: c
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..452.01 rows=333334 width=8)
	Hash Key: c
	-> Seq Scan on cost_agg_t2 (cost=0.00..438.70 rows=333334 width=8)
	Optimizer: Pivotal Optimizer (GPORCA)
	(7 rows)

	-- But if there are a lot more duplicate values, the two-stage plan becomes
	-- cheaper again, even though it doesn't git in memory and has to spill.
	insert into cost_agg_t2 select i, random() * 99999,1 from generate_series(1, 200000) i;
	analyze cost_agg_t2;
	explain select avg(b) from cost_agg_t2 group by c;
	QUERY PLAN
	-----------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..504.85 rows=102902 width=8)
	-> Finalize HashAggregate (cost=0.00..501.78 rows=34301 width=8)
	Group Key: c
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..497.48 rows=34301 width=12)
	Hash Key: c
	-> Streaming Partial HashAggregate (cost=0.00..496.19 rows=34301 width=12)
	Group Key: c
	-> Seq Scan on cost_agg_t2 (cost=0.00..440.24 rows=400000 width=8)
	Optimizer: Pivotal Optimizer (GPORCA)
	(9 rows)

	drop table cost_agg_t1;
	drop table cost_agg_t2;
	reset statement_mem;
	-- The following case is to test GUC gp_eager_two_phase_agg for planner
	-- When it is set true, planner will choose two stage agg.
	create table t_planner_force_multi_stage(a int, b int) distributed randomly;
	analyze t_planner_force_multi_stage;
	show gp_eager_two_phase_agg;
	gp_eager_two_phase_agg
	------------------------
	off
	(1 row)

	-- the GUC gp_eager_two_phase_agg is default false, the table contains no data
	-- so one stage agg will win.
	explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
	QUERY PLAN
	------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3)
	-> GroupAggregate
	Group Key: b
	-> Sort
	Sort Key: b
	-> Redistribute Motion 3:3 (slice2; segments: 3)
	Hash Key: b
	-> Seq Scan on t_planner_force_multi_stage
	Optimizer: Pivotal Optimizer (GPORCA)
	(9 rows)

	set gp_eager_two_phase_agg = on;
	-- when forcing two stage, it should generate two stage agg plan.
	explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
	QUERY PLAN
	------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3)
	-> GroupAggregate
	Group Key: b
	-> Sort
	Sort Key: b
	-> Redistribute Motion 3:3 (slice2; segments: 3)
	Hash Key: b
	-> Seq Scan on t_planner_force_multi_stage
	Optimizer: Pivotal Optimizer (GPORCA)
	(9 rows)

	reset gp_eager_two_phase_agg;
	drop table t_planner_force_multi_stage;
	-- test operatorMem
	begin;
	create table test_operator_mem (i int, j int) distributed by (i);
	insert into test_operator_mem select i, i+1 from generate_series(1, 100)i;
	analyze test_operator_mem;
	set local statement_mem=1024;
	set local gp_resqueue_print_operator_memory_limits=on;
	explain(costs off)
	select count(*) from test_operator_mem;
	QUERY PLAN
	-----------------------------------------------------------------------
	Finalize Aggregate (operatorMem: 100kB)
	-> Gather Motion 3:1 (slice1; segments: 3) (operatorMem: 100kB)
	-> Partial Aggregate (operatorMem: 100kB)
	-> Seq Scan on test_operator_mem (operatorMem: 100kB)
	Optimizer: Pivotal Optimizer (GPORCA)
	(5 rows)

	abort;
	-- Test user-defined aggregate marked safe to execute on replicated slices without motion
	CREATE AGGREGATE my_unsafe_avg (float8)
	(
	sfunc = float8_accum,
	stype = float8[],
	finalfunc = float8_avg,
	initcond = '{0,0,0}'
	);
	CREATE AGGREGATE my_safe_avg (float8)
	(
	sfunc = float8_accum,
	stype = float8[],
	finalfunc = float8_avg,
	initcond = '{0,0,0}',
	repsafe = true
	);
	CREATE TABLE a_reptable (a int) DISTRIBUTED REPLICATED;
	CREATE TABLE b_reptable (b int) DISTRIBUTED REPLICATED;
	EXPLAIN INSERT INTO a_reptable(a) SELECT my_unsafe_avg(b) FROM b_reptable;
	QUERY PLAN
	---------------------------------------------------------------------------------------------
	Insert on a_reptable (cost=0.00..431.03 rows=1 width=4)
	-> Result (cost=0.00..431.00 rows=3 width=8)
	-> Broadcast Motion 1:3 (slice1; segments: 1) (cost=0.00..431.00 rows=3 width=4)
	-> Aggregate (cost=0.00..431.00 rows=3 width=8)
	-> Seq Scan on b_reptable (cost=0.00..431.00 rows=3 width=4)
	Optimizer: Pivotal Optimizer (GPORCA)
	(6 rows)

	EXPLAIN INSERT INTO a_reptable(a) SELECT my_safe_avg(b) FROM b_reptable;
	QUERY PLAN
	------------------------------------------------------------------------
	Insert on a_reptable (cost=0.00..431.03 rows=1 width=4)
	-> Aggregate (cost=0.00..431.00 rows=3 width=8)
	-> Seq Scan on b_reptable (cost=0.00..431.00 rows=3 width=4)
	Optimizer: Pivotal Optimizer (GPORCA)
	(4 rows)