src/test/regress/sql/gp_aggregates_costs.sql - cloudberry - Git at Google

 create table cost_agg_t1(a int, b int, c int);
 insert into cost_agg_t1 select i, random() * 99999, i % 2000 from generate_series(1, 1000000) i;
 create table cost_agg_t2 as select * from cost_agg_t1 with no data;
 insert into cost_agg_t2 select i, random() * 99999, i % 300000 from generate_series(1, 1000000) i;
 analyze cost_agg_t1;
 analyze cost_agg_t2;

 --
 -- Test planner's decisions on aggregates when only little memory is available.
 --
 set statement_mem= '1800 kB';

 -- There are only 2000 distinct values of 'c' in the table, which fits
 -- comfortably in an in-memory hash table.
 explain select avg(b) from cost_agg_t1 group by c;

 -- In the other table, there are 300000 distinct values of 'c', which doesn't
 -- fit in statement_mem. The planner chooses to do a single-phase agg for this.
 --
 -- In the single-phase plan, the aggregation is performed after redistrbuting
 -- the data, which means that each node only has to process 1/(# of segments)
 -- fraction of the data. That fits in memory, whereas an initial stage before
 -- redistributing would not. And it would eliminate only a few rows, anyway.
 explain select avg(b) from cost_agg_t2 group by c;

 -- But if there are a lot more duplicate values, the two-stage plan becomes
 -- cheaper again, even though it doesn't git in memory and has to spill.
 insert into cost_agg_t2 select i, random() * 99999,1 from generate_series(1, 200000) i;
 analyze cost_agg_t2;
 explain select avg(b) from cost_agg_t2 group by c;


 drop table cost_agg_t1;
 drop table cost_agg_t2;
 reset statement_mem;

 -- The following case is to test GUC gp_eager_two_phase_agg for planner
 -- When it is set true, planner will choose two stage agg.
 create table t_planner_force_multi_stage(a int, b int) distributed randomly;
 analyze t_planner_force_multi_stage;
 show gp_eager_two_phase_agg;
 -- the GUC gp_eager_two_phase_agg is default false, the table contains no data
 -- so one stage agg will win.
 explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
 set gp_eager_two_phase_agg = on;
 -- when forcing two stage, it should generate two stage agg plan.
 explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
 reset gp_eager_two_phase_agg;
 drop table t_planner_force_multi_stage;

 -- test operatorMem
 begin;
 create table test_operator_mem (i int, j int) distributed by (i);
 insert into test_operator_mem select i, i+1 from generate_series(1, 100)i;
 analyze test_operator_mem;
 set local statement_mem=1024;
 set local gp_resqueue_print_operator_memory_limits=on;
 explain(costs off)
 select count(*) from test_operator_mem;

 abort;

 -- Test user-defined aggregate marked safe to execute on replicated slices without motion
 CREATE AGGREGATE my_unsafe_avg (float8)
 (
     sfunc = float8_accum,
     stype = float8[],
     finalfunc = float8_avg,
     initcond = '{0,0,0}'
 );
 CREATE AGGREGATE my_safe_avg (float8)
 (
     sfunc = float8_accum,
     stype = float8[],
     finalfunc = float8_avg,
     initcond = '{0,0,0}',
     repsafe = true
 );
 CREATE TABLE a_reptable (a int) DISTRIBUTED REPLICATED;
 CREATE TABLE b_reptable (b int) DISTRIBUTED REPLICATED;
 EXPLAIN INSERT INTO a_reptable(a) SELECT my_unsafe_avg(b) FROM b_reptable;
 EXPLAIN INSERT INTO a_reptable(a) SELECT my_safe_avg(b) FROM b_reptable;
	create table cost_agg_t1(a int, b int, c int);
	insert into cost_agg_t1 select i, random() * 99999, i % 2000 from generate_series(1, 1000000) i;
	create table cost_agg_t2 as select * from cost_agg_t1 with no data;
	insert into cost_agg_t2 select i, random() * 99999, i % 300000 from generate_series(1, 1000000) i;
	analyze cost_agg_t1;
	analyze cost_agg_t2;

	--
	-- Test planner's decisions on aggregates when only little memory is available.
	--
	set statement_mem= '1800 kB';

	-- There are only 2000 distinct values of 'c' in the table, which fits
	-- comfortably in an in-memory hash table.
	explain select avg(b) from cost_agg_t1 group by c;

	-- In the other table, there are 300000 distinct values of 'c', which doesn't
	-- fit in statement_mem. The planner chooses to do a single-phase agg for this.
	--
	-- In the single-phase plan, the aggregation is performed after redistrbuting
	-- the data, which means that each node only has to process 1/(# of segments)
	-- fraction of the data. That fits in memory, whereas an initial stage before
	-- redistributing would not. And it would eliminate only a few rows, anyway.
	explain select avg(b) from cost_agg_t2 group by c;

	-- But if there are a lot more duplicate values, the two-stage plan becomes
	-- cheaper again, even though it doesn't git in memory and has to spill.
	insert into cost_agg_t2 select i, random() * 99999,1 from generate_series(1, 200000) i;
	analyze cost_agg_t2;
	explain select avg(b) from cost_agg_t2 group by c;


	drop table cost_agg_t1;
	drop table cost_agg_t2;
	reset statement_mem;

	-- The following case is to test GUC gp_eager_two_phase_agg for planner
	-- When it is set true, planner will choose two stage agg.
	create table t_planner_force_multi_stage(a int, b int) distributed randomly;
	analyze t_planner_force_multi_stage;
	show gp_eager_two_phase_agg;
	-- the GUC gp_eager_two_phase_agg is default false, the table contains no data
	-- so one stage agg will win.
	explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
	set gp_eager_two_phase_agg = on;
	-- when forcing two stage, it should generate two stage agg plan.
	explain (costs off) select b, sum(a) from t_planner_force_multi_stage group by b;
	reset gp_eager_two_phase_agg;
	drop table t_planner_force_multi_stage;

	-- test operatorMem
	begin;
	create table test_operator_mem (i int, j int) distributed by (i);
	insert into test_operator_mem select i, i+1 from generate_series(1, 100)i;
	analyze test_operator_mem;
	set local statement_mem=1024;
	set local gp_resqueue_print_operator_memory_limits=on;
	explain(costs off)
	select count(*) from test_operator_mem;

	abort;

	-- Test user-defined aggregate marked safe to execute on replicated slices without motion
	CREATE AGGREGATE my_unsafe_avg (float8)
	(
	sfunc = float8_accum,
	stype = float8[],
	finalfunc = float8_avg,
	initcond = '{0,0,0}'
	);
	CREATE AGGREGATE my_safe_avg (float8)
	(
	sfunc = float8_accum,
	stype = float8[],
	finalfunc = float8_avg,
	initcond = '{0,0,0}',
	repsafe = true
	);
	CREATE TABLE a_reptable (a int) DISTRIBUTED REPLICATED;
	CREATE TABLE b_reptable (b int) DISTRIBUTED REPLICATED;
	EXPLAIN INSERT INTO a_reptable(a) SELECT my_unsafe_avg(b) FROM b_reptable;
	EXPLAIN INSERT INTO a_reptable(a) SELECT my_safe_avg(b) FROM b_reptable;