blob: 8e1c80c60a4aa2543f23b6477f176444d9cc0561 [file] [log] [blame]
-- The last digits of some of the results vary from one invocation to another,
-- because the intermediate operations are done in arbitrary order when rows
-- are flowing from different segments in different order. Mask those
-- differences by setting 'extra_float_digits'. This isn't enough for all of
-- the queries, so a few also use TO_CHAR() to truncate the results further.
set extra_float_digits=-1;
SET optimizer_trace_fallback to on;
drop table if exists dqa_t1;
drop table if exists dqa_t2;
create table dqa_t1 (d int, i int, c char, dt date);
create table dqa_t2 (d int, i int, c char, dt date);
insert into dqa_t1 select i%23, i%12, (i%10) || '', '2009-06-10'::date + ( (i%34) || ' days')::interval
from generate_series(0, 99) i;
insert into dqa_t2 select i%34, i%45, (i%10) || '', '2009-06-10'::date + ( (i%56) || ' days')::interval
from generate_series(0, 99) i;
analyze dqa_t1;
analyze dqa_t2;
-- With the default very small cost, the planner often prefer to just Gather
-- all the rows to the QD. We want to test the more complicated multi-tage DQA
-- plans here, without using a huge number of rows.
-- GPDB_14_MERGE_FIXME: `DedupCost` seems to be computed wrongly caused by pg's
-- change of get_agg_clause_costs implementation.
set gp_motion_cost_per_row=2;
set enable_hashagg=on;
set enable_groupagg=off;
-- Also run EXPLAIN on each of these queries, to make sure you get an efficient plan,
-- and not e.g. a naive one that just pulls all the rows to the QD.
-- Distinct keys are distribution keys
select count(distinct d) from dqa_t1;
explain (costs off) select count(distinct d) from dqa_t1;
select count(distinct d) from dqa_t1 group by i;
explain (costs off) select count(distinct d) from dqa_t1 group by i;
select count(distinct d), sum(distinct d) from dqa_t1 group by i;
explain (costs off) select count(distinct d), sum(distinct d) from dqa_t1 group by i;
select count(distinct d), count(distinct dt) from dqa_t1;
explain (costs off) select count(distinct d), count(distinct dt) from dqa_t1;
select count(distinct d), count(distinct c), count(distinct dt) from dqa_t1;
explain (costs off) select count(distinct d), count(distinct c), count(distinct dt) from dqa_t1;
select count(distinct d), count(distinct dt) from dqa_t1 group by c;
explain (costs off) select count(distinct d), count(distinct dt) from dqa_t1 group by c;
select count(distinct d), count(distinct dt) from dqa_t1 group by d;
explain (costs off) select count(distinct d), count(distinct dt) from dqa_t1 group by d;
select count(distinct dqa_t1.d) from dqa_t1, dqa_t2 where dqa_t1.d = dqa_t2.d;
explain (costs off) select count(distinct dqa_t1.d) from dqa_t1, dqa_t2 where dqa_t1.d = dqa_t2.d;
select count(distinct dqa_t1.d) from dqa_t1, dqa_t2 where dqa_t1.d = dqa_t2.d group by dqa_t2.dt;
explain (costs off) select count(distinct dqa_t1.d) from dqa_t1, dqa_t2 where dqa_t1.d = dqa_t2.d group by dqa_t2.dt;
-- Distinct keys are not distribution keys
select count(distinct c) from dqa_t1;
explain (costs off) select count(distinct c) from dqa_t1;
select count(distinct c) from dqa_t1 group by dt;
explain (costs off) select count(distinct c) from dqa_t1 group by dt;
select count(distinct c) from dqa_t1 group by d;
explain (costs off) select count(distinct c) from dqa_t1 group by d;
select count(distinct i), sum(distinct i) from dqa_t1 group by c;
explain (costs off) select count(distinct i), sum(distinct i) from dqa_t1 group by c;
select count(distinct c), count(distinct dt) from dqa_t1;
explain (costs off) select count(distinct c), count(distinct dt) from dqa_t1;
select count(distinct c), count(distinct dt), i from dqa_t1 group by i;
explain (costs off) select count(distinct c), count(distinct dt), i from dqa_t1 group by i;
select count(distinct i), count(distinct c), d from dqa_t1 group by d;
explain (costs off) select count(distinct i), count(distinct c), d from dqa_t1 group by d;
select count(distinct dqa_t1.dt) from dqa_t1, dqa_t2 where dqa_t1.c = dqa_t2.c;
explain (costs off) select count(distinct dqa_t1.dt) from dqa_t1, dqa_t2 where dqa_t1.c = dqa_t2.c;
select count(distinct dqa_t1.dt) from dqa_t1, dqa_t2 where dqa_t1.c = dqa_t2.c group by dqa_t2.dt;
explain (costs off) select count(distinct dqa_t1.dt) from dqa_t1, dqa_t2 where dqa_t1.c = dqa_t2.c group by dqa_t2.dt;
-- multidqa with groupby and order by
select sum(distinct d), count(distinct i), count(distinct c),i,c from dqa_t1 group by i,c order by i,c;
explain (costs off) select sum(distinct d), count(distinct i), count(distinct c),i,c from dqa_t1 group by i,c order by i,c;
-- multi args singledqa
select corr(distinct d, i) from dqa_t1;
explain (costs off) select corr(distinct d, i) from dqa_t1;
-- multi args singledqa with group by
select corr(distinct d, i) from dqa_t1 group by d;
explain (costs off) select corr(distinct d, i) from dqa_t1 group by d;
select to_char(corr(distinct d, i), '9.99999999999999') from dqa_t1 group by c;
explain (costs off) select to_char(corr(distinct d, i), '9.99999999999999') from dqa_t1 group by c;
-- multi args multidqa
select count(distinct c), corr(distinct d, i) from dqa_t1;
explain (costs off) select count(distinct c), corr(distinct d, i) from dqa_t1;
select count(distinct d), corr(distinct d, i) from dqa_t1;
explain (costs off) select count(distinct d), corr(distinct d, i) from dqa_t1;
select count(distinct d), count(distinct i), corr(distinct d, i) from dqa_t1;
explain (costs off) select count(distinct d), count(distinct i), corr(distinct d, i) from dqa_t1;
select count(distinct c), count(distinct d), count(distinct i), corr(distinct d, i) from dqa_t1;
explain (costs off) select count(distinct c), count(distinct d), count(distinct i), corr(distinct d, i) from dqa_t1;
-- multi args multidqa with group by
select count(distinct c), corr(distinct d, i), d from dqa_t1 group by d;
explain (costs off) select count(distinct c), corr(distinct d, i), d from dqa_t1 group by d;
select count(distinct c), corr(distinct d, i), d, i from dqa_t1 group by d,i;
explain (costs off) select count(distinct c), corr(distinct d, i), d, i from dqa_t1 group by d,i;
select count(distinct c), corr(distinct d, i), dt from dqa_t1 group by dt;
explain (costs off) select count(distinct c), corr(distinct d, i), dt from dqa_t1 group by dt;
select count(distinct d), corr(distinct d, i), i from dqa_t1 group by i;
explain (costs off) select count(distinct d), corr(distinct d, i), i from dqa_t1 group by i;
select count(distinct d), corr(distinct d, i), d from dqa_t1 group by d;
explain (costs off) select count(distinct d), corr(distinct d, i), d from dqa_t1 group by d;
select count(distinct d), to_char(corr(distinct d, i), '9.99999999999999'), c from dqa_t1 group by c;
explain (costs off) select count(distinct d), to_char(corr(distinct d, i), '9.99999999999999'), c from dqa_t1 group by c;
-- MPP-19037
drop table if exists fact_route_aggregation;
drop table if exists dim_devices;
CREATE TABLE fact_route_aggregation
(
device_id integer,
is_route integer ,
is_pedestrian integer,
user_id integer,
pedestrian_route_length_in_meters integer,
in_car_route_length_in_meters integer
) DISTRIBUTED BY (device_id);
insert into fact_route_aggregation select generate_series(1,700),generate_series(200,300),generate_series(300,400), generate_series(400,500),generate_series(500,600),generate_series(600,700);
CREATE TABLE dim_devices
(
device_id integer,
platform integer
) DISTRIBUTED BY (device_id);
-- Repro query from the JIRA
select distinct
count(distinct case when T218094.is_route >= 1 or T218094.is_pedestrian >= 1 then T218094.user_id else NULL end ) as c1,
sum(cast(T218094.is_route + T218094.is_pedestrian as DOUBLE PRECISION )) as c2,
sum(cast(T218094.is_pedestrian as DOUBLE PRECISION )) as c3,
count(distinct case when T218094.is_pedestrian >= 1 then T218094.user_id else NULL end ) as c4,
sum(T218094.pedestrian_route_length_in_meters / 1000.0) as c5,
sum(T218094.in_car_route_length_in_meters / 1000.0) as c6,
sum(cast(T218094.is_route as DOUBLE PRECISION )) as c7,
count(distinct case when T218094.is_route >= 1 then T218094.user_id else NULL end ) as c8,
T43883.platform as c9
from
dim_devices T43883,
fact_route_aggregation T218094
where ( T43883.device_id = T218094.device_id )
group by T43883.platform;
-- cleanup
drop table fact_route_aggreagation;
drop table dim_devices;
-- other test queries for mpp-19037
drop table if exists t1_mdqa;
drop table if exists t2_mdqa;
create table t1_mdqa(a int, b int, c varchar);
create table t2_mdqa(a int, b int, c varchar);
insert into t1_mdqa select i % 5 , i % 10, i || 'value' from generate_series(1, 20) i;
insert into t1_mdqa select i % 5 , i % 10, i || 'value' from generate_series(1, 20) i;
insert into t2_mdqa select i % 10 , i % 5, i || 'value' from generate_series(1, 20) i;
insert into t2_mdqa select i % 10 , i % 5, i || 'value' from generate_series(1, 20) i;
-- simple mdqa
select count(distinct t1.a), count(distinct t2.b), t1.c, t2.c from t1_mdqa t1, t2_mdqa t2 where t1.c = t2.c group by t1.c, t2.c order by t1.c;
-- distinct on top of some mdqas
select distinct sum(distinct t1.a), avg(t2.a), sum(distinct t2.b), t1.a, t2.b from t1_mdqa t1, t2_mdqa t2 where t1.a = t2.a group by t1.a, t2.b order by t1.a;
select distinct sum (distinct t1.a), avg(distinct t2.a), sum(distinct t2.b), t1.c from t1_mdqa t1, t2_mdqa t2 where t1.a = t2.a group by t1.c order by t1.c;
-- distinct on group by fields
select distinct t1.c , sum(distinct t1.a), count(t2.b), sum(distinct t2.b) from t1_mdqa t1, t2_mdqa t2 where t1.a = t2.a group by t1.c order by t1.c;
-- distinct on normal aggregates
select distinct sum(t1.a), avg(distinct t2.a), sum(distinct (t1.a + t2.a)), t1.a, t2.b from t1_mdqa t1, t2_mdqa t2 where t1.a = t2.a group by t1.a, t2.b order by t1.a;
select distinct avg(t1.a + t2.b), count(distinct t1.c), count(distinct char_length(t1.c)), t1.a, t2.b from t1_mdqa t1, t2_mdqa t2 where t1.a = t2.a group by t1.a, t2.b order by t1.a;
-- cleanup
drop table t1_mdqa;
drop table t2_mdqa;
-- other queries from MPP-19037
drop table if exists gp_dqa_r;
drop table if exists gp_dqa_s;
create table gp_dqa_r (a int, b int, c int);
create table gp_dqa_s (d int, e int, f int);
insert into gp_dqa_r select i , i %10, i%5 from generate_series(1,20) i;
insert into gp_dqa_s select i, i %15, i%10 from generate_series(1,30) i;
select a, d, count(distinct b) as c1, count(distinct c) as c2 from gp_dqa_r, gp_dqa_s where ( e = a ) group by d, a order by a,d;
select distinct
count(distinct case when b >= 1 or c >= 1 then b else NULL end ) as c1,
sum(cast(b + c as DOUBLE PRECISION )) as c2,
sum(cast(c as DOUBLE PRECISION )) as c3,
count(distinct case when b >= 1 then b else NULL end ) as c2,
d as c9
from gp_dqa_r, gp_dqa_s
where ( e = a )
group by d order by c9;
select distinct
count(distinct case when b >= 1 or c >= 1 then b else NULL end ) as c1,
count(distinct case when b >= 1 then b else NULL end ) as c2,
d as c9
from gp_dqa_r, gp_dqa_s
where ( e = a )
group by d order by c9;
select distinct count(distinct b) as c1, count(distinct c) as c2, d as c9
from gp_dqa_r, gp_dqa_s
where ( e = a )
group by d order by c9;
select distinct d, count(distinct b) as c1, count(distinct c) as c2, d as c9 from gp_dqa_r, gp_dqa_s group by d order by c9;
select distinct d, count(distinct b) as c1, count(distinct c) as c2, d as c9 from gp_dqa_r, gp_dqa_s group by d, a order by c9;
select distinct count(distinct b) as c1, count(distinct c) as c2 from gp_dqa_r, gp_dqa_s;
select distinct count(distinct b) as c1, count(distinct c) as c2 from gp_dqa_r;
select distinct count(distinct b) as c1, count(distinct c) as c2, d, a from gp_dqa_r, gp_dqa_s where ( e = a)group by d, a order by a,d;
select distinct count(distinct b) as c1, count(distinct c) as c2, d from gp_dqa_r, gp_dqa_s group by d, a order by d,a;
select distinct count(distinct b) as c1, count(distinct c) as c2, d from gp_dqa_r, gp_dqa_s group by d, a order by d;
select distinct count(distinct b) as c1, count(distinct c) as c2, d from gp_dqa_r, gp_dqa_s group by d order by d;
-- cleanup
drop table gp_dqa_r;
drop table gp_dqa_s;
-- setup
drop table if exists gp_dqa_t1;
drop table if exists gp_dqa_t2;
create table gp_dqa_t1 (a int, b int) distributed by (a);
create table gp_dqa_t2 (a int, c int) distributed by (a);
insert into gp_dqa_t1 select i , i %5 from generate_series(1,10) i;
insert into gp_dqa_t2 select i , i %4 from generate_series(1,10) i;
select distinct A.a, sum(distinct A.b), count(distinct B.c) from gp_dqa_t1 A left join gp_dqa_t2 B on (A.a = B.a) group by A.a order by A.a;
select distinct A.a, sum(distinct A.b), count(distinct B.c) from gp_dqa_t1 A right join gp_dqa_t2 B on (A.a = B.a) group by A.a order by A.a;
-- Most of the above test queries got planned as hash aggregates. Repeat
-- a few of them as group aggregates
set enable_hashagg=off;
set enable_groupagg=on;
select count(distinct d) from dqa_t1 group by i;
explain (costs off) select count(distinct d) from dqa_t1 group by i;
select count(distinct d), count(distinct c), count(distinct dt) from dqa_t1;
select count(distinct c), count(distinct dt), i from dqa_t1 group by i;
-- cleanup
drop table gp_dqa_t1;
drop table gp_dqa_t2;
create table foo_mdqa(x int, y int);
SELECT distinct C.z, count(distinct FS.x), count(distinct FS.y) FROM (SELECT 1 AS z FROM generate_series(1,10)) C, foo_mdqa FS GROUP BY z;
SELECT distinct C.z, count(distinct FS.x), count(distinct FS.y) FROM (SELECT i AS z FROM generate_series(1,10) i) C, foo_mdqa FS GROUP BY z;
drop table foo_mdqa;
-- non-strict agg test
-- Like COUNT(col), but also counts NULLs
create or replace function countall_trans(c int, newval int) returns int as $$
SELECT $1 + 1;
$$ language sql;
create aggregate countall(sfunc = countall_trans, basetype = int, stype = int, initcond = 0, combinefunc = int4pl);
-- Test table
create table nonullstab (a int, b int);
insert into nonullstab select 1, 1 from generate_series(1, 100);
-- This returns wrong result. countall(distinct a) should return 1.
select countall(distinct a), count(distinct b) from nonullstab;
-- multi DQA with filter test
set enable_hashagg=on;
set enable_groupagg=off;
create table dqa_f1(a int, b int, c int) distributed by (a);
create table dqa_f2(x int, y int, z int) distributed by (x);
insert into dqa_f1 select i%17, i%5 , i%3 from generate_series(1,1000) i;
insert into dqa_f2 select i % 13, i % 5 , i % 11 from generate_series(1,1000) i;
select sum(distinct a) filter (where a > 0), sum(distinct b) filter (where a > 0) from dqa_f1;
select sum(distinct a) filter (where a > 0), sum(distinct b) filter (where a > 0) from dqa_f1 group by b;
select sum(distinct a) filter (where a > 0), sum(distinct b) filter (where a > 0) from dqa_f1 group by c;
select sum(distinct a) filter (where a in (select x from dqa_f2 where x = a)), sum(distinct b) filter (where a > 0) from dqa_f1;
select sum(distinct a) filter (where a in (select x from dqa_f2 where x = a)), sum(distinct b) filter (where a > 0) from dqa_f1 group by c;
select count(distinct a) filter (where a > 3),count( distinct b) filter (where a > 4), sum(distinct b) filter( where a > 4) from dqa_f1;
-- fix hang of multi-dqa with filter (https://github.com/greenplum-db/gpdb/issues/14728)
select count(distinct a) filter (where a > 3), count(distinct b) from dqa_f1;
explain select sum(distinct a) filter (where a > 0), sum(distinct b) filter (where a > 0) from dqa_f1;
explain select sum(distinct a) filter (where a > 0), sum(distinct b) filter (where a > 0) from dqa_f1 group by b;
explain select sum(distinct a) filter (where a > 0), sum(distinct b) filter (where a > 0) from dqa_f1 group by c;
explain select sum(distinct a) filter (where a in (select x from dqa_f2 where x = a)), sum(distinct b) filter (where a > 0) from dqa_f1;
explain select sum(distinct a) filter (where a in (select x from dqa_f2 where x = a)), sum(distinct b) filter (where a > 0) from dqa_f1 group by c;
explain select count(distinct a) filter (where a > 3),count( distinct b) filter (where a > 4), sum(distinct b) filter( where a > 4) from dqa_f1;
-- single DQA with agg
-- the following SQL should use two stage agg
explain select count(distinct a), sum(b), sum(c) from dqa_f1;
select count(distinct a), sum(b), sum(c) from dqa_f1;
-- multi DQA with primary key
create table dqa_unique(a int, b int, c int, d int, primary key(a, b));
insert into dqa_unique select i%3, i%5, i%7, i%9 from generate_series(1, 10) i;
explain(verbose on, costs off) select count(distinct a), count(distinct d), c from dqa_unique group by a, b;
select count(distinct a), count(distinct d), c from dqa_unique group by a, b;
-- multi DQA with type conversions
create table dqa_f3(a character varying, b bigint) distributed by (a);
insert into dqa_f3 values ('123', 2), ('213', 0), ('231', 2), ('312', 0), ('321', 2), ('132', 1), ('4', 0);
-- Case 1: When converting the type of column 'a' from 'VARCHAR' to 'TEXT' in DQA expression, instead of generating a new column '(a)::text'
-- by TupleSplit, we can reference the column 'a' as part of hash-key in Redistribute-Motion directly, since the conversion is binary-compatible.
-- -> Redistribute Motion 3:3 (slice2; segments: 3)
-- Output: b, a, ((b)::text), (AggExprId)
-- Hash Key: ((b)::text), a, (AggExprId)
-- ...
-- -> TupleSplit
-- Output: b, a, ((b)::text), AggExprId
-- Split by Col: (((dqa_f3.b)::text)), (dqa_f3.a)
-- -> Seq Scan on public.dqa_f3
-- Output: b, a, (b)::text
select count(distinct (b)::text) as b, count(distinct (a)::text) as a from dqa_f3;
explain (verbose, costs off) select count(distinct (b)::text) as b, count(distinct (a)::text) as a from dqa_f3;
-- Case 2: Same as the above one, but convert the type of column 'a' to 'varchar' via binary-compatible types.
select count(distinct (b)::text) as b, count(distinct (a)::text::varchar) as a from dqa_f3;
explain (verbose, costs off) select count(distinct (b)::text) as b, count(distinct (a)::text::varchar) as a from dqa_f3;
-- Case 3: When converting the type of column 'a' from 'varchar' to 'int' in DQA expression, TupleSplit should generate an additional
-- column '(a)::integer' as part of hash-key in Redistribute-Motion, since the conversion is not binary-compatible.
-- -> Redistribute Motion 3:3 (slice2; segments: 3)
-- Output: b, a, ((b)::text), ((a)::integer), (AggExprId)
-- Hash Key: ((b)::text), ((a)::integer), (AggExprId)
-- ...
-- -> TupleSplit
-- Output: b, a, ((b)::text), ((a)::integer), AggExprId
-- Split by Col: (((dqa_f3.b)::text)), (((dqa_f3.a)::integer))
-- -> Seq Scan on public.dqa_f3
-- Output: b, a, (b)::text, (a)::integer
select count(distinct (b)::text) as b, count(distinct (a)::int) as a from dqa_f3;
explain (verbose, costs off) select count(distinct (b)::text) as b, count(distinct (a)::int) as a from dqa_f3;
-- Case 4: When converting the type of column 'a' from 'varchar' to 'int' to 'varchar', TupleSplit should generate an additional
-- column '(a)::integer::varchar' as part of hash-key in Redistribute-Motion.
select count(distinct (b)::text) as b, count(distinct (a)::int::varchar) as a from dqa_f3;
explain (verbose, costs off) select count(distinct (b)::text) as b, count(distinct (a)::int::varchar) as a from dqa_f3;
drop table dqa_f3;
-- Test 3-phase agg for DISTINCT on distribution keys
-- or DISTINCT when GROUP BY on distribution keys
create table t_issue_659(a int, b int) using ao_row;
insert into t_issue_659 select i from generate_series(1, 1000) i;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
insert into t_issue_659 select * from t_issue_659;
analyze t_issue_659;
explain(costs off)
select count(distinct a) from t_issue_659;
select count(distinct a) from t_issue_659;
set gp_eager_distinct_dedup = on;
-- for ORCA
set optimizer_force_three_stage_scalar_dqa to on;
set optimizer_force_multistage_agg to on;
set optimizer_enable_use_distribution_in_dqa to on;
explain(costs off)
select count(distinct a) from t_issue_659;
select count(distinct a) from t_issue_659;
reset gp_eager_distinct_dedup;
reset optimizer_force_three_stage_scalar_dqa;
reset optimizer_enable_use_distribution_in_dqa;
drop table t_issue_659;
-- fix dqa bug when optimizer_force_multistage_agg is on
set optimizer_force_multistage_agg = on;
create table multiagg1(a int, b bigint, c int);
create table multiagg2(a int, b bigint, c numeric(8, 4));
insert into multiagg1 values(generate_series(1, 10), generate_series(1, 10), generate_series(1, 10));
insert into multiagg2 values(generate_series(1, 10), generate_series(1, 10), 555.55);
analyze multiagg1;
analyze multiagg2;
explain (verbose, costs off) select count(distinct b), sum(c) from multiagg1;
select count(distinct b), sum(c) from multiagg1;
explain (verbose, costs off) select count(distinct b), sum(c) from multiagg2;
select count(distinct b), sum(c) from multiagg2;
drop table multiagg1;
drop table multiagg2;
-- Support Multi-stage DQA with ride along aggregation in ORCA
-- Historically, Agg aggsplit is identically equal to Aggref aggsplit
-- In ORCA's attempt to support intermediate aggregation
-- The two are allowed to differ
-- Now Agg aggsplit is derived as bitwise OR of its children Aggref aggsplit
-- The plan is to eventually make Agg aggsplit a dummy
-- And use Aggref aggsplit to build trans/combine functions
set optimizer_force_multistage_agg=on;
create table num_table(id int, a bigint, b int, c numeric);
insert into num_table values(1,1,1,1),(2,2,2,2),(3,3,3,3);
-- count(distinct a) is a simple aggregation
-- sum(b) is a split aggregation
-- Before the fix, in the final aggregation of sum(b)
-- the executor mistakenly built a trans func instead of a combine func
-- The trans func building process errored out due to mismatch between
-- the input type (int) and trans type (bigint), and caused missing plan
explain select count(distinct a), sum(b) from num_table;
select count(distinct a), sum(b) from num_table;
explain select count(distinct a), sum(b) from num_table group by id;
select count(distinct a), sum(b) from num_table group by id;
-- count(distinct a) is a simple aggregation
-- sum(c) is a split aggregation
-- Before the fix, the final aggregation of sum(c) was mistakenly
-- treated as simple aggregation, and led to the missing
-- deserialization step in the aggregation execution prep
-- Numeric aggregation serializes partial aggregation states
-- The executor then evaluated the aggregation state without deserializing it first
-- This led to the creation of garbage NaN count, and caused NaN output
explain select count(distinct a), sum(c) from num_table;
select count(distinct a), sum(c) from num_table;
explain select id, count(distinct a), avg(b), sum(c) from num_table group by grouping sets ((id,c));
select id, count(distinct a), avg(b), sum(c) from num_table group by grouping sets ((id,c));
explain (verbose on, costs off) select count(distinct b) from num_table group by c;
select count(distinct b) from num_table group by c;
reset optimizer_force_multistage_agg;
-- DQA with Agg(Intermediate Agg)
-- In PG optimizer Intermediate Agg have not support yet
-- Current test cases will be changed after commit 971fa82(Support intermediate Agg in planner (#13959))
set enable_hashagg=on;
set enable_groupagg=off;
create table dqa_f3(a int, b int, c int, d int, e int ) distributed by (a);
insert into dqa_f3 select i % 17, i % 5 , i % 3, i %10, i % 7 from generate_series(1,1000) i;
analyze dqa_f3;
/*
* Test distinct or group by column is distributed key
*
* 1. If the input's locus matches the DISTINCT, but not GROUP BY:
*
* HashAggregate
* -> Redistribute (according to GROUP BY)
* -> HashAggregate (to eliminate duplicates)
* -> input (hashed by GROUP BY + DISTINCT)
*
* 2. If the input's locus matches the GROUP BY(don't care DISTINCT any more):
*
* HashAggregate (to aggregate)
* -> HashAggregate (to eliminate duplicates)
* -> input (hashed by GROUP BY)
*
*/
explain (verbose on, costs off)select sum(Distinct a), count(b), sum(c) from dqa_f3 group by e;
select sum(Distinct a), count(b), sum(c) from dqa_f3 group by e;
explain (verbose on, costs off) select sum(Distinct e), count(b), sum(c) from dqa_f3 group by a;
select sum(Distinct e), count(b), sum(c) from dqa_f3 group by a;
/*
* Test both distinct and group by column are not distributed key
*
* HashAgg (to aggregate)
* -> HashAgg (to eliminate duplicates)
* -> Redistribute (according to GROUP BY)
* -> Streaming HashAgg (to eliminate duplicates)
* -> input
*
*/
explain (verbose on, costs off) select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b;
select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b;
explain (verbose on, costs off) select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b order by b;
select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b order by b;
explain (verbose on, costs off) select distinct sum(Distinct c), count(a), sum(d) from dqa_f3 group by b;
select distinct sum(Distinct c), count(a), sum(d) from dqa_f3 group by b;
explain (verbose on, costs off) select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b having avg(e) > 3;
select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b having avg(e) > 3;
-- flaky tests
-- explain (verbose on, costs off)
-- select sum(Distinct sub.c), count(a), sum(d)
-- from dqa_f3 left join(select x, coalesce(y, 5) as c from dqa_f2) as sub
-- on sub.x = dqa_f3.e group by b;
-- select sum(Distinct sub.c), count(a), sum(d)
-- from dqa_f3 left join(select x, coalesce(y, 5) as c from dqa_f2) as sub
-- on sub.x = dqa_f3.e group by b;
-- Test gp_enable_agg_distinct_pruning is off on this branch
set gp_enable_agg_distinct_pruning = off;
explain (verbose on, costs off) select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b;
select sum(Distinct c), count(a), sum(d) from dqa_f3 group by b;
reset gp_enable_agg_distinct_pruning;
/*
* Test multistage through Gather Motion(grouplocus cannot hashed or not exist)
*
* Finalize Aggregate
* -> Gather Motion
* -> Partial Aggregate
* -> HashAggregate, to remove duplicates
* -> Redistribute Motion (according to DISTINCT arg)
* -> Streaming HashAgg (to eliminate duplicates)
* -> input
*/
explain (verbose on, costs off) select sum(Distinct b), count(c), sum(a) from dqa_f3;
select sum(Distinct b), count(c), sum(a) from dqa_f3;
explain (verbose on, costs off) select distinct sum(Distinct b), count(c), sum(a) from dqa_f3;
select distinct sum(Distinct b), count(c), sum(a) from dqa_f3;
explain (verbose on, costs off) select sum(Distinct b), count(c) filter(where c > 1), sum(a) from dqa_f3;
select sum(Distinct b), count(c) filter(where c > 1), sum(a) from dqa_f3;
drop table dqa_f3;
-- Test some corner case of dqa ex.NULL
create table dqa_f4(a int, b int, c int);
insert into dqa_f4 values(null, null, null);
insert into dqa_f4 values(1, 1, 1);
insert into dqa_f4 values(2, 2, 2);
select count(distinct a), count(distinct b) from dqa_f4 group by c;
set optimizer_enable_multiple_distinct_aggs=on;
explain (verbose on, costs off) select count(distinct a), count(distinct b) from dqa_f4 group by c;
select count(distinct a), count(distinct b) from dqa_f4 group by c;
reset optimizer_enable_multiple_distinct_aggs;
drop table dqa_f4;