src/test/regress/expected/gp_distinct_plans.out - cloudberry - Git at Google

 --
 -- Test all the different plan shapes that the planner can generate for
 -- DISTINCT queries
 --
 create table distinct_test (a int, b int, c int) distributed by (a);
 insert into distinct_test select g / 1000, g / 2000, g from generate_series(1, 10000) g;
 analyze distinct_test;
 --
 -- With the default cost settings, you get hashed plans
 --
 -- If the DISTINCT is a superset of the table's distribution keys, the
 -- duplicates can be eliminated independently in the segments.
 explain select distinct a, b from distinct_test;
                                   QUERY PLAN
 -------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=55.00..56.10 rows=66 width=8)
    ->  HashAggregate  (cost=55.00..55.22 rows=22 width=8)
          Group Key: a, b
          ->  Seq Scan on distinct_test  (cost=0.00..38.33 rows=3333 width=8)
  Optimizer: Postgres query optimizer
 (5 rows)

 select distinct a, b from distinct_test;
  a  | b
 ----+---
   1 | 0
   0 | 0
   6 | 3
   9 | 4
  10 | 5
   5 | 2
   3 | 1
   4 | 2
   2 | 1
   7 | 3
   8 | 4
 (11 rows)

 -- Otherwise, redistribution is needed
 explain select distinct b from distinct_test;
                                            QUERY PLAN
 ------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=46.86..46.96 rows=6 width=4)
    ->  HashAggregate  (cost=46.86..46.88 rows=2 width=4)
          Group Key: b
          ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=46.67..46.85 rows=6 width=4)
                Hash Key: b
                ->  HashAggregate  (cost=46.67..46.73 rows=6 width=4)
                      Group Key: b
                      ->  Seq Scan on distinct_test  (cost=0.00..38.33 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (9 rows)

 select distinct b from distinct_test;
  b
 ---
  5
  2
  4
  3
  1
  0
 (6 rows)

 -- The two-stage aggregation can be disabled with GUC
 set gp_enable_preunique = off;
 explain select distinct b from distinct_test;
                                             QUERY PLAN
 ---------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=113.33..113.43 rows=6 width=4)
    ->  HashAggregate  (cost=113.33..113.35 rows=2 width=4)
          Group Key: b
          ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..105.00 rows=3333 width=4)
                Hash Key: b
                ->  Seq Scan on distinct_test  (cost=0.00..38.33 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (7 rows)

 reset gp_enable_preunique;
 -- If the input is highly unique already the pre-Unique step is not worthwhile.
 -- (Only print count(*) of the result because it returns so many rows)
 explain select distinct c from distinct_test;
                                             QUERY PLAN
 ---------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=113.33..280.00 rows=10000 width=4)
    ->  HashAggregate  (cost=113.33..146.67 rows=3333 width=4)
          Group Key: c
          ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..105.00 rows=3333 width=4)
                Hash Key: c
                ->  Seq Scan on distinct_test  (cost=0.00..38.33 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (7 rows)

 select count(*) from (
         select distinct c from distinct_test
 offset 0) as x;
  count
 -------
  10000
 (1 row)

 --
 -- Repeat the same tests with sorted Unique plans
 --
 set enable_hashagg=off;
 set optimizer_enable_hashagg=off;
 -- If the DISTINCT is a superset of the table's distribution keys, the
 -- duplicates can be eliminated independently in the segments.
 explain select distinct a, b from distinct_test;
                                     QUERY PLAN
 -----------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=233.38..259.26 rows=66 width=8)
    Merge Key: a, b
    ->  Unique  (cost=233.38..258.38 rows=22 width=8)
          Group Key: a, b
          ->  Sort  (cost=233.38..241.71 rows=3333 width=8)
                Sort Key: a, b
                ->  Seq Scan on distinct_test  (cost=0.00..38.33 rows=3333 width=8)
  Optimizer: Postgres query optimizer
 (8 rows)

 select distinct a, b from distinct_test;
  a  | b
 ----+---
   0 | 0
   1 | 0
   2 | 1
   3 | 1
   4 | 2
   5 | 2
   6 | 3
   7 | 3
   8 | 4
   9 | 4
  10 | 5
 (11 rows)

 -- Otherwise, redistribution is needed
 explain select distinct b from distinct_test;
                                        QUERY PLAN
 -----------------------------------------------------------------------------------------
  GroupAggregate  (cost=233.38..250.45 rows=6 width=4)
    Group Key: b
    ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=233.38..250.35 rows=18 width=4)
          Merge Key: b
          ->  GroupAggregate  (cost=233.38..250.11 rows=6 width=4)
                Group Key: b
                ->  Sort  (cost=233.38..241.71 rows=3333 width=4)
                      Sort Key: b
                      ->  Seq Scan on distinct_test  (cost=0.00..38.33 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (10 rows)

 select distinct b from distinct_test;
  b
 ---
  0
  1
  2
  3
  4
  5
 (6 rows)

 -- If the input is highly unique already the pre-Unique step is not worthwhile.
 -- (Only print count(*) of the result because it returns so many rows)
 explain select distinct c from distinct_test;
                                                QUERY PLAN
 ---------------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=300.05..450.05 rows=10000 width=4)
    Merge Key: c
    ->  Unique  (cost=300.05..316.71 rows=3333 width=4)
          Group Key: c
          ->  Sort  (cost=300.05..308.38 rows=3333 width=4)
                Sort Key: c
                ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..105.00 rows=3333 width=4)
                      Hash Key: c
                      ->  Seq Scan on distinct_test  (cost=0.00..38.33 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (10 rows)

 select count(*) from (
         select distinct c from distinct_test
 offset 0) as x;
  count
 -------
  10000
 (1 row)

 --
 -- Also test paths where the explicit Sort is not needed
 --
 create index on distinct_test (a, b);
 create index on distinct_test (b);
 create index on distinct_test (c);
 set random_page_cost=1;
 -- If the DISTINCT is a superset of the table's distribution keys, the
 -- duplicates can be eliminated independently in the segments.
 explain select distinct a, b from distinct_test;
                                                    QUERY PLAN
 -----------------------------------------------------------------------------------------------------------------
  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.16..206.04 rows=66 width=8)
    Merge Key: a, b
    ->  Unique  (cost=0.16..205.16 rows=22 width=8)
          Group Key: a, b
          ->  Index Only Scan using distinct_test_a_b_idx on distinct_test  (cost=0.16..188.49 rows=3333 width=8)
  Optimizer: Postgres query optimizer
 (6 rows)

 select distinct a, b from distinct_test;
  a  | b
 ----+---
   0 | 0
   1 | 0
   2 | 1
   3 | 1
   4 | 2
   5 | 2
   6 | 3
   7 | 3
   8 | 4
   9 | 4
  10 | 5
 (11 rows)

 -- Otherwise, redistribution is needed
 explain select distinct b from distinct_test;
                                                      QUERY PLAN
 ---------------------------------------------------------------------------------------------------------------------
  GroupAggregate  (cost=0.16..197.23 rows=6 width=4)
    Group Key: b
    ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.16..197.13 rows=18 width=4)
          Merge Key: b
          ->  GroupAggregate  (cost=0.16..196.89 rows=6 width=4)
                Group Key: b
                ->  Index Only Scan using distinct_test_b_idx on distinct_test  (cost=0.16..188.49 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (8 rows)

 select distinct b from distinct_test;
  b
 ---
  0
  1
  2
  3
  4
  5
 (6 rows)

 -- If the input is highly unique already the pre-Unique step is not worthwhile.
 -- (Only print count(*) of the result because it returns so many rows)
 explain select distinct c from distinct_test;
                                                   QUERY PLAN
 ---------------------------------------------------------------------------------------------------------------
  Unique  (cost=0.16..346.83 rows=10000 width=4)
    Group Key: c
    ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.16..321.83 rows=10000 width=4)
          Merge Key: c
          ->  Index Only Scan using distinct_test_c_idx on distinct_test  (cost=0.16..188.49 rows=3333 width=4)
  Optimizer: Postgres query optimizer
 (6 rows)

 select count(*) from (
         select distinct c from distinct_test
 offset 0) as x;
  count
 -------
  10000
 (1 row)
	--
	-- Test all the different plan shapes that the planner can generate for
	-- DISTINCT queries
	--
	create table distinct_test (a int, b int, c int) distributed by (a);
	insert into distinct_test select g / 1000, g / 2000, g from generate_series(1, 10000) g;
	analyze distinct_test;
	--
	-- With the default cost settings, you get hashed plans
	--
	-- If the DISTINCT is a superset of the table's distribution keys, the
	-- duplicates can be eliminated independently in the segments.
	explain select distinct a, b from distinct_test;
	QUERY PLAN
	-------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=55.00..56.10 rows=66 width=8)
	-> HashAggregate (cost=55.00..55.22 rows=22 width=8)
	Group Key: a, b
	-> Seq Scan on distinct_test (cost=0.00..38.33 rows=3333 width=8)
	Optimizer: Postgres query optimizer
	(5 rows)

	select distinct a, b from distinct_test;
	a \| b
	----+---
	1 \| 0
	0 \| 0
	6 \| 3
	9 \| 4
	10 \| 5
	5 \| 2
	3 \| 1
	4 \| 2
	2 \| 1
	7 \| 3
	8 \| 4
	(11 rows)

	-- Otherwise, redistribution is needed
	explain select distinct b from distinct_test;
	QUERY PLAN
	------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=46.86..46.96 rows=6 width=4)
	-> HashAggregate (cost=46.86..46.88 rows=2 width=4)
	Group Key: b
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=46.67..46.85 rows=6 width=4)
	Hash Key: b
	-> HashAggregate (cost=46.67..46.73 rows=6 width=4)
	Group Key: b
	-> Seq Scan on distinct_test (cost=0.00..38.33 rows=3333 width=4)
	Optimizer: Postgres query optimizer
	(9 rows)

	select distinct b from distinct_test;
	b
	---
	5
	2
	4
	3
	1
	0
	(6 rows)

	-- The two-stage aggregation can be disabled with GUC
	set gp_enable_preunique = off;
	explain select distinct b from distinct_test;
	QUERY PLAN
	---------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=113.33..113.43 rows=6 width=4)
	-> HashAggregate (cost=113.33..113.35 rows=2 width=4)
	Group Key: b
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..105.00 rows=3333 width=4)
	Hash Key: b
	-> Seq Scan on distinct_test (cost=0.00..38.33 rows=3333 width=4)
	Optimizer: Postgres query optimizer
	(7 rows)

	reset gp_enable_preunique;
	-- If the input is highly unique already the pre-Unique step is not worthwhile.
	-- (Only print count(*) of the result because it returns so many rows)
	explain select distinct c from distinct_test;
	QUERY PLAN
	---------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=113.33..280.00 rows=10000 width=4)
	-> HashAggregate (cost=113.33..146.67 rows=3333 width=4)
	Group Key: c
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..105.00 rows=3333 width=4)
	Hash Key: c
	-> Seq Scan on distinct_test (cost=0.00..38.33 rows=3333 width=4)
	Optimizer: Postgres query optimizer
	(7 rows)

	select count(*) from (
	select distinct c from distinct_test
	offset 0) as x;
	count
	-------
	10000
	(1 row)

	--
	-- Repeat the same tests with sorted Unique plans
	--
	set enable_hashagg=off;
	set optimizer_enable_hashagg=off;
	-- If the DISTINCT is a superset of the table's distribution keys, the
	-- duplicates can be eliminated independently in the segments.
	explain select distinct a, b from distinct_test;
	QUERY PLAN
	-----------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=233.38..259.26 rows=66 width=8)
	Merge Key: a, b
	-> Unique (cost=233.38..258.38 rows=22 width=8)
	Group Key: a, b
	-> Sort (cost=233.38..241.71 rows=3333 width=8)
	Sort Key: a, b
	-> Seq Scan on distinct_test (cost=0.00..38.33 rows=3333 width=8)
	Optimizer: Postgres query optimizer
	(8 rows)

	select distinct a, b from distinct_test;
	a \| b
	----+---
	0 \| 0
	1 \| 0
	2 \| 1
	3 \| 1
	4 \| 2
	5 \| 2
	6 \| 3
	7 \| 3
	8 \| 4
	9 \| 4
	10 \| 5
	(11 rows)

	-- Otherwise, redistribution is needed
	explain select distinct b from distinct_test;
	QUERY PLAN
	-----------------------------------------------------------------------------------------
	GroupAggregate (cost=233.38..250.45 rows=6 width=4)
	Group Key: b
	-> Gather Motion 3:1 (slice1; segments: 3) (cost=233.38..250.35 rows=18 width=4)
	Merge Key: b
	-> GroupAggregate (cost=233.38..250.11 rows=6 width=4)
	Group Key: b
	-> Sort (cost=233.38..241.71 rows=3333 width=4)
	Sort Key: b
	-> Seq Scan on distinct_test (cost=0.00..38.33 rows=3333 width=4)
	Optimizer: Postgres query optimizer
	(10 rows)

	select distinct b from distinct_test;
	b
	---
	0
	1
	2
	3
	4
	5
	(6 rows)

	-- If the input is highly unique already the pre-Unique step is not worthwhile.
	-- (Only print count(*) of the result because it returns so many rows)
	explain select distinct c from distinct_test;
	QUERY PLAN
	---------------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=300.05..450.05 rows=10000 width=4)
	Merge Key: c
	-> Unique (cost=300.05..316.71 rows=3333 width=4)
	Group Key: c
	-> Sort (cost=300.05..308.38 rows=3333 width=4)
	Sort Key: c
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..105.00 rows=3333 width=4)
	Hash Key: c
	-> Seq Scan on distinct_test (cost=0.00..38.33 rows=3333 width=4)
	Optimizer: Postgres query optimizer
	(10 rows)

	select count(*) from (
	select distinct c from distinct_test
	offset 0) as x;
	count
	-------
	10000
	(1 row)

	--
	-- Also test paths where the explicit Sort is not needed
	--
	create index on distinct_test (a, b);
	create index on distinct_test (b);
	create index on distinct_test (c);
	set random_page_cost=1;
	-- If the DISTINCT is a superset of the table's distribution keys, the
	-- duplicates can be eliminated independently in the segments.
	explain select distinct a, b from distinct_test;
	QUERY PLAN
	-----------------------------------------------------------------------------------------------------------------
	Gather Motion 3:1 (slice1; segments: 3) (cost=0.16..206.04 rows=66 width=8)
	Merge Key: a, b
	-> Unique (cost=0.16..205.16 rows=22 width=8)
	Group Key: a, b
	-> Index Only Scan using distinct_test_a_b_idx on distinct_test (cost=0.16..188.49 rows=3333 width=8)
	Optimizer: Postgres query optimizer
	(6 rows)

	select distinct a, b from distinct_test;
	a \| b
	----+---
	0 \| 0
	1 \| 0
	2 \| 1
	3 \| 1
	4 \| 2
	5 \| 2
	6 \| 3
	7 \| 3
	8 \| 4
	9 \| 4
	10 \| 5
	(11 rows)

	-- Otherwise, redistribution is needed
	explain select distinct b from distinct_test;
	QUERY PLAN
	---------------------------------------------------------------------------------------------------------------------
	GroupAggregate (cost=0.16..197.23 rows=6 width=4)
	Group Key: b
	-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.16..197.13 rows=18 width=4)
	Merge Key: b
	-> GroupAggregate (cost=0.16..196.89 rows=6 width=4)
	Group Key: b
	-> Index Only Scan using distinct_test_b_idx on distinct_test (cost=0.16..188.49 rows=3333 width=4)
	Optimizer: Postgres query optimizer
	(8 rows)

	select distinct b from distinct_test;
	b
	---
	0
	1
	2
	3
	4
	5
	(6 rows)

	-- If the input is highly unique already the pre-Unique step is not worthwhile.
	-- (Only print count(*) of the result because it returns so many rows)
	explain select distinct c from distinct_test;
	QUERY PLAN
	---------------------------------------------------------------------------------------------------------------
	Unique (cost=0.16..346.83 rows=10000 width=4)
	Group Key: c
	-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.16..321.83 rows=10000 width=4)
	Merge Key: c
	-> Index Only Scan using distinct_test_c_idx on distinct_test (cost=0.16..188.49 rows=3333 width=4)
	Optimizer: Postgres query optimizer
	(6 rows)

	select count(*) from (
	select distinct c from distinct_test
	offset 0) as x;
	count
	-------
	10000
	(1 row)