src/test/regress/expected/limit_gp_optimizer.out - cloudberry - Git at Google

 -- Check for MPP-19310 and MPP-19857 where mksort produces wrong result
 -- on OPT build, and fails assertion on debug build if a "LIMIT" query
 -- spills to disk.
 CREATE TABLE mksort_limit_test_table(dkey INT, jkey INT, rval REAL, tval TEXT default repeat('abcdefghijklmnopqrstuvwxyz', 300)) DISTRIBUTED BY (dkey);
 INSERT INTO mksort_limit_test_table VALUES(generate_series(1, 10000), generate_series(10001, 20000), sqrt(generate_series(10001, 20000)));
 --Should fit LESS (because of overhead) than (20 * 1024 * 1024) / (26 * 300 + 12) => 2684 tuples in memory, after that spills to disk
 SET statement_mem="20MB";
 -- Should work in memory
 SELECT dkey, substring(tval from 1 for 2) as str  from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 200) as temp ORDER BY jkey LIMIT 3;
  dkey | str
 ------+-----
     1 | ab
     2 | ab
     3 | ab
 (3 rows)

 SELECT dkey, substring(tval from 1 for 2) as str  from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 200) as temp ORDER BY jkey DESC LIMIT 3;
  dkey | str
 ------+-----
   200 | ab
   199 | ab
   198 | ab
 (3 rows)

 -- Should spill to disk (tested with 2 segments, for more segments it may not spill)
 SELECT dkey, substring(tval from 1 for 2) as str  from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 5000) as temp ORDER BY jkey LIMIT 3;
  dkey | str
 ------+-----
     1 | ab
     2 | ab
     3 | ab
 (3 rows)

 SELECT dkey, substring(tval from 1 for 2) as str  from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 5000) as temp ORDER BY jkey DESC LIMIT 3;
  dkey | str
 ------+-----
  5000 | ab
  4999 | ab
  4998 | ab
 (3 rows)

 -- In memory descending sort
 SELECT dkey, substring(tval from 1 for 2) as str  from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 200) as temp ORDER BY jkey LIMIT 3;
  dkey | str
 ------+-----
  9801 | ab
  9802 | ab
  9803 | ab
 (3 rows)

 SELECT dkey, substring(tval from 1 for 2) as str  from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 200) as temp ORDER BY jkey DESC LIMIT 3;
  dkey  | str
 -------+-----
  10000 | ab
   9999 | ab
   9998 | ab
 (3 rows)

 -- Spilled descending sort (tested with 2 segments, for more segments it may not spill)
 SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 5000) as temp ORDER BY jkey LIMIT 3;
  dkey | str
 ------+-----
  5001 | ab
  5002 | ab
  5003 | ab
 (3 rows)

 SELECT dkey, substring(tval from 1 for 2) as str  from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 5000) as temp ORDER BY jkey DESC LIMIT 3;
  dkey  | str
 -------+-----
  10000 | ab
   9999 | ab
   9998 | ab
 (3 rows)

 DROP TABLE  mksort_limit_test_table;
 -- Check invalid things in LIMIT
 select * from generate_series(1,10) g limit g;
 ERROR:  argument of LIMIT must not contain variables
 LINE 1: select * from generate_series(1,10) g limit g;
                                                     ^
 select * from generate_series(1,10) g limit count(*);
 ERROR:  aggregate functions are not allowed in LIMIT
 LINE 1: select * from generate_series(1,10) g limit count(*);
                                                     ^
 -- Check volatile limit should not pushdown.
 create table t_volatile_limit (i int4);
 NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'i' as the Apache Cloudberry data distribution key for this table.
 HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
 create table t_volatile_limit_1 (a int, b int) distributed randomly;
 -- Cloudberry may generate two-stage limit plan to improve performance.
 -- But for limit clause contains volatile functions, if we push them down
 -- below the final gather motion, those volatile functions will be evaluated
 -- many times. For such cases, we should not push down the limit.
 -- Below test cases' limit clause contain function call `random` with order by.
 -- `random()` is a volatile function it may return different results each time
 -- invoked. If we push down to generate two-stage limit plan, `random()` will
 -- execute on each segment which leads to different limit values of QEs
 -- and QD and this cannot guarantee correct results. Suppose seg 0 contains the
 -- top 3 minimum values, but random() returns 1, then you lose 2 values.
 explain select * from t_volatile_limit order by i limit (random() * 10);
                                      QUERY PLAN
 ------------------------------------------------------------------------------------
  Limit  (cost=0.00..431.00 rows=1 width=4)
    ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..431.00 rows=1 width=4)
          Merge Key: i
          ->  Sort  (cost=0.00..431.00 rows=1 width=4)
                Sort Key: i
                ->  Seq Scan on t_volatile_limit  (cost=0.00..431.00 rows=1 width=4)
  Optimizer: Pivotal Optimizer (GPORCA) version 3.80.0
 (7 rows)

 explain select * from t_volatile_limit order by i limit 2 offset (random()*5);
                                      QUERY PLAN
 ------------------------------------------------------------------------------------
  Limit  (cost=0.00..431.00 rows=1 width=4)
    ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=0.00..431.00 rows=1 width=4)
          Merge Key: i
          ->  Sort  (cost=0.00..431.00 rows=1 width=4)
                Sort Key: i
                ->  Seq Scan on t_volatile_limit  (cost=0.00..431.00 rows=1 width=4)
  Optimizer: Pivotal Optimizer (GPORCA) version 3.80.0
 (7 rows)

 explain select distinct(a), sum(b) from t_volatile_limit_1 group by a order by a, sum(b) limit (random()+3);
                                                     QUERY PLAN
 ------------------------------------------------------------------------------------------------------------------
  Limit  (cost=0.00..431.00 rows=1 width=12)
    ->  Gather Motion 3:1  (slice2; segments: 3)  (cost=0.00..431.00 rows=1 width=12)
          Merge Key: a, (sum(b))
          ->  Sort  (cost=0.00..431.00 rows=1 width=12)
                Sort Key: a, (sum(b))
                ->  GroupAggregate  (cost=0.00..431.00 rows=1 width=12)
                      Group Key: a
                      ->  Sort  (cost=0.00..431.00 rows=1 width=8)
                            Sort Key: a
                            ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=1 width=8)
                                  Hash Key: a
                                  ->  Seq Scan on t_volatile_limit_1  (cost=0.00..431.00 rows=1 width=8)
  Optimizer: Pivotal Optimizer (GPORCA)
 (13 rows)

 explain select distinct(a), sum(b) from t_volatile_limit_1 group by a order by a, sum(b) limit 2 offset (random()*2);
                                                     QUERY PLAN
 ------------------------------------------------------------------------------------------------------------------
  Limit  (cost=0.00..431.00 rows=1 width=12)
    ->  Gather Motion 3:1  (slice2; segments: 3)  (cost=0.00..431.00 rows=1 width=12)
          Merge Key: a, (sum(b))
          ->  Sort  (cost=0.00..431.00 rows=1 width=12)
                Sort Key: a, (sum(b))
                ->  GroupAggregate  (cost=0.00..431.00 rows=1 width=12)
                      Group Key: a
                      ->  Sort  (cost=0.00..431.00 rows=1 width=8)
                            Sort Key: a
                            ->  Redistribute Motion 3:3  (slice2; segments: 3)  (cost=0.00..431.00 rows=1 width=8)
                                  Hash Key: a
                                  ->  Seq Scan on t_volatile_limit_1  (cost=0.00..431.00 rows=1 width=8)
  Optimizer: Pivotal Optimizer (GPORCA)
 (13 rows)

 drop table t_volatile_limit;
 drop table t_volatile_limit_1;
 -- Check LIMIT ALL should not be considered when gathering data to a single node
 create table t_limit_all(a int, b int) distributed by (a);
 insert into t_limit_all select i, i from generate_series(1,10)i;
 explain (costs off)
 select array(select b from t_limit_all order by b asc limit all) t;
                     QUERY PLAN
 --------------------------------------------------
  Result
    InitPlan 1 (returns $0)  (slice1)
      ->  Gather Motion 3:1  (slice2; segments: 3)
            Merge Key: t_limit_all.b
            ->  Sort
                  Sort Key: t_limit_all.b
                  ->  Seq Scan on t_limit_all
  Optimizer: Postgres query optimizer
 (8 rows)

 select array(select b from t_limit_all order by b asc limit all) t;
            t
 ------------------------
  {1,2,3,4,5,6,7,8,9,10}
 (1 row)

 drop table t_limit_all;
	-- Check for MPP-19310 and MPP-19857 where mksort produces wrong result
	-- on OPT build, and fails assertion on debug build if a "LIMIT" query
	-- spills to disk.
	CREATE TABLE mksort_limit_test_table(dkey INT, jkey INT, rval REAL, tval TEXT default repeat('abcdefghijklmnopqrstuvwxyz', 300)) DISTRIBUTED BY (dkey);
	INSERT INTO mksort_limit_test_table VALUES(generate_series(1, 10000), generate_series(10001, 20000), sqrt(generate_series(10001, 20000)));
	--Should fit LESS (because of overhead) than (20 * 1024 * 1024) / (26 * 300 + 12) => 2684 tuples in memory, after that spills to disk
	SET statement_mem="20MB";
	-- Should work in memory
	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 200) as temp ORDER BY jkey LIMIT 3;
	dkey \| str
	------+-----
	1 \| ab
	2 \| ab
	3 \| ab
	(3 rows)

	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 200) as temp ORDER BY jkey DESC LIMIT 3;
	dkey \| str
	------+-----
	200 \| ab
	199 \| ab
	198 \| ab
	(3 rows)

	-- Should spill to disk (tested with 2 segments, for more segments it may not spill)
	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 5000) as temp ORDER BY jkey LIMIT 3;
	dkey \| str
	------+-----
	1 \| ab
	2 \| ab
	3 \| ab
	(3 rows)

	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey LIMIT 5000) as temp ORDER BY jkey DESC LIMIT 3;
	dkey \| str
	------+-----
	5000 \| ab
	4999 \| ab
	4998 \| ab
	(3 rows)

	-- In memory descending sort
	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 200) as temp ORDER BY jkey LIMIT 3;
	dkey \| str
	------+-----
	9801 \| ab
	9802 \| ab
	9803 \| ab
	(3 rows)

	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 200) as temp ORDER BY jkey DESC LIMIT 3;
	dkey \| str
	-------+-----
	10000 \| ab
	9999 \| ab
	9998 \| ab
	(3 rows)

	-- Spilled descending sort (tested with 2 segments, for more segments it may not spill)
	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 5000) as temp ORDER BY jkey LIMIT 3;
	dkey \| str
	------+-----
	5001 \| ab
	5002 \| ab
	5003 \| ab
	(3 rows)

	SELECT dkey, substring(tval from 1 for 2) as str from (SELECT * from mksort_limit_test_table ORDER BY dkey DESC LIMIT 5000) as temp ORDER BY jkey DESC LIMIT 3;
	dkey \| str
	-------+-----
	10000 \| ab
	9999 \| ab
	9998 \| ab
	(3 rows)

	DROP TABLE mksort_limit_test_table;
	-- Check invalid things in LIMIT
	select * from generate_series(1,10) g limit g;
	ERROR: argument of LIMIT must not contain variables
	LINE 1: select * from generate_series(1,10) g limit g;
	^
	select * from generate_series(1,10) g limit count(*);
	ERROR: aggregate functions are not allowed in LIMIT
	LINE 1: select * from generate_series(1,10) g limit count(*);
	^
	-- Check volatile limit should not pushdown.
	create table t_volatile_limit (i int4);
	NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'i' as the Apache Cloudberry data distribution key for this table.
	HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
	create table t_volatile_limit_1 (a int, b int) distributed randomly;
	-- Cloudberry may generate two-stage limit plan to improve performance.
	-- But for limit clause contains volatile functions, if we push them down
	-- below the final gather motion, those volatile functions will be evaluated
	-- many times. For such cases, we should not push down the limit.
	-- Below test cases' limit clause contain function call `random` with order by.
	-- `random()` is a volatile function it may return different results each time
	-- invoked. If we push down to generate two-stage limit plan, `random()` will
	-- execute on each segment which leads to different limit values of QEs
	-- and QD and this cannot guarantee correct results. Suppose seg 0 contains the
	-- top 3 minimum values, but random() returns 1, then you lose 2 values.
	explain select * from t_volatile_limit order by i limit (random() * 10);
	QUERY PLAN
	------------------------------------------------------------------------------------
	Limit (cost=0.00..431.00 rows=1 width=4)
	-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=4)
	Merge Key: i
	-> Sort (cost=0.00..431.00 rows=1 width=4)
	Sort Key: i
	-> Seq Scan on t_volatile_limit (cost=0.00..431.00 rows=1 width=4)
	Optimizer: Pivotal Optimizer (GPORCA) version 3.80.0
	(7 rows)

	explain select * from t_volatile_limit order by i limit 2 offset (random()*5);
	QUERY PLAN
	------------------------------------------------------------------------------------
	Limit (cost=0.00..431.00 rows=1 width=4)
	-> Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.00 rows=1 width=4)
	Merge Key: i
	-> Sort (cost=0.00..431.00 rows=1 width=4)
	Sort Key: i
	-> Seq Scan on t_volatile_limit (cost=0.00..431.00 rows=1 width=4)
	Optimizer: Pivotal Optimizer (GPORCA) version 3.80.0
	(7 rows)

	explain select distinct(a), sum(b) from t_volatile_limit_1 group by a order by a, sum(b) limit (random()+3);
	QUERY PLAN
	------------------------------------------------------------------------------------------------------------------
	Limit (cost=0.00..431.00 rows=1 width=12)
	-> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=12)
	Merge Key: a, (sum(b))
	-> Sort (cost=0.00..431.00 rows=1 width=12)
	Sort Key: a, (sum(b))
	-> GroupAggregate (cost=0.00..431.00 rows=1 width=12)
	Group Key: a
	-> Sort (cost=0.00..431.00 rows=1 width=8)
	Sort Key: a
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=8)
	Hash Key: a
	-> Seq Scan on t_volatile_limit_1 (cost=0.00..431.00 rows=1 width=8)
	Optimizer: Pivotal Optimizer (GPORCA)
	(13 rows)

	explain select distinct(a), sum(b) from t_volatile_limit_1 group by a order by a, sum(b) limit 2 offset (random()*2);
	QUERY PLAN
	------------------------------------------------------------------------------------------------------------------
	Limit (cost=0.00..431.00 rows=1 width=12)
	-> Gather Motion 3:1 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=12)
	Merge Key: a, (sum(b))
	-> Sort (cost=0.00..431.00 rows=1 width=12)
	Sort Key: a, (sum(b))
	-> GroupAggregate (cost=0.00..431.00 rows=1 width=12)
	Group Key: a
	-> Sort (cost=0.00..431.00 rows=1 width=8)
	Sort Key: a
	-> Redistribute Motion 3:3 (slice2; segments: 3) (cost=0.00..431.00 rows=1 width=8)
	Hash Key: a
	-> Seq Scan on t_volatile_limit_1 (cost=0.00..431.00 rows=1 width=8)
	Optimizer: Pivotal Optimizer (GPORCA)
	(13 rows)

	drop table t_volatile_limit;
	drop table t_volatile_limit_1;
	-- Check LIMIT ALL should not be considered when gathering data to a single node
	create table t_limit_all(a int, b int) distributed by (a);
	insert into t_limit_all select i, i from generate_series(1,10)i;
	explain (costs off)
	select array(select b from t_limit_all order by b asc limit all) t;
	QUERY PLAN
	--------------------------------------------------
	Result
	InitPlan 1 (returns $0) (slice1)
	-> Gather Motion 3:1 (slice2; segments: 3)
	Merge Key: t_limit_all.b
	-> Sort
	Sort Key: t_limit_all.b
	-> Seq Scan on t_limit_all
	Optimizer: Postgres query optimizer
	(8 rows)

	select array(select b from t_limit_all order by b asc limit all) t;
	t
	------------------------
	{1,2,3,4,5,6,7,8,9,10}
	(1 row)

	drop table t_limit_all;