blob: a5630447227b9522d8b01f0b104b58e5e79ededb [file] [log] [blame]
create schema bfv_statistic;
set search_path=bfv_statistic;
create table bfv_statistics_foo (a int, b int) distributed by (a);
insert into bfv_statistics_foo values (1,1);
insert into bfv_statistics_foo values (0,1);
insert into bfv_statistics_foo values (2,1);
insert into bfv_statistics_foo values (null,1);
analyze bfv_statistics_foo;
-- current statistics
select stanullfrac, stadistinct, stanumbers1 from pg_statistic where starelid='bfv_statistics_foo'::regclass and staattnum=1;
-- exercise GPORCA translator
explain select * from bfv_statistics_foo where a is not null and b >= 1;
create table bfv_statistics_foo2(a int) distributed by (a);
insert into bfv_statistics_foo2 select generate_series(1,5);
insert into bfv_statistics_foo2 select 1 from generate_series(1,5);
insert into bfv_statistics_foo2 select 2 from generate_series(1,4);
insert into bfv_statistics_foo2 select 3 from generate_series(1,3);
insert into bfv_statistics_foo2 select 4 from generate_series(1,2);
insert into bfv_statistics_foo2 select 5 from generate_series(1,1);
analyze bfv_statistics_foo2;
-- current stats
select stanumbers1, stavalues1 from pg_statistic where starelid='bfv_statistics_foo2'::regclass;
explain select a from bfv_statistics_foo2 where a > 1 order by a;
-- change stats manually so that MCV and MCF numbers do not match
set allow_system_table_mods=true;
update pg_statistic set stavalues1='{6,3,1,5,4,2}'::int[] where starelid='bfv_statistics_foo2'::regclass;
-- excercise the translator
explain select a from bfv_statistics_foo2 where a > 1 order by a;
--
-- test missing statistics
--
set gp_create_table_random_default_distribution=off;
create table bfv_statistics_foo3(a int);
select * from gp_toolkit.gp_stats_missing where smischema = 'bfv_statistic' AND smitable = 'bfv_statistics_foo3';
--
-- for Orca's Split Operator ensure that the columns needed for stats derivation is correct
--
set gp_create_table_random_default_distribution=off;
CREATE TABLE bar_dml (
vtrg character varying(6) NOT NULL,
tec_schuld_whg character varying(3) NOT NULL,
inv character varying(11) NOT NULL,
zed_id character varying(6) NOT NULL,
mkl_id character varying(6) NOT NULL,
zj integer NOT NULL,
folio integer NOT NULL,
zhlg_typ character varying(1) NOT NULL,
zhlg character varying(8) NOT NULL,
ant_zhlg double precision,
zuordn_sys_dat character varying(11),
zhlg_whg_bilkurs numeric(15,8),
tec_whg_bilkurs numeric(15,8),
zhlg_ziel_id character varying(1) NOT NULL,
btg_tec_whg_gesh numeric(13,2),
btg_tec_whg_makl numeric(13,2),
btg_zhlg_whg numeric(13,2),
zhlg_typ_org character varying(1),
zhlg_org character varying(8),
upd_dat date
)
WITH (appendonly=true) DISTRIBUTED RANDOMLY;
update bar_dml set (zhlg_org, zhlg_typ_org) = (zhlg, zhlg_typ);
--
-- Cardinality estimation when there is no histogram and MCV
--
create table bfv_statistics_foo4 (a int);
insert into bfv_statistics_foo4 select i from generate_series(1,99) i;
insert into bfv_statistics_foo4 values (NULL);
analyze bfv_statistics_foo4;
select stanullfrac, stadistinct, stanumbers1 from pg_statistic where starelid='bfv_statistics_foo4'::regclass and staattnum=1;
explain select a from bfv_statistics_foo4 where a > 888;
--
-- Testing that the merging of memo groups inside Orca does not crash cardinality estimation inside Orca
--
create table t1(c1 int);
insert into t1 values(1);
select v from (select max(c1) as v, 1 as r from t1 union select 1 as v, 2 as r ) as bfv_statistics_foo group by v;
select v from (select max(c1) as v, 1 as r from t1 union all select 1 as v, 2 as r ) as bfv_statistics_foo group by v;
select v from (select max(c1) as v, 1 as r from t1 union select 1 as v, 2 as r ) as bfv_statistics_foo;
--
-- test the generation of histogram boundaries for numeric and real data types
--
create table bfv_statistics_foo_real (a int4, b real) distributed randomly;
insert into bfv_statistics_foo_real values (0, 'Infinity');
insert into bfv_statistics_foo_real values (0, '-Infinity');
insert into bfv_statistics_foo_real values (0, 'NaN');
insert into bfv_statistics_foo_real values (0, 'Infinity');
insert into bfv_statistics_foo_real values (0, '-Infinity');
insert into bfv_statistics_foo_real values (0, 'NaN');
insert into bfv_statistics_foo_real values (0, 'Infinity');
insert into bfv_statistics_foo_real values (0, '-Infinity');
insert into bfv_statistics_foo_real values (0, 'NaN');
insert into bfv_statistics_foo_real values (0, 'Infinity');
insert into bfv_statistics_foo_real values (0, '-Infinity');
insert into bfv_statistics_foo_real values (0, 'NaN');
INSERT INTO bfv_statistics_foo_real VALUES (0, '0');
INSERT INTO bfv_statistics_foo_real VALUES (1, '0');
INSERT INTO bfv_statistics_foo_real VALUES (2, '-34338492.215397047');
INSERT INTO bfv_statistics_foo_real VALUES (3, '4.31');
INSERT INTO bfv_statistics_foo_real VALUES (4, '7799461.4119');
INSERT INTO bfv_statistics_foo_real VALUES (5, '16397.038491');
INSERT INTO bfv_statistics_foo_real VALUES (6, '93901.57763026');
INSERT INTO bfv_statistics_foo_real VALUES (7, '-83028485');
INSERT INTO bfv_statistics_foo_real VALUES (8, '74881');
INSERT INTO bfv_statistics_foo_real VALUES (9, '-24926804.045047420');
INSERT INTO bfv_statistics_foo_real VALUES (0, '0');
INSERT INTO bfv_statistics_foo_real VALUES (1, '0');
INSERT INTO bfv_statistics_foo_real VALUES (2, '-34338492.215397047');
INSERT INTO bfv_statistics_foo_real VALUES (3, '4.31');
INSERT INTO bfv_statistics_foo_real VALUES (4, '7799461.4119');
INSERT INTO bfv_statistics_foo_real VALUES (5, '16397.038491');
INSERT INTO bfv_statistics_foo_real VALUES (6, '93901.57763026');
INSERT INTO bfv_statistics_foo_real VALUES (7, '-83028485');
INSERT INTO bfv_statistics_foo_real VALUES (8, '74881');
INSERT INTO bfv_statistics_foo_real VALUES (9, '-24926804.045047420');
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_real VALUES (9, '-24926804.045047420');
INSERT INTO bfv_statistics_foo_real VALUES (0, '0');
INSERT INTO bfv_statistics_foo_real VALUES (1, '0');
INSERT INTO bfv_statistics_foo_real VALUES (2, '-34338492.215397047');
INSERT INTO bfv_statistics_foo_real VALUES (3, '4.31');
INSERT INTO bfv_statistics_foo_real VALUES (4, '7799461.4119');
INSERT INTO bfv_statistics_foo_real VALUES (5, '16397.038491');
INSERT INTO bfv_statistics_foo_real VALUES (6, '93901.57763026');
INSERT INTO bfv_statistics_foo_real VALUES (7, '-83028485');
INSERT INTO bfv_statistics_foo_real VALUES (8, '74881');
INSERT INTO bfv_statistics_foo_real VALUES (9, '-24926804.045047420');
ANALYZE bfv_statistics_foo_real;
select histogram_bounds from pg_stats where tablename = 'bfv_statistics_foo_real' and attname = 'b';
select most_common_vals from pg_stats where tablename = 'bfv_statistics_foo_real' and attname = 'b';
create table bfv_statistics_foo_numeric (a int4, b numeric) distributed randomly;
insert into bfv_statistics_foo_numeric values (0, 'NaN');
insert into bfv_statistics_foo_numeric values (0, 'NaN');
insert into bfv_statistics_foo_numeric values (0, 'NaN');
insert into bfv_statistics_foo_numeric values (0, 'NaN');
insert into bfv_statistics_foo_numeric values (0, 'NaN');
insert into bfv_statistics_foo_numeric values (0, 'NaN');
insert into bfv_statistics_foo_numeric values (0, 'NaN');
insert into bfv_statistics_foo_numeric values (0, 'NaN');
INSERT INTO bfv_statistics_foo_numeric VALUES (0, '0');
INSERT INTO bfv_statistics_foo_numeric VALUES (1, '0');
INSERT INTO bfv_statistics_foo_numeric VALUES (2, '-34338492.215397047');
INSERT INTO bfv_statistics_foo_numeric VALUES (3, '4.31');
INSERT INTO bfv_statistics_foo_numeric VALUES (4, '7799461.4119');
INSERT INTO bfv_statistics_foo_numeric VALUES (5, '16397.038491');
INSERT INTO bfv_statistics_foo_numeric VALUES (6, '93901.57763026');
INSERT INTO bfv_statistics_foo_numeric VALUES (7, '-83028485');
INSERT INTO bfv_statistics_foo_numeric VALUES (8, '74881');
INSERT INTO bfv_statistics_foo_numeric VALUES (9, '-24926804.045047420');
INSERT INTO bfv_statistics_foo_numeric VALUES (0, '0');
INSERT INTO bfv_statistics_foo_numeric VALUES (1, '0');
INSERT INTO bfv_statistics_foo_numeric VALUES (2, '-34338492.215397047');
INSERT INTO bfv_statistics_foo_numeric VALUES (3, '4.31');
INSERT INTO bfv_statistics_foo_numeric VALUES (4, '7799461.4119');
INSERT INTO bfv_statistics_foo_numeric VALUES (5, '16397.038491');
INSERT INTO bfv_statistics_foo_numeric VALUES (6, '93901.57763026');
INSERT INTO bfv_statistics_foo_numeric VALUES (7, '-83028485');
INSERT INTO bfv_statistics_foo_numeric VALUES (8, '74881');
INSERT INTO bfv_statistics_foo_numeric VALUES (9, '-24926804.045047420');
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (10, NULL);
INSERT INTO bfv_statistics_foo_numeric VALUES (9, '-24926804.045047420');
INSERT INTO bfv_statistics_foo_numeric VALUES (0, '0');
INSERT INTO bfv_statistics_foo_numeric VALUES (1, '0');
INSERT INTO bfv_statistics_foo_numeric VALUES (2, '-34338492.215397047');
INSERT INTO bfv_statistics_foo_numeric VALUES (3, '4.31');
INSERT INTO bfv_statistics_foo_numeric VALUES (4, '7799461.4119');
INSERT INTO bfv_statistics_foo_numeric VALUES (5, '16397.038491');
INSERT INTO bfv_statistics_foo_numeric VALUES (6, '93901.57763026');
INSERT INTO bfv_statistics_foo_numeric VALUES (7, '-83028485');
INSERT INTO bfv_statistics_foo_numeric VALUES (8, '74881');
INSERT INTO bfv_statistics_foo_numeric VALUES (9, '-24926804.045047420');
INSERT INTO bfv_statistics_foo_numeric SELECT i,i FROM generate_series(1,30) i;
ANALYZE bfv_statistics_foo_numeric;
select histogram_bounds from pg_stats where tablename = 'bfv_statistics_foo_numeric' and attname = 'b';
select most_common_vals from pg_stats where tablename = 'bfv_statistics_foo_numeric' and attname = 'b';
reset gp_create_table_random_default_distribution;
--
-- Ensure that VACUUM ANALYZE does not result in incorrect statistics
--
CREATE TABLE T25289_T1 (c int);
INSERT INTO T25289_T1 VALUES (1);
DELETE FROM T25289_T1;
ANALYZE T25289_T1;
--
-- expect NO more notice after customer run VACUUM FULL
--
CREATE TABLE T25289_T2 (c int);
INSERT INTO T25289_T2 VALUES (1);
DELETE FROM T25289_T2;
VACUUM FULL T25289_T2;
ANALYZE T25289_T2;
--
-- expect NO notice during analyze if table doesn't have empty pages
--
CREATE TABLE T25289_T3 (c int);
INSERT INTO T25289_T3 VALUES (1);
ANALYZE T25289_T3;
--
-- expect NO notice when analyzing append only tables
--
CREATE TABLE T25289_T4 (c int, d int)
WITH (APPENDONLY=ON) DISTRIBUTED BY (c)
PARTITION BY RANGE(d) (START(1) END (5) EVERY(1));
ANALYZE T25289_T4;
--
-- expect NO crash when the statistic slot for an attribute is broken
--
CREATE TABLE good_tab(a int, b text);
CREATE TABLE test_broken_stats(a int, b text);
INSERT INTO test_broken_stats VALUES(1, 'abc'), (2, 'cde'), (3, 'efg'), (3, 'efg'), (3, 'efg'), (1, 'abc'), (2, 'cde');
ANALYZE test_broken_stats;
SET allow_system_table_mods=true;
-- Simulate broken stats by changing the data type of MCV slot to a different type than in pg_attribute
-- start_matchsubs
-- m/ERROR: invalid .* of type .*, for attribute of type .* \(selfuncs\.c\:\d+\)/
-- s/\(selfuncs\.c:\d+\)//
-- end_matchsubs
-- Broken MCVs
UPDATE pg_statistic SET stavalues1='{1,2,3}'::int[] WHERE starelid ='test_broken_stats'::regclass AND staattnum=2;
SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;
-- Broken histogram
UPDATE pg_statistic SET stakind2=2 WHERE starelid ='test_broken_stats'::regclass AND staattnum=2;
UPDATE pg_statistic SET stavalues2='{1,2,3}'::int[] WHERE starelid ='test_broken_stats'::regclass AND staattnum=2 and stakind2=2;
SELECT * FROM test_broken_stats t1, good_tab t2 WHERE t1.b = t2.b;
RESET allow_system_table_mods;
-- cardinality estimation for join on varchar, text, char and bpchar columns must account for FreqRemaining and NDVRemaining
-- resulting in better cardinality numbers for the joins in orca
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
DROP TABLE IF EXISTS test_join_card2;
-- end_ignore
CREATE TABLE test_join_card1 (a varchar, b varchar);
CREATE TABLE test_join_card2 (a varchar, b varchar);
CREATE TABLE test_join_card3 (a varchar, b varchar);
INSERT INTO test_join_card1 SELECT i::text, i::text FROM generate_series(1, 20000)i;
INSERT INTO test_join_card2 SELECT i::text, NULL FROM generate_series(1, 179)i;
INSERT INTO test_join_card2 SELECT 1::text, 'a' FROM generate_series(1, 5820)i;
INSERT INTO test_join_card3 SELECT i::text, i::text FROM generate_series(1,10000)i;
ANALYZE test_join_card1;
ANALYZE test_join_card2;
ANALYZE test_join_card3;
EXPLAIN SELECT * FROM test_join_card1 t1, test_join_card2 t2, test_join_card3 t3 WHERE t1.b = t2.b and t3.b = t2.b;
-- start_ignore
DROP TABLE IF EXISTS test_join_card1;
DROP TABLE IF EXISTS test_join_card2;
-- end_ignore
-- Test if the table pg_statistic has data in segments
DROP TABLE IF EXISTS test_statistic_1;
CREATE TABLE test_statistic_1(a int, b int);
INSERT INTO test_statistic_1 SELECT i, i FROM generate_series(1, 1000)i;
ANALYZE test_statistic_1;
select count(*) from pg_class c, pg_statistic s where c.oid = s.starelid and relname = 'test_statistic_1';
select count(*) from pg_class c, gp_dist_random('pg_statistic') s where c.oid = s.starelid and relname = 'test_statistic_1';
DROP TABLE test_statistic_1;
-- Test that the histogram looks reasonable.
--
-- We once had a bug where the samples gathered from the segments were
-- truncated, leading to highly biased samples.
CREATE TABLE uniformtest(i int4);
INSERT INTO uniformtest SELECT g/100 FROM generate_series(0, 9999) g;
BEGIN;
SET LOCAL default_statistics_target=10; -- don't need so many rows for testing
ANALYZE uniformtest;
COMMIT;
-- ANALYZE collects a random sample, so the exact values chosen for the
-- histogram are nondeterministic. But they should be roughly uniformly
-- distributed across the range 0-99. Show some characteristic values.
select case when avg(bound) between 40 and 60 then '40-60' else avg(bound)::text end as avg,
case when min(bound) <= 5 then '<= 5' else min(bound)::text end as min,
case when max(bound) >= 95 then '>= 95' else max(bound)::text end as max
from pg_stats s,
unnest(histogram_bounds::text::int4[]) as bound
where tablename = 'uniformtest';
-- ORCA: Test previous scenario with duplicate memo groups running multiple
-- xforms that need stats before applying and reset after applying. It
-- used to be that this scenario could lead to SIGSEGV where stats were
-- reset and were not re-derived between applying the xforms.
SET optimizer_join_order=exhaustive;
SET optimizer_trace_fallback=on;
CREATE TABLE duplicate_memo_group_test_t1 (c11 varchar, c12 integer) DISTRIBUTED BY (c11);
CREATE TABLE duplicate_memo_group_test_t2 (c2 varchar) DISTRIBUTED BY (c2);
CREATE TABLE duplicate_memo_group_test_t3 (c3 varchar) DISTRIBUTED BY (c3);
INSERT INTO duplicate_memo_group_test_t1 SELECT 'something', generate_series(1,900);
INSERT INTO duplicate_memo_group_test_t2 SELECT generate_series(1,900);
ANALYZE duplicate_memo_group_test_t1, duplicate_memo_group_test_t2;
SELECT
(SELECT c11 FROM duplicate_memo_group_test_t1 WHERE c12 = 100) AS column1,
(SELECT sum(c12)
FROM duplicate_memo_group_test_t1
INNER JOIN duplicate_memo_group_test_t2 ON c11 = c2
INNER JOIN duplicate_memo_group_test_t3 ON c2 = c3
INNER JOIN duplicate_memo_group_test_t3 a1 on a1.c3 = a2.c3
LEFT OUTER JOIN duplicate_memo_group_test_t3 a3 ON a1.c3 = a3.c3
LEFT OUTER JOIN duplicate_memo_group_test_t3 a4 ON a1.c3 = a4.c3
) AS column2
FROM duplicate_memo_group_test_t3 a2;
-- Tests ORCA coverage for time-related cross-type stats calculation
--
-- Previously, ORCA didn't support stats calculation for time-related
-- cross-type predicates. It used default scale factor for cardinality
-- estimate, that could sometimes be off by a few orders of magnitude,
-- thence affecting plan performance. This was because date type was
-- converted to int internally, whereas other time-related types were
-- converted to double.
--
-- Using int for date type allows an equality predicate that only
-- involves the date type to be always viewed as a singleton, rather
-- than a range in double in ORCA's constraint framework. This provided
-- convenience of implementing stats derivation. However, such choice
-- prevented ORCA from deriving stats from predicates that involve both
-- date type and other time-related types. Now, in an attempt of
-- supporting cross-type stats calcualtion, we convert date type to
-- double as well.
--
-- Test filter stats derivation in table scans
drop table if exists t1, t2;
create table t1 (a int, b date);
create table t2 (a int, b date);
insert into t1 select i, j::date from generate_series(1, 10) i, generate_series('2015-01-01','2021-12-31', '1 day'::interval) j;
insert into t2 select i, j::date from generate_series(1, 10) i, generate_series('2021-01-01','2021-12-31', '1 day'::interval) j;
analyze t1, t2;
-- The following two queries should generate the same plan, now that
-- we support time-related cross-type stats calculation. ORCA should
-- derive the same stats for t1 (small subset of the total) based on
-- predicates on t1.b. Prior to this commit, the date-timestamp cross
-- type predicates used in the following queries yielded a cardinality
-- estimate in the order of 3000.
--
-- inequality predicates:
explain select * from t1, t2 where t1.a = t2.a and t1.b < '2015-01-05'::date;
explain select * from t1, t2 where t1.a = t2.a and t1.b < '2015-01-05'::timestamp;
-- equality predicates:
explain select * from t1, t2 where t1.a = t2.a and t1.b = '2015-01-05'::date;
explain select * from t1, t2 where t1.a = t2.a and t1.b = '2015-01-05'::timestamp;
-- Test filter stats derivation in dynamic table scans
drop table if exists t1, t2;
create table t1 (a int, b date)
partition by range (b) (
start (date '2015-01-01') end (date '2021-01-01') every (interval '1' year),
default partition d);
create table t2 (a int, b date);
insert into t1 select i, j::date from generate_series(1, 10) i, generate_series('2015-01-01','2021-12-31', '1 day'::interval) j;
insert into t2 select i, j::date from generate_series(1, 10) i, generate_series('2015-01-01','2021-12-31', '1 day'::interval) j;
analyze t1, t2;
-- The following two queries should generate the same plan, now that
-- we support time-related cross-type comparison. ORCA should derive
-- the same stats for t1 (small number of partitions) and t2 (small
-- subset of the total) based on the predicates and allow DPE. Prior
-- to this commit, the date-timestamp cross-type predicates used in
-- the following queries yielded a cardinality estimate in the order
-- of 500~1000. Consequently, the partition selector wasn't propagated.
--
-- inequality predicates (2 out of 7 partitions):
explain select * from t1, t2 where t1.a = t2.a and t1.b = t2.b and t1.b < '2015-01-05'::date;
explain select * from t1, t2 where t1.a = t2.a and t1.b = t2.b and t1.b < '2015-01-05'::timestamp;
-- equality predicates (1 out of 7 partitions):
explain select * from t1, t2 where t1.a = t2.a and t1.b = t2.b and t1.b = '2015-01-05'::date;
explain select * from t1, t2 where t1.a = t2.a and t1.b = t2.b and t1.b = '2015-01-05'::timestamp;
RESET optimizer_join_order;
RESET optimizer_trace_fallback;