blob: e02b9a3f00ee41803acacf5adb82f41ec04df1c4 [file] [log] [blame]
--
-- Tests for legacy cdbhash opclasses
--
drop schema if exists gpdist_legacy_opclasses;
create schema gpdist_legacy_opclasses;
set search_path to gpdist_legacy_opclasses;
-- Basic sanity check of all the legacy hash opclasses. Create a table that
-- uses all of them in the distribution key, and insert a value.
set gp_use_legacy_hashops=on;
create table all_legacy_types(
int2_col int2,
int4_col int4,
int8_col int8,
float4_col float4,
float8_col float8,
numeric_col numeric,
char_col "char",
bpchar_col char(20),
text_col text,
varchar_col varchar(20),
bytea_col bytea,
name_col name,
oid_col oid,
tid_col tid,
timestamp_col timestamp,
timestamptz_col timestamptz,
date_col date,
time_col time,
timetz_col timetz,
interval_col interval,
inet_col inet,
cidr_col cidr,
macaddr_col macaddr,
bit_col bit(11),
varbit_col varbit(20),
bool_col bool,
oidvector_col oidvector,
money_col money,
--uuid_col uuid,
complex_col complex
) distributed by (
int2_col,
int4_col,
int8_col,
float4_col,
float8_col,
numeric_col,
char_col,
bpchar_col,
text_col,
varchar_col,
bytea_col,
name_col,
oid_col,
tid_col,
timestamp_col,
timestamptz_col,
date_col,
time_col,
timetz_col,
interval_col,
inet_col,
cidr_col,
macaddr_col,
bit_col,
varbit_col,
bool_col,
oidvector_col,
money_col,
--uuid_col,
complex_col
);
-- Verify that all columns are using the legacy hashops
select attno, opc.opcname from
(select unnest(distkey) as attno, unnest(distclass) as distclass from gp_distribution_policy
where localoid='all_legacy_types'::regclass) as d,
pg_opclass opc
where opc.oid=distclass
order by attno;
attno | opcname
-------+-------------------------
1 | cdbhash_int2_ops
2 | cdbhash_int4_ops
3 | cdbhash_int8_ops
4 | cdbhash_float4_ops
5 | cdbhash_float8_ops
6 | cdbhash_numeric_ops
7 | cdbhash_char_ops
8 | cdbhash_bpchar_ops
9 | cdbhash_text_ops
10 | cdbhash_text_ops
11 | cdbhash_bytea_ops
12 | cdbhash_name_ops
13 | cdbhash_oid_ops
14 | cdbhash_tid_ops
15 | cdbhash_timestamp_ops
16 | cdbhash_timestamptz_ops
17 | cdbhash_date_ops
18 | cdbhash_time_ops
19 | cdbhash_timetz_ops
20 | cdbhash_interval_ops
21 | cdbhash_inet_ops
22 | cdbhash_inet_ops
23 | cdbhash_macaddr_ops
24 | cdbhash_bit_ops
25 | cdbhash_bit_ops
26 | cdbhash_bool_ops
27 | cdbhash_oidvector_ops
28 | cdbhash_cash_ops
29 | cdbhash_complex_ops
(29 rows)
insert into all_legacy_types values (
'12345', -- int2
'12345678', -- int4
'1234567890123', -- int8
'1.2345', -- float4
'1.23456789', -- float8
'1.23456789', -- numeric
'x', -- "char"
'bpchar foobar', -- bpchar
'text foobar', -- text
'varchar foobar', -- varchar
'\xdeadbeef', -- bytea
'name foobar', -- name
'12345', -- oid
'(1,1)', -- tid
'2018-11-25 01:23:45+02', -- timestamp
'2018-11-25 01:23:45+02', -- timestamptz
'2018-11-25', -- date,
'01:23:45', -- time,
'01:23:45+02', -- timetz,
'6 years', -- interval,
'192.168.1.255/25', -- inet,
'10.1.2', -- cidr,
'08:00:2b:01:02:03', -- macaddr,
B'11011000000', -- bit,
B'01010101010', -- varbit,
'true', -- bool,
'1 2 3 4', -- oidvector,
'123.45', -- money,
--uuid_col -- uuid,
'5 + 3i' -- complex
);
-- Test that CTAS honors the gp_use_legacy_hashops GUC
-- Note: When ORCA is on, the distribution is RANDOM.
create table legacy_hashops_ctas as select 1;
select gpdp.distkey, pgopc.opcname
from gp_distribution_policy gpdp, pg_opclass pgopc
where gpdp.localoid='legacy_hashops_ctas'::regclass and pgopc.oid::text = gpdp.distclass::text;
distkey | opcname
---------+---------
(0 rows)
set gp_use_legacy_hashops=off;
--
-- Test joins between tables using a mix of default and legacy opclasses.
--
create table legacy_int (id int4) distributed by (id cdbhash_int4_ops);
insert into legacy_int values (1), (2), (3);
create table modern_int (id int4) distributed by (id);
insert into modern_int values (2), (3), (4);
analyze modern_int;
create table modern_text (t text) distributed by (t);
insert into modern_text values ('foo'), ('1');
-- Only legacy opclasses used. ORCA can deal with this
explain (costs off) select * from legacy_int a inner join legacy_int b on a.id = b.id;
QUERY PLAN
-------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (a.id = b.id)
-> Seq Scan on legacy_int a
-> Hash
-> Seq Scan on legacy_int b
Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0
(7 rows)
select * from legacy_int a inner join legacy_int b on a.id = b.id;
id | id
----+----
1 | 1
2 | 2
3 | 3
(3 rows)
-- A mixture of legacy and modern opclasses. Needs a Redistribute motion. ORCA
-- can't handle this, and falls back.
explain (costs off) select * from legacy_int a inner join modern_int b on a.id = b.id;
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (a.id = b.id)
-> Seq Scan on legacy_int a
-> Hash
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: b.id
-> Seq Scan on modern_int b
Optimizer: Postgres query optimizer
(9 rows)
select * from legacy_int a inner join modern_int b on a.id = b.id;
id | id
----+----
2 | 2
3 | 3
(2 rows)
-- for the sake of completeness, modern opclasses only. No problem for ORCA.
explain (costs off) select * from modern_int a inner join modern_int b on a.id = b.id;
QUERY PLAN
-------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (a.id = b.id)
-> Seq Scan on modern_int a
-> Hash
-> Seq Scan on modern_int b
Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0
(7 rows)
select * from modern_int a inner join modern_int b on a.id = b.id;
id | id
----+----
2 | 2
3 | 3
4 | 4
(3 rows)
-- In principle, ORCA would create a correct plan for this, because even though it
-- mixes the legacy and other opclasses, they're not used in join conditions. But
-- the ORCA translator code is conservative, and falls back if there are any mixed
-- use, even if it would be safe.
explain (costs off) select * from legacy_int a inner join modern_text b on a.id::text = b.t;
QUERY PLAN
------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: ((a.id)::text = b.t)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: (a.id)::text
-> Seq Scan on legacy_int a
-> Hash
-> Seq Scan on modern_text b
Optimizer: Postgres query optimizer
(9 rows)
select * from legacy_int a inner join modern_text b on a.id::text = b.t;
id | t
----+---
1 | 1
(1 row)
-- Also test with a domain over a base type, and enums.
create domain intdom as integer;
create table legacy_domain_over_int(id intdom) distributed by(id cdbhash_int4_ops);
insert into legacy_domain_over_int values (1), (2), (3);
analyze legacy_domain_over_int;
explain (costs off) select * from legacy_domain_over_int a inner join legacy_domain_over_int b on a.id = b.id;
QUERY PLAN
----------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: ((a.id)::integer = (b.id)::integer)
-> Seq Scan on legacy_domain_over_int a
-> Hash
-> Seq Scan on legacy_domain_over_int b
Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0
(7 rows)
explain (costs off) select * from legacy_int a inner join legacy_domain_over_int b on a.id = b.id;
QUERY PLAN
---------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (a.id = (b.id)::integer)
-> Seq Scan on legacy_int a
-> Hash
-> Seq Scan on legacy_domain_over_int b
Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0
(7 rows)
explain (costs off) select * from modern_int a inner join legacy_domain_over_int b on a.id = b.id;
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (a.id = (b.id)::integer)
-> Seq Scan on modern_int a
-> Hash
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: b.id
-> Seq Scan on legacy_domain_over_int b
Optimizer: Postgres query optimizer
(9 rows)
-- Distributing by enum has been banned, so this test is updated to instead distribute by a dummy int column
-- Banned because in backup/restore scenarios the data will be in the "wrong" segment as oids for each enum
-- entry are re-generated and hashing them will result in arbitrary segment assignment.
create type colors as enum ('red', 'green', 'blue');
create table legacy_enum(col1 int, color colors) distributed by(col1);
insert into legacy_enum values (1, 'red'), (2, 'green'), (3, 'blue');
explain (costs off) select * from legacy_enum a inner join legacy_enum b on a.color = b.color;
QUERY PLAN
---------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: ((a.color)::anyenum = (b.color)::anyenum)
-> Seq Scan on legacy_enum a
-> Hash
-> Broadcast Motion 3:3 (slice2; segments: 3)
-> Seq Scan on legacy_enum b
Optimizer: Pivotal Optimizer (GPORCA) version 3.41.0
(8 rows)
select * from legacy_enum a inner join legacy_enum b on a.color = b.color;
col1 | color | col1 | color
------+-------+------+-------
2 | green | 2 | green
3 | blue | 3 | blue
1 | red | 1 | red
(3 rows)
--
-- A regression issue that the data is reorganized incorrectly when
-- gp_use_legacy_hashops has non-default value.
--
-- The ALTER TABLE command reorganizes the data by using a temporary table, if
-- a "distributed by" clause is specified without the opclasses, the default
-- opclasses will be chosen. There was a bug that the non-legacy opclasses are
-- always chosen, regarding the setting of gp_use_legacy_hashops. However the
-- table's new opclasses are determined with gp_use_legacy_hashops, so when
-- gp_use_legacy_hashops is true the data will be incorrectly redistributed.
--
-- set the guc to the non-default value
set gp_use_legacy_hashops to on;
create table legacy_data_reorg (c1 int) distributed by (c1);
insert into legacy_data_reorg select i from generate_series(1, 10) i;
-- verify the opclass and data distribution
select gp_segment_id, c1 from legacy_data_reorg order by 1, 2;
gp_segment_id | c1
---------------+----
0 | 1
0 | 2
1 | 3
1 | 4
1 | 5
1 | 6
1 | 7
2 | 8
2 | 9
2 | 10
(10 rows)
select dp.localoid::regclass::name as name, oc.opcname
from gp_distribution_policy dp
join pg_opclass oc
on oc.oid::text = dp.distclass::text
where dp.localoid = 'legacy_data_reorg'::regclass::oid;
name | opcname
-------------------+------------------
legacy_data_reorg | cdbhash_int4_ops
(1 row)
-- when reorganizing the table we set the distributed-by without an explicit
-- opclass, so the default one should be chosen according to
-- gp_use_legacy_hashops.
alter table legacy_data_reorg set with (reorganize) distributed by (c1);
-- double-check the opclass and data distribution
select gp_segment_id, c1 from legacy_data_reorg order by 1, 2;
gp_segment_id | c1
---------------+----
0 | 1
0 | 2
1 | 3
1 | 4
1 | 5
1 | 6
1 | 7
2 | 8
2 | 9
2 | 10
(10 rows)
select dp.localoid::regclass::name as name, oc.opcname
from gp_distribution_policy dp
join pg_opclass oc
on oc.oid::text = dp.distclass::text
where dp.localoid = 'legacy_data_reorg'::regclass::oid;
name | opcname
-------------------+------------------
legacy_data_reorg | cdbhash_int4_ops
(1 row)
--
-- A regression issue similar to previous one, with CTAS.
--
-- The default opclasses in CTAS should also be determined with
-- gp_use_legacy_hashops.
--
set gp_use_legacy_hashops=off;
create table ctastest_off as select 123 as col distributed by (col);
set gp_use_legacy_hashops=on;
create table ctastest_on as select 123 as col distributed by (col);
select dp.localoid::regclass::name as name, oc.opcname
from gp_distribution_policy dp
join pg_opclass oc
on oc.oid::text = dp.distclass::text
where dp.localoid in ('ctastest_on'::regclass::oid,
'ctastest_off'::regclass::oid);
name | opcname
--------------+------------------
ctastest_off | int4_ops
ctastest_on | cdbhash_int4_ops
(2 rows)
set gp_use_legacy_hashops=on;
create table try_distinct_array (test_char varchar,test_array integer[]);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'test_char' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into try_distinct_array select 'y',string_to_array('1~1','~')::int[];
analyze try_distinct_array;
insert into try_distinct_array select 'n',string_to_array('1~1','~')::int[];
-- Aggregate with grouping column that does not have legacy hashop
explain (costs off) select distinct test_array from try_distinct_array;
QUERY PLAN
------------------------------------------------
HashAggregate
Group Key: test_array
-> Gather Motion 3:1 (slice1; segments: 3)
-> Seq Scan on try_distinct_array
Optimizer: Pivotal Optimizer (GPORCA)
(5 rows)
select distinct test_array from try_distinct_array;
test_array
------------
{1,1}
(1 row)
-- Hash join on column that does not have legacy hashop
explain (costs off) select * from try_distinct_array a, try_distinct_array b where a.test_array=b.test_array;
QUERY PLAN
--------------------------------------------------------------------------------------------------------
Hash Join
Hash Cond: ((a.test_array)::anyarray = (b.test_array)::anyarray)
-> Gather Motion 3:1 (slice1; segments: 3)
-> Seq Scan on try_distinct_array a
-> Hash
-> Gather Motion 3:1 (slice2; segments: 3)
-> Seq Scan on try_distinct_array b
Optimizer: Pivotal Optimizer (GPORCA)
(8 rows)
select * from try_distinct_array a, try_distinct_array b where a.test_array=b.test_array;
test_char | test_array | test_char | test_array
-----------+------------+-----------+------------
n | {1,1} | y | {1,1}
n | {1,1} | n | {1,1}
y | {1,1} | y | {1,1}
y | {1,1} | n | {1,1}
(4 rows)
-- CTAS should use value of gp_use_legacy_hashops when setting the distribution policy based on an existing table
set gp_use_legacy_hashops=on;
create table ctas_base_legacy as select unnest(array[1,2,3]) as col distributed by (col);
set gp_use_legacy_hashops=off;
create table ctas_from_legacy as select * from ctas_base_legacy distributed by (col);
create table ctas_explicit_legacy as select * from ctas_base_legacy distributed by (col cdbhash_int4_ops);
create table ctas_base_nonlegacy as select unnest(array[1,2,3]) as col distributed by (col);
set gp_use_legacy_hashops=on;
create table ctas_from_nonlegacy as select * from ctas_base_nonlegacy distributed by (col);
create table ctas_explicit_nonlegacy as select * from ctas_base_nonlegacy distributed by (col int4_ops);
select dp.localoid::regclass as name, opc.opcname
from gp_distribution_policy dp
join pg_opclass opc
on ARRAY[opc.oid]::oidvector = dp.distclass
where dp.localoid in ('ctas_base_legacy'::regclass,
'ctas_from_legacy'::regclass,
'ctas_base_nonlegacy'::regclass,
'ctas_from_nonlegacy'::regclass,
'ctas_explicit_legacy'::regclass,
'ctas_explicit_nonlegacy'::regclass);
name | opcname
-------------------------+------------------
ctas_explicit_nonlegacy | int4_ops
ctas_base_nonlegacy | int4_ops
ctas_from_legacy | int4_ops
ctas_from_nonlegacy | cdbhash_int4_ops
ctas_explicit_legacy | cdbhash_int4_ops
ctas_base_legacy | cdbhash_int4_ops
(6 rows)
select * from ctas_from_legacy where col=1;
col
-----
1
(1 row)
select * from ctas_explicit_legacy where col=1;
col
-----
1
(1 row)
select * from ctas_from_nonlegacy where col=1;
col
-----
1
(1 row)
select * from ctas_explicit_nonlegacy where col=1;
col
-----
1
(1 row)