| -- |
| -- Tests for legacy cdbhash opclasses |
| -- |
| drop schema if exists gpdist_legacy_opclasses; |
| create schema gpdist_legacy_opclasses; |
| set search_path to gpdist_legacy_opclasses; |
| -- Basic sanity check of all the legacy hash opclasses. Create a table that |
| -- uses all of them in the distribution key, and insert a value. |
| set gp_use_legacy_hashops=on; |
| create table all_legacy_types( |
| int2_col int2, |
| int4_col int4, |
| int8_col int8, |
| float4_col float4, |
| float8_col float8, |
| numeric_col numeric, |
| char_col "char", |
| bpchar_col char(20), |
| text_col text, |
| varchar_col varchar(20), |
| bytea_col bytea, |
| name_col name, |
| oid_col oid, |
| tid_col tid, |
| timestamp_col timestamp, |
| timestamptz_col timestamptz, |
| date_col date, |
| time_col time, |
| timetz_col timetz, |
| interval_col interval, |
| inet_col inet, |
| cidr_col cidr, |
| macaddr_col macaddr, |
| bit_col bit(11), |
| varbit_col varbit(20), |
| bool_col bool, |
| oidvector_col oidvector, |
| money_col money, |
| --uuid_col uuid, |
| complex_col complex |
| ) distributed by ( |
| int2_col, |
| int4_col, |
| int8_col, |
| float4_col, |
| float8_col, |
| numeric_col, |
| char_col, |
| bpchar_col, |
| text_col, |
| varchar_col, |
| bytea_col, |
| name_col, |
| oid_col, |
| tid_col, |
| timestamp_col, |
| timestamptz_col, |
| date_col, |
| time_col, |
| timetz_col, |
| interval_col, |
| inet_col, |
| cidr_col, |
| macaddr_col, |
| bit_col, |
| varbit_col, |
| bool_col, |
| oidvector_col, |
| money_col, |
| --uuid_col, |
| complex_col |
| ); |
| -- Verify that all columns are using the legacy hashops |
| select attno, opc.opcname from |
| (select unnest(distkey) as attno, unnest(distclass) as distclass from gp_distribution_policy |
| where localoid='all_legacy_types'::regclass) as d, |
| pg_opclass opc |
| where opc.oid=distclass |
| order by attno; |
| attno | opcname |
| -------+------------------------- |
| 1 | cdbhash_int2_ops |
| 2 | cdbhash_int4_ops |
| 3 | cdbhash_int8_ops |
| 4 | cdbhash_float4_ops |
| 5 | cdbhash_float8_ops |
| 6 | cdbhash_numeric_ops |
| 7 | cdbhash_char_ops |
| 8 | cdbhash_bpchar_ops |
| 9 | cdbhash_text_ops |
| 10 | cdbhash_text_ops |
| 11 | cdbhash_bytea_ops |
| 12 | cdbhash_name_ops |
| 13 | cdbhash_oid_ops |
| 14 | cdbhash_tid_ops |
| 15 | cdbhash_timestamp_ops |
| 16 | cdbhash_timestamptz_ops |
| 17 | cdbhash_date_ops |
| 18 | cdbhash_time_ops |
| 19 | cdbhash_timetz_ops |
| 20 | cdbhash_interval_ops |
| 21 | cdbhash_inet_ops |
| 22 | cdbhash_inet_ops |
| 23 | cdbhash_macaddr_ops |
| 24 | cdbhash_bit_ops |
| 25 | cdbhash_bit_ops |
| 26 | cdbhash_bool_ops |
| 27 | cdbhash_oidvector_ops |
| 28 | cdbhash_cash_ops |
| 29 | cdbhash_complex_ops |
| (29 rows) |
| |
| insert into all_legacy_types values ( |
| '12345', -- int2 |
| '12345678', -- int4 |
| '1234567890123', -- int8 |
| '1.2345', -- float4 |
| '1.23456789', -- float8 |
| '1.23456789', -- numeric |
| 'x', -- "char" |
| 'bpchar foobar', -- bpchar |
| 'text foobar', -- text |
| 'varchar foobar', -- varchar |
| '\xdeadbeef', -- bytea |
| 'name foobar', -- name |
| '12345', -- oid |
| '(1,1)', -- tid |
| '2018-11-25 01:23:45+02', -- timestamp |
| '2018-11-25 01:23:45+02', -- timestamptz |
| '2018-11-25', -- date, |
| '01:23:45', -- time, |
| '01:23:45+02', -- timetz, |
| '6 years', -- interval, |
| '192.168.1.255/25', -- inet, |
| '10.1.2', -- cidr, |
| '08:00:2b:01:02:03', -- macaddr, |
| B'11011000000', -- bit, |
| B'01010101010', -- varbit, |
| 'true', -- bool, |
| '1 2 3 4', -- oidvector, |
| '123.45', -- money, |
| --uuid_col -- uuid, |
| '5 + 3i' -- complex |
| ); |
| -- Test that CTAS honors the gp_use_legacy_hashops GUC |
| -- Note: When ORCA is on, the distribution is RANDOM. |
| create table legacy_hashops_ctas as select 1; |
| select gpdp.distkey, pgopc.opcname |
| from gp_distribution_policy gpdp, pg_opclass pgopc |
| where gpdp.localoid='legacy_hashops_ctas'::regclass and pgopc.oid::text = gpdp.distclass::text; |
| distkey | opcname |
| ---------+--------- |
| (0 rows) |
| |
| set gp_use_legacy_hashops=off; |
| -- |
| -- Test joins between tables using a mix of default and legacy opclasses. |
| -- |
| create table legacy_int (id int4) distributed by (id cdbhash_int4_ops); |
| insert into legacy_int values (1), (2), (3); |
| create table modern_int (id int4) distributed by (id); |
| insert into modern_int values (2), (3), (4); |
| analyze modern_int; |
| create table modern_text (t text) distributed by (t); |
| insert into modern_text values ('foo'), ('1'); |
| -- Only legacy opclasses used. ORCA can deal with this |
| explain (costs off) select * from legacy_int a inner join legacy_int b on a.id = b.id; |
| QUERY PLAN |
| ------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (a.id = b.id) |
| -> Seq Scan on legacy_int a |
| -> Hash |
| -> Seq Scan on legacy_int b |
| Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0 |
| (7 rows) |
| |
| select * from legacy_int a inner join legacy_int b on a.id = b.id; |
| id | id |
| ----+---- |
| 1 | 1 |
| 2 | 2 |
| 3 | 3 |
| (3 rows) |
| |
| -- A mixture of legacy and modern opclasses. Needs a Redistribute motion. ORCA |
| -- can't handle this, and falls back. |
| explain (costs off) select * from legacy_int a inner join modern_int b on a.id = b.id; |
| QUERY PLAN |
| ------------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (a.id = b.id) |
| -> Seq Scan on legacy_int a |
| -> Hash |
| -> Redistribute Motion 3:3 (slice2; segments: 3) |
| Hash Key: b.id |
| -> Seq Scan on modern_int b |
| Optimizer: Postgres query optimizer |
| (9 rows) |
| |
| select * from legacy_int a inner join modern_int b on a.id = b.id; |
| id | id |
| ----+---- |
| 2 | 2 |
| 3 | 3 |
| (2 rows) |
| |
| -- for the sake of completeness, modern opclasses only. No problem for ORCA. |
| explain (costs off) select * from modern_int a inner join modern_int b on a.id = b.id; |
| QUERY PLAN |
| ------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (a.id = b.id) |
| -> Seq Scan on modern_int a |
| -> Hash |
| -> Seq Scan on modern_int b |
| Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0 |
| (7 rows) |
| |
| select * from modern_int a inner join modern_int b on a.id = b.id; |
| id | id |
| ----+---- |
| 2 | 2 |
| 3 | 3 |
| 4 | 4 |
| (3 rows) |
| |
| -- In principle, ORCA would create a correct plan for this, because even though it |
| -- mixes the legacy and other opclasses, they're not used in join conditions. But |
| -- the ORCA translator code is conservative, and falls back if there are any mixed |
| -- use, even if it would be safe. |
| explain (costs off) select * from legacy_int a inner join modern_text b on a.id::text = b.t; |
| QUERY PLAN |
| ------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: ((a.id)::text = b.t) |
| -> Redistribute Motion 3:3 (slice2; segments: 3) |
| Hash Key: (a.id)::text |
| -> Seq Scan on legacy_int a |
| -> Hash |
| -> Seq Scan on modern_text b |
| Optimizer: Postgres query optimizer |
| (9 rows) |
| |
| select * from legacy_int a inner join modern_text b on a.id::text = b.t; |
| id | t |
| ----+--- |
| 1 | 1 |
| (1 row) |
| |
| -- Also test with a domain over a base type, and enums. |
| create domain intdom as integer; |
| create table legacy_domain_over_int(id intdom) distributed by(id cdbhash_int4_ops); |
| insert into legacy_domain_over_int values (1), (2), (3); |
| analyze legacy_domain_over_int; |
| explain (costs off) select * from legacy_domain_over_int a inner join legacy_domain_over_int b on a.id = b.id; |
| QUERY PLAN |
| ---------------------------------------------------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: ((a.id)::integer = (b.id)::integer) |
| -> Seq Scan on legacy_domain_over_int a |
| -> Hash |
| -> Seq Scan on legacy_domain_over_int b |
| Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0 |
| (7 rows) |
| |
| explain (costs off) select * from legacy_int a inner join legacy_domain_over_int b on a.id = b.id; |
| QUERY PLAN |
| --------------------------------------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (a.id = (b.id)::integer) |
| -> Seq Scan on legacy_int a |
| -> Hash |
| -> Seq Scan on legacy_domain_over_int b |
| Optimizer: Pivotal Optimizer (GPORCA) version 3.9.0 |
| (7 rows) |
| |
| explain (costs off) select * from modern_int a inner join legacy_domain_over_int b on a.id = b.id; |
| QUERY PLAN |
| ------------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (a.id = (b.id)::integer) |
| -> Seq Scan on modern_int a |
| -> Hash |
| -> Redistribute Motion 3:3 (slice2; segments: 3) |
| Hash Key: b.id |
| -> Seq Scan on legacy_domain_over_int b |
| Optimizer: Postgres query optimizer |
| (9 rows) |
| |
| -- Distributing by enum has been banned, so this test is updated to instead distribute by a dummy int column |
| -- Banned because in backup/restore scenarios the data will be in the "wrong" segment as oids for each enum |
| -- entry are re-generated and hashing them will result in arbitrary segment assignment. |
| create type colors as enum ('red', 'green', 'blue'); |
| create table legacy_enum(col1 int, color colors) distributed by(col1); |
| insert into legacy_enum values (1, 'red'), (2, 'green'), (3, 'blue'); |
| explain (costs off) select * from legacy_enum a inner join legacy_enum b on a.color = b.color; |
| QUERY PLAN |
| --------------------------------------------- |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: ((a.color)::anyenum = (b.color)::anyenum) |
| -> Seq Scan on legacy_enum a |
| -> Hash |
| -> Broadcast Motion 3:3 (slice2; segments: 3) |
| -> Seq Scan on legacy_enum b |
| Optimizer: Pivotal Optimizer (GPORCA) version 3.41.0 |
| (8 rows) |
| |
| select * from legacy_enum a inner join legacy_enum b on a.color = b.color; |
| col1 | color | col1 | color |
| ------+-------+------+------- |
| 2 | green | 2 | green |
| 3 | blue | 3 | blue |
| 1 | red | 1 | red |
| (3 rows) |
| |
| -- |
| -- A regression issue that the data is reorganized incorrectly when |
| -- gp_use_legacy_hashops has non-default value. |
| -- |
| -- The ALTER TABLE command reorganizes the data by using a temporary table, if |
| -- a "distributed by" clause is specified without the opclasses, the default |
| -- opclasses will be chosen. There was a bug that the non-legacy opclasses are |
| -- always chosen, regarding the setting of gp_use_legacy_hashops. However the |
| -- table's new opclasses are determined with gp_use_legacy_hashops, so when |
| -- gp_use_legacy_hashops is true the data will be incorrectly redistributed. |
| -- |
| -- set the guc to the non-default value |
| set gp_use_legacy_hashops to on; |
| create table legacy_data_reorg (c1 int) distributed by (c1); |
| insert into legacy_data_reorg select i from generate_series(1, 10) i; |
| -- verify the opclass and data distribution |
| select gp_segment_id, c1 from legacy_data_reorg order by 1, 2; |
| gp_segment_id | c1 |
| ---------------+---- |
| 0 | 1 |
| 0 | 2 |
| 1 | 3 |
| 1 | 4 |
| 1 | 5 |
| 1 | 6 |
| 1 | 7 |
| 2 | 8 |
| 2 | 9 |
| 2 | 10 |
| (10 rows) |
| |
| select dp.localoid::regclass::name as name, oc.opcname |
| from gp_distribution_policy dp |
| join pg_opclass oc |
| on oc.oid::text = dp.distclass::text |
| where dp.localoid = 'legacy_data_reorg'::regclass::oid; |
| name | opcname |
| -------------------+------------------ |
| legacy_data_reorg | cdbhash_int4_ops |
| (1 row) |
| |
| -- when reorganizing the table we set the distributed-by without an explicit |
| -- opclass, so the default one should be chosen according to |
| -- gp_use_legacy_hashops. |
| alter table legacy_data_reorg set with (reorganize) distributed by (c1); |
| -- double-check the opclass and data distribution |
| select gp_segment_id, c1 from legacy_data_reorg order by 1, 2; |
| gp_segment_id | c1 |
| ---------------+---- |
| 0 | 1 |
| 0 | 2 |
| 1 | 3 |
| 1 | 4 |
| 1 | 5 |
| 1 | 6 |
| 1 | 7 |
| 2 | 8 |
| 2 | 9 |
| 2 | 10 |
| (10 rows) |
| |
| select dp.localoid::regclass::name as name, oc.opcname |
| from gp_distribution_policy dp |
| join pg_opclass oc |
| on oc.oid::text = dp.distclass::text |
| where dp.localoid = 'legacy_data_reorg'::regclass::oid; |
| name | opcname |
| -------------------+------------------ |
| legacy_data_reorg | cdbhash_int4_ops |
| (1 row) |
| |
| -- |
| -- A regression issue similar to previous one, with CTAS. |
| -- |
| -- The default opclasses in CTAS should also be determined with |
| -- gp_use_legacy_hashops. |
| -- |
| set gp_use_legacy_hashops=off; |
| create table ctastest_off as select 123 as col distributed by (col); |
| set gp_use_legacy_hashops=on; |
| create table ctastest_on as select 123 as col distributed by (col); |
| select dp.localoid::regclass::name as name, oc.opcname |
| from gp_distribution_policy dp |
| join pg_opclass oc |
| on oc.oid::text = dp.distclass::text |
| where dp.localoid in ('ctastest_on'::regclass::oid, |
| 'ctastest_off'::regclass::oid); |
| name | opcname |
| --------------+------------------ |
| ctastest_off | int4_ops |
| ctastest_on | cdbhash_int4_ops |
| (2 rows) |
| |
| set gp_use_legacy_hashops=on; |
| create table try_distinct_array (test_char varchar,test_array integer[]); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'test_char' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| insert into try_distinct_array select 'y',string_to_array('1~1','~')::int[]; |
| analyze try_distinct_array; |
| insert into try_distinct_array select 'n',string_to_array('1~1','~')::int[]; |
| -- Aggregate with grouping column that does not have legacy hashop |
| explain (costs off) select distinct test_array from try_distinct_array; |
| QUERY PLAN |
| ------------------------------------------------ |
| HashAggregate |
| Group Key: test_array |
| -> Gather Motion 3:1 (slice1; segments: 3) |
| -> Seq Scan on try_distinct_array |
| Optimizer: Pivotal Optimizer (GPORCA) |
| (5 rows) |
| |
| select distinct test_array from try_distinct_array; |
| test_array |
| ------------ |
| {1,1} |
| (1 row) |
| |
| -- Hash join on column that does not have legacy hashop |
| explain (costs off) select * from try_distinct_array a, try_distinct_array b where a.test_array=b.test_array; |
| QUERY PLAN |
| -------------------------------------------------------------------------------------------------------- |
| Hash Join |
| Hash Cond: ((a.test_array)::anyarray = (b.test_array)::anyarray) |
| -> Gather Motion 3:1 (slice1; segments: 3) |
| -> Seq Scan on try_distinct_array a |
| -> Hash |
| -> Gather Motion 3:1 (slice2; segments: 3) |
| -> Seq Scan on try_distinct_array b |
| Optimizer: Pivotal Optimizer (GPORCA) |
| (8 rows) |
| |
| select * from try_distinct_array a, try_distinct_array b where a.test_array=b.test_array; |
| test_char | test_array | test_char | test_array |
| -----------+------------+-----------+------------ |
| n | {1,1} | y | {1,1} |
| n | {1,1} | n | {1,1} |
| y | {1,1} | y | {1,1} |
| y | {1,1} | n | {1,1} |
| (4 rows) |
| |
| -- CTAS should use value of gp_use_legacy_hashops when setting the distribution policy based on an existing table |
| set gp_use_legacy_hashops=on; |
| create table ctas_base_legacy as select unnest(array[1,2,3]) as col distributed by (col); |
| set gp_use_legacy_hashops=off; |
| create table ctas_from_legacy as select * from ctas_base_legacy distributed by (col); |
| create table ctas_explicit_legacy as select * from ctas_base_legacy distributed by (col cdbhash_int4_ops); |
| create table ctas_base_nonlegacy as select unnest(array[1,2,3]) as col distributed by (col); |
| set gp_use_legacy_hashops=on; |
| create table ctas_from_nonlegacy as select * from ctas_base_nonlegacy distributed by (col); |
| create table ctas_explicit_nonlegacy as select * from ctas_base_nonlegacy distributed by (col int4_ops); |
| select dp.localoid::regclass as name, opc.opcname |
| from gp_distribution_policy dp |
| join pg_opclass opc |
| on ARRAY[opc.oid]::oidvector = dp.distclass |
| where dp.localoid in ('ctas_base_legacy'::regclass, |
| 'ctas_from_legacy'::regclass, |
| 'ctas_base_nonlegacy'::regclass, |
| 'ctas_from_nonlegacy'::regclass, |
| 'ctas_explicit_legacy'::regclass, |
| 'ctas_explicit_nonlegacy'::regclass); |
| name | opcname |
| -------------------------+------------------ |
| ctas_explicit_nonlegacy | int4_ops |
| ctas_base_nonlegacy | int4_ops |
| ctas_from_legacy | int4_ops |
| ctas_from_nonlegacy | cdbhash_int4_ops |
| ctas_explicit_legacy | cdbhash_int4_ops |
| ctas_base_legacy | cdbhash_int4_ops |
| (6 rows) |
| |
| select * from ctas_from_legacy where col=1; |
| col |
| ----- |
| 1 |
| (1 row) |
| |
| select * from ctas_explicit_legacy where col=1; |
| col |
| ----- |
| 1 |
| (1 row) |
| |
| select * from ctas_from_nonlegacy where col=1; |
| col |
| ----- |
| 1 |
| (1 row) |
| |
| select * from ctas_explicit_nonlegacy where col=1; |
| col |
| ----- |
| 1 |
| (1 row) |
| |