| -- Test using different operator classes in DISTRIBUTED BY. |
| -- |
| -- Test joins involving tables with distribution keys using non-default |
| -- hash opclasses. |
| -- |
| -- For the tests, we define our own equality operator called |=|, which |
| -- compares the absolute values. For example -1 |=| 1. |
| CREATE FUNCTION abseq(int, int) RETURNS BOOL AS $$ |
| begin return abs($1) = abs($2); end; |
| $$ LANGUAGE plpgsql STRICT IMMUTABLE; |
| CREATE OPERATOR |=| ( |
| PROCEDURE = abseq, |
| LEFTARG = int, |
| RIGHTARG = int, |
| COMMUTATOR = |=|, |
| hashes, merges); |
| -- and a hash opclass to back it. |
| CREATE FUNCTION abshashfunc(int) RETURNS int AS $$ |
| begin return abs($1); end; |
| $$ LANGUAGE plpgsql STRICT IMMUTABLE; |
| CREATE OPERATOR FAMILY abs_int_hash_ops USING hash; |
| CREATE OPERATOR CLASS abs_int_hash_ops FOR TYPE int4 |
| USING hash FAMILY abs_int_hash_ops AS |
| OPERATOR 1 |=|, |
| FUNCTION 1 abshashfunc(int); |
| -- we need a btree opclass too. Otherwise the planner won't consider |
| -- collocation of joins. |
| CREATE FUNCTION abslt(int, int) RETURNS BOOL AS $$ |
| begin return abs($1) < abs($2); end; |
| $$ LANGUAGE plpgsql STRICT IMMUTABLE; |
| CREATE OPERATOR |<| (PROCEDURE = abslt, LEFTARG = int, RIGHTARG = int, hashes); |
| CREATE FUNCTION absgt(int, int) RETURNS BOOL AS $$ |
| begin return abs($1) > abs($2); end; |
| $$ LANGUAGE plpgsql STRICT IMMUTABLE; |
| CREATE OPERATOR |>| (PROCEDURE = absgt, LEFTARG = int, RIGHTARG = int, hashes); |
| CREATE FUNCTION abscmp(int, int) RETURNS int AS $$ |
| begin return btint4cmp(abs($1),abs($2)); end; |
| $$ LANGUAGE plpgsql STRICT IMMUTABLE; |
| CREATE OPERATOR CLASS abs_int_btree_ops FOR TYPE int4 |
| USING btree AS |
| OPERATOR 1 |<|, |
| OPERATOR 3 |=|, |
| OPERATOR 5 |>|, |
| FUNCTION 1 abscmp(int, int); |
| -- Create test tables. At first, use the default opclasses, suitable |
| -- for the normal = equality operator. |
| CREATE TABLE abstab_a (a int) DISTRIBUTED BY (a); |
| INSERT INTO abstab_a VALUES (-1), (0), (1); |
| CREATE TABLE abstab_b (b int) DISTRIBUTED BY (b); |
| INSERT INTO abstab_b VALUES (-1), (0), (1), (2); |
| ANALYZE abstab_a; |
| ANALYZE abstab_b; |
| -- The default opclass isn't helpful with the |=| operator, so this needs |
| -- a Motion. |
| EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b; |
| QUERY PLAN |
| ------------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (abstab_b.b |=| abstab_a.a) |
| -> Redistribute Motion 3:3 (slice2; segments: 3) |
| Hash Key: abstab_b.b |
| -> Seq Scan on abstab_b |
| -> Hash |
| -> Redistribute Motion 3:3 (slice3; segments: 3) |
| Hash Key: abstab_a.a |
| -> Seq Scan on abstab_a |
| Optimizer: Pivotal Optimizer (GPORCA) version 3.93.0 |
| (11 rows) |
| |
| SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b; |
| a | b |
| ----+---- |
| 0 | 0 |
| 1 | -1 |
| 1 | 1 |
| -1 | -1 |
| -1 | 1 |
| (5 rows) |
| |
| -- Change distribution key of abstab_a to use our fancy opclass. |
| DROP TABLE abstab_a; |
| CREATE TABLE abstab_a (a int) DISTRIBUTED BY (a abs_int_hash_ops); |
| INSERT INTO abstab_a VALUES (-1), (0), (1); |
| ANALYZE abstab_a; |
| -- The other side is still distributed using the default opclass, |
| -- so we still need a Motion. |
| EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b; |
| QUERY PLAN |
| ------------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (abstab_a.a |=| abstab_b.b) |
| -> Seq Scan on abstab_a |
| -> Hash |
| -> Redistribute Motion 3:3 (slice2; segments: 3) |
| Hash Key: abstab_b.b |
| -> Seq Scan on abstab_b |
| Optimizer: Postgres query optimizer |
| (9 rows) |
| |
| SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b; |
| a | b |
| ----+---- |
| -1 | 1 |
| -1 | -1 |
| 0 | 0 |
| 1 | 1 |
| 1 | -1 |
| (5 rows) |
| |
| -- Change distribution policy on abstab_a to match. (Drop and recreate, rather |
| -- than use ALTER TABLE, so that both CREATE and ALTER TABLE get tested.) |
| ALTER TABLE abstab_b SET DISTRIBUTED BY (b abs_int_hash_ops); |
| -- Now we should be able to answer this query without a Redistribute Motion |
| EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b; |
| QUERY PLAN |
| ------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (abstab_a.a |=| abstab_b.b) |
| -> Seq Scan on abstab_a |
| -> Hash |
| -> Seq Scan on abstab_b |
| Optimizer: Postgres query optimizer |
| (7 rows) |
| |
| SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b; |
| a | b |
| ----+---- |
| -1 | -1 |
| -1 | 1 |
| 0 | 0 |
| 1 | -1 |
| 1 | 1 |
| (5 rows) |
| |
| -- On the other hand, a regular equality join now needs a Redistribute Motion |
| -- (Given the way our hash function is defined, it could actually be used for |
| -- normal equality too. But the planner has no way to know that.) |
| EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a = b; |
| QUERY PLAN |
| ------------------------------------------------------------------ |
| Gather Motion 3:1 (slice1; segments: 3) |
| -> Hash Join |
| Hash Cond: (abstab_b.b = abstab_a.a) |
| -> Redistribute Motion 3:3 (slice2; segments: 3) |
| Hash Key: abstab_b.b |
| -> Seq Scan on abstab_b |
| -> Hash |
| -> Redistribute Motion 3:3 (slice3; segments: 3) |
| Hash Key: abstab_a.a |
| -> Seq Scan on abstab_a |
| Optimizer: Postgres query optimizer |
| (11 rows) |
| |
| SELECT a, b FROM abstab_a, abstab_b WHERE a = b; |
| a | b |
| ----+---- |
| -1 | -1 |
| 0 | 0 |
| 1 | 1 |
| (3 rows) |
| |
| -- Check dependency. The table should be marked as dependent on the operator |
| -- class, which prevents dropping it. |
| DROP OPERATOR CLASS abs_int_hash_ops USING hash; |
| ERROR: cannot drop operator class abs_int_hash_ops for access method hash because other objects depend on it |
| DETAIL: table abstab_b depends on operator class abs_int_hash_ops for access method hash |
| table abstab_a depends on operator class abs_int_hash_ops for access method hash |
| HINT: Use DROP ... CASCADE to drop the dependent objects too. |
| -- With CASCADE, though, the table should be dropped too. (It would be nicer |
| -- if it just changed the distribution policy to randomly distributed, but |
| -- that's not how it works currently.) |
| DROP OPERATOR CLASS abs_int_hash_ops USING hash CASCADE; |
| NOTICE: drop cascades to 2 other objects |
| DETAIL: drop cascades to table abstab_b |
| drop cascades to table abstab_a |
| -- Recreate the opclass, and a table using it again. This is just so that we |
| -- leave behind a custom operator class and a table that uses it in the |
| -- distribution key, so that it gets tested by pg_upgrade tests that run |
| -- on the regression database, after all the tests. |
| CREATE OPERATOR CLASS abs_int_hash_ops FOR TYPE int4 |
| USING hash FAMILY abs_int_hash_ops AS |
| OPERATOR 1 |=|, |
| FUNCTION 1 abshashfunc(int); |
| CREATE TABLE abs_opclass_test (i int, t text, j int) |
| DISTRIBUTED BY (i abs_int_hash_ops, t, j abs_int_hash_ops); |
| INSERT INTO abs_opclass_test VALUES |
| (-1, 'minus one', 1), |
| (0, 'zero', 0), |
| (1, 'one', 1); |
| -- Test deparsing of the non-default opclass |
| \d abs_opclass_test |
| Table "public.abs_opclass_test" |
| Column | Type | Collation | Nullable | Default |
| --------+---------+-----------+----------+--------- |
| i | integer | | | |
| t | text | | | |
| j | integer | | | |
| Distributed by: (i abs_int_hash_ops, t, j abs_int_hash_ops) |
| |
| -- |
| -- Test interaction between unique and primary key indexes and distribution keys. |
| -- |
| -- should fail, because the default index opclass is not compatible with the |=| operator |
| CREATE UNIQUE INDEX ON abs_opclass_test (i, j, t); |
| ERROR: UNIQUE index must contain all columns in the table's distribution key |
| DETAIL: Operator class abs_int_hash_ops of distribution key column "i" is not compatible with operator class int4_ops used in the constraint. |
| ALTER TABLE abs_opclass_test ADD PRIMARY KEY (i, j, t); |
| ERROR: PRIMARY KEY definition must contain all columns in the table's distribution key |
| DETAIL: Operator class abs_int_hash_ops of distribution key column "i" is not compatible with operator class int4_ops used in the constraint. |
| -- but this is allowed. (There is no syntax to specify the opclasses with ADD PRIMARY KEY) |
| CREATE UNIQUE INDEX ON abs_opclass_test (i abs_int_btree_ops, j abs_int_btree_ops, t); |
| -- ALTER TABLE should perform the same tests |
| ALTER TABLE abs_opclass_test SET DISTRIBUTED BY (i, j); -- not allowed |
| ERROR: distribution policy is not compatible with UNIQUE index "abs_opclass_test_i_j_t_idx" |
| DETAIL: Operator class int4_ops of distribution key column "i" is not compatible with operator class abs_int_btree_ops used in the constraint. |
| ALTER TABLE abs_opclass_test SET DISTRIBUTED BY (i abs_int_hash_ops, j abs_int_hash_ops); |
| -- Exclusion constraints work similarly. We can enforce them, as long as the |
| -- exclusion ops are the same equality ops as used in the distribution key. |
| -- |
| -- That may seem a bit pointless, but there are some use cases for it. For |
| -- example, imagine that you have a calendar system, where you can book rooms. |
| -- If you distribute the bookings table by room number, you could have an |
| -- exclusion constraint on (room_no WITH =, reservation WITH &&), to enforce |
| -- that there are no overlapping reservations for the same room. |
| -- |
| -- We can't use that exact example here, without the 'btree_gist' extension |
| -- that would provide the = gist opclass for basic types. So we use a more |
| -- contrived example using IP addresses rather than rooms. |
| CREATE TABLE ip_reservations (ip_addr inet, reserved tsrange) DISTRIBUTED BY (ip_addr); |
| -- these are not allowed |
| ALTER TABLE ip_reservations ADD EXCLUDE USING gist (reserved WITH &&); |
| ERROR: exclusion constraint is not compatible with the table's distribution policy |
| DETAIL: Distribution key column "ip_addr" is not included in the constraint. |
| HINT: Add "ip_addr" to the constraint with the =(inet,inet) operator. |
| ALTER TABLE ip_reservations ADD EXCLUDE USING gist (ip_addr inet_ops WITH &&); |
| ERROR: exclusion constraint is not compatible with the table's distribution policy |
| DETAIL: Distribution key column "ip_addr" is not included in the constraint. |
| HINT: Add "ip_addr" to the constraint with the =(inet,inet) operator. |
| -- but this is. |
| ALTER TABLE ip_reservations ADD EXCLUDE USING gist (ip_addr inet_ops WITH =, reserved WITH &&); |
| -- new distribution is incompatible with the constraint. |
| ALTER TABLE ip_reservations SET DISTRIBUTED BY (reserved); |
| ERROR: distribution policy is not compatible with exclusion constraint "ip_reservations_ip_addr_reserved_excl" |
| DETAIL: Distribution key column "reserved" is not included in the constraint. |
| HINT: Add "reserved" to the constraint with the =(anyrange,anyrange) operator. |
| -- After dropping the constraint, it's allowed. |
| ALTER TABLE ip_reservations DROP CONSTRAINT ip_reservations_ip_addr_reserved_excl; |
| ALTER TABLE ip_reservations SET DISTRIBUTED BY (reserved); |
| -- Test creating exclusion constraint on tsrange column. (The subtle |
| -- difference is there is no direct =(tsrange, tsrange) operator, we rely on |
| -- the implicit casts for it) |
| ALTER TABLE ip_reservations ADD EXCLUDE USING gist (reserved WITH =); |
| -- |
| -- Test scenario, where a type has a hash operator class, but not a default |
| -- one. |
| -- |
| -- Doesn't work, because 'point' doesn't have a hash opclass |
| CREATE TABLE dist_by_point1(p point) DISTRIBUTED BY (p); |
| ERROR: data type point has no default operator class for access method "hash" |
| HINT: You must specify an operator class or define a default operator class for the data type. |
| -- Neither does the same via ALTER TABLE |
| CREATE TABLE dist_by_point2(p point) DISTRIBUTED RANDOMLY; |
| ALTER TABLE dist_by_point2 SET DISTRIBUTED BY (p); |
| ERROR: data type point has no default operator class for access method "hash" |
| HINT: You must specify an operator class or define a default operator class for the data type. |
| -- But if we create it one, then we can use it. |
| CREATE FUNCTION pointhashfunc(point) RETURNS int AS $$ |
| begin return hashfloat8(point_distance(point(0,0), $1)); end; |
| $$ LANGUAGE plpgsql STRICT IMMUTABLE; |
| CREATE OPERATOR CLASS point_hash_ops FOR TYPE point |
| USING hash AS |
| OPERATOR 1 ~=, |
| FUNCTION 1 pointhashfunc(point); |
| -- This still doesn't work, because there's still no *default* opclass. |
| CREATE TABLE dist_by_point3(p point) DISTRIBUTED BY (p); |
| ERROR: data type point has no default operator class for access method "hash" |
| HINT: You must specify an operator class or define a default operator class for the data type. |
| CREATE TABLE dist_by_point3(p point) DISTRIBUTED RANDOMLY; |
| ALTER TABLE dist_by_point3 SET DISTRIBUTED BY (p); |
| ERROR: data type point has no default operator class for access method "hash" |
| HINT: You must specify an operator class or define a default operator class for the data type. |
| -- But explicitly specifying the opclass works. |
| CREATE TABLE dist_by_point4(p point) DISTRIBUTED BY (p point_hash_ops); |
| ALTER TABLE dist_by_point4 SET DISTRIBUTED RANDOMLY; |
| ALTER TABLE dist_by_point4 SET DISTRIBUTED BY (p point_hash_ops); |