blob: 64a10febbd5c7bc1661ca8856ebc64f43eeb5a0f [file] [log] [blame]
-- Test using different operator classes in DISTRIBUTED BY.
--
-- Test joins involving tables with distribution keys using non-default
-- hash opclasses.
--
-- For the tests, we define our own equality operator called |=|, which
-- compares the absolute values. For example -1 |=| 1.
CREATE FUNCTION abseq(int, int) RETURNS BOOL AS $$
begin return abs($1) = abs($2); end;
$$ LANGUAGE plpgsql STRICT IMMUTABLE;
CREATE OPERATOR |=| (
PROCEDURE = abseq,
LEFTARG = int,
RIGHTARG = int,
COMMUTATOR = |=|,
hashes, merges);
-- and a hash opclass to back it.
CREATE FUNCTION abshashfunc(int) RETURNS int AS $$
begin return abs($1); end;
$$ LANGUAGE plpgsql STRICT IMMUTABLE;
CREATE OPERATOR FAMILY abs_int_hash_ops USING hash;
CREATE OPERATOR CLASS abs_int_hash_ops FOR TYPE int4
USING hash FAMILY abs_int_hash_ops AS
OPERATOR 1 |=|,
FUNCTION 1 abshashfunc(int);
-- we need a btree opclass too. Otherwise the planner won't consider
-- collocation of joins.
CREATE FUNCTION abslt(int, int) RETURNS BOOL AS $$
begin return abs($1) < abs($2); end;
$$ LANGUAGE plpgsql STRICT IMMUTABLE;
CREATE OPERATOR |<| (PROCEDURE = abslt, LEFTARG = int, RIGHTARG = int, hashes);
CREATE FUNCTION absgt(int, int) RETURNS BOOL AS $$
begin return abs($1) > abs($2); end;
$$ LANGUAGE plpgsql STRICT IMMUTABLE;
CREATE OPERATOR |>| (PROCEDURE = absgt, LEFTARG = int, RIGHTARG = int, hashes);
CREATE FUNCTION abscmp(int, int) RETURNS int AS $$
begin return btint4cmp(abs($1),abs($2)); end;
$$ LANGUAGE plpgsql STRICT IMMUTABLE;
CREATE OPERATOR CLASS abs_int_btree_ops FOR TYPE int4
USING btree AS
OPERATOR 1 |<|,
OPERATOR 3 |=|,
OPERATOR 5 |>|,
FUNCTION 1 abscmp(int, int);
-- Create test tables. At first, use the default opclasses, suitable
-- for the normal = equality operator.
CREATE TABLE abstab_a (a int) DISTRIBUTED BY (a);
INSERT INTO abstab_a VALUES (-1), (0), (1);
CREATE TABLE abstab_b (b int) DISTRIBUTED BY (b);
INSERT INTO abstab_b VALUES (-1), (0), (1), (2);
ANALYZE abstab_a;
ANALYZE abstab_b;
-- The default opclass isn't helpful with the |=| operator, so this needs
-- a Motion.
EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b;
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (abstab_b.b |=| abstab_a.a)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: abstab_b.b
-> Seq Scan on abstab_b
-> Hash
-> Redistribute Motion 3:3 (slice3; segments: 3)
Hash Key: abstab_a.a
-> Seq Scan on abstab_a
Optimizer: Pivotal Optimizer (GPORCA) version 3.93.0
(11 rows)
SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b;
a | b
----+----
0 | 0
1 | -1
1 | 1
-1 | -1
-1 | 1
(5 rows)
-- Change distribution key of abstab_a to use our fancy opclass.
DROP TABLE abstab_a;
CREATE TABLE abstab_a (a int) DISTRIBUTED BY (a abs_int_hash_ops);
INSERT INTO abstab_a VALUES (-1), (0), (1);
ANALYZE abstab_a;
-- The other side is still distributed using the default opclass,
-- so we still need a Motion.
EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b;
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (abstab_a.a |=| abstab_b.b)
-> Seq Scan on abstab_a
-> Hash
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: abstab_b.b
-> Seq Scan on abstab_b
Optimizer: Postgres query optimizer
(9 rows)
SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b;
a | b
----+----
-1 | 1
-1 | -1
0 | 0
1 | 1
1 | -1
(5 rows)
-- Change distribution policy on abstab_a to match. (Drop and recreate, rather
-- than use ALTER TABLE, so that both CREATE and ALTER TABLE get tested.)
ALTER TABLE abstab_b SET DISTRIBUTED BY (b abs_int_hash_ops);
-- Now we should be able to answer this query without a Redistribute Motion
EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b;
QUERY PLAN
------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (abstab_a.a |=| abstab_b.b)
-> Seq Scan on abstab_a
-> Hash
-> Seq Scan on abstab_b
Optimizer: Postgres query optimizer
(7 rows)
SELECT a, b FROM abstab_a, abstab_b WHERE a |=| b;
a | b
----+----
-1 | -1
-1 | 1
0 | 0
1 | -1
1 | 1
(5 rows)
-- On the other hand, a regular equality join now needs a Redistribute Motion
-- (Given the way our hash function is defined, it could actually be used for
-- normal equality too. But the planner has no way to know that.)
EXPLAIN (COSTS OFF) SELECT a, b FROM abstab_a, abstab_b WHERE a = b;
QUERY PLAN
------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3)
-> Hash Join
Hash Cond: (abstab_b.b = abstab_a.a)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: abstab_b.b
-> Seq Scan on abstab_b
-> Hash
-> Redistribute Motion 3:3 (slice3; segments: 3)
Hash Key: abstab_a.a
-> Seq Scan on abstab_a
Optimizer: Postgres query optimizer
(11 rows)
SELECT a, b FROM abstab_a, abstab_b WHERE a = b;
a | b
----+----
-1 | -1
0 | 0
1 | 1
(3 rows)
-- Check dependency. The table should be marked as dependent on the operator
-- class, which prevents dropping it.
DROP OPERATOR CLASS abs_int_hash_ops USING hash;
ERROR: cannot drop operator class abs_int_hash_ops for access method hash because other objects depend on it
DETAIL: table abstab_b depends on operator class abs_int_hash_ops for access method hash
table abstab_a depends on operator class abs_int_hash_ops for access method hash
HINT: Use DROP ... CASCADE to drop the dependent objects too.
-- With CASCADE, though, the table should be dropped too. (It would be nicer
-- if it just changed the distribution policy to randomly distributed, but
-- that's not how it works currently.)
DROP OPERATOR CLASS abs_int_hash_ops USING hash CASCADE;
NOTICE: drop cascades to 2 other objects
DETAIL: drop cascades to table abstab_b
drop cascades to table abstab_a
-- Recreate the opclass, and a table using it again. This is just so that we
-- leave behind a custom operator class and a table that uses it in the
-- distribution key, so that it gets tested by pg_upgrade tests that run
-- on the regression database, after all the tests.
CREATE OPERATOR CLASS abs_int_hash_ops FOR TYPE int4
USING hash FAMILY abs_int_hash_ops AS
OPERATOR 1 |=|,
FUNCTION 1 abshashfunc(int);
CREATE TABLE abs_opclass_test (i int, t text, j int)
DISTRIBUTED BY (i abs_int_hash_ops, t, j abs_int_hash_ops);
INSERT INTO abs_opclass_test VALUES
(-1, 'minus one', 1),
(0, 'zero', 0),
(1, 'one', 1);
-- Test deparsing of the non-default opclass
\d abs_opclass_test
Table "public.abs_opclass_test"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
i | integer | | |
t | text | | |
j | integer | | |
Distributed by: (i abs_int_hash_ops, t, j abs_int_hash_ops)
--
-- Test interaction between unique and primary key indexes and distribution keys.
--
-- should fail, because the default index opclass is not compatible with the |=| operator
CREATE UNIQUE INDEX ON abs_opclass_test (i, j, t);
ERROR: UNIQUE index must contain all columns in the table's distribution key
DETAIL: Operator class abs_int_hash_ops of distribution key column "i" is not compatible with operator class int4_ops used in the constraint.
ALTER TABLE abs_opclass_test ADD PRIMARY KEY (i, j, t);
ERROR: PRIMARY KEY definition must contain all columns in the table's distribution key
DETAIL: Operator class abs_int_hash_ops of distribution key column "i" is not compatible with operator class int4_ops used in the constraint.
-- but this is allowed. (There is no syntax to specify the opclasses with ADD PRIMARY KEY)
CREATE UNIQUE INDEX ON abs_opclass_test (i abs_int_btree_ops, j abs_int_btree_ops, t);
-- ALTER TABLE should perform the same tests
ALTER TABLE abs_opclass_test SET DISTRIBUTED BY (i, j); -- not allowed
ERROR: distribution policy is not compatible with UNIQUE index "abs_opclass_test_i_j_t_idx"
DETAIL: Operator class int4_ops of distribution key column "i" is not compatible with operator class abs_int_btree_ops used in the constraint.
ALTER TABLE abs_opclass_test SET DISTRIBUTED BY (i abs_int_hash_ops, j abs_int_hash_ops);
-- Exclusion constraints work similarly. We can enforce them, as long as the
-- exclusion ops are the same equality ops as used in the distribution key.
--
-- That may seem a bit pointless, but there are some use cases for it. For
-- example, imagine that you have a calendar system, where you can book rooms.
-- If you distribute the bookings table by room number, you could have an
-- exclusion constraint on (room_no WITH =, reservation WITH &&), to enforce
-- that there are no overlapping reservations for the same room.
--
-- We can't use that exact example here, without the 'btree_gist' extension
-- that would provide the = gist opclass for basic types. So we use a more
-- contrived example using IP addresses rather than rooms.
CREATE TABLE ip_reservations (ip_addr inet, reserved tsrange) DISTRIBUTED BY (ip_addr);
-- these are not allowed
ALTER TABLE ip_reservations ADD EXCLUDE USING gist (reserved WITH &&);
ERROR: exclusion constraint is not compatible with the table's distribution policy
DETAIL: Distribution key column "ip_addr" is not included in the constraint.
HINT: Add "ip_addr" to the constraint with the =(inet,inet) operator.
ALTER TABLE ip_reservations ADD EXCLUDE USING gist (ip_addr inet_ops WITH &&);
ERROR: exclusion constraint is not compatible with the table's distribution policy
DETAIL: Distribution key column "ip_addr" is not included in the constraint.
HINT: Add "ip_addr" to the constraint with the =(inet,inet) operator.
-- but this is.
ALTER TABLE ip_reservations ADD EXCLUDE USING gist (ip_addr inet_ops WITH =, reserved WITH &&);
-- new distribution is incompatible with the constraint.
ALTER TABLE ip_reservations SET DISTRIBUTED BY (reserved);
ERROR: distribution policy is not compatible with exclusion constraint "ip_reservations_ip_addr_reserved_excl"
DETAIL: Distribution key column "reserved" is not included in the constraint.
HINT: Add "reserved" to the constraint with the =(anyrange,anyrange) operator.
-- After dropping the constraint, it's allowed.
ALTER TABLE ip_reservations DROP CONSTRAINT ip_reservations_ip_addr_reserved_excl;
ALTER TABLE ip_reservations SET DISTRIBUTED BY (reserved);
-- Test creating exclusion constraint on tsrange column. (The subtle
-- difference is there is no direct =(tsrange, tsrange) operator, we rely on
-- the implicit casts for it)
ALTER TABLE ip_reservations ADD EXCLUDE USING gist (reserved WITH =);
--
-- Test scenario, where a type has a hash operator class, but not a default
-- one.
--
-- Doesn't work, because 'point' doesn't have a hash opclass
CREATE TABLE dist_by_point1(p point) DISTRIBUTED BY (p);
ERROR: data type point has no default operator class for access method "hash"
HINT: You must specify an operator class or define a default operator class for the data type.
-- Neither does the same via ALTER TABLE
CREATE TABLE dist_by_point2(p point) DISTRIBUTED RANDOMLY;
ALTER TABLE dist_by_point2 SET DISTRIBUTED BY (p);
ERROR: data type point has no default operator class for access method "hash"
HINT: You must specify an operator class or define a default operator class for the data type.
-- But if we create it one, then we can use it.
CREATE FUNCTION pointhashfunc(point) RETURNS int AS $$
begin return hashfloat8(point_distance(point(0,0), $1)); end;
$$ LANGUAGE plpgsql STRICT IMMUTABLE;
CREATE OPERATOR CLASS point_hash_ops FOR TYPE point
USING hash AS
OPERATOR 1 ~=,
FUNCTION 1 pointhashfunc(point);
-- This still doesn't work, because there's still no *default* opclass.
CREATE TABLE dist_by_point3(p point) DISTRIBUTED BY (p);
ERROR: data type point has no default operator class for access method "hash"
HINT: You must specify an operator class or define a default operator class for the data type.
CREATE TABLE dist_by_point3(p point) DISTRIBUTED RANDOMLY;
ALTER TABLE dist_by_point3 SET DISTRIBUTED BY (p);
ERROR: data type point has no default operator class for access method "hash"
HINT: You must specify an operator class or define a default operator class for the data type.
-- But explicitly specifying the opclass works.
CREATE TABLE dist_by_point4(p point) DISTRIBUTED BY (p point_hash_ops);
ALTER TABLE dist_by_point4 SET DISTRIBUTED RANDOMLY;
ALTER TABLE dist_by_point4 SET DISTRIBUTED BY (p point_hash_ops);