| /* ----------------------------------------------------------------------- *//** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| * |
| *//* ----------------------------------------------------------------------- */ |
| |
| DROP TABLE IF EXISTS "TEST_s" CASCADE; |
| |
| CREATE TABLE "TEST_s"( |
| id1 INTEGER, |
| id2 INTEGER, |
| gr1 INTEGER, |
| gr2 INTEGER, |
| gr3 TEXT |
| ); |
| |
| INSERT INTO "TEST_s" VALUES |
| (1,0,1,1,'a'), |
| (1,0,1,1,'b'), |
| (1,0,1,2,'b'), |
| (1,0,1,2,'c'), |
| (1,0,1,2,'d'), |
| (1,0,1,2,'e'), |
| (1,0,1,5,'c'), |
| (1,0,1,6,'c'), |
| (2,0,1,6,'d'), |
| (3,0,1,1,'a'), |
| (4,0,1,1,'a'), |
| (5,0,1,1,'a'), |
| (6,0,1,1,'a'), |
| (7,0,1,1,'b'), |
| (8,0,1,1,'b'), |
| (9,0,1,1,'b'), |
| (19,0,1,1,'b'), |
| (29,0,1,1,'b'), |
| (39,0,1,1,'b'), |
| (0,1,1,2,'b'), |
| (0,2,1,2,'b'), |
| (0,3,1,2,'b'), |
| (0,4,1,2,'b'), |
| (0,5,1,2,'b'), |
| (0,6,1,2,'b'), |
| (10,10,2,2,'c'), |
| (20,20,2,2,'c'), |
| (30,30,2,2,'c'), |
| (40,40,2,2,'c'), |
| (50,50,2,2,'c'), |
| (60,60,2,2,'c'), |
| (70,70,2,2,'c'), |
| (10,10,5,5,'c'), |
| (50,50,5,5,'c'), |
| (88,88,5,5,'c'), |
| (40,40,5,6,'c'), |
| (50,50,5,6,'c'), |
| (60,60,5,6,'c'), |
| (70,70,5,6,'c'), |
| (10,10,6,6,'c'), |
| (60,60,6,6,'c'), |
| (30,30,6,6,'d'), |
| (40,40,6,6,'d'), |
| (50,50,6,6,'d'), |
| (60,60,6,6,'d'), |
| (70,70,6,6,'d'), |
| (50,50,4,2,'d'), |
| (60,60,4,2,'d'), |
| (70,70,4,2,'d'), |
| (50,50,3,2,'d'), |
| (60,60,3,2,'d'), |
| (70,70,3,2,'d'), |
| (500,50,NULL,2,'e'), |
| (600,60,NULL,2,'e'), |
| (700,70,NULL,2,'e') |
| ; |
| |
| -- SELECT gr1, count(*) AS c FROM "TEST_s" GROUP BY gr1; |
| -- gr1 | c |
| -- -----+---- |
| -- 4 | 3 |
| -- 1 | 18 |
| -- 5 | 7 |
| -- 3 | 3 |
| -- 6 | 7 |
| -- 2 | 7 |
| -- NULL | 3 |
| -- (6 rows) |
| |
| SELECT gr1, count(*) AS c FROM "TEST_s" GROUP BY gr1; |
| -- --- Test for random undersampling with replacement |
| DROP TABLE IF EXISTS out_sr2; |
| SELECT balance_sample('"TEST_s"', 'out_sr2', 'gr1', 'undersample ', NULL, NULL, TRUE, TRUE); |
| SELECT gr1, count(*) AS c FROM out_sr2 GROUP BY gr1; |
| -- SELECT gr1, count(*) AS c FROM out_sr2 GROUP BY gr1; |
| SELECT assert(count(*) = 0, 'Wrong number of samples on undersampling with replacement on gr1') FROM |
| (SELECT gr1, count(*) AS c FROM out_sr2 GROUP BY gr1) AS foo WHERE foo.c != 3; |
| |
| --- Test for random undersampling without replacement |
| DROP TABLE IF EXISTS out_s; |
| SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', 'undersample', NULL, 'gr2, gr3', FALSE); |
| SELECT * FROM out_s; |
| SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3; |
| DROP TABLE IF EXISTS out_s; |
| SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', 'oversample', NULL, 'gr2, gr3', FALSE); |
| SELECT * FROM out_s; |
| SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3; |
| DROP TABLE IF EXISTS out_s; |
| SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', 'uniform', NULL, 'gr2, gr3', FALSE); |
| SELECT * FROM out_s; |
| SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3; |
| DROP TABLE IF EXISTS out_s; |
| SELECT balance_sample('"TEST_s"', 'out_s', 'gr1', '1=3', NULL, 'gr2, gr3', FALSE); |
| SELECT * FROM out_s; |
| SELECT gr2, gr3, count(*) AS c FROM out_s GROUP BY gr3, gr2 ORDER BY gr2, gr3; |
| -- SELECT assert(count(*) = 0, 'Wrong number of samples on undersampling gr1') FROM |
| -- (SELECT gr1, gr2, count(*) AS c FROM out_s GROUP BY gr1, gr2) AS foo WHERE foo.c != 3; |
| |
| -- --- Test for random oversampling |
| DROP TABLE IF EXISTS out_or3; |
| SELECT balance_sample('"TEST_s"', 'out_or3', 'gr1', ' oVEr ', NULL, NULL); |
| SELECT gr1, count(*) AS c FROM out_or3 GROUP BY gr1; |
| SELECT assert(count(*) = 0, 'Wrong number of samples on oversampling') FROM |
| (SELECT gr1, count(*) AS c FROM out_or3 GROUP BY gr1) AS foo WHERE foo.c != 25; |
| |
| --- UNIFORM sampling |
| DROP TABLE IF EXISTS out_cd2; |
| SELECT balance_sample('"TEST_s"', 'out_cd2', 'gr1', 'Uniform', NULL, NULL); |
| SELECT gr1, count(*) AS c FROM out_cd2 GROUP BY gr1; |
| SELECT assert(count(*) = 0, 'Wrong number of samples on uniform sampling for gr1') FROM |
| (SELECT gr1, count(*) AS c FROM out_cd2 GROUP BY gr1) AS foo WHERE foo.c != 9; |
| |
| --- Default sampling should be uniform |
| DROP TABLE IF EXISTS out_cd3; |
| SELECT balance_sample('"TEST_s"', 'out_cd3', 'gr1', NULL, 100, NULL, NULL, TRUE); |
| SELECT gr1, count(*) AS c FROM out_cd3 GROUP BY gr1; |
| SELECT assert(count(*) = 0, 'Wrong number of samples on uniform sampling for gr1') FROM |
| (SELECT gr1, count(*) AS c FROM out_cd3 GROUP BY gr1) AS foo WHERE foo.c != 15; |
| |
| --- Only one class size is specified |
| DROP TABLE IF EXISTS out_cd4; |
| SELECT balance_sample('"TEST_s"', 'out_cd4', 'gr1', '2=10', NULL, NULL, TRUE); |
| SELECT gr1, count(*) AS c FROM out_cd4 GROUP BY gr1; |
| SELECT assert(count(*) = 10, 'Wrong number of samples on sampling with specified class sizes') from |
| out_cd4 where gr1 = 2; |
| SELECT assert(count(*) = 3, 'Wrong number of samples on sampling with specified class sizes') from |
| out_cd4 where gr1 = 3; |
| SELECT assert(count(*) = 25, 'Wrong number of samples on sampling with specified class sizes') from |
| out_cd4 where gr1 = 1; |
| |
| --- Multiple class sizes with comma delimited string |
| DROP TABLE IF EXISTS out_cd5; |
| SELECT balance_sample('"TEST_s"', 'out_cd5', 'gr1', '2= 10, 3=6, 1 = 10', 100, NULL); |
| select gr1, count(*) from out_cd5 group by gr1; |
| SELECT assert(count(*) >= 100, 'Wrong number of samples on sampling with comma-delimited list') from out_cd5; |
| SELECT assert(count(*) = 10, 'Wrong number of samples on sampling with comma-delimited list') from |
| out_cd5 where gr1 = 2; |
| SELECT assert(count(*) = 25, 'Wrong number of samples on sampling with comma-delimited list') from |
| out_cd5 where gr1 = 4; |
| SELECT assert(count(*) = 10, 'Wrong number of samples on sampling with comma-delimited list') from |
| out_cd5 where gr1 = 1; |
| SELECT assert(count(*) = 25, 'Wrong number of samples on sampling with comma-delimited list') from |
| out_cd5 where gr1 = 5; |
| SELECT assert(count(*) = 6, 'Wrong number of samples on sampling with comma-delimited list') from |
| out_cd5 where gr1 = 3; |