| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| package org.apache.impala.analysis; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertNotNull; |
| |
| import java.util.Set; |
| |
| import org.apache.curator.shaded.com.google.common.collect.Sets; |
| import org.apache.impala.catalog.Catalog; |
| import org.apache.impala.catalog.Column; |
| import org.apache.impala.catalog.ColumnStats; |
| import org.apache.impala.catalog.DatabaseNotFoundException; |
| import org.apache.impala.catalog.Db; |
| import org.apache.impala.catalog.Table; |
| import org.apache.impala.common.AnalysisSessionFixture; |
| import org.apache.impala.common.ImpalaException; |
| import org.apache.impala.common.InternalException; |
| import org.apache.impala.common.QueryFixture.SelectFixture; |
| import org.apache.impala.planner.CardinalityTest; |
| import org.junit.Test; |
| |
| /** |
| * Tests expression cardinality and selectivity, both of which are |
| * important inputs to scan and join cardinality estimates. |
| * |
| * In the comments below, the notation |x| means the cardinality of |
| * x. If x is a table, then it is the row count of x. If x is a column, |
| * then it is the number of distinct values (the cardinality of the |
| * domain of the column), also known as NDV. |
| * |
| * This test focuses on cardinality and the selectivity that determines |
| * derived cardinality. If |T| is the cardinality of table T, then |
| * |T'| is defined as the cardinality of table T after applying a selection s. |
| * Selectivity is defined as: |
| * |
| * sel(s) = |T'|/|T| |
| * |
| * Or |
| * |
| * |T'| = |T| * sel(s) |
| * |
| * Though not used here, it can be helpful to think of the selectivity as |
| * the probability p that some row r appears in the output after selection: |
| * |
| * sel(s) = p(r in |T'|) |
| * |
| * Tests here focus on the entire cardinality and NDV lifecycle up to an |
| * expression, ensuring that we produce proper overall estimates. See also: |
| * |
| * * {@link ExprNdvTest} which focuses on the actual NDV calculation |
| * method, |
| * * {@link CardinalityTest} which examines cardinality output from the |
| * planner. |
| * |
| * The tests here illustrate a number of known bugs, typically marked by |
| * their ticket number. IMPALA-7601 is a roll-up for the general case that |
| * Impala does not estimate selectivity except in the narrow (col = const) |
| * case. |
| */ |
| public class ExprCardinalityTest { |
| private static AnalysisSessionFixture session_ = new AnalysisSessionFixture(); |
| |
| private void verifyTableCol(Table table, String colName, |
| long expectedNdv, long expectedNullCount) { |
| Column col = table.getColumn(colName); |
| assertNotNull(col); |
| ColumnStats stats = col.getStats(); |
| assertNotNull(stats); |
| assertEquals(expectedNdv, stats.getNumDistinctValues()); |
| assertEquals(expectedNullCount, stats.getNumNulls()); |
| } |
| |
| /** |
| * Baseline test of metadata cardinality, NDVs and null count. |
| * Locks down the values used in later tests to catch external changes |
| * easily. |
| * |
| * Cases: |
| * - With stats |
| * - Columns without nulls |
| * - Columns with nulls |
| * - Without stats, estimated from file size and schema |
| * |
| * (The last bit is not yet available.) |
| */ |
| |
| @Test |
| public void testMetadata() throws DatabaseNotFoundException, InternalException { |
| Catalog catalog = session_.catalog(); |
| Db db = catalog.getDb("functional"); |
| StmtMetadataLoader mdLoader = |
| new StmtMetadataLoader(session_.frontend(), "functional", null); |
| Set<TableName> tables = Sets.newHashSet( |
| new TableName("functional", "alltypes"), |
| new TableName("functional", "nullrows"), |
| new TableName("functional", "manynulls")); |
| mdLoader.loadTables(tables); |
| |
| // Table with stats, no nulls |
| Table allTypes = db.getTable("alltypes"); |
| assertEquals(7300, allTypes.getTTableStats().getNum_rows()); |
| verifyTableCol(allTypes, "id", 7300, 0); |
| verifyTableCol(allTypes, "bool_col", 2, 0); |
| verifyTableCol(allTypes, "int_col", 10, 0); |
| // Bug: NDV of partition columns is -1 though it is listed as |
| // 2 in the shell with: SHOW COLUMN STATS alltypes |
| //verifyTableCol(allTypes, "year", 2, 0); |
| // Bug: When tests are run in Eclipse we get the result above. |
| // But, when the same test is run using maven from the command line, |
| // we get the result shown below. |
| // Unit test in Eclipse see the above, unit tests run from the |
| // Disabling both to avoid a flaky test, |
| // Same issue for the next three tests. |
| //verifyTableCol(allTypes, "year", -1, -1); |
| //verifyTableCol(allTypes, "month", 12, 0); |
| //verifyTableCol(allTypes, "month", -1, -1); |
| |
| // Table with stats and nulls |
| Table nullrows = db.getTable("nullrows"); |
| assertEquals(26, nullrows.getTTableStats().getNum_rows()); |
| verifyTableCol(nullrows, "id", 26, 0); |
| // Bug: NDV should be 1 to include nulls |
| verifyTableCol(nullrows, "null_str", 0, 26); |
| verifyTableCol(nullrows, "group_str", 6, 0); |
| verifyTableCol(nullrows, "some_nulls", 6, 20); |
| // Oddly, boolean columns DO include nulls in NDV. |
| verifyTableCol(nullrows, "bool_nulls", 3, 15); |
| |
| // Table without stats |
| Table manynulls = db.getTable("manynulls"); |
| // Bug: Table cardinality should be guessed from schema & file size. |
| assertEquals(-1, manynulls.getTTableStats().getNum_rows()); |
| verifyTableCol(manynulls, "id", -1, -1); |
| } |
| |
| public void verifySelectCol(String table, String col, |
| long expectedNdv, long expectedNullCount) throws ImpalaException { |
| SelectFixture select = new SelectFixture(session_) |
| .table("functional." + table) |
| .exprSql(col); |
| Expr expr = select.analyzeExpr(); |
| SlotRef colRef = (SlotRef) expr; |
| assertEquals(expectedNdv, expr.getNumDistinctValues()); |
| assertEquals(expectedNullCount, colRef.getDesc().getStats().getNumNulls()); |
| // Columns don't have selectivity, only expressions on columns |
| assertEquals(-1, expr.getSelectivity(), 0.001); |
| } |
| |
| /** |
| * Test cardinality of the column references within an AST. |
| * Ensures that the metadata cardinality was propagated into the |
| * AST, along with possible adjustments. |
| * |
| * Cases: |
| * - With stats |
| * - Normal NDV |
| * - Small NDV |
| * - Small NDV with nulls |
| * - NDV with all nulls |
| * - Constants |
| * - Without stats |
| * @throws ImpalaException |
| */ |
| @Test |
| public void testColumnCardinality() throws ImpalaException { |
| // Stats, no null values |
| verifySelectCol("alltypes", "id", 7300, 0); |
| verifySelectCol("alltypes", "bool_col", 2, 0); |
| verifySelectCol("alltypes", "int_col", 10, 0); |
| // Bug: Stats not available for partition columns |
| //verifySelectExpr("alltypes", "year", 2, 0); |
| // Bug: Unit test in Eclipse see the above, unit tests run from the |
| // command line see the below. Disabling to avoid a flaky test, |
| // here and below. |
| //verifySelectExpr("alltypes", "year", -1, -1); |
| //verifySelectExpr("alltypes", "month", 12, 0); |
| //verifySelectExpr("alltypes", "month", -1, -1); |
| |
| // Stats, with null values |
| verifySelectCol("nullrows", "id", 26, 0); |
| // Bug: NDV should be 1 to include nulls |
| verifySelectCol("nullrows", "null_str", 0, 26); |
| verifySelectCol("nullrows", "group_str", 6, 0); |
| verifySelectCol("nullrows", "some_nulls", 6, 20); |
| // Oddly, boolean columns DO include nulls in NDV. |
| verifySelectCol("nullrows", "bool_nulls", 3, 15); |
| |
| // No stats |
| verifySelectCol("manynulls", "id", -1, -1); |
| } |
| |
| public void verifySelectExpr(String table, String exprSql, |
| long expectedNdv, double expectedSel) throws ImpalaException { |
| SelectFixture select = new SelectFixture(session_) |
| .table("functional." + table) |
| .exprSql(exprSql); |
| Expr expr = select.analyzeExpr(); |
| assertEquals(expectedNdv, expr.getNumDistinctValues()); |
| assertEquals(expectedSel, expr.getSelectivity(), 0.00001); |
| } |
| |
| /** |
| * Constants have an NDV of 1, selectivity of -1. |
| */ |
| @Test |
| public void testConstants() throws ImpalaException { |
| verifySelectExpr("alltypes", "10", 1, -1); |
| verifySelectExpr("allTypes", "'foo'", 1, -1); |
| // Note that the constant NULL has an NDV = 1, but |
| // Null-only columns have an NDV=0... |
| // See IMPALA-8058 |
| verifySelectExpr("alltypes", "NULL", 1, -1); |
| verifySelectExpr("alltypes", "true", 1, -1); |
| } |
| |
| // Expression selectivity |
| // - Test for each expression type |
| // - Test for variety of situations |
| // - Valid/invalid table cardinality |
| // - Valid/invalid NDV |
| // - Valid/invalid null count |
| |
| /** |
| * Test col = const |
| * |
| * selectivity = 1 / |col| |
| */ |
| @Test |
| public void testEqSelectivity() throws ImpalaException { |
| verifySelectExpr("alltypes", "id = 10", 3, 1.0/7300); |
| verifySelectExpr("alltypes", "bool_col = true", 3, 1.0/2); |
| verifySelectExpr("alltypes", "int_col = 10", 3, 1.0/10); |
| |
| verifySelectExpr("nullrows", "id = 'foo'", 3, 1.0/26); |
| // Bug: All nulls, so NDV should = 1, so Sel should be 1.0/1 |
| //verifySelectExpr("nullrows", "c = 'foo'", 3, 1.0/1); |
| verifySelectExpr("nullrows", "null_str = 'foo'", 3, -1); |
| verifySelectExpr("nullrows", "group_str = 'foo'", 3, 1.0/6); |
| //verifySelectExpr("nullrows", "some_nulls = 'foo'", 3, 1.0/7); |
| verifySelectExpr("nullrows", "some_nulls = 'foo'", 3, 1.0/6); |
| |
| // Bug: Sel should default to good old 0.1 |
| verifySelectExpr("manynulls", "id = 10", 3, -1); |
| } |
| |
| /** |
| * Test col IS NOT DISTINCT FROM x |
| * |
| * Sel should be same as = if x is non-null, otherwise |
| * same as IS NULL |
| */ |
| @Test |
| public void testNotDistinctSelectivity() throws ImpalaException { |
| verifySelectExpr("alltypes", "id is not distinct from 10", 3, 1.0/7300); |
| // Bug: does not treat NULL specially |
| // Bug: NDV sould be 2 since IS NOT DISTINCT won't return NULL |
| //verifySelectExpr("alltypes", "id is not distinct from null", 2, 0); |
| verifySelectExpr("alltypes", "id is not distinct from null", 3, 1.0/7300); |
| verifySelectExpr("alltypes", "bool_col is not distinct from true", 3, 1.0/2); |
| //verifySelectExpr("alltypes", "bool_col is not distinct from null", 2, 0); |
| verifySelectExpr("alltypes", "bool_col is not distinct from null", 3, 1.0/2); |
| verifySelectExpr("alltypes", "int_col is not distinct from 10", 3, 1.0/10); |
| //verifySelectExpr("alltypes", "int_col is not distinct from null", 2, 0); |
| verifySelectExpr("alltypes", "int_col is not distinct from null", 3, 1.0/10); |
| |
| verifySelectExpr("nullrows", "id is not distinct from 'foo'", 3, 1.0/26); |
| //verifySelectExpr("nullrows", "id is not distinct from null", 2, 0); |
| verifySelectExpr("nullrows", "id is not distinct from null", 3, 1.0/26); |
| // Bug: All nulls, so NDV should = 1, so Sel should be 1.0/1 |
| //verifySelectExpr("nullrows", "null_str is not distinct from 'foo'", 2, 1.0/1); |
| verifySelectExpr("nullrows", "null_str is not distinct from 'foo'", 3, -1); |
| verifySelectExpr("nullrows", "null_str is not distinct from null", 3, -1); |
| verifySelectExpr("nullrows", "group_str is not distinct from 'foo'", 3, 1.0/6); |
| //verifySelectExpr("nullrows", "group_str is not distinct from null", 2, 1); |
| verifySelectExpr("nullrows", "group_str is not distinct from null", 3, 1.0/6); |
| //verifySelectExpr("nullrows", "some_nulls is not distinct from 'foo'", 3, 1.0/7); |
| verifySelectExpr("nullrows", "some_nulls is not distinct from 'foo'", 3, 1.0/6); |
| |
| // Bug: Sel should default to good old 0.1 |
| verifySelectExpr("manynulls", "id is not distinct from 10", 3, -1); |
| } |
| |
| /** |
| * Test col != const |
| */ |
| @Test |
| public void testNeSelectivity() throws ImpalaException { |
| // Bug: No estimated selectivity for != (IMPALA-8039) |
| //verifySelectExpr("alltypes", "id != 10", 3, 1 - 1.0/7300); |
| verifySelectExpr("alltypes", "id != 10", 3, -1); |
| //verifySelectExpr("alltypes", "bool_col != true", 3, 1 - 1.0/2); |
| verifySelectExpr("alltypes", "bool_col != true", 3, -1); |
| //verifySelectExpr("alltypes", "int_col != 10", 3, 1 - 1.0/10); |
| verifySelectExpr("alltypes", "int_col != 10", 3, -1); |
| |
| //verifySelectExpr("nullrows", "id != 'foo'", 3, 1 - 1.0/26); |
| verifySelectExpr("nullrows", "id != 'foo'", 3, -1); |
| // Bug: All nulls, so NDV should = 1, so Sel should be 1 - 1.0/1 |
| //verifySelectExpr("nullrows", "null_str != 'foo'", 3, 1 - 1.0/1); |
| verifySelectExpr("nullrows", "null_str != 'foo'", 3, -1); |
| //verifySelectExpr("nullrows", "group_str != 'foo'", 3, 1 - 1.0/6); |
| verifySelectExpr("nullrows", "group_str != 'foo'", 3, -1); |
| //verifySelectExpr("nullrows", "some_nulls != 'foo'", 3, 1 - 1.0/7); |
| verifySelectExpr("nullrows", "some_nulls != 'foo'", 3, -1); |
| |
| // Bug: Sel should default to 1 - good old 0.1 |
| verifySelectExpr("manynulls", "id != 10", 3, -1); |
| } |
| |
| /** |
| * Test col IS DISTINCT FROM x |
| * |
| * Sel should be 1 - Sel(col IS NOT DISTINCT FROM x) |
| */ |
| @Test |
| public void testDistinctSelectivity() throws ImpalaException { |
| // BUG: IS DISTINCT has no selectivity |
| //verifySelectExpr("alltypes", "id is distinct from 10", 3, 1 - 1.0/7300); |
| verifySelectExpr("alltypes", "id is distinct from 10", 3, -1); |
| // Bug: does not treat NULL specially |
| // Bug: NDV sould be 2 since IS DISTINCT won't return NULL |
| //verifySelectExpr("alltypes", "id is distinct from null", 2, 1); |
| verifySelectExpr("alltypes", "id is distinct from null", 3, -1); |
| //verifySelectExpr("alltypes", "bool_col is distinct from true", 3, 1 - 1.0/2); |
| verifySelectExpr("alltypes", "bool_col is distinct from true", 3, -1); |
| //verifySelectExpr("alltypes", "bool_col is distinct from null", 2, 1); |
| verifySelectExpr("alltypes", "bool_col is distinct from null", 3, -1); |
| //verifySelectExpr("alltypes", "int_col is distinct from 10", 3, 1 - 1.0/10); |
| verifySelectExpr("alltypes", "int_col is distinct from 10", 3, -1); |
| //verifySelectExpr("alltypes", "int_col is distinct from null", 2, 1); |
| verifySelectExpr("alltypes", "int_col is distinct from null", 3, -1); |
| |
| //verifySelectExpr("nullrows", "id is distinct from 'foo'", 3, 1 - 1.0/26); |
| verifySelectExpr("nullrows", "id is distinct from 'foo'", 3, -1); |
| //verifySelectExpr("nullrows", "id is distinct from null", 2, 1); |
| verifySelectExpr("nullrows", "id is distinct from null", 3, -1); |
| // Bug: All nulls, so NDV should = 1, so Sel should be 1.0/1 |
| //verifySelectExpr("nullrows", "null_str is distinct from 'foo'", 2, 1 - 1.0/1); |
| verifySelectExpr("nullrows", "null_str is distinct from 'foo'", 3, -1); |
| verifySelectExpr("nullrows", "null_str is distinct from null", 3, -1); |
| //verifySelectExpr("nullrows", "group_str is distinct from 'foo'", 3, 1 - 1.0/6); |
| verifySelectExpr("nullrows", "group_str is distinct from 'foo'", 3, -1); |
| //verifySelectExpr("nullrows", "group_str is distinct from null", 2, 0); |
| verifySelectExpr("nullrows", "group_str is distinct from null", 3, -1); |
| //verifySelectExpr("nullrows", "some_nulls is not distinct from 'foo'", 3, 1 - 1.0/7); |
| verifySelectExpr("nullrows", "some_nulls is not distinct from 'foo'", 3, 1.0/6); |
| |
| // Bug: Sel should default to 1 - good old 0.1 |
| verifySelectExpr("manynulls", "id is distinct from 10", 3, -1); |
| } |
| |
| public static final double INEQUALITY_SEL = 0.33; |
| |
| private void verifyInequalitySel(String table, String col, String value) |
| throws ImpalaException { |
| for (String op : new String[] { "<", "<=", ">", ">="}) { |
| // Bug: No estimated selectivity for >, >=, <, <= (IMPALA-7603) |
| //verifySelectExpr(table, col + " " + op + " " + value, 3, INEQUALITY_SEL); |
| verifySelectExpr(table, col + " " + op + " " + value, 3, -1); |
| } |
| } |
| |
| @Test |
| public void testInequalitySelectivity() throws ImpalaException { |
| verifyInequalitySel("alltypes", "id", "10"); |
| verifyInequalitySel("alltypes", "int_col", "10"); |
| |
| verifyInequalitySel("nullrows", "id", "'foo'"); |
| verifyInequalitySel("nullrows", "null_str", "'foo'"); |
| verifyInequalitySel("nullrows", "group_str", "'foo'"); |
| verifyInequalitySel("nullrows", "some_nulls", "'foo'"); |
| |
| // Bug: Sel should default to 1 - good old 0.1 |
| verifyInequalitySel("manynulls", "id", "10"); |
| } |
| |
| /** |
| * Test col IS NULL |
| * Selectivity should be null_count / |table| |
| */ |
| @Test |
| public void testIsNullSelectivity() throws ImpalaException { |
| // Bug: No estimated selectivity for IS NULL (IMPALA-8050) |
| // Should be null count / |table| |
| // Bug: NDV of IS NULL is 3, should be 2 since IS NULL will |
| // never itself return NULL |
| //verifySelectExpr("alltypes", "id is null", 2, 0); |
| verifySelectExpr("alltypes", "id is null", 3, -1); |
| //verifySelectExpr("alltypes", "bool_col is null", 2, 0); |
| verifySelectExpr("alltypes", "bool_col is null", 3, -1); |
| //verifySelectExpr("alltypes", "int_col is null", 2, 0); |
| verifySelectExpr("alltypes", "int_col is null", 3, -1); |
| |
| //verifySelectExpr("nullrows", "id is null", 2, 0); |
| verifySelectExpr("nullrows", "id is null", 3, -1); |
| //verifySelectExpr("nullrows", "null_str is null", 2, 1); |
| verifySelectExpr("nullrows", "null_str is null", 3, 1); |
| //verifySelectExpr("nullrows", "group_str is null", 2, 0); |
| verifySelectExpr("nullrows", "group_str is null", 3, -1); |
| //verifySelectExpr("nullrows", "some_nulls is null", 2, 20.0/26); |
| verifySelectExpr("nullrows", "some_nulls is null", 3, 20.0/26); |
| verifySelectExpr("nullrows", "bool_nulls is not null", 3, 1 - 15.0/26); |
| |
| // Bug: Sel should default to good old 0.1 |
| verifySelectExpr("manynulls", "id is null", 3, -1); |
| } |
| |
| /** |
| * Test col IS NOT NULL |
| * Selectivity should be 1 - null_count / |table| |
| */ |
| @Test |
| public void testNotNullSelectivity() throws ImpalaException { |
| // Bug: No estimated selectivity for IS NOT NULL (IMPALA-8050) |
| // Should be 1 - null count / |table| |
| // Bug: NDV of IS NULL is 3, should be 2 since IS NOT NULL will |
| // never itself return NULL |
| //verifySelectExpr("alltypes", "id is not null", 2, 1); |
| verifySelectExpr("alltypes", "id is null", 3, -1); |
| //verifySelectExpr("alltypes", "bool_col is not null", 2, 1); |
| verifySelectExpr("alltypes", "bool_col is null", 3, -1); |
| //verifySelectExpr("alltypes", "int_col is not null", 2, 1); |
| verifySelectExpr("alltypes", "int_col is not null", 3, -1); |
| |
| //verifySelectExpr("nullrows", "id is not null", 2, 1); |
| verifySelectExpr("nullrows", "id is not null", 3, -1); |
| //verifySelectExpr("nullrows", "null_str is not null", 2, 0); |
| verifySelectExpr("nullrows", "null_str is not null", 3, 0); |
| //verifySelectExpr("nullrows", "group_str is not null", 2, 1); |
| verifySelectExpr("nullrows", "group_str is not null", 3, -1); |
| //verifySelectExpr("nullrows", "some_nulls is not null", 2, 1 - 20.0/26); |
| verifySelectExpr("nullrows", "some_nulls is not null", 3, 1 - 20.0/26); |
| verifySelectExpr("nullrows", "bool_nulls is not null", 3, 1 - 15.0/26); |
| |
| // Bug: Sel should default to good old 0.1 |
| verifySelectExpr("manynulls", "id is not null", 3, -1); |
| } |
| |
| /** |
| * Test col IN (a, b, c) |
| * |
| * The code should check only distinct values, so that |
| * |in| = NDV(in clause) |
| * |
| * Expected selectivity is |in| / |col| |
| * |
| * Where |col| = ndv(col) |
| * |
| * Estimate should be based on the "Containment" assumption: that the |
| * in-clause values are contained in the set of column values |
| */ |
| @Test |
| public void testInSelectivity() throws ImpalaException { |
| verifySelectExpr("alltypes", "id in (1, 2, 3)", 3, 3.0/7300); |
| // Bug: Does not use NDV, just simple value count |
| //verifySelectExpr("alltypes", "id in (1, 2, 3, 2, 3, 1)", 3, 3.0/7300); |
| verifySelectExpr("alltypes", "id in (1, 2, 3, 2, 3, 1)", 3, 6.0/7300); |
| verifySelectExpr("alltypes", "bool_col in (true)", 3, 1.0/2); |
| verifySelectExpr("alltypes", "bool_col in (true, false)", 3, 2.0/2); |
| verifySelectExpr("alltypes", "int_col in (1, 2, 3)", 3, 3.0/10); |
| verifySelectExpr("alltypes", |
| "int_col in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)", 3, 1); |
| |
| verifySelectExpr("nullrows", "id in ('a', 'b', 'c')", 3, 3.0/26); |
| // Bug: Why -1? |
| //verifySelectExpr("nullrows", "null_str in ('a', 'b', 'c')", 3, 1); |
| verifySelectExpr("nullrows", "null_str in ('a', 'b', 'c')", 3, -1); |
| verifySelectExpr("nullrows", "group_str in ('a', 'b', 'c')", 3, 3.0/6); |
| //verifySelectExpr("nullrows", "some_nulls in ('a', 'b', 'c')", 3, 3.0/7); |
| verifySelectExpr("nullrows", "some_nulls in ('a', 'b', 'c')", 3, 3.0/6); |
| |
| // Bug: Sel should default to good old 0.1 |
| verifySelectExpr("manynulls", "id in (1, 3, 3)", 3, -1); |
| } |
| |
| /** |
| * Test col NOT IN (a, b, c) |
| * |
| * Should be 1 = sel(col IN (a, b, c)) |
| */ |
| @Test |
| public void testNotInSelectivity() throws ImpalaException { |
| verifySelectExpr("alltypes", "id not in (1, 2, 3)", 3, 1 - 3.0/7300); |
| // Bug: Does not use NDV, just simple value count |
| //verifySelectExpr("alltypes", "id not in (1, 2, 3, 2, 3, 1)", 3, 1 - 3.0/7300); |
| verifySelectExpr("alltypes", "id not in (1, 2, 3, 2, 3, 1)", 3, 1 - 6.0/7300); |
| verifySelectExpr("alltypes", "bool_col not in (true)", 3, 1 - 1.0/2); |
| verifySelectExpr("alltypes", "bool_col not in (true, false)", 3, 1 - 2.0/2); |
| verifySelectExpr("alltypes", "int_col not in (1, 2, 3)", 3, 1 - 3.0/10); |
| verifySelectExpr("alltypes", |
| "int_col not in (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)", 3, 0); |
| |
| verifySelectExpr("nullrows", "id not in ('a', 'b', 'c')", 3, 1 - 3.0/26); |
| // Bug: Why -1? |
| //verifySelectExpr("nullrows", "null_str not in ('a', 'b', 'c')", 3, 1); |
| verifySelectExpr("nullrows", "null_str not in ('a', 'b', 'c')", 3, -1); |
| verifySelectExpr("nullrows", "group_str not in ('a', 'b', 'c')", 3, 1 - 3.0/6); |
| //verifySelectExpr("nullrows", "some_nulls not in ('a', 'b', 'c')", 3, 1 - 3.0/7); |
| verifySelectExpr("nullrows", "some_nulls not in ('a', 'b', 'c')", 3, 1 - 3.0/6); |
| |
| // Bug: Sel should default to 1 - good old 0.1 |
| verifySelectExpr("manynulls", "id not in (1, 3, 3)", 3, -1); |
| } |
| |
| @Test |
| public void testNotSelectivity() throws ImpalaException { |
| verifySelectExpr("alltypes", "not id in (1, 2, 3)", 3, 1 - 3.0/7300); |
| verifySelectExpr("alltypes", "not int_col in (1, 2)", 3, 1 - 2.0/10); |
| verifySelectExpr("alltypes", "not int_col = 10", 3, 1 - 1.0/10); |
| |
| // Bug: Sel should default to 1 - good old 0.1 |
| //verifySelectExpr("manynulls", "not id = 10", 3, 0.9); |
| verifySelectExpr("manynulls", "not id = 10", 3, -1); |
| } |
| |
| @Test |
| public void testAndSelectivity() throws ImpalaException { |
| verifySelectExpr("alltypes", "bool_col = true", 3, 1.0/2); |
| verifySelectExpr("alltypes", "int_col = 10", 3, 1.0/10); |
| // Note: This is NOT the logic used in plan nodes! |
| verifySelectExpr("alltypes", "bool_col = true and int_col = 10", 3, 1.0/2 * 1.0/10); |
| // Bug: should be something like (1/3)^2 |
| //verifySelectExpr("alltypes", "int_col >= 10 and int_col <= 20", 3, 0.11); |
| verifySelectExpr("alltypes", "int_col >= 10 and int_col <= 20", 3, -1); |
| |
| // Bug: Should be a product of two estimates. |
| // But, the -1 from the inequality poisons the whole expression |
| //verifySelectExpr("alltypes", "int_col = 10 AND smallint_col > 20", |
| // 3, 1.0/10 * 0.33); |
| verifySelectExpr("alltypes", "int_col = 10 AND smallint_col > 20", 3, -1); |
| } |
| |
| @Test |
| public void testOrSelectivity() throws ImpalaException { |
| verifySelectExpr("alltypes", "bool_col = true or int_col = 10", |
| 3, 1.0/2 + 1.0/10 - 1.0/2 * 1.0/10); |
| // Chain of OR rewritten to IN |
| verifySelectExpr("alltypes", "int_col = 10 or int_col = 20", 3, 2.0/10); |
| } |
| |
| /** |
| * Test col BETWEEN x and y. Rewritten to |
| * col >= x AND col <= y. Inequality should have an estimate. Since |
| * the expression is an AND, we multipley the two estimates. |
| * So, regardless of NDV and null count, selectivity should be |
| * something like 0.33^2. |
| */ |
| @Test |
| public void testBetweenSelectivity() throws ImpalaException { |
| // Bug: NO selectivity for Between because it is rewritten to |
| // use inequalities, and there no selectivities for those |
| // See IMPALA-8042 |
| //verifySelectExpr("alltypes", "id between 30 and 60", 3, 0.33 * 0.33); |
| verifySelectExpr("alltypes", "id between 30 and 60", 3, -1); |
| //verifySelectExpr("alltypes", "int_col between 30 and 60", 3, 0.33 * 0.33); |
| verifySelectExpr("alltypes", "int_col between 30 and 60", 3, -1); |
| |
| // Should not matter that there are no stats |
| //verifySelectExpr("manynulls", "id between 30 and 60", 3, 0.33 * 0.33); |
| verifySelectExpr("manynulls", "id between 30 and 60", 3, -1); |
| } |
| |
| /** |
| * Test col NOT BETWEEN x and y. Should be 1 - sel(col BETWEEN x and y). |
| */ |
| @Test |
| public void testNotBetweenSelectivity() throws ImpalaException { |
| // Bug: NO selectivity for Not Between because it is rewritten to |
| // use inequalities, and there no selectivities for those |
| //verifySelectExpr("alltypes", "id not between 30 and 60", 3, 1 - 0.33 * 0.33); |
| verifySelectExpr("alltypes", "id not between 30 and 60", 3, -1); |
| //verifySelectExpr("alltypes", "int_col not between 30 and 60", 3, 1 - 0.33 * 0.33); |
| verifySelectExpr("alltypes", "int_col not between 30 and 60", 3, -1); |
| |
| // Should not matter that there are no stats |
| //verifySelectExpr("manynulls", "id not between 30 and 60", 3, 1 - 0.33 * 0.33); |
| verifySelectExpr("manynulls", "id not between 30 and 60", 3, -1); |
| } |
| } |