blob: dfe318c9be5d9d10cf697d3f7af1b2221a677663 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.spark.sql.hive.execution
import java.util.{Locale, TimeZone}
import org.scalatest.BeforeAndAfter
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.tags.SlowHiveTest
* Runs the test cases that are included in the hive distribution.
class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
// TODO: bundle in jar files... get from classpath
private lazy val hiveQueryDir = TestHive.getHiveFile(
private val originalTimeZone = TimeZone.getDefault
private val originalLocale = Locale.getDefault
private val originalColumnBatchSize = TestHive.conf.columnBatchSize
private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning
private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled
private val originalSessionLocalTimeZone = TestHive.conf.sessionLocalTimeZone
def testCases: Seq[(String, File)] = { => f.getName.stripSuffix(".q") -> f)
override def beforeAll() {
// Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
// Add Locale setting
// Set a relatively small column batch size for testing purposes
TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, 5)
// Enable in-memory partition pruning for testing purposes
// Ensures that cross joins are enabled so that we can test them
TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true)
// Fix session local timezone to America/Los_Angeles for those timezone sensitive tests
// (timestamp_*)
TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, "America/Los_Angeles")
override def afterAll() {
try {
TestHive.setConf(SQLConf.COLUMN_BATCH_SIZE, originalColumnBatchSize)
TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning)
TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled)
TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone)
// For debugging dump some statistics about how much time was spent in various optimizer rules
} finally {
/** A list of tests deemed out of scope currently and thus completely disregarded. */
override def blackList: Seq[String] = Seq(
// These tests use hooks that are not on the classpath and thus break all subsequent execution.
// Setting a default property does not seem to get reset and thus changes the answer for many
// subsequent tests.
// User/machine specific test answers, breaks the caching mechanism.
// "describe_table",
// DFS commands
// Weird DDL differences result in failures on jenkins.
// This test is totally fine except that it includes wrong queries and expects errors, but error
// message format in Hive and Spark SQL differ. Should workaround this later.
// we can cast dates likes '2015-03-18' to a timestamp and extract the seconds.
// Hive returns null for second('2015-03-18')
// we can cast dates likes '2015-03-18' to a timestamp and extract the minutes.
// Hive returns null for minute('2015-03-18')
// Cant run without local map/reduce.
// Hive seems to think 1.0 > NaN = true && 1.0 < NaN = false... which is wrong.
// Tests that seems to never complete on hive...
// These tests fail and exit the JVM.
// Uses a serde that isn't on the classpath... breaks other tests.
// Avro tests seem to change the output format permanently thus breaking the answer cache, until
// we figure out why this is the case let just ignore all of avro related tests.
// Unique joins are weird and will require a lot of hacks (see comments in hive parser).
// Hive seems to get the wrong answer on some outer joins. MySQL agrees with catalyst.
// No support for multi-alias i.e. udf as (e1, e2, e3).
// No support for TestSerDe (not published afaik)
// No support for unpublished test udfs.
// Hive does not support buckets.
// We have our own tests based on these query files.
// Fails in hive with authorization errors.
// Hadoop version specific tests
// No support for case sensitivity is resolution using hive properties atm.
// Flaky test, Hive sometimes returns different set of 10 rows.
// After stop taking the `stringOrError` route, exceptions are thrown from these cases.
// See SPARK-2129 for details.
// Returning the result of a describe state as a JSON object is not supported.
// Hive returns the results of describe as plain text. Comments with multiple lines
// introduce extra lines in the Hive results, which make the result comparison fail.
// Limit clause without a ordering, which causes failure.
// Requires precision decimal support:
// the table src(key INT, value STRING) is not the same as HIVE unittest. In Hive
// is src(key STRING, value STRING), and in the reflect.q, it failed in
// Integer.valueOf, which expect the first argument passed as STRING type not INT.
// Sort with Limit clause causes failure.
// timestamp in array, the output format of Hive contains double quotes, while
// Spark SQL doesn't
// It has a bug and it has been fixed by
// (in Hive 0.14 and trunk).
// These tests were broken by the hive client isolation PR.
"nullformatCTAS", // SPARK-7411: need to finish CTAS parser
// The isolated classloader seemed to make some of our test reset mechanisms less robust.
"combine1", // This test changes compression settings in a way that breaks all subsequent tests.
"load_dyn_part14.*", // These work alone but fail when run with other tests...
// the answer is sensitive for jdk version
// Spark SQL use Long for TimestampType, lose the precision under 1us
// Hive returns string from UTC formatted timestamp, spark returns timestamp type
// Can't compare the result that have newline in it
// Unlike Hive, we do support log base in (0, 1.0], therefore disable this
// Trivial changes to DDL output
// Odd changes to output
// Unsupported underscore syntax.
// Thift is broken...
// Hive changed ordering of ddl:
// Parser changes in Hive 1.2
// Uses invalid table name
// classpath problems
// The difference between the double numbers generated by Hive and Spark
// can be ignored (e.g., 0.6633880657639323 and 0.6633880657639322)
// Feature removed in HIVE-11145
// Hive returns null rather than NaN when n = 1
// The implementation of GROUPING__ID in Hive is wrong (not match with doc).
// Spark parser treats numerical literals differently: it creates decimals instead of doubles.
// These tests check the VIEW table definition, but Spark handles CREATE VIEW itself and
// generates different View Expanded Text.
// We don't support show create table commands in general
// These tests try to change how a table is bucketed, which we don't support
// These tests try to create a table with bucketed columns, which we don't support
// These tests try to create a table with skewed columns, which we don't support
// This test tries to create a table like with TBLPROPERTIES clause, which we don't support.
// Index commands are not supported
// Macro commands are not supported
// Create partitioned view is not supported
// This uses CONCATENATE, which we don't support
// TOUCH is not supported
// INPUTDRIVER and OUTPUTDRIVER are not supported
// We have converted the useful parts of these tests to tests
// in org.apache.spark.sql.hive.execution.SQLQuerySuite.
// The following fails due to describe extended.
// The following fails due to alter table partitions with predicate.
// The following failes due to truncate table
// We do not support DFS command.
// We have converted the useful parts of these tests to tests
// in org.apache.spark.sql.hive.execution.SQLQuerySuite.
// These tests use EXPLAIN FORMATTED, which is not supported
// This test uses CREATE EXTERNAL TABLE without specifying LOCATION
// [SPARK-16248][SQL] Whitelist the list of Hive fallback functions
// These tests DROP TABLE that don't exist (but do not specify IF EXISTS)
// This test assumes we parse scientific decimals as doubles (we parse them as decimals)
// These tests are duplicates of joinXYZ
// These tests are based on the Hive's hash function, which is different from Spark
* The set of tests that are believed to be working in catalyst. Tests not on whiteList or
* blacklist are implicitly marked as ignored.
override def whiteList: Seq[String] = Seq(
// "udf_array", -- done in array.sql
// "udf_array_contains", -- done in array.sql