testdata/avro_schema_resolution/create_table.sql - impala - Git at Google

 -- Licensed to the Apache Software Foundation (ASF) under one
 -- or more contributor license agreements.  See the NOTICE file
 -- distributed with this work for additional information
 -- regarding copyright ownership.  The ASF licenses this file
 -- to you under the Apache License, Version 2.0 (the
 -- "License"); you may not use this file except in compliance
 -- with the License.  You may obtain a copy of the License at
 --
 --   http://www.apache.org/licenses/LICENSE-2.0
 --
 -- Unless required by applicable law or agreed to in writing,
 -- software distributed under the License is distributed on an
 -- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 -- KIND, either express or implied.  See the License for the
 -- specific language governing permissions and limitations
 -- under the License.

 -- NOTE: Queries in this file have to be executed against Hive and some won't work with
 -- Impala due to different type compatibility rules.

 USE functional_avro_snap;

 DROP TABLE IF EXISTS schema_resolution_test;

 -- Specify the Avro schema in SERDEPROPERTIES instead of TBLPROPERTIES to validate
 -- IMPALA-538. Also, give the table a different column definition (col1, col2) than what
 -- is defined in the Avro schema for testing mismatched table/deserializer schemas.
 CREATE EXTERNAL TABLE schema_resolution_test (col1 string, col2 string)
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
 WITH SERDEPROPERTIES ('avro.schema.literal'='{
 "name": "a",
 "type": "record",
 "fields": [
   {"name":"boolean1", "type":"boolean", "default": true},
   {"name":"int1",     "type":"int",     "default": 1},
   {"name":"long1",    "type":"long",    "default": 1},
   {"name":"float1",   "type":"float",   "default": 1.0},
   {"name":"double1",  "type":"double",  "default": 1.0},
   {"name":"date1",    "type": {"type": "int", "logicalType": "date"}, "default": 1},
   {"name":"string1",  "type":"string",  "default": "default string"},
   {"name":"string2",  "type": ["string", "null"],  "default": ""},
   {"name":"string3",  "type": ["null", "string"],  "default": null}
 ]}')
 STORED AS
 INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
 LOCATION '/test-warehouse/avro_schema_resolution_test/';

 LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records1.avro' OVERWRITE INTO TABLE schema_resolution_test;
 LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records2.avro' INTO TABLE schema_resolution_test;

 -- The following tables are used to test Impala's handling of HIVE-6308 which causes
 -- COMPUTE STATS and Hive's ANALYZE TABLE to fail for Avro tables with mismatched
 -- column definitions and Avro-schema columns. In such cases, COMPUTE STATS is expected to
 -- fail in analysis and not after actually computing stats (IMPALA-867).
 -- TODO: The creation of these tables should migrate into our regular data loading
 -- framework. There are issues with beeline for these CREATE TABLE stmts (NPEs).

 -- No explicit column definitions for an Avro table.
 DROP TABLE IF EXISTS alltypes_no_coldef;
 CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_no_coldef
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
 STORED AS
 INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
 LOCATION '/test-warehouse/alltypes_avro_snap'
 TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

 -- Column definition list has one more column than the Avro schema.
 DROP TABLE IF EXISTS alltypes_extra_coldef;
 CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_extra_coldef (
 id int,
 bool_col boolean,
 tinyint_col tinyint,
 smallint_col smallint,
 int_col int,
 bigint_col bigint,
 float_col float,
 double_col double,
 date_string_col string,
 string_col string,
 timestamp_col timestamp,
 extra_col string)
 PARTITIONED BY (year int, month int)
 STORED AS
 INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
 LOCATION '/test-warehouse/alltypes_avro_snap'
 TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

 -- Column definition list is missing 'tinyint_col' and 'timestamp_col' from the Avro schema.
 DROP TABLE IF EXISTS alltypes_missing_coldef;
 CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_missing_coldef (
 id int,
 bool_col boolean,
 smallint_col smallint,
 int_col int,
 bigint_col bigint,
 float_col float,
 double_col double,
 date_string_col string,
 string_col string)
 PARTITIONED BY (year int, month int)
 STORED AS
 INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
 LOCATION '/test-warehouse/alltypes_avro_snap'
 TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

 -- Matching number of columns and column names, but mismatched type (bigint_col is a string).
 DROP TABLE IF EXISTS alltypes_type_mismatch;
 CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_type_mismatch (
 id int,
 bool_col boolean,
 tinyint_col tinyint,
 smallint_col smallint,
 int_col int,
 bigint_col string,
 float_col float,
 double_col double,
 date_string_col string,
 string_col string,
 timestamp_col timestamp)
 PARTITIONED BY (year int, month int)
 STORED AS
 INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
 OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
 LOCATION '/test-warehouse/alltypes_avro_snap'
 TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

 -- IMPALA-2798, create two avro tables with same underlying data location, one table
 -- has an extra column at the end. Validate Impala doesn't use codegen decoding function
 -- if table schema is not the same as file schema.
 DROP TABLE IF EXISTS avro_coldef;
 CREATE EXTERNAL TABLE IF NOT EXISTS avro_coldef (
 bool_col boolean,
 tinyint_col int,
 smallint_col int,
 int_col int,
 bigint_col bigint,
 float_col float,
 double_col double,
 date_string_col string,
 string_col string,
 timestamp_col timestamp)
 PARTITIONED BY (year int, month int)
 STORED AS avro
 LOCATION '/test-warehouse/avro_coldef_snap'
 TBLPROPERTIES ('avro.schema.literal'='{
 "name": "a",
 "type": "record",
 "fields": [
   {"name":"bool_col", "type":"boolean"},
   {"name":"tinyint_col",     "type":"int"},
   {"name":"smallint_col",    "type":"int"},
   {"name":"int_col",   "type":"int"},
   {"name":"bigint_col",  "type":"long"},
   {"name":"float_col",  "type":"float"},
   {"name":"double_col",  "type": "double"},
   {"name":"date_string_col",  "type": "string"},
   {"name":"string_col",  "type": "string"},
   {"name":"timestamp_col",  "type": "long"}
 ]}');

 -- Reload existing partitions from HDFS. Without this, the overwrite will fail to remove
 -- any preexisting data files, which in turn will fail the query.
 MSCK REPAIR TABLE avro_coldef;

 INSERT OVERWRITE TABLE avro_coldef PARTITION(year=2014, month=1)
 SELECT bool_col, tinyint_col, smallint_col, int_col, bigint_col,
 float_col, double_col, date_string_col, string_col, timestamp_col
 FROM (select * from functional.alltypes order by id limit 5) a;

 DROP TABLE IF EXISTS avro_extra_coldef;
 CREATE EXTERNAL TABLE IF NOT EXISTS avro_extra_coldef (
 bool_col boolean,
 tinyint_col int,
 smallint_col int,
 int_col int,
 bigint_col bigint,
 float_col float,
 double_col double,
 date_string_col string,
 string_col string,
 timestamp_col timestamp,
 extra_col string)
 PARTITIONED BY (year int, month int)
 STORED AS avro
 LOCATION '/test-warehouse/avro_coldef_snap'
 TBLPROPERTIES ('avro.schema.literal'='{
 "name": "a",
 "type": "record",
 "fields": [
   {"name":"bool_col", "type":"boolean"},
   {"name":"tinyint_col",     "type":"int"},
   {"name":"smallint_col",    "type":"int"},
   {"name":"int_col",   "type":"int"},
   {"name":"bigint_col",  "type":"long"},
   {"name":"float_col",  "type":"float"},
   {"name":"double_col",  "type": "double"},
   {"name":"date_string_col",  "type": "string"},
   {"name":"string_col",  "type": "string"},
   {"name":"timestamp_col",  "type": "long"},
   {"name":"extra_col",  "type": "string", "default": "null"}
 ]}');

 -- Reload existing partitions from HDFS. Without this, the overwrite will fail to remove
 -- any preexisting data files, which in turn will fail the query.
 MSCK REPAIR TABLE avro_extra_coldef;

 INSERT OVERWRITE TABLE avro_extra_coldef PARTITION(year=2014, month=2)
 SELECT bool_col, tinyint_col, smallint_col, int_col, bigint_col,
 float_col, double_col, date_string_col, string_col,
 timestamp_col, "avro" AS extra_col FROM
 (select * from functional.alltypes order by id limit 5) a;

 -- Reload the partitions for the first table once again. This will make sure that the new
 -- partition from the second insert shows up in the first table, too.
 MSCK REPAIR TABLE avro_coldef;
	-- Licensed to the Apache Software Foundation (ASF) under one
	-- or more contributor license agreements. See the NOTICE file
	-- distributed with this work for additional information
	-- regarding copyright ownership. The ASF licenses this file
	-- to you under the Apache License, Version 2.0 (the
	-- "License"); you may not use this file except in compliance
	-- with the License. You may obtain a copy of the License at
	--
	-- http://www.apache.org/licenses/LICENSE-2.0
	--
	-- Unless required by applicable law or agreed to in writing,
	-- software distributed under the License is distributed on an
	-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	-- KIND, either express or implied. See the License for the
	-- specific language governing permissions and limitations
	-- under the License.

	-- NOTE: Queries in this file have to be executed against Hive and some won't work with
	-- Impala due to different type compatibility rules.

	USE functional_avro_snap;

	DROP TABLE IF EXISTS schema_resolution_test;

	-- Specify the Avro schema in SERDEPROPERTIES instead of TBLPROPERTIES to validate
	-- IMPALA-538. Also, give the table a different column definition (col1, col2) than what
	-- is defined in the Avro schema for testing mismatched table/deserializer schemas.
	CREATE EXTERNAL TABLE schema_resolution_test (col1 string, col2 string)
	ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
	WITH SERDEPROPERTIES ('avro.schema.literal'='{
	"name": "a",
	"type": "record",
	"fields": [
	{"name":"boolean1", "type":"boolean", "default": true},
	{"name":"int1", "type":"int", "default": 1},
	{"name":"long1", "type":"long", "default": 1},
	{"name":"float1", "type":"float", "default": 1.0},
	{"name":"double1", "type":"double", "default": 1.0},
	{"name":"date1", "type": {"type": "int", "logicalType": "date"}, "default": 1},
	{"name":"string1", "type":"string", "default": "default string"},
	{"name":"string2", "type": ["string", "null"], "default": ""},
	{"name":"string3", "type": ["null", "string"], "default": null}
	]}')
	STORED AS
	INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
	OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
	LOCATION '/test-warehouse/avro_schema_resolution_test/';

	LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records1.avro' OVERWRITE INTO TABLE schema_resolution_test;
	LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records2.avro' INTO TABLE schema_resolution_test;

	-- The following tables are used to test Impala's handling of HIVE-6308 which causes
	-- COMPUTE STATS and Hive's ANALYZE TABLE to fail for Avro tables with mismatched
	-- column definitions and Avro-schema columns. In such cases, COMPUTE STATS is expected to
	-- fail in analysis and not after actually computing stats (IMPALA-867).
	-- TODO: The creation of these tables should migrate into our regular data loading
	-- framework. There are issues with beeline for these CREATE TABLE stmts (NPEs).

	-- No explicit column definitions for an Avro table.
	DROP TABLE IF EXISTS alltypes_no_coldef;
	CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_no_coldef
	ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
	STORED AS
	INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
	OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
	LOCATION '/test-warehouse/alltypes_avro_snap'
	TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

	-- Column definition list has one more column than the Avro schema.
	DROP TABLE IF EXISTS alltypes_extra_coldef;
	CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_extra_coldef (
	id int,
	bool_col boolean,
	tinyint_col tinyint,
	smallint_col smallint,
	int_col int,
	bigint_col bigint,
	float_col float,
	double_col double,
	date_string_col string,
	string_col string,
	timestamp_col timestamp,
	extra_col string)
	PARTITIONED BY (year int, month int)
	STORED AS
	INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
	OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
	LOCATION '/test-warehouse/alltypes_avro_snap'
	TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

	-- Column definition list is missing 'tinyint_col' and 'timestamp_col' from the Avro schema.
	DROP TABLE IF EXISTS alltypes_missing_coldef;
	CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_missing_coldef (
	id int,
	bool_col boolean,
	smallint_col smallint,
	int_col int,
	bigint_col bigint,
	float_col float,
	double_col double,
	date_string_col string,
	string_col string)
	PARTITIONED BY (year int, month int)
	STORED AS
	INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
	OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
	LOCATION '/test-warehouse/alltypes_avro_snap'
	TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

	-- Matching number of columns and column names, but mismatched type (bigint_col is a string).
	DROP TABLE IF EXISTS alltypes_type_mismatch;
	CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_type_mismatch (
	id int,
	bool_col boolean,
	tinyint_col tinyint,
	smallint_col smallint,
	int_col int,
	bigint_col string,
	float_col float,
	double_col double,
	date_string_col string,
	string_col string,
	timestamp_col timestamp)
	PARTITIONED BY (year int, month int)
	STORED AS
	INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
	OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
	LOCATION '/test-warehouse/alltypes_avro_snap'
	TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');

	-- IMPALA-2798, create two avro tables with same underlying data location, one table
	-- has an extra column at the end. Validate Impala doesn't use codegen decoding function
	-- if table schema is not the same as file schema.
	DROP TABLE IF EXISTS avro_coldef;
	CREATE EXTERNAL TABLE IF NOT EXISTS avro_coldef (
	bool_col boolean,
	tinyint_col int,
	smallint_col int,
	int_col int,
	bigint_col bigint,
	float_col float,
	double_col double,
	date_string_col string,
	string_col string,
	timestamp_col timestamp)
	PARTITIONED BY (year int, month int)
	STORED AS avro
	LOCATION '/test-warehouse/avro_coldef_snap'
	TBLPROPERTIES ('avro.schema.literal'='{
	"name": "a",
	"type": "record",
	"fields": [
	{"name":"bool_col", "type":"boolean"},
	{"name":"tinyint_col", "type":"int"},
	{"name":"smallint_col", "type":"int"},
	{"name":"int_col", "type":"int"},
	{"name":"bigint_col", "type":"long"},
	{"name":"float_col", "type":"float"},
	{"name":"double_col", "type": "double"},
	{"name":"date_string_col", "type": "string"},
	{"name":"string_col", "type": "string"},
	{"name":"timestamp_col", "type": "long"}
	]}');

	-- Reload existing partitions from HDFS. Without this, the overwrite will fail to remove
	-- any preexisting data files, which in turn will fail the query.
	MSCK REPAIR TABLE avro_coldef;

	INSERT OVERWRITE TABLE avro_coldef PARTITION(year=2014, month=1)
	SELECT bool_col, tinyint_col, smallint_col, int_col, bigint_col,
	float_col, double_col, date_string_col, string_col, timestamp_col
	FROM (select * from functional.alltypes order by id limit 5) a;

	DROP TABLE IF EXISTS avro_extra_coldef;
	CREATE EXTERNAL TABLE IF NOT EXISTS avro_extra_coldef (
	bool_col boolean,
	tinyint_col int,
	smallint_col int,
	int_col int,
	bigint_col bigint,
	float_col float,
	double_col double,
	date_string_col string,
	string_col string,
	timestamp_col timestamp,
	extra_col string)
	PARTITIONED BY (year int, month int)
	STORED AS avro
	LOCATION '/test-warehouse/avro_coldef_snap'
	TBLPROPERTIES ('avro.schema.literal'='{
	"name": "a",
	"type": "record",
	"fields": [
	{"name":"bool_col", "type":"boolean"},
	{"name":"tinyint_col", "type":"int"},
	{"name":"smallint_col", "type":"int"},
	{"name":"int_col", "type":"int"},
	{"name":"bigint_col", "type":"long"},
	{"name":"float_col", "type":"float"},
	{"name":"double_col", "type": "double"},
	{"name":"date_string_col", "type": "string"},
	{"name":"string_col", "type": "string"},
	{"name":"timestamp_col", "type": "long"},
	{"name":"extra_col", "type": "string", "default": "null"}
	]}');

	-- Reload existing partitions from HDFS. Without this, the overwrite will fail to remove
	-- any preexisting data files, which in turn will fail the query.
	MSCK REPAIR TABLE avro_extra_coldef;

	INSERT OVERWRITE TABLE avro_extra_coldef PARTITION(year=2014, month=2)
	SELECT bool_col, tinyint_col, smallint_col, int_col, bigint_col,
	float_col, double_col, date_string_col, string_col,
	timestamp_col, "avro" AS extra_col FROM
	(select * from functional.alltypes order by id limit 5) a;

	-- Reload the partitions for the first table once again. This will make sure that the new
	-- partition from the second insert shows up in the first table, too.
	MSCK REPAIR TABLE avro_coldef;