blob: 1f66fc3bab9884600c1c96134ebedbd060b475f4 [file] [log] [blame]
PREHOOK: query: -- Ensure it works if skewed column is not the first column in the table columns
-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
-- SORT_QUERY_RESULTS
-- test where the skewed values are more than 1 say columns no. 2 and 4 in a table with 5 columns
create table list_bucketing_mul_col (col1 String, col2 String, col3 String, col4 String, col5 string)
partitioned by (ds String, hr String)
skewed by (col2, col4) on (('466','val_466'),('287','val_287'),('82','val_82'))
stored as DIRECTORIES
STORED AS RCFILE
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@list_bucketing_mul_col
POSTHOOK: query: -- Ensure it works if skewed column is not the first column in the table columns
-- INCLUDE_HADOOP_MAJOR_VERSIONS(0.23)
-- SORT_QUERY_RESULTS
-- test where the skewed values are more than 1 say columns no. 2 and 4 in a table with 5 columns
create table list_bucketing_mul_col (col1 String, col2 String, col3 String, col4 String, col5 string)
partitioned by (ds String, hr String)
skewed by (col2, col4) on (('466','val_466'),('287','val_287'),('82','val_82'))
stored as DIRECTORIES
STORED AS RCFILE
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@list_bucketing_mul_col
PREHOOK: query: -- list bucketing DML
explain extended
insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '11')
select 1, key, 1, value, 1 from src
PREHOOK: type: QUERY
POSTHOOK: query: -- list bucketing DML
explain extended
insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '11')
select 1, key, 1, value, 1 from src
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
Stage-1 is a root stage
Stage-0 depends on stages: Stage-1
Stage-2 depends on stages: Stage-0
STAGE PLANS:
Stage: Stage-1
Map Reduce
Map Operator Tree:
TableScan
alias: src
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Select Operator
expressions: '1' (type: string), key (type: string), '1' (type: string), value (type: string), '1' (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
File Output Operator
compressed: false
GlobalTableId: 1
#### A masked pattern was here ####
NumFilesPerFileSink: 1
Static Partition Specification: ds=2008-04-08/hr=11/
Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
properties:
bucket_count -1
columns col1,col2,col3,col4,col5
columns.comments
columns.types string:string:string:string:string
#### A masked pattern was here ####
name default.list_bucketing_mul_col
partition_columns ds/hr
partition_columns.types string:string
serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
name: default.list_bucketing_mul_col
TotalFiles: 1
GatherStats: true
MultiFileSpray: false
Path -> Alias:
#### A masked pattern was here ####
Path -> Partition:
#### A masked pattern was here ####
Partition
base file name: src
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
bucket_count -1
columns key,value
columns.comments 'default','default'
columns.types string:string
#### A masked pattern was here ####
name default.src
numFiles 1
numRows 500
rawDataSize 5312
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
input format: org.apache.hadoop.mapred.TextInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
properties:
COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"key":"true","value":"true"}}
bucket_count -1
columns key,value
columns.comments 'default','default'
columns.types string:string
#### A masked pattern was here ####
name default.src
numFiles 1
numRows 500
rawDataSize 5312
serialization.ddl struct src { string key, string value}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
totalSize 5812
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
name: default.src
name: default.src
Truncated Path -> Alias:
/src [src]
Stage: Stage-0
Move Operator
tables:
partition:
ds 2008-04-08
hr 11
replace: true
#### A masked pattern was here ####
table:
input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
properties:
bucket_count -1
columns col1,col2,col3,col4,col5
columns.comments
columns.types string:string:string:string:string
#### A masked pattern was here ####
name default.list_bucketing_mul_col
partition_columns ds/hr
partition_columns.types string:string
serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
name: default.list_bucketing_mul_col
Stage: Stage-2
Stats-Aggr Operator
#### A masked pattern was here ####
PREHOOK: query: insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '11')
select 1, key, 1, value, 1 from src
PREHOOK: type: QUERY
PREHOOK: Input: default@src
PREHOOK: Output: default@list_bucketing_mul_col@ds=2008-04-08/hr=11
POSTHOOK: query: insert overwrite table list_bucketing_mul_col partition (ds = '2008-04-08', hr = '11')
select 1, key, 1, value, 1 from src
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
POSTHOOK: Output: default@list_bucketing_mul_col@ds=2008-04-08/hr=11
POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=11).col1 EXPRESSION []
POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=11).col2 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ]
POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=11).col3 EXPRESSION []
POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=11).col4 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ]
POSTHOOK: Lineage: list_bucketing_mul_col PARTITION(ds=2008-04-08,hr=11).col5 EXPRESSION []
PREHOOK: query: -- check DML result
show partitions list_bucketing_mul_col
PREHOOK: type: SHOWPARTITIONS
PREHOOK: Input: default@list_bucketing_mul_col
POSTHOOK: query: -- check DML result
show partitions list_bucketing_mul_col
POSTHOOK: type: SHOWPARTITIONS
POSTHOOK: Input: default@list_bucketing_mul_col
ds=2008-04-08/hr=11
PREHOOK: query: desc formatted list_bucketing_mul_col partition (ds='2008-04-08', hr='11')
PREHOOK: type: DESCTABLE
PREHOOK: Input: default@list_bucketing_mul_col
POSTHOOK: query: desc formatted list_bucketing_mul_col partition (ds='2008-04-08', hr='11')
POSTHOOK: type: DESCTABLE
POSTHOOK: Input: default@list_bucketing_mul_col
# col_name data_type comment
col1 string
col2 string
col3 string
col4 string
col5 string
# Partition Information
# col_name data_type comment
ds string
hr string
# Detailed Partition Information
Partition Value: [2008-04-08, 11]
Database: default
Table: list_bucketing_mul_col
#### A masked pattern was here ####
Partition Parameters:
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
numFiles 4
numRows 500
rawDataSize 6312
totalSize 7094
#### A masked pattern was here ####
# Storage Information
SerDe Library: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
InputFormat: org.apache.hadoop.hive.ql.io.RCFileInputFormat
OutputFormat: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
Compressed: No
Num Buckets: -1
Bucket Columns: []
Sort Columns: []
Stored As SubDirectories: Yes
Skewed Columns: [col2, col4]
Skewed Values: [[287, val_287], [466, val_466], [82, val_82]]
#### A masked pattern was here ####
Skewed Value to Truncated Path: {[287, val_287]=/list_bucketing_mul_col/ds=2008-04-08/hr=11/col2=287/col4=val_287, [466, val_466]=/list_bucketing_mul_col/ds=2008-04-08/hr=11/col2=466/col4=val_466, [82, val_82]=/list_bucketing_mul_col/ds=2008-04-08/hr=11/col2=82/col4=val_82}
Storage Desc Params:
serialization.format 1
PREHOOK: query: explain extended
select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "466" and col4 = "val_466"
PREHOOK: type: QUERY
POSTHOOK: query: explain extended
select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "466" and col4 = "val_466"
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
Stage-0 is a root stage
STAGE PLANS:
Stage: Stage-0
Fetch Operator
limit: -1
Partition Description:
Partition
input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
partition values:
ds 2008-04-08
hr 11
properties:
COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"}
bucket_count -1
columns col1,col2,col3,col4,col5
columns.comments
columns.types string:string:string:string:string
#### A masked pattern was here ####
name default.list_bucketing_mul_col
numFiles 4
numRows 500
partition_columns ds/hr
partition_columns.types string:string
rawDataSize 6312
serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
totalSize 7094
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
properties:
bucket_count -1
columns col1,col2,col3,col4,col5
columns.comments
columns.types string:string:string:string:string
#### A masked pattern was here ####
name default.list_bucketing_mul_col
partition_columns ds/hr
partition_columns.types string:string
serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
name: default.list_bucketing_mul_col
name: default.list_bucketing_mul_col
Processor Tree:
TableScan
alias: list_bucketing_mul_col
Statistics: Num rows: 500 Data size: 6312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: ((col2 = '466') and (col4 = 'val_466')) (type: boolean)
Statistics: Num rows: 125 Data size: 1578 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: col1 (type: string), '466' (type: string), col3 (type: string), 'val_466' (type: string), col5 (type: string), '2008-04-08' (type: string), '11' (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
Statistics: Num rows: 125 Data size: 1578 Basic stats: COMPLETE Column stats: NONE
ListSink
PREHOOK: query: select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "466" and col4 = "val_466"
PREHOOK: type: QUERY
PREHOOK: Input: default@list_bucketing_mul_col
PREHOOK: Input: default@list_bucketing_mul_col@ds=2008-04-08/hr=11
#### A masked pattern was here ####
POSTHOOK: query: select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "466" and col4 = "val_466"
POSTHOOK: type: QUERY
POSTHOOK: Input: default@list_bucketing_mul_col
POSTHOOK: Input: default@list_bucketing_mul_col@ds=2008-04-08/hr=11
#### A masked pattern was here ####
1 466 1 val_466 1 2008-04-08 11
1 466 1 val_466 1 2008-04-08 11
1 466 1 val_466 1 2008-04-08 11
PREHOOK: query: explain extended
select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "382" and col4 = "val_382"
PREHOOK: type: QUERY
POSTHOOK: query: explain extended
select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "382" and col4 = "val_382"
POSTHOOK: type: QUERY
STAGE DEPENDENCIES:
Stage-0 is a root stage
STAGE PLANS:
Stage: Stage-0
Fetch Operator
limit: -1
Partition Description:
Partition
input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
partition values:
ds 2008-04-08
hr 11
properties:
COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"}
bucket_count -1
columns col1,col2,col3,col4,col5
columns.comments
columns.types string:string:string:string:string
#### A masked pattern was here ####
name default.list_bucketing_mul_col
numFiles 4
numRows 500
partition_columns ds/hr
partition_columns.types string:string
rawDataSize 6312
serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
totalSize 7094
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
input format: org.apache.hadoop.hive.ql.io.RCFileInputFormat
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
properties:
bucket_count -1
columns col1,col2,col3,col4,col5
columns.comments
columns.types string:string:string:string:string
#### A masked pattern was here ####
name default.list_bucketing_mul_col
partition_columns ds/hr
partition_columns.types string:string
serialization.ddl struct list_bucketing_mul_col { string col1, string col2, string col3, string col4, string col5}
serialization.format 1
serialization.lib org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
#### A masked pattern was here ####
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
name: default.list_bucketing_mul_col
name: default.list_bucketing_mul_col
Processor Tree:
TableScan
alias: list_bucketing_mul_col
Statistics: Num rows: 500 Data size: 6312 Basic stats: COMPLETE Column stats: NONE
GatherStats: false
Filter Operator
isSamplingPred: false
predicate: ((col2 = '382') and (col4 = 'val_382')) (type: boolean)
Statistics: Num rows: 125 Data size: 1578 Basic stats: COMPLETE Column stats: NONE
Select Operator
expressions: col1 (type: string), '382' (type: string), col3 (type: string), 'val_382' (type: string), col5 (type: string), '2008-04-08' (type: string), '11' (type: string)
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
Statistics: Num rows: 125 Data size: 1578 Basic stats: COMPLETE Column stats: NONE
ListSink
PREHOOK: query: select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "382" and col4 = "val_382"
PREHOOK: type: QUERY
PREHOOK: Input: default@list_bucketing_mul_col
PREHOOK: Input: default@list_bucketing_mul_col@ds=2008-04-08/hr=11
#### A masked pattern was here ####
POSTHOOK: query: select * from list_bucketing_mul_col
where ds='2008-04-08' and hr='11' and col2 = "382" and col4 = "val_382"
POSTHOOK: type: QUERY
POSTHOOK: Input: default@list_bucketing_mul_col
POSTHOOK: Input: default@list_bucketing_mul_col@ds=2008-04-08/hr=11
#### A masked pattern was here ####
1 382 1 val_382 1 2008-04-08 11
1 382 1 val_382 1 2008-04-08 11
PREHOOK: query: drop table list_bucketing_mul_col
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@list_bucketing_mul_col
PREHOOK: Output: default@list_bucketing_mul_col
POSTHOOK: query: drop table list_bucketing_mul_col
POSTHOOK: type: DROPTABLE
POSTHOOK: Input: default@list_bucketing_mul_col
POSTHOOK: Output: default@list_bucketing_mul_col