blob: 1d6db468fb8d2a006d1b77be8cae4b37d0827827 [file] [log] [blame]
This file was created for:
IMPALA-1881: Maximize data locality when scanning Parquet files with multiple row groups.
IMPALA-2466: Add more tests to the HDFS parquet scanner.
IMPALA-5717: Add tests for HDFS orc scanner.
The table lineitem_multiblock is a single parquet file with:
- A row group size of approximately 12 KB each.
- 200 row groups in total.
Assuming a 1 MB HDFS block size, it has:
- 3 blocks of up to 1 MB each.
- Multiple row groups per block
- Some row groups that span across block boundaries and live on 2 blocks.
This table was created using hive and has the same table structure and some of the data of
The following commands were used:
create table functional_parquet.lineitem_multiblock like tpch.lineitem
stored as parquet;
set parquet.block.size=4086; # This is to set the row group size
insert into functional_parquet.lineitem_multiblock select * from
tpch.lineitem limit 20000; # We limit to 20000 to keep the size of the table small
'lineitem_sixblocks' was created the same way but with more rows, so that we got more
'lineitem_multiblock_one_row_group' was created similarly but with a much higher
'parquet.block.size' so that everything fit in one row group.
The orc files are created by the following hive queries:
use functional_orc_def;
set orc.stripe.size=1024;
set orc.compress=ZLIB;
create table lineitem_threeblocks like tpch.lineitem stored as orc;
create table lineitem_sixblocks like tpch.lineitem stored as orc;
insert overwrite table lineitem_threeblocks select * from tpch.lineitem limit 16000;
insert overwrite table lineitem_sixblocks select * from tpch.lineitem limit 30000;
set orc.stripe.size=67108864;
create table lineitem_orc_multiblock_one_stripe like tpch.lineitem stored as orc;
insert overwrite table lineitem_orc_multiblock_one_stripe select * from
tpch.lineitem limit 16000;