blob: 6096fe4573d2eaa78be28e2dd2f36aa6a65e6579 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "Timezone.hh"
#include "orc/Type.hh"
#include "wrap/gtest-wrapper.h"
#include "MockStripeStreams.hh"
#include "OrcTest.hh"
#include "SchemaEvolution.hh"
#include "ConvertColumnReader.hh"
#include "MemoryInputStream.hh"
#include "MemoryOutputStream.hh"
#include <iomanip>
namespace orc {
using BooleanVectorBatch = ByteVectorBatch;
static std::unique_ptr<Reader> createReader(MemoryPool& memoryPool,
std::unique_ptr<InputStream> stream) {
ReaderOptions options;
options.setMemoryPool(memoryPool);
return createReader(std::move(stream), options);
}
TEST(ConvertColumnReader, betweenNumericWithoutOverflows) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<t1:boolean,t2:int,t3:double,t4:float>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<t1:int,t2:boolean,t3:bigint,t4:boolean>"));
WriterOptions options;
options.setUseTightNumericVector(true);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
auto& c0 = dynamic_cast<ByteVectorBatch&>(*structBatch.fields[0]);
auto& c1 = dynamic_cast<IntVectorBatch&>(*structBatch.fields[1]);
auto& c2 = dynamic_cast<DoubleVectorBatch&>(*structBatch.fields[2]);
auto& c3 = dynamic_cast<FloatVectorBatch&>(*structBatch.fields[3]);
structBatch.numElements = c0.numElements = c1.numElements = c2.numElements = c3.numElements =
TEST_CASES;
for (size_t i = 0; i < TEST_CASES; i++) {
c0.data[i] = i % 2 || i % 3 ? true : false;
c1.data[i] = static_cast<int>((TEST_CASES / 2 - i) * TEST_CASES);
c2.data[i] = static_cast<double>(TEST_CASES - i) / (TEST_CASES / 2);
c3.data[i] = static_cast<float>(TEST_CASES - i) / (TEST_CASES / 2);
}
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOpts;
rowReaderOpts.setReadType(readType);
rowReaderOpts.setUseTightNumericVector(true);
auto rowReader = reader->createRowReader(rowReaderOpts);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC0 = dynamic_cast<IntVectorBatch&>(*readStructBatch.fields[0]);
auto& readC1 = dynamic_cast<ByteVectorBatch&>(*readStructBatch.fields[1]);
auto& readC2 = dynamic_cast<LongVectorBatch&>(*readStructBatch.fields[2]);
auto& readC3 = dynamic_cast<ByteVectorBatch&>(*readStructBatch.fields[3]);
for (size_t i = 0; i < TEST_CASES; i++) {
EXPECT_EQ(readC0.data[i], i % 2 || i % 3 ? 1 : 0);
EXPECT_TRUE(readC1.data[i] == true || i == TEST_CASES / 2);
EXPECT_EQ(readC2.data[i],
i > TEST_CASES / 2 ? 0 : static_cast<int64_t>((TEST_CASES - i) / (TEST_CASES / 2)));
EXPECT_TRUE(readC3.data[i] == true || i > TEST_CASES / 2);
}
rowReaderOpts.setUseTightNumericVector(false);
rowReader = reader->createRowReader(rowReaderOpts);
readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_THROW(rowReader->next(*readBatch), SchemaEvolutionError);
}
TEST(ConvertColumnReader, betweenNumricOverflows) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<t1:double,t2:bigint,t3:bigint>"));
std::shared_ptr<Type> readType(Type::buildTypeFromString("struct<t1:int,t2:int,t3:float>"));
WriterOptions options;
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(2);
auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
auto& c0 = dynamic_cast<DoubleVectorBatch&>(*structBatch.fields[0]);
auto& c1 = dynamic_cast<LongVectorBatch&>(*structBatch.fields[1]);
auto& c2 = dynamic_cast<LongVectorBatch&>(*structBatch.fields[2]);
c0.data[0] = 1e35;
c0.data[1] = 1e9 + 7;
c1.data[0] = (1LL << 31);
c1.data[1] = (1LL << 31) - 1;
c2.data[0] = (1LL << 62) + 112312;
c2.data[1] = (1LL << 20) + 77553;
structBatch.numElements = c0.numElements = c1.numElements = c2.numElements = 2;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOpts;
rowReaderOpts.setReadType(readType);
rowReaderOpts.setUseTightNumericVector(true);
auto rowReader = reader->createRowReader(rowReaderOpts);
auto readBatch = rowReader->createRowBatch(2);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC0 = dynamic_cast<IntVectorBatch&>(*readStructBatch.fields[0]);
auto& readC1 = dynamic_cast<IntVectorBatch&>(*readStructBatch.fields[1]);
auto& readC2 = dynamic_cast<FloatVectorBatch&>(*readStructBatch.fields[2]);
EXPECT_EQ(readC0.notNull[0], false);
EXPECT_EQ(readC1.notNull[0], false);
EXPECT_EQ(readC2.notNull[0], true);
EXPECT_TRUE(readC0.notNull[1]);
EXPECT_TRUE(readC1.notNull[1]);
EXPECT_TRUE(readC2.notNull[1]);
rowReaderOpts.throwOnSchemaEvolutionOverflow(true);
rowReader = reader->createRowReader(rowReaderOpts);
readBatch = rowReader->createRowBatch(2);
EXPECT_THROW(rowReader->next(*readBatch), SchemaEvolutionError);
}
// Test for converting from boolean to string/char/varchar
// Create a file with schema struct<t1:boolean,t2:boolean,t3:boolean,t4:boolean,t5:boolean> and
// write 1024 rows with alternating true and false values. Read the file with schema
// struct<t1:string,t2:char(3),t3:char(7),t4:varchar(3),t5:varchar(7)> and verify that the values
// are "TRUE" and "FALSE".
TEST(ConvertColumnReader, booleanToString) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(Type::buildTypeFromString(
"struct<t1:boolean,t2:boolean,t3:boolean,t4:boolean,t5:boolean>"));
std::shared_ptr<Type> readType(Type::buildTypeFromString(
"struct<t1:string,t2:char(6),t3:char(7),t4:varchar(5),t5:varchar(7)>"));
WriterOptions options;
options.setUseTightNumericVector(true);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(1024);
auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
auto& c0 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[0]);
auto& c1 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[1]);
auto& c2 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[2]);
auto& c3 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[3]);
auto& c4 = dynamic_cast<BooleanVectorBatch&>(*structBatch.fields[4]);
for (size_t i = 0; i < 1024; i++) {
c0.data[i] = static_cast<char>(i % 2 == 0 ? 0 : 1);
c1.data[i] = static_cast<char>(i % 3 == 0 ? 0 : 1);
c2.data[i] = static_cast<char>(i % 5 == 0 ? 0 : 1);
c3.data[i] = static_cast<char>(i % 7 == 0 ? 0 : 1);
c4.data[i] = static_cast<char>(i % 11 == 0 ? 0 : 1);
}
structBatch.numElements = c0.numElements = c1.numElements = c2.numElements = c3.numElements =
c4.numElements = 1024;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOpts;
rowReaderOpts.setReadType(readType);
rowReaderOpts.setUseTightNumericVector(true);
auto rowReader = reader->createRowReader(rowReaderOpts);
auto readBatch = rowReader->createRowBatch(1024);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC0 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[0]);
auto& readC1 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[1]);
auto& readC2 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[2]);
auto& readC3 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[3]);
auto& readC4 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[4]);
for (size_t i = 0; i < 1024; i++) {
EXPECT_EQ(std::string(readC0.data[i], static_cast<size_t>(readC0.length[i])),
i % 2 == 0 ? "FALSE" : "TRUE");
EXPECT_EQ(std::string(readC1.data[i], static_cast<size_t>(readC1.length[i])),
i % 3 == 0 ? "FALSE " : "TRUE ");
EXPECT_EQ(std::string(readC2.data[i], static_cast<size_t>(readC2.length[i])),
i % 5 == 0 ? "FALSE " : "TRUE ");
EXPECT_EQ(std::string(readC3.data[i], static_cast<size_t>(readC3.length[i])),
i % 7 == 0 ? "FALSE" : "TRUE");
EXPECT_EQ(std::string(readC4.data[i], static_cast<size_t>(readC4.length[i])),
i % 11 == 0 ? "FALSE" : "TRUE");
}
}
TEST(ConvertColumnReader, TestConvertNumericToStringVariant) {
// Create a memory buffer to hold the ORC file data
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int64_t TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(Type::buildTypeFromString(
"struct<t1:tinyint,t2:smallint,t3:int,t4:bigint,t5:float,t6:double,"
"t7:tinyint,t8:smallint,t9:int,t10:bigint,t11:float,t12:double,"
"t13:tinyint,t14:smallint,t15:int,t16:bigint,t17:float,t18:double"
">"));
std::shared_ptr<Type> readType(Type::buildTypeFromString(
"struct<t1:string,t2:string,t3:string,t4:string,t5:string,t6:string,"
"t7:char(1),t8:char(2),t9:char(3),t10:char(4),t11:char(5),t12:char(6),"
"t13:varchar(1),t14:varchar(2),t15:varchar(3),t16:varchar(4),t17:varchar(5),t18:varchar(6)"
">"));
WriterOptions options;
options.setUseTightNumericVector(true);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch);
for (size_t i = 0; i < 3; i++) {
auto& col0 = dynamic_cast<ByteVectorBatch&>(*structBatch.fields[i * 6]);
auto& col1 = dynamic_cast<ShortVectorBatch&>(*structBatch.fields[i * 6 + 1]);
auto& col2 = dynamic_cast<IntVectorBatch&>(*structBatch.fields[i * 6 + 2]);
auto& col3 = dynamic_cast<LongVectorBatch&>(*structBatch.fields[i * 6 + 3]);
auto& col4 = dynamic_cast<FloatVectorBatch&>(*structBatch.fields[i * 6 + 4]);
auto& col5 = dynamic_cast<DoubleVectorBatch&>(*structBatch.fields[i * 6 + 5]);
for (int j = 0; j < TEST_CASES; j++) {
int flag = j % 2 == 0 ? -1 : 1;
uint64_t idx = static_cast<uint64_t>(j);
col0.data[idx] = static_cast<int8_t>(flag * (j % 128));
col1.data[idx] = static_cast<int16_t>(flag * (j % 32768));
col2.data[idx] = flag * j;
col3.data[idx] = flag * j;
col4.data[idx] = static_cast<float>(flag * j) * 1.234f;
col5.data[idx] = static_cast<double>(flag * j) * 1.234;
}
col0.numElements = col1.numElements = col2.numElements = col3.numElements = col4.numElements =
col5.numElements = TEST_CASES;
}
structBatch.numElements = TEST_CASES;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOpts;
rowReaderOpts.setReadType(readType);
rowReaderOpts.setUseTightNumericVector(true);
auto rowReader = reader->createRowReader(rowReaderOpts);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
// column 0 to 17
auto& readC1 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[0]);
auto& readC2 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[1]);
auto& readC3 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[2]);
auto& readC4 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[3]);
auto& readC5 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[4]);
auto& readC6 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[5]);
auto& readC7 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[6]);
auto& readC8 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[7]);
auto& readC9 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[8]);
auto& readC10 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[9]);
auto& readC11 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[10]);
auto& readC12 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[11]);
auto& readC13 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[12]);
auto& readC14 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[13]);
auto& readC15 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[14]);
auto& readC16 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[15]);
auto& readC17 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[16]);
auto& readC18 = dynamic_cast<StringVectorBatch&>(*readStructBatch.fields[17]);
std::vector<std::vector<std::string>> origin(30, std::vector<std::string>(TEST_CASES));
for (int j = 0; j < TEST_CASES; j++) {
for (size_t k = 0; k < 5; k++) {
int flag = j % 2 == 0 ? -1 : 1;
uint64_t idx = static_cast<uint64_t>(j);
origin[6 * k + 0][idx] = std::to_string(static_cast<int8_t>(flag * (j % 128)));
origin[6 * k + 1][idx] = std::to_string(static_cast<int16_t>(flag * (j % 32768)));
origin[6 * k + 2][idx] = std::to_string(flag * j);
origin[6 * k + 3][idx] = std::to_string(flag * j);
origin[6 * k + 4][idx] = std::to_string(static_cast<float>(flag * j) * 1.234f);
origin[6 * k + 5][idx] = std::to_string(static_cast<double>(flag * j) * 1.234);
}
}
for (size_t i = 0; i < TEST_CASES; i++) {
std::vector<std::string> expected(19);
std::vector<bool> isNull(19, false);
for (size_t j = 1; j <= 18; j++) {
expected[j] = origin[j - 1][i];
}
for (size_t j = 7; j <= 12; j++) {
size_t length = j - 6;
if (expected[j].size() > length) {
isNull[j] = true;
} else {
expected[j].resize(length, ' ');
}
}
for (size_t j = 13; j <= 18; j++) {
size_t length = j - 12;
if (expected[j].size() > length) {
isNull[j] = true;
}
}
#define TEST_COLUMN(index) \
if (isNull[index]) { \
EXPECT_EQ(false, readC##index.notNull[i]) \
<< i << " " << expected[index] << " " \
<< std::string(readC##index.data[i], static_cast<size_t>(readC##index.length[i])); \
} else { \
EXPECT_EQ(true, readC##index.notNull[i]) << i; \
EXPECT_EQ(expected[index], \
std::string(readC##index.data[i], static_cast<size_t>(readC##index.length[i]))) \
<< i; \
}
TEST_COLUMN(1)
TEST_COLUMN(2)
TEST_COLUMN(3)
TEST_COLUMN(4)
TEST_COLUMN(5)
TEST_COLUMN(6)
TEST_COLUMN(7)
TEST_COLUMN(8)
TEST_COLUMN(9)
TEST_COLUMN(10)
TEST_COLUMN(11)
TEST_COLUMN(12)
TEST_COLUMN(13)
TEST_COLUMN(14)
TEST_COLUMN(15)
TEST_COLUMN(16)
TEST_COLUMN(17)
TEST_COLUMN(18)
#undef TEST_COLUMN
}
}
// Test conversion from numeric to decimal64/decimal128
TEST(ConvertColumnReader, TestConvertNumericToDecimal) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<c1:bigint,c2:double,c3:bigint,c4:double>"));
std::shared_ptr<Type> readType(Type::buildTypeFromString(
"struct<c1:decimal(10,2),c2:decimal(10,4),c3:decimal(20,3),c4:decimal(20,3)>"));
WriterOptions options;
options.setUseTightNumericVector(true);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[1]);
auto& c3 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[2]);
auto& c4 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[3]);
for (int32_t i = 0; i < TEST_CASES / 2; i++) {
size_t idx = static_cast<size_t>(i);
c1.data[idx] = i * 12;
c3.data[idx] = i * 16;
c2.data[idx] = (i % 2 ? -1 : 1) * (i * 1000 + 0.111111 * (i % 9));
c4.data[idx] = (i % 2 ? -1 : 1) * (i * 1000 + 0.111111 * (i % 9));
}
for (int32_t i = TEST_CASES / 2; i < TEST_CASES; i++) {
size_t idx = static_cast<size_t>(i);
c1.data[idx] = 12345678910LL * i;
c2.data[idx] = 12345678910.1234 * i;
c3.data[idx] = 12345678910LL * i;
c4.data[idx] = (123456.0 * i + (0.1111 * (i % 9))) * (i % 2 ? -1 : 1);
}
structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = c4.numElements =
TEST_CASES;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<Decimal64VectorBatch&>(*readStructBatch.fields[0]);
auto& readC2 = dynamic_cast<Decimal64VectorBatch&>(*readStructBatch.fields[1]);
auto& readC3 = dynamic_cast<Decimal128VectorBatch&>(*readStructBatch.fields[2]);
auto& readC4 = dynamic_cast<Decimal128VectorBatch&>(*readStructBatch.fields[3]);
EXPECT_EQ(TEST_CASES, readBatch->numElements);
for (int i = 0; i < TEST_CASES / 2; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_TRUE(readC1.notNull[idx]) << i;
EXPECT_TRUE(readC2.notNull[idx]) << i;
EXPECT_TRUE(readC3.notNull[idx]) << i;
EXPECT_TRUE(readC4.notNull[idx]) << i;
EXPECT_EQ(i * 1200, readC1.values[idx]) << i;
EXPECT_EQ(i * 16000, readC3.values[idx].toLong()) << i;
EXPECT_EQ((1LL * i * 10000000 + (1111 * (i % 9) + (i % 9 > 4))) * (i % 2 ? -1 : 1),
readC2.values[idx])
<< i;
EXPECT_EQ((1LL * i * 1000000 + (111 * (i % 9) + (i % 9 > 4))) * (i % 2 ? -1 : 1),
readC4.values[idx].toLong())
<< i;
}
for (int i = TEST_CASES / 2; i < TEST_CASES; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_FALSE(readC1.notNull[idx]) << i;
EXPECT_FALSE(readC2.notNull[idx]) << i;
EXPECT_TRUE(readC3.notNull[idx]) << i;
EXPECT_TRUE(readC4.notNull[idx]) << i;
EXPECT_EQ(12345678910000LL * i, readC3.values[idx].toLong()) << i;
EXPECT_EQ((123456000LL * i + (111 * (i % 9) + (i % 9 > 4))) * (i % 2 ? -1 : 1),
readC4.values[idx].toLong())
<< i;
}
}
// Test conversion from numeric to timestamp/timestamp with local timezone
TEST(ConvertColumnReader, TestConvertNumericToTimestamp) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<c1:bigint,c2:double,c3:bigint,c4:double>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<c1:timestamp,c2:timestamp,c3:timestamp with local time "
"zone,c4:timestamp with local time zone>"));
WriterOptions options;
options.setUseTightNumericVector(true);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[1]);
auto& c3 = dynamic_cast<LongVectorBatch&>(*structBatch->fields[2]);
auto& c4 = dynamic_cast<DoubleVectorBatch&>(*structBatch->fields[3]);
for (int i = 0; i < TEST_CASES; i++) {
size_t idx = static_cast<size_t>(i);
c1.data[idx] = (i - TEST_CASES / 2) * 3600 + i;
c2.data[idx] = (i - TEST_CASES / 2) * 3600 + i;
c3.data[idx] = (i - TEST_CASES / 2) * 3600 + i * i;
c4.data[idx] = (i - TEST_CASES / 2) * 3600 + i * i;
if (i % 2) {
c2.data[idx] += 0.55555;
c4.data[idx] += 0.777;
} else {
c2.data[idx] += 0.11111;
c4.data[idx] += 0.333;
}
}
structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = c4.numElements =
TEST_CASES;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setTimezoneName("Asia/Shanghai");
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[0]);
auto& readC2 = dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[1]);
auto& readC3 = dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[2]);
auto& readC4 = dynamic_cast<TimestampVectorBatch&>(*readStructBatch.fields[3]);
EXPECT_EQ(TEST_CASES, readBatch->numElements);
for (int i = 0; i < TEST_CASES; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_TRUE(readC1.notNull[idx]) << i;
EXPECT_TRUE(readC2.notNull[idx]) << i;
EXPECT_TRUE(readC3.notNull[idx]) << i;
EXPECT_TRUE(readC4.notNull[idx]) << i;
EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i - 8 * 3600, readC1.data[idx]) << i;
EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i - 8 * 3600, readC2.data[idx]) << i;
EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i * i, readC3.data[idx]) << i;
EXPECT_EQ((i - TEST_CASES / 2) * 3600 + i * i, readC4.data[idx]) << i;
EXPECT_EQ(0, readC1.nanoseconds[idx]) << i;
EXPECT_EQ(0, readC3.nanoseconds[idx]) << i;
if (i % 2) {
EXPECT_TRUE(std::abs(555550000 - readC2.nanoseconds[idx]) <= 1) << i;
EXPECT_TRUE(std::abs(777000000 - readC4.nanoseconds[idx]) <= 1) << i;
} else {
EXPECT_TRUE(std::abs(111110000 - readC2.nanoseconds[idx]) <= 1) << i;
EXPECT_TRUE(std::abs(333000000 - readC4.nanoseconds[idx]) <= 1) << i;
}
}
}
TEST(ConvertColumnReader, TestConvertDecimalToNumeric) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(Type::buildTypeFromString(
"struct<c1:decimal(10,2),c2:decimal(10,4),c3:decimal(20,4),c4:decimal(20,4)>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<c1:boolean,c2:smallint,c3:int,c4:double>"));
WriterOptions options;
options.setUseTightNumericVector(true);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[1]);
auto& c3 = dynamic_cast<Decimal128VectorBatch&>(*structBatch->fields[2]);
auto& c4 = dynamic_cast<Decimal128VectorBatch&>(*structBatch->fields[3]);
for (uint32_t i = 0; i < TEST_CASES / 2; i++) {
int64_t flag = i % 2 ? 1 : -1;
c1.values[i] = flag * (!(i % 2) ? static_cast<int64_t>(0) : static_cast<int64_t>(i * 123));
c2.values[i] = flag * (static_cast<int64_t>(i * 10000) + i);
c3.values[i] = flag * (static_cast<int64_t>(i * 10000) + i);
c4.values[i] = flag * (static_cast<int64_t>(i * 10000) + i);
}
for (uint32_t i = TEST_CASES / 2; i < TEST_CASES; i++) {
c1.values[i] = 0;
c2.values[i] = (static_cast<int64_t>(std::numeric_limits<int16_t>::max()) + i) * 10000 + i;
c3.values[i] = (static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + i) * 10000 + i;
c4.values[i] = 0;
}
structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = c4.numElements =
TEST_CASES;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<BooleanVectorBatch&>(*readStructBatch.fields[0]);
auto& readC2 = dynamic_cast<ShortVectorBatch&>(*readStructBatch.fields[1]);
auto& readC3 = dynamic_cast<IntVectorBatch&>(*readStructBatch.fields[2]);
auto& readC4 = dynamic_cast<DoubleVectorBatch&>(*readStructBatch.fields[3]);
EXPECT_EQ(TEST_CASES, readBatch->numElements);
for (int i = 0; i < TEST_CASES / 2; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_TRUE(readC1.notNull[idx]) << i;
EXPECT_TRUE(readC2.notNull[idx]) << i;
EXPECT_TRUE(readC3.notNull[idx]) << i;
EXPECT_TRUE(readC4.notNull[idx]) << i;
int64_t flag = i % 2 ? 1 : -1;
EXPECT_EQ(!(i % 2) ? 0 : 1, readC1.data[idx]) << i;
EXPECT_EQ(flag * i, readC2.data[idx]) << i;
EXPECT_EQ(flag * i, readC3.data[idx]) << i;
EXPECT_DOUBLE_EQ(1.0001 * flag * i, readC4.data[idx]) << i;
}
for (int i = TEST_CASES / 2; i < TEST_CASES; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_TRUE(readC1.notNull[idx]) << i;
EXPECT_FALSE(readC2.notNull[idx]) << i;
EXPECT_FALSE(readC3.notNull[idx]) << i;
EXPECT_TRUE(readC4.notNull[idx]) << i;
}
}
TEST(ConvertColumnReader, TestConvertDecimalToDecimal) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(Type::buildTypeFromString(
"struct<c1:decimal(10,4),c2:decimal(10,4),c3:decimal(20,4),c4:decimal(20,4)>"));
std::shared_ptr<Type> readType(Type::buildTypeFromString(
"struct<c1:decimal(9,5),c2:decimal(20,5),c3:decimal(10,3),c4:decimal(19,3)>"));
WriterOptions options;
options.setUseTightNumericVector(true);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[1]);
auto& c3 = dynamic_cast<Decimal128VectorBatch&>(*structBatch->fields[2]);
auto& c4 = dynamic_cast<Decimal128VectorBatch&>(*structBatch->fields[3]);
for (uint32_t i = 0; i < TEST_CASES / 2; i++) {
int64_t flag = i % 2 ? 1 : -1;
c1.values[i] = flag * (static_cast<int64_t>(i * 10000) + i);
c2.values[i] = flag * (static_cast<int64_t>(i * 10000) + i);
c3.values[i] = flag * (static_cast<int64_t>(i * 10000) + i);
c4.values[i] = flag * (static_cast<int64_t>(i * 10000) + i);
}
for (uint32_t i = TEST_CASES / 2; i < TEST_CASES; i++) {
c1.values[i] = 100000000ll + i;
c2.values[i] = 100000000ll + i;
c3.values[i] = (Int128("100000000000") += i);
c4.values[i] = (Int128("100000000000000000000") += i);
}
structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = c4.numElements =
TEST_CASES;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readStructBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<Decimal64VectorBatch&>(*readStructBatch.fields[0]);
auto& readC2 = dynamic_cast<Decimal128VectorBatch&>(*readStructBatch.fields[1]);
auto& readC3 = dynamic_cast<Decimal64VectorBatch&>(*readStructBatch.fields[2]);
auto& readC4 = dynamic_cast<Decimal128VectorBatch&>(*readStructBatch.fields[3]);
EXPECT_TRUE(9 == readC1.precision && 5 == readC1.scale);
EXPECT_TRUE(20 == readC2.precision && 5 == readC2.scale);
EXPECT_TRUE(10 == readC3.precision && 3 == readC3.scale);
EXPECT_TRUE(19 == readC4.precision && 3 == readC4.scale);
EXPECT_EQ(TEST_CASES, readBatch->numElements);
for (int i = 0; i < TEST_CASES / 2; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_TRUE(readC1.notNull[idx]) << i;
EXPECT_TRUE(readC2.notNull[idx]) << i;
EXPECT_TRUE(readC3.notNull[idx]) << i;
EXPECT_TRUE(readC4.notNull[idx]) << i;
int64_t flag = i % 2 ? 1 : -1;
EXPECT_EQ(readC1.values[idx], flag * (i * 100000 + i * 10));
EXPECT_EQ(readC2.values[idx].toLong(), flag * (i * 100000 + i * 10));
EXPECT_EQ(readC3.values[idx], flag * (i * 1000 + (i + 5) / 10));
EXPECT_EQ(readC4.values[idx].toLong(), flag * (i * 1000 + (i + 5) / 10));
}
for (int i = TEST_CASES / 2; i < TEST_CASES; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_FALSE(readC1.notNull[idx]) << i;
EXPECT_TRUE(readC2.notNull[idx]) << i;
EXPECT_FALSE(readC3.notNull[idx]) << i;
EXPECT_FALSE(readC4.notNull[idx]) << i;
}
}
TEST(ConvertColumnReader, TestConvertDecimalToTimestamp) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
std::string writerTimezoneName = "America/New_York";
std::string readerTimezoneName = "Australia/Sydney";
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<c1:decimal(14,4),c2:decimal(25,10)>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<c1:timestamp,c2:timestamp with local time zone>"));
WriterOptions options;
options.setUseTightNumericVector(true);
options.setTimezoneName(writerTimezoneName);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<Decimal128VectorBatch&>(*structBatch->fields[1]);
auto convertToSeconds = [](const Timezone& writerTimezone, const std::string& date) {
tm timeStruct;
if (strptime(date.c_str(), "%Y-%m-%d %H:%M:%S", &timeStruct) == nullptr) {
throw TimezoneError("bad time " + date);
}
return writerTimezone.convertFromUTC(timegm(&timeStruct));
};
std::vector<std::string> timeStrings;
for (int i = 0; i < TEST_CASES; i++) {
int64_t year = 1960 + (i / 12);
int64_t month = i % 12 + 1;
int64_t day = 27;
std::string others = "23:45:56";
std::stringstream ss;
ss << year << "-";
ss << std::setfill('0') << std::setw(2) << month << "-" << day << " " << others;
timeStrings.push_back(ss.str());
}
std::vector<int64_t> ts[2];
for (auto& time : timeStrings) {
ts[0].emplace_back(convertToSeconds(getTimezoneByName("GMT"), time));
ts[1].emplace_back(convertToSeconds(getTimezoneByName(writerTimezoneName), time));
}
bool overflow = false;
for (int i = 0; i < TEST_CASES; i++) {
c1.values[i] = ts[0][i] * 10000 + 1234;
c2.values[i] = scaleUpInt128ByPowerOfTen(Int128(ts[1][i]), 10, overflow) +=
Int128("1234567895");
assert(!overflow);
}
structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES;
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
rowReaderOptions.setTimezoneName(readerTimezoneName);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readSturctBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<TimestampVectorBatch&>(*readSturctBatch.fields[0]);
auto& readC2 = dynamic_cast<TimestampVectorBatch&>(*readSturctBatch.fields[1]);
for (int i = 0; i < TEST_CASES; i++) {
size_t idx = static_cast<size_t>(i);
EXPECT_TRUE(readC1.notNull[idx]) << i;
EXPECT_TRUE(readC2.notNull[idx]) << i;
EXPECT_EQ(getTimezoneByName(readerTimezoneName).convertToUTC(readC1.data[i]), ts[0][i]);
EXPECT_TRUE(readC1.nanoseconds[i] == 123400000);
EXPECT_EQ(readC2.data[i], ts[1][i]);
if (readC2.data[i] < 0) {
EXPECT_EQ(readC2.nanoseconds[i], 123456790) << timeStrings[i];
} else {
EXPECT_EQ(readC2.nanoseconds[i], 123456789) << timeStrings[i];
}
}
}
TEST(ConvertColumnReader, TestConvertDecimalToStringVariant) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<c1:decimal(14,3),c2:decimal(14,3),c3:decimal(14,3)>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<c1:char(5),c2:varchar(5),c3:string>"));
WriterOptions options;
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[1]);
auto& c3 = dynamic_cast<Decimal64VectorBatch&>(*structBatch->fields[2]);
for (int i = 0; i < TEST_CASES; i++) {
c1.values[i] = i * 1000 + 123;
c2.values[i] = i * 1000 + 456;
c3.values[i] = i * 1000 + 789;
}
structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES;
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readSturctBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<StringVectorBatch&>(*readSturctBatch.fields[0]);
auto& readC2 = dynamic_cast<StringVectorBatch&>(*readSturctBatch.fields[1]);
auto& readC3 = dynamic_cast<StringVectorBatch&>(*readSturctBatch.fields[2]);
for (int i = 0; i < TEST_CASES; i++) {
if (i < 10) {
EXPECT_EQ(std::to_string(i) + ".123", std::string(readC1.data[i], readC1.length[i]));
EXPECT_EQ(std::to_string(i) + ".456", std::string(readC2.data[i], readC2.length[i]));
} else if (i >= 10 && i < 100) {
EXPECT_EQ(std::to_string(i) + ".12", std::string(readC1.data[i], readC1.length[i]));
EXPECT_EQ(std::to_string(i) + ".45", std::string(readC2.data[i], readC2.length[i]));
} else if (i >= 100 && i < 1000) {
EXPECT_EQ(std::to_string(i) + ".1", std::string(readC1.data[i], readC1.length[i]));
EXPECT_EQ(std::to_string(i) + ".4", std::string(readC2.data[i], readC2.length[i]));
} else {
EXPECT_EQ(std::to_string(i) + ".", std::string(readC1.data[i], readC1.length[i]));
EXPECT_EQ(std::to_string(i) + ".", std::string(readC2.data[i], readC2.length[i]));
}
EXPECT_EQ(std::to_string(i) + ".789", std::string(readC3.data[i], readC3.length[i]));
}
}
TEST(ConvertColumnReader, TestConvertStringVariantToNumeric) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 6;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<c1:char(25),c2:varchar(25),c3:string>"));
std::shared_ptr<Type> readType(Type::buildTypeFromString("struct<c1:boolean,c2:int,c3:float>"));
WriterOptions options;
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[1]);
auto& c3 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[2]);
std::vector<std::string> raw1{"", "123456", "0", "-1234567890", "999999999999999999999999",
"error"};
std::vector<std::string> raw2{"", "123456", "0", "-1234567890", "999999999999999999999999",
"error"};
std::vector<std::string> raw3{
"", "123456", "-0.0", "-123456789.0123", "1000000000000000000000000000000000000000",
"error"};
c1.notNull[0] = c2.notNull[0] = c3.notNull[0] = false;
for (int i = 1; i < TEST_CASES; i++) {
c1.data[i] = raw1[i].data();
c1.length[i] = raw1[i].length();
c1.notNull[i] = true;
c2.data[i] = raw2[i].data();
c2.length[i] = raw2[i].length();
c2.notNull[i] = true;
c3.data[i] = raw3[i].data();
c3.length[i] = raw3[i].length();
c3.notNull[i] = true;
}
structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = TEST_CASES;
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = c3.hasNulls = true;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readSturctBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<BooleanVectorBatch&>(*readSturctBatch.fields[0]);
auto& readC2 = dynamic_cast<IntVectorBatch&>(*readSturctBatch.fields[1]);
auto& readC3 = dynamic_cast<FloatVectorBatch&>(*readSturctBatch.fields[2]);
EXPECT_FALSE(readC1.notNull[0]);
EXPECT_FALSE(readC2.notNull[0]);
EXPECT_FALSE(readC3.notNull[0]);
for (int i = 1; i < 4; i++) {
EXPECT_TRUE(readC1.notNull[i]);
EXPECT_TRUE(readC2.notNull[i]);
EXPECT_TRUE(readC3.notNull[i]);
}
for (int i = 4; i <= 5; i++) {
EXPECT_FALSE(readC1.notNull[i]) << i;
EXPECT_FALSE(readC2.notNull[i]) << i;
EXPECT_FALSE(readC3.notNull[i]) << i;
}
EXPECT_EQ(readC1.data[1], 1);
EXPECT_EQ(readC2.data[1], 123456);
EXPECT_FLOAT_EQ(readC3.data[1], 123456);
EXPECT_EQ(readC1.data[2], 0);
EXPECT_EQ(readC2.data[2], 0);
EXPECT_FLOAT_EQ(readC3.data[2], -0.0);
EXPECT_EQ(readC1.data[3], 1);
EXPECT_EQ(readC2.data[3], -1234567890);
EXPECT_FLOAT_EQ(readC3.data[3], -123456789.0123);
}
TEST(ConvertColumnReader, TestConvertStringVariant) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 4;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(
Type::buildTypeFromString("struct<c1:char(5),c2:varchar(5),c3:string>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<c1:string,c2:char(4),c3:varchar(4)>"));
WriterOptions options;
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[1]);
auto& c3 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[2]);
std::vector<std::string> raw1{"", "12345", "1", "1234"};
std::vector<std::string> raw2{"", "12345", "1", "1234"};
std::vector<std::string> raw3{"", "12345", "1", "1234"};
c1.notNull[0] = c2.notNull[0] = c3.notNull[0] = false;
for (int i = 1; i < TEST_CASES; i++) {
c1.data[i] = raw1[i].data();
c1.length[i] = raw1[i].length();
c1.notNull[i] = true;
c2.data[i] = raw2[i].data();
c2.length[i] = raw2[i].length();
c2.notNull[i] = true;
c3.data[i] = raw3[i].data();
c3.length[i] = raw3[i].length();
c3.notNull[i] = true;
}
structBatch->numElements = c1.numElements = c2.numElements = c3.numElements = TEST_CASES;
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = c3.hasNulls = true;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readSturctBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<StringVectorBatch&>(*readSturctBatch.fields[0]);
auto& readC2 = dynamic_cast<StringVectorBatch&>(*readSturctBatch.fields[1]);
auto& readC3 = dynamic_cast<StringVectorBatch&>(*readSturctBatch.fields[2]);
EXPECT_FALSE(readC1.notNull[0]);
EXPECT_FALSE(readC2.notNull[0]);
EXPECT_FALSE(readC3.notNull[0]);
for (int i = 1; i < TEST_CASES; i++) {
EXPECT_TRUE(readC1.notNull[i]);
EXPECT_TRUE(readC2.notNull[i]);
EXPECT_TRUE(readC3.notNull[i]);
}
EXPECT_EQ(std::string(readC1.data[1], readC1.length[1]), "12345");
EXPECT_EQ(std::string(readC2.data[1], readC2.length[1]), "1234");
EXPECT_EQ(std::string(readC3.data[1], readC3.length[1]), "1234");
EXPECT_EQ(std::string(readC1.data[2], readC1.length[2]), "1 ");
EXPECT_EQ(std::string(readC2.data[2], readC2.length[2]), "1 ");
EXPECT_EQ(std::string(readC3.data[2], readC3.length[2]), "1");
EXPECT_EQ(std::string(readC1.data[3], readC1.length[3]), "1234 ");
EXPECT_EQ(std::string(readC2.data[3], readC2.length[3]), "1234");
EXPECT_EQ(std::string(readC3.data[3], readC3.length[3]), "1234");
}
// Returns year/month/day triple in civil calendar
// Preconditions: z is number of days since 1970-01-01 and is in the range:
// [numeric_limits<Int>::min(), numeric_limits<Int>::max()-719468].
template <class Int>
constexpr std::tuple<int, unsigned, unsigned> civil_from_days(Int z) noexcept {
static_assert(std::numeric_limits<unsigned>::digits >= 18,
"This algorithm has not been ported to a 16 bit unsigned integer");
static_assert(std::numeric_limits<Int>::digits >= 20,
"This algorithm has not been ported to a 16 bit signed integer");
z += 719468;
const Int era = (z >= 0 ? z : z - 146096) / 146097;
const unsigned doe = static_cast<unsigned>(z - era * 146097); // [0, 146096]
const unsigned yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; // [0, 399]
const Int y = static_cast<Int>(yoe) + era * 400;
const unsigned doy = doe - (365 * yoe + yoe / 4 - yoe / 100); // [0, 365]
const unsigned mp = (5 * doy + 2) / 153; // [0, 11]
const unsigned d = doy - (153 * mp + 2) / 5 + 1; // [1, 31]
const unsigned m = mp < 10 ? mp + 3 : mp - 9; // [1, 12]
return std::tuple<int, unsigned, unsigned>(y + (m <= 2), m, d);
}
static std::string timestampToString(int64_t seconds, int64_t nanos,
const std::string& zoneName) {
auto& timezone = getTimezoneByName(zoneName);
seconds = timezone.convertToUTC(seconds);
time_t t = static_cast<time_t>(seconds);
char buffer[100];
constexpr auto SECOND_IN_DAY = 3600 * 24;
auto day = t < 0 ? (t - SECOND_IN_DAY + 1) / SECOND_IN_DAY : t / SECOND_IN_DAY;
auto [y, m, d] = civil_from_days(day);
auto second_in_day = t % (3600 * 24);
if (second_in_day < 0) {
second_in_day += 3600 * 24;
}
auto h = second_in_day % (3600 * 24) / 3600;
auto min = second_in_day % 3600 / 60;
auto s = second_in_day % 60;
std::snprintf(buffer, sizeof(buffer), "%04d-%02d-%02d %02ld:%02ld:%02ld", y, m, d, h, min, s);
std::string result(buffer);
if (nanos) {
while (nanos % 10 == 0) nanos /= 10;
result = result + "." + std::to_string(nanos);
}
result = result + " " + zoneName;
return result;
}
TEST(ConvertColumnReader, TestConvertStringVariantToTimestamp) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
const std::string writerTimezone = "America/New_York";
const std::string readerTimezone = "Australia/Sydney";
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(Type::buildTypeFromString("struct<c1:string,c2:string>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<c1:timestamp,c2:timestamp with local time zone>"));
WriterOptions options;
options.setTimezoneName(writerTimezone);
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[1]);
std::vector<std::string> raw1, raw2;
raw1.reserve(TEST_CASES * 3);
raw2.reserve(TEST_CASES * 3);
std::vector<int64_t> ts1, ts2;
for (int i = 0; i < TEST_CASES; i++) {
char buff[100];
auto size = ::snprintf(buff, sizeof(buff), "%04d-%02d-27 12:34:56.789", 1960 + (i / 12),
(i % 12) + 1);
raw1.emplace_back(buff, size);
raw2.push_back(raw1.back() + " " + writerTimezone);
c1.data[i] = const_cast<char*>(raw1.back().c_str());
c1.length[i] = raw1.back().length();
c2.data[i] = const_cast<char*>(raw2.back().c_str());
c2.length[i] = raw2.back().length();
}
structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES;
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false;
writer->add(*batch);
for (int i = 0; i < TEST_CASES; i++) {
char buff[100];
auto size =
::snprintf(buff, sizeof(buff), "%04d-%02d-27 12:34:56", 1960 + (i / 12), (i % 12) + 1);
raw1.emplace_back(buff, size);
raw2.push_back(raw1.back() + " " + writerTimezone);
c1.data[i] = const_cast<char*>(raw1.back().c_str());
c1.length[i] = raw1.back().length();
c2.data[i] = const_cast<char*>(raw2.back().c_str());
c2.length[i] = raw2.back().length();
}
structBatch->numElements = c1.numElements = c2.numElements = TEST_CASES;
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false;
writer->add(*batch);
{
raw1.push_back("2024?11-14 00:01:02");
raw2.push_back("2024-01-02 03:04:05.678 tz/error");
c1.data[0] = const_cast<char*>(raw1.back().c_str());
c1.length[0] = raw1.back().length();
c2.data[0] = const_cast<char*>(raw2.back().c_str());
c2.length[0] = raw2.back().length();
c1.notNull[1] = false;
c2.notNull[1] = false;
raw1.push_back("2024-12-14 00:01:02.-1");
raw2.push_back("2024-01-02 03:04:05.678");
c1.data[2] = const_cast<char*>(raw1.back().c_str());
c1.length[2] = raw1.back().length();
c2.data[2] = const_cast<char*>(raw2.back().c_str());
c2.length[2] = raw2.back().length();
}
structBatch->numElements = c1.numElements = c2.numElements = 3;
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = true;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
rowReaderOptions.setTimezoneName(readerTimezone);
rowReaderOptions.throwOnSchemaEvolutionOverflow(true);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES * 2);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readSturctBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<TimestampVectorBatch&>(*readSturctBatch.fields[0]);
auto& readC2 = dynamic_cast<TimestampVectorBatch&>(*readSturctBatch.fields[1]);
for (int i = 0; i < TEST_CASES * 2; i++) {
EXPECT_TRUE(readC1.notNull[i]);
EXPECT_TRUE(readC2.notNull[i]);
EXPECT_EQ(raw1[i] + " " + readerTimezone,
timestampToString(readC1.data[i], readC1.nanoseconds[i], readerTimezone));
EXPECT_EQ(raw2[i], timestampToString(readC2.data[i], readC2.nanoseconds[i], writerTimezone));
}
rowReaderOptions.throwOnSchemaEvolutionOverflow(false);
rowReader = reader->createRowReader(rowReaderOptions);
EXPECT_EQ(true, rowReader->next(*readBatch));
EXPECT_EQ(true, rowReader->next(*readBatch));
EXPECT_EQ(3, readBatch->numElements);
EXPECT_FALSE(readC1.notNull[0]);
EXPECT_FALSE(readC2.notNull[0]);
EXPECT_FALSE(readC1.notNull[1]);
EXPECT_FALSE(readC2.notNull[1]);
EXPECT_FALSE(readC1.notNull[2]);
EXPECT_FALSE(readC2.notNull[2]);
}
TEST(ConvertColumnReader, TestConvertStringVariantToDecimal) {
constexpr int DEFAULT_MEM_STREAM_SIZE = 10 * 1024 * 1024;
constexpr int TEST_CASES = 1024;
MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE);
std::unique_ptr<Type> fileType(Type::buildTypeFromString("struct<c1:string,c2:string>"));
std::shared_ptr<Type> readType(
Type::buildTypeFromString("struct<c1:decimal(10,5),c2:decimal(25,10)>"));
WriterOptions options;
auto writer = createWriter(*fileType, &memStream, options);
auto batch = writer->createRowBatch(TEST_CASES);
auto structBatch = dynamic_cast<StructVectorBatch*>(batch.get());
auto& c1 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[0]);
auto& c2 = dynamic_cast<StringVectorBatch&>(*structBatch->fields[1]);
// <source_string, failed_to_int64, failed_to_int128, expected_int64, expected_int128>
std::vector<std::tuple<std::string, bool, bool, int64_t, Int128>> rawDataAndExpected;
rawDataAndExpected = {
/* 0 */ {"123456789012345678901234567890123456789", false, false, int64_t(), Int128()},
/* 1 */ {"123456789012345678901234567890.1234567890", false, false, int64_t(), Int128()},
/* 2 */ {"-123456789012345678901234567890.1234567890", false, false, int64_t(), Int128()},
/* 3 */ {"-foo.bar", false, false, int64_t(), Int128()},
/* 4 */ {"-foo.123", false, false, int64_t(), Int128()},
/* 5 */ {"-123.foo", false, false, int64_t(), Int128()},
/* 6 */ {"-123foo.123", false, false, int64_t(), Int128()},
/* 7 */ {"-123.123foo", false, false, int64_t(), Int128()},
/* 8 */ {"-.", false, false, int64_t(), Int128()},
/* 9 */ {"-", false, false, int64_t(), Int128()},
/* 10 */ {".", false, false, int64_t(), Int128()},
/* 11 */ {"", false, false, int64_t(), Int128()},
/* 12 */ {".12345", false, false, int64_t(), Int128()},
/* 13 */ {"12345.", false, false, int64_t(), Int128()},
/* 14 */ {"-1", true, true, -100000LL, Int128("-10000000000")},
/* 15 */ {"-1.0", true, true, -100000LL, Int128("-10000000000")},
/* 16 */ {"1", true, true, 100000, Int128("10000000000")},
/* 17 */ {"1.0", true, true, 100000, Int128("10000000000")},
/* 18 */ {"12345", true, true, 1234500000, Int128("123450000000000")},
/* 19 */ {"12345.12345", true, true, 1234512345LL, Int128("123451234500000")},
/* 20 */ {"-12345.12345", true, true, -1234512345LL, Int128("-123451234500000")},
/* 21 */ {"1234567890", false, true, int64_t(), Int128("12345678900000000000")},
/* 22 */ {"-1234567890", false, true, int64_t(), Int128("-12345678900000000000")},
/* 23 */ {"1234567890.123", false, true, int64_t(), Int128("12345678901230000000")},
/* 24 */ {"-1234567890.1234567", false, true, int64_t(), Int128("-12345678901234567000")},
/* 25 */ {"1234567890123.12345", false, true, int64_t(), Int128("12345678901231234500000")},
/* 26 */
{"-1234567890123.12345678901", false, true, int64_t(), Int128("-12345678901231234567890")}};
for (int i = 0; i < rawDataAndExpected.size(); i++) {
c1.data[i] = c2.data[i] = const_cast<char*>(std::get<0>(rawDataAndExpected[i]).c_str());
c1.length[i] = c2.length[i] = std::get<0>(rawDataAndExpected[i]).length();
}
structBatch->numElements = c1.numElements = c2.numElements = rawDataAndExpected.size();
structBatch->hasNulls = c1.hasNulls = c2.hasNulls = false;
writer->add(*batch);
writer->close();
auto inStream = std::make_unique<MemoryInputStream>(memStream.getData(), memStream.getLength());
auto pool = getDefaultPool();
auto reader = createReader(*pool, std::move(inStream));
RowReaderOptions rowReaderOptions;
rowReaderOptions.setUseTightNumericVector(true);
rowReaderOptions.setReadType(readType);
auto rowReader = reader->createRowReader(rowReaderOptions);
auto readBatch = rowReader->createRowBatch(TEST_CASES);
EXPECT_EQ(true, rowReader->next(*readBatch));
auto& readSturctBatch = dynamic_cast<StructVectorBatch&>(*readBatch);
auto& readC1 = dynamic_cast<Decimal64VectorBatch&>(*readSturctBatch.fields[0]);
auto& readC2 = dynamic_cast<Decimal128VectorBatch&>(*readSturctBatch.fields[1]);
EXPECT_EQ(readBatch->numElements, rawDataAndExpected.size());
for (int i = 0; i < readBatch->numElements; i++) {
bool expectedNotNull1 = std::get<1>(rawDataAndExpected[i]);
bool expectedNotNull2 = std::get<2>(rawDataAndExpected[i]);
EXPECT_EQ(expectedNotNull1, readC1.notNull[i]) << i;
EXPECT_EQ(expectedNotNull2, readC2.notNull[i]) << i;
if (expectedNotNull1) {
EXPECT_EQ(std::get<3>(rawDataAndExpected[i]), readC1.values[i]) << i;
}
if (expectedNotNull2) {
EXPECT_EQ(std::get<4>(rawDataAndExpected[i]), readC2.values[i]) << i;
}
}
}
} // namespace orc