blob: 896d1db3d7d3bb8dd5a92c3f66ec4063f386b860 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "storage/format/orc/column-printer.h"
#include <time.h>
#include <limits>
#include <sstream>
#include <stdexcept>
#include <typeinfo>
#ifdef __clang__
#pragma clang diagnostic ignored "-Wformat-security"
#endif
namespace orc {
static void writeChar(std::string& file, char ch) { // NOLINT
file += ch;
}
void writeString(std::string& file, const char* ptr) { // NOLINT
size_t len = strlen(ptr);
file.append(ptr, len);
}
ColumnPrinter::ColumnPrinter(std::string& _buffer)
: // NOLINT
buffer(_buffer) {
notNull = nullptr;
hasNulls = false;
}
ColumnPrinter::~ColumnPrinter() {
// PASS
}
void ColumnPrinter::reset(const ColumnVectorBatch& batch) {
hasNulls = batch.hasNulls;
if (hasNulls) {
notNull = batch.notNull.data();
} else {
notNull = nullptr;
}
}
std::unique_ptr<ColumnPrinter> createColumnPrinter(
std::string& buffer, // NOLINT
const Type* type) {
ColumnPrinter* result = nullptr;
if (type == nullptr) {
result = new VoidColumnPrinter(buffer);
} else {
switch (static_cast<int64_t>(type->getKind())) {
case BOOLEAN:
result = new BooleanColumnPrinter(buffer);
break;
case SHORT:
result = new ShortColumnPrinter(buffer);
break;
case INT:
result = new IntColumnPrinter(buffer);
break;
case BYTE:
case LONG:
result = new LongColumnPrinter(buffer);
break;
case FLOAT:
result = new FloatColumnPrinter(buffer, *type);
break;
case DOUBLE:
result = new DoubleColumnPrinter(buffer, *type);
break;
case STRING:
case VARCHAR:
case CHAR:
result = new StringColumnPrinter(buffer);
break;
case BINARY:
result = new BinaryColumnPrinter(buffer);
break;
case TIMESTAMP:
result = new TimestampColumnPrinter(buffer);
break;
case LIST:
result = new ListColumnPrinter(buffer, *type);
break;
case MAP:
result = new MapColumnPrinter(buffer, *type);
break;
case STRUCT:
result = new StructColumnPrinter(buffer, *type);
break;
case DECIMAL:
if (type->getPrecision() == 0 || type->getPrecision() > 18) {
result = new Decimal128ColumnPrinter(buffer);
} else {
result = new Decimal64ColumnPrinter(buffer);
}
break;
case DATE:
result = new DateColumnPrinter(buffer);
break;
case TIME:
result = new TimeColumnPrinter(buffer);
break;
case UNION:
result = new UnionColumnPrinter(buffer, *type);
break;
default:
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "unknown batch type");
}
}
return std::unique_ptr<ColumnPrinter>(result);
}
VoidColumnPrinter::VoidColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer) {
// PASS
}
void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
// PASS
}
void VoidColumnPrinter::printRow(uint64_t) { writeString(buffer, "null"); }
FloatColumnPrinter::FloatColumnPrinter(std::string& buffer, const Type& type)
: // NOLINT
ColumnPrinter(buffer),
data(nullptr) {
// PASS
}
void FloatColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const FloatVectorBatch&>(batch).data.data();
}
void FloatColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
char numBuffer[64];
snprintf(numBuffer, sizeof(numBuffer), "%.7g", data[rowId]);
writeString(buffer, numBuffer);
}
}
DoubleColumnPrinter::DoubleColumnPrinter(std::string& buffer, const Type& type)
: // NOLINT
ColumnPrinter(buffer),
data(nullptr) {
// PASS
}
void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
}
void DoubleColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
char numBuffer[64];
snprintf(numBuffer, sizeof(numBuffer), "%.14g", data[rowId]);
writeString(buffer, numBuffer);
}
}
Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), data(nullptr), scale(0) {
// PASS
}
void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
}
std::string toDecimalString(int64_t value, int32_t scale) {
std::stringstream buffer;
if (scale == 0) {
buffer << value;
return buffer.str();
}
std::string sign = "";
if (value < 0) {
sign = "-";
value = -value;
}
buffer << value;
std::string str = buffer.str();
int32_t len = static_cast<int32_t>(str.length());
if (len > scale) {
return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
str.substr(static_cast<size_t>(len - scale),
static_cast<size_t>(scale));
} else if (len == scale) {
return sign + "0." + str;
} else {
std::string result = sign + "0.";
for (int32_t i = 0; i < scale - len; ++i) {
result += "0";
}
return result + str;
}
}
void Decimal64ColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeString(buffer, toDecimalString(data[rowId], scale).c_str());
}
}
Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), data(nullptr), scale(0) {
// PASS
}
void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
}
void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeString(buffer, data[rowId].toDecimalString(scale).c_str());
}
}
StringColumnPrinter::StringColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), start(nullptr), length(nullptr) {
// PASS
}
void StringColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
start = dynamic_cast<const BytesVectorBatch&>(batch).data.data();
length = dynamic_cast<const BytesVectorBatch&>(batch).length.data();
}
void StringColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeChar(buffer, '"');
for (int64_t i = 0; i < length[rowId]; ++i) {
char ch = static_cast<char>(start[rowId][i]);
switch (ch) {
case '\\':
writeString(buffer, "\\\\");
break;
case '\b':
writeString(buffer, "\\b");
break;
case '\f':
writeString(buffer, "\\f");
break;
case '\n':
writeString(buffer, "\\n");
break;
case '\r':
writeString(buffer, "\\r");
break;
case '\t':
writeString(buffer, "\\t");
break;
case '"':
writeString(buffer, "\\\"");
break;
default:
writeChar(buffer, ch);
break;
}
}
writeChar(buffer, '"');
}
}
ListColumnPrinter::ListColumnPrinter(std::string& buffer, const Type& type)
: ColumnPrinter(buffer), offsets(nullptr) {
elementPrinter = createColumnPrinter(buffer, type.getSubtype(0));
}
void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements);
}
void ListColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
if (i != offsets[rowId]) {
writeString(buffer, ", ");
}
elementPrinter->printRow(static_cast<uint64_t>(i));
}
writeChar(buffer, ']');
}
}
MapColumnPrinter::MapColumnPrinter(std::string& buffer, const Type& type)
: ColumnPrinter(buffer), offsets(nullptr) {
keyPrinter = createColumnPrinter(buffer, type.getSubtype(0));
elementPrinter = createColumnPrinter(buffer, type.getSubtype(1));
}
void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch);
offsets = myBatch.offsets.data();
keyPrinter->reset(*myBatch.keys);
elementPrinter->reset(*myBatch.elements);
}
void MapColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
if (i != offsets[rowId]) {
writeString(buffer, ", ");
}
writeString(buffer, "{\"key\": ");
keyPrinter->printRow(static_cast<uint64_t>(i));
writeString(buffer, ", \"value\": ");
elementPrinter->printRow(static_cast<uint64_t>(i));
writeChar(buffer, '}');
}
writeChar(buffer, ']');
}
}
UnionColumnPrinter::UnionColumnPrinter(std::string& buffer, const Type& type)
: ColumnPrinter(buffer), tags(nullptr), offsets(nullptr) {
for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
fieldPrinter.push_back(
createColumnPrinter(buffer, type.getSubtype(i)).release());
}
}
UnionColumnPrinter::~UnionColumnPrinter() {
for (size_t i = 0; i < fieldPrinter.size(); i++) {
delete fieldPrinter[i];
}
}
void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const UnionVectorBatch& unionBatch =
dynamic_cast<const UnionVectorBatch&>(batch);
tags = unionBatch.tags.data();
offsets = unionBatch.offsets.data();
for (size_t i = 0; i < fieldPrinter.size(); ++i) {
fieldPrinter[i]->reset(*(unionBatch.children[i]));
}
}
void UnionColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeString(buffer, "{\"tag\": ");
char numBuffer[64];
snprintf(numBuffer, sizeof(numBuffer),
"%"
"ll"
"d",
static_cast<int64_t>(tags[rowId]));
writeString(buffer, numBuffer);
writeString(buffer, ", \"value\": ");
fieldPrinter[tags[rowId]]->printRow(offsets[rowId]);
writeChar(buffer, '}');
}
}
StructColumnPrinter::StructColumnPrinter(std::string& buffer, const Type& type)
: // NOLINT
ColumnPrinter(buffer) {
for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
fieldNames.push_back(type.getFieldName(i));
fieldTypes.push_back(type.getSubtype(i)->toString());
fieldPrinter.push_back(
createColumnPrinter(buffer, type.getSubtype(i)).release());
}
}
StructColumnPrinter::~StructColumnPrinter() {
for (size_t i = 0; i < fieldPrinter.size(); i++) {
delete fieldPrinter[i];
}
}
void StructColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const StructVectorBatch& structBatch =
dynamic_cast<const StructVectorBatch&>(batch);
for (size_t i = 0; i < fieldPrinter.size(); ++i) {
fieldPrinter[i]->reset(*(structBatch.fields[i]));
}
}
void StructColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeChar(buffer, '{');
for (unsigned int i = 0; i < fieldPrinter.size(); ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
writeChar(buffer, '"');
writeString(buffer, fieldNames[i].c_str());
writeChar(buffer, '-');
writeString(buffer, fieldTypes[i].c_str());
writeString(buffer, "\": ");
fieldPrinter[i]->printRow(rowId);
}
writeChar(buffer, '}');
}
}
DateColumnPrinter::DateColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), data(nullptr) {
// PASS
}
void DateColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
const time_t timeValue = data[rowId] * 24 * 60 * 60;
struct tm tmValue;
gmtime_r(&timeValue, &tmValue);
char timeBuffer[11];
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue);
writeChar(buffer, '"');
writeString(buffer, timeBuffer);
writeChar(buffer, '"');
}
}
void DateColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
TimeColumnPrinter::TimeColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), data(nullptr) {
// PASS
}
void TimeColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
const time_t timeValue = data[rowId] * 24 * 60 * 60;
struct tm tmValue;
gmtime_r(&timeValue, &tmValue);
char timeBuffer[11];
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue);
writeChar(buffer, '"');
writeString(buffer, timeBuffer);
writeChar(buffer, '"');
}
}
void TimeColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
BooleanColumnPrinter::BooleanColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), data(nullptr) {
// PASS
}
void BooleanColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeString(buffer, (data[rowId] ? "true" : "false"));
}
}
void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
}
BinaryColumnPrinter::BinaryColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), start(nullptr), length(nullptr) {
// PASS
}
void BinaryColumnPrinter::printRow(uint64_t rowId) {
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
writeChar(buffer, '[');
for (int64_t i = 0; i < length[rowId]; ++i) {
if (i != 0) {
writeString(buffer, ", ");
}
char numBuffer[64];
snprintf(numBuffer, sizeof(numBuffer), "%d",
(static_cast<const int>(start[rowId][i]) & 0xff));
writeString(buffer, numBuffer);
}
writeChar(buffer, ']');
}
}
void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
start = dynamic_cast<const BytesVectorBatch&>(batch).data.data();
length = dynamic_cast<const BytesVectorBatch&>(batch).length.data();
}
TimestampColumnPrinter::TimestampColumnPrinter(std::string& buffer)
: ColumnPrinter(buffer), seconds(nullptr), nanoseconds(nullptr) {
// PASS
}
void TimestampColumnPrinter::printRow(uint64_t rowId) {
const int64_t NANO_DIGITS = 9;
if (hasNulls && !notNull[rowId]) {
writeString(buffer, "null");
} else {
int64_t nanos = nanoseconds[rowId];
time_t secs = static_cast<time_t>(seconds[rowId]);
struct tm tmValue;
gmtime_r(&secs, &tmValue);
char timeBuffer[20];
strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
writeChar(buffer, '"');
writeString(buffer, timeBuffer);
writeChar(buffer, '.');
// remove trailing zeros off the back of the nanos value.
int64_t zeroDigits = 0;
if (nanos == 0) {
zeroDigits = 8;
} else {
while (nanos % 10 == 0) {
nanos /= 10;
zeroDigits += 1;
}
}
char numBuffer[64];
snprintf(numBuffer, sizeof(numBuffer),
"%0*"
"ll"
"d\"",
static_cast<int>(NANO_DIGITS - zeroDigits),
static_cast<int64_t>(nanos));
writeString(buffer, numBuffer);
}
}
void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) {
ColumnPrinter::reset(batch);
const TimestampVectorBatch& ts =
dynamic_cast<const TimestampVectorBatch&>(batch);
seconds = ts.data.data();
nanoseconds = ts.nanoseconds.data();
}
} // namespace orc