blob: 37b987270897b3ae80640728bff746220baac327 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <iostream>
#include <sstream>
#include <string>
#include "storage/format/orc/exceptions.h"
#include "storage/format/orc/type-impl.h"
#include "storage/format/orc/vector.h"
namespace orc {
Type::~Type() {
// PASS
}
TypeImpl::TypeImpl(ORCTypeKind _kind) {
parent = nullptr;
columnId = -1;
maximumColumnId = -1;
kind = _kind;
maxLength = 0;
precision = 0;
scale = 0;
subtypeCount = 0;
}
TypeImpl::TypeImpl(ORCTypeKind _kind, uint64_t _maxLength) {
parent = nullptr;
columnId = -1;
maximumColumnId = -1;
kind = _kind;
maxLength = _maxLength;
precision = 0;
scale = 0;
subtypeCount = 0;
}
TypeImpl::TypeImpl(ORCTypeKind _kind, uint64_t _precision, uint64_t _scale) {
parent = nullptr;
columnId = -1;
maximumColumnId = -1;
kind = _kind;
maxLength = 0;
precision = _precision;
scale = _scale;
subtypeCount = 0;
}
uint64_t TypeImpl::assignIds(uint64_t root) const {
columnId = static_cast<int64_t>(root);
uint64_t current = root + 1;
for (uint64_t i = 0; i < subtypeCount; ++i) {
current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current);
}
maximumColumnId = static_cast<int64_t>(current) - 1;
return current;
}
TypeImpl::~TypeImpl() {
for (std::vector<Type*>::iterator it = subTypes.begin(); it != subTypes.end();
it++) {
delete (*it);
}
}
void TypeImpl::ensureIdAssigned() const {
if (columnId == -1) {
const TypeImpl* root = this;
while (root->parent != nullptr) {
root = root->parent;
}
root->assignIds(0);
}
}
uint64_t TypeImpl::getColumnId() const {
ensureIdAssigned();
return static_cast<uint64_t>(columnId);
}
uint64_t TypeImpl::getMaximumColumnId() const {
ensureIdAssigned();
return static_cast<uint64_t>(maximumColumnId);
}
ORCTypeKind TypeImpl::getKind() const { return kind; }
uint64_t TypeImpl::getSubtypeCount() const { return subtypeCount; }
const Type* TypeImpl::getSubtype(uint64_t i) const { return subTypes[i]; }
const std::string& TypeImpl::getFieldName(uint64_t i) const {
return fieldNames[i];
}
uint64_t TypeImpl::getMaximumLength() const { return maxLength; }
uint64_t TypeImpl::getPrecision() const { return precision; }
uint64_t TypeImpl::getScale() const { return scale; }
void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) {
columnId = static_cast<int64_t>(_columnId);
maximumColumnId = static_cast<int64_t>(_maxColumnId);
}
void TypeImpl::addChildType(std::unique_ptr<Type> childType) {
TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release());
subTypes.push_back(child);
if (child != nullptr) {
child->parent = this;
}
subtypeCount += 1;
}
Type* TypeImpl::addStructField(const std::string& fieldName,
std::unique_ptr<Type> fieldType) {
addChildType(std::move(fieldType));
fieldNames.push_back(fieldName);
return this;
}
Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) {
addChildType(std::move(fieldType));
return this;
}
std::string TypeImpl::toString() const {
switch (static_cast<int64_t>(kind)) {
case BOOLEAN:
return "boolean";
case BYTE:
return "tinyint";
case SHORT:
return "smallint";
case INT:
return "int";
case LONG:
return "bigint";
case FLOAT:
return "float";
case DOUBLE:
return "double";
case STRING:
return "string";
case BINARY:
return "binary";
case TIMESTAMP:
return "timestamp";
case LIST:
return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">";
case MAP:
return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," +
(subTypes[1] ? subTypes[1]->toString() : "void") + ">";
case STRUCT: {
std::string result = "struct<";
for (size_t i = 0; i < subTypes.size(); ++i) {
if (i != 0) {
result += ",";
}
result += fieldNames[i];
result += ":";
result += subTypes[i]->toString();
}
result += ">";
return result;
}
case UNION: {
std::string result = "uniontype<";
for (size_t i = 0; i < subTypes.size(); ++i) {
if (i != 0) {
result += ",";
}
result += subTypes[i]->toString();
}
result += ">";
return result;
}
case DECIMAL: {
std::stringstream result;
result << "decimal(" << precision << "," << scale << ")";
return result.str();
}
case DATE:
return "date";
case TIME:
return "time";
case VARCHAR: {
std::stringstream result;
result << "varchar(" << maxLength << ")";
return result.str();
}
case CHAR: {
std::stringstream result;
result << "char(" << maxLength << ")";
return result.str();
}
default:
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type");
}
}
std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(
uint64_t capacity, dbcommon::MemoryPool& pool) const {
switch (kind) {
case BYTE:
return std::unique_ptr<ColumnVectorBatch>(
new ByteVectorBatch(capacity, pool));
case INT:
return std::unique_ptr<ColumnVectorBatch>(
new IntVectorBatch(capacity, pool));
case SHORT:
return std::unique_ptr<ColumnVectorBatch>(
new ShortVectorBatch(capacity, pool));
case LONG:
return std::unique_ptr<ColumnVectorBatch>(
new LongVectorBatch(capacity, pool));
case FLOAT:
return std::unique_ptr<ColumnVectorBatch>(
new FloatVectorBatch(capacity, pool));
case DOUBLE:
return std::unique_ptr<ColumnVectorBatch>(
new DoubleVectorBatch(capacity, pool));
case STRING:
return std::unique_ptr<ColumnVectorBatch>(
new StringVectorBatch(capacity, pool));
case BINARY:
return std::unique_ptr<ColumnVectorBatch>(
new BinaryVectorBatch(capacity, pool));
case CHAR:
return std::unique_ptr<ColumnVectorBatch>(
new BlankPaddedCharVectorBatch(capacity, pool, maxLength));
case VARCHAR:
return std::unique_ptr<ColumnVectorBatch>(
new VaryingCharVectorBatch(capacity, pool, maxLength));
case BOOLEAN:
return std::unique_ptr<ColumnVectorBatch>(
new BooleanVectorBatch(capacity, pool));
case DATE:
return std::unique_ptr<ColumnVectorBatch>(
new DateVectorBatch(capacity, pool));
case TIME:
return std::unique_ptr<ColumnVectorBatch>(
new TimeVectorBatch(capacity, pool));
case TIMESTAMP:
return std::unique_ptr<ColumnVectorBatch>(
new TimestampVectorBatch(capacity, pool));
case STRUCT: {
StructVectorBatch* result = new StructVectorBatch(capacity, pool);
for (uint64_t i = 0; i < getSubtypeCount(); ++i) {
result->fields.push_back(
getSubtype(i)->createRowBatch(capacity, pool).release());
}
return std::unique_ptr<ColumnVectorBatch>(result);
}
case LIST: {
ListVectorBatch* result = new ListVectorBatch(capacity, pool);
if (getSubtype(0) != nullptr) {
result->elements = getSubtype(0)->createRowBatch(capacity, pool);
}
return std::unique_ptr<ColumnVectorBatch>(result);
}
case MAP: {
MapVectorBatch* result = new MapVectorBatch(capacity, pool);
if (getSubtype(0) != nullptr) {
result->keys = getSubtype(0)->createRowBatch(capacity, pool);
}
if (getSubtype(1) != nullptr) {
result->elements = getSubtype(1)->createRowBatch(capacity, pool);
}
return std::unique_ptr<ColumnVectorBatch>(result);
}
case DECIMAL: {
if (getPrecision() == 0 || getPrecision() > 18) {
return std::unique_ptr<ColumnVectorBatch>(
new Decimal128VectorBatch(capacity, pool));
} else {
return std::unique_ptr<ColumnVectorBatch>(
new Decimal64VectorBatch(capacity, pool));
}
}
case UNION: {
UnionVectorBatch* result = new UnionVectorBatch(capacity, pool);
for (uint64_t i = 0; i < getSubtypeCount(); ++i) {
result->children.push_back(
getSubtype(i)->createRowBatch(capacity, pool).release());
}
return std::unique_ptr<ColumnVectorBatch>(result);
}
default:
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "not supported yet");
}
}
std::unique_ptr<Type> createPrimitiveType(ORCTypeKind kind) {
return std::unique_ptr<Type>(new TypeImpl(kind));
}
std::unique_ptr<Type> createCharType(ORCTypeKind kind, uint64_t maxLength) {
return std::unique_ptr<Type>(new TypeImpl(kind, maxLength));
}
std::unique_ptr<Type> createDecimalType(uint64_t precision, uint64_t scale) {
return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
}
std::unique_ptr<Type> createStructType() {
return std::unique_ptr<Type>(new TypeImpl(STRUCT));
}
std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) {
TypeImpl* result = new TypeImpl(LIST);
result->addChildType(std::move(elements));
return std::unique_ptr<Type>(result);
}
std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key,
std::unique_ptr<Type> value) {
TypeImpl* result = new TypeImpl(MAP);
result->addChildType(std::move(key));
result->addChildType(std::move(value));
return std::unique_ptr<Type>(result);
}
std::unique_ptr<Type> createUnionType() {
return std::unique_ptr<Type>(new TypeImpl(UNION));
}
std::string printProtobufMessage(const google::protobuf::Message& message);
std::unique_ptr<Type> convertType(const proto::Type& type,
const proto::Footer& footer) {
switch (static_cast<int64_t>(type.kind())) {
case proto::Type_Kind_BOOLEAN:
case proto::Type_Kind_BYTE:
case proto::Type_Kind_SHORT:
case proto::Type_Kind_INT:
case proto::Type_Kind_LONG:
case proto::Type_Kind_FLOAT:
case proto::Type_Kind_DOUBLE:
case proto::Type_Kind_STRING:
case proto::Type_Kind_BINARY:
case proto::Type_Kind_TIMESTAMP:
case proto::Type_Kind_DATE:
case proto::Type_Kind_TIME:
return std::unique_ptr<Type>(
new TypeImpl(static_cast<ORCTypeKind>(type.kind())));
case proto::Type_Kind_CHAR:
case proto::Type_Kind_VARCHAR:
return std::unique_ptr<Type>(new TypeImpl(
static_cast<ORCTypeKind>(type.kind()), type.maximumlength()));
case proto::Type_Kind_DECIMAL:
return std::unique_ptr<Type>(
new TypeImpl(DECIMAL, type.precision(), type.scale()));
case proto::Type_Kind_LIST:
case proto::Type_Kind_MAP:
case proto::Type_Kind_UNION: {
TypeImpl* result = new TypeImpl(static_cast<ORCTypeKind>(type.kind()));
for (int i = 0; i < type.subtypes_size(); ++i) {
result->addUnionChild(convertType(
footer.types(static_cast<int>(type.subtypes(i))), footer));
}
return std::unique_ptr<Type>(result);
}
case proto::Type_Kind_STRUCT: {
TypeImpl* result = new TypeImpl(STRUCT);
uint64_t size = static_cast<uint64_t>(type.subtypes_size());
std::vector<Type*> typeList(size);
std::vector<std::string> fieldList(size);
for (int i = 0; i < type.subtypes_size(); ++i) {
result->addStructField(
type.fieldnames(i),
convertType(footer.types(static_cast<int>(type.subtypes(i))),
footer));
}
return std::unique_ptr<Type>(result);
}
default:
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type kind");
}
}
/**
* Build a clone of the file type, projecting columns from the selected
* vector. This routine assumes that the parent of any selected column
* is also selected. The column ids are copied from the fileType.
* @param fileType the type in the file
* @param selected is each column by id selected
* @return a clone of the fileType filtered by the selection array
*/
std::unique_ptr<Type> buildSelectedType(const Type* fileType,
const std::vector<bool>& selected) {
if (fileType == nullptr || !selected[fileType->getColumnId()]) {
return std::unique_ptr<Type>();
}
TypeImpl* result;
switch (static_cast<int32_t>(fileType->getKind())) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case STRING:
case BINARY:
case TIMESTAMP:
case DATE:
case TIME:
result = new TypeImpl(fileType->getKind());
break;
case DECIMAL:
result = new TypeImpl(fileType->getKind(), fileType->getPrecision(),
fileType->getScale());
break;
case VARCHAR:
case CHAR:
result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength());
break;
case LIST:
result = new TypeImpl(fileType->getKind());
result->addChildType(
buildSelectedType(fileType->getSubtype(0), selected));
break;
case MAP:
result = new TypeImpl(fileType->getKind());
result->addChildType(
buildSelectedType(fileType->getSubtype(0), selected));
result->addChildType(
buildSelectedType(fileType->getSubtype(1), selected));
break;
case STRUCT: {
result = new TypeImpl(fileType->getKind());
for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) {
std::unique_ptr<Type> childType =
buildSelectedType(fileType->getSubtype(child), selected);
if (childType.get() != nullptr) {
result->addStructField(fileType->getFieldName(child),
std::move(childType));
}
}
break;
}
case UNION: {
result = new TypeImpl(fileType->getKind());
for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) {
std::unique_ptr<Type> childType =
buildSelectedType(fileType->getSubtype(child), selected);
if (childType.get() != nullptr) {
result->addUnionChild(std::move(childType));
}
}
break;
}
default:
LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type kind");
}
result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
return std::unique_ptr<Type>(result);
}
} // namespace orc