| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include <iostream> |
| #include <sstream> |
| #include <string> |
| |
| #include "storage/format/orc/exceptions.h" |
| #include "storage/format/orc/type-impl.h" |
| #include "storage/format/orc/vector.h" |
| |
| namespace orc { |
| Type::~Type() { |
| // PASS |
| } |
| |
| TypeImpl::TypeImpl(ORCTypeKind _kind) { |
| parent = nullptr; |
| columnId = -1; |
| maximumColumnId = -1; |
| kind = _kind; |
| maxLength = 0; |
| precision = 0; |
| scale = 0; |
| subtypeCount = 0; |
| } |
| |
| TypeImpl::TypeImpl(ORCTypeKind _kind, uint64_t _maxLength) { |
| parent = nullptr; |
| columnId = -1; |
| maximumColumnId = -1; |
| kind = _kind; |
| maxLength = _maxLength; |
| precision = 0; |
| scale = 0; |
| subtypeCount = 0; |
| } |
| |
| TypeImpl::TypeImpl(ORCTypeKind _kind, uint64_t _precision, uint64_t _scale) { |
| parent = nullptr; |
| columnId = -1; |
| maximumColumnId = -1; |
| kind = _kind; |
| maxLength = 0; |
| precision = _precision; |
| scale = _scale; |
| subtypeCount = 0; |
| } |
| |
| uint64_t TypeImpl::assignIds(uint64_t root) const { |
| columnId = static_cast<int64_t>(root); |
| uint64_t current = root + 1; |
| for (uint64_t i = 0; i < subtypeCount; ++i) { |
| current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current); |
| } |
| maximumColumnId = static_cast<int64_t>(current) - 1; |
| return current; |
| } |
| |
| TypeImpl::~TypeImpl() { |
| for (std::vector<Type*>::iterator it = subTypes.begin(); it != subTypes.end(); |
| it++) { |
| delete (*it); |
| } |
| } |
| |
| void TypeImpl::ensureIdAssigned() const { |
| if (columnId == -1) { |
| const TypeImpl* root = this; |
| while (root->parent != nullptr) { |
| root = root->parent; |
| } |
| root->assignIds(0); |
| } |
| } |
| |
| uint64_t TypeImpl::getColumnId() const { |
| ensureIdAssigned(); |
| return static_cast<uint64_t>(columnId); |
| } |
| |
| uint64_t TypeImpl::getMaximumColumnId() const { |
| ensureIdAssigned(); |
| return static_cast<uint64_t>(maximumColumnId); |
| } |
| |
| ORCTypeKind TypeImpl::getKind() const { return kind; } |
| |
| uint64_t TypeImpl::getSubtypeCount() const { return subtypeCount; } |
| |
| const Type* TypeImpl::getSubtype(uint64_t i) const { return subTypes[i]; } |
| |
| const std::string& TypeImpl::getFieldName(uint64_t i) const { |
| return fieldNames[i]; |
| } |
| |
| uint64_t TypeImpl::getMaximumLength() const { return maxLength; } |
| |
| uint64_t TypeImpl::getPrecision() const { return precision; } |
| |
| uint64_t TypeImpl::getScale() const { return scale; } |
| |
| void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) { |
| columnId = static_cast<int64_t>(_columnId); |
| maximumColumnId = static_cast<int64_t>(_maxColumnId); |
| } |
| |
| void TypeImpl::addChildType(std::unique_ptr<Type> childType) { |
| TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release()); |
| subTypes.push_back(child); |
| if (child != nullptr) { |
| child->parent = this; |
| } |
| subtypeCount += 1; |
| } |
| |
| Type* TypeImpl::addStructField(const std::string& fieldName, |
| std::unique_ptr<Type> fieldType) { |
| addChildType(std::move(fieldType)); |
| fieldNames.push_back(fieldName); |
| return this; |
| } |
| |
| Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) { |
| addChildType(std::move(fieldType)); |
| return this; |
| } |
| |
| std::string TypeImpl::toString() const { |
| switch (static_cast<int64_t>(kind)) { |
| case BOOLEAN: |
| return "boolean"; |
| case BYTE: |
| return "tinyint"; |
| case SHORT: |
| return "smallint"; |
| case INT: |
| return "int"; |
| case LONG: |
| return "bigint"; |
| case FLOAT: |
| return "float"; |
| case DOUBLE: |
| return "double"; |
| case STRING: |
| return "string"; |
| case BINARY: |
| return "binary"; |
| case TIMESTAMP: |
| return "timestamp"; |
| case LIST: |
| return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; |
| case MAP: |
| return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + |
| (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; |
| case STRUCT: { |
| std::string result = "struct<"; |
| for (size_t i = 0; i < subTypes.size(); ++i) { |
| if (i != 0) { |
| result += ","; |
| } |
| result += fieldNames[i]; |
| result += ":"; |
| result += subTypes[i]->toString(); |
| } |
| result += ">"; |
| return result; |
| } |
| case UNION: { |
| std::string result = "uniontype<"; |
| for (size_t i = 0; i < subTypes.size(); ++i) { |
| if (i != 0) { |
| result += ","; |
| } |
| result += subTypes[i]->toString(); |
| } |
| result += ">"; |
| return result; |
| } |
| case DECIMAL: { |
| std::stringstream result; |
| result << "decimal(" << precision << "," << scale << ")"; |
| return result.str(); |
| } |
| case DATE: |
| return "date"; |
| case TIME: |
| return "time"; |
| case VARCHAR: { |
| std::stringstream result; |
| result << "varchar(" << maxLength << ")"; |
| return result.str(); |
| } |
| case CHAR: { |
| std::stringstream result; |
| result << "char(" << maxLength << ")"; |
| return result.str(); |
| } |
| default: |
| LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type"); |
| } |
| } |
| |
| std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch( |
| uint64_t capacity, dbcommon::MemoryPool& pool) const { |
| switch (kind) { |
| case BYTE: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new ByteVectorBatch(capacity, pool)); |
| |
| case INT: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new IntVectorBatch(capacity, pool)); |
| |
| case SHORT: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new ShortVectorBatch(capacity, pool)); |
| |
| case LONG: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new LongVectorBatch(capacity, pool)); |
| |
| case FLOAT: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new FloatVectorBatch(capacity, pool)); |
| |
| case DOUBLE: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new DoubleVectorBatch(capacity, pool)); |
| |
| case STRING: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new StringVectorBatch(capacity, pool)); |
| case BINARY: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new BinaryVectorBatch(capacity, pool)); |
| case CHAR: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new BlankPaddedCharVectorBatch(capacity, pool, maxLength)); |
| case VARCHAR: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new VaryingCharVectorBatch(capacity, pool, maxLength)); |
| |
| case BOOLEAN: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new BooleanVectorBatch(capacity, pool)); |
| |
| case DATE: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new DateVectorBatch(capacity, pool)); |
| |
| case TIME: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new TimeVectorBatch(capacity, pool)); |
| |
| case TIMESTAMP: |
| return std::unique_ptr<ColumnVectorBatch>( |
| new TimestampVectorBatch(capacity, pool)); |
| |
| case STRUCT: { |
| StructVectorBatch* result = new StructVectorBatch(capacity, pool); |
| for (uint64_t i = 0; i < getSubtypeCount(); ++i) { |
| result->fields.push_back( |
| getSubtype(i)->createRowBatch(capacity, pool).release()); |
| } |
| return std::unique_ptr<ColumnVectorBatch>(result); |
| } |
| |
| case LIST: { |
| ListVectorBatch* result = new ListVectorBatch(capacity, pool); |
| if (getSubtype(0) != nullptr) { |
| result->elements = getSubtype(0)->createRowBatch(capacity, pool); |
| } |
| return std::unique_ptr<ColumnVectorBatch>(result); |
| } |
| |
| case MAP: { |
| MapVectorBatch* result = new MapVectorBatch(capacity, pool); |
| if (getSubtype(0) != nullptr) { |
| result->keys = getSubtype(0)->createRowBatch(capacity, pool); |
| } |
| if (getSubtype(1) != nullptr) { |
| result->elements = getSubtype(1)->createRowBatch(capacity, pool); |
| } |
| return std::unique_ptr<ColumnVectorBatch>(result); |
| } |
| |
| case DECIMAL: { |
| if (getPrecision() == 0 || getPrecision() > 18) { |
| return std::unique_ptr<ColumnVectorBatch>( |
| new Decimal128VectorBatch(capacity, pool)); |
| } else { |
| return std::unique_ptr<ColumnVectorBatch>( |
| new Decimal64VectorBatch(capacity, pool)); |
| } |
| } |
| |
| case UNION: { |
| UnionVectorBatch* result = new UnionVectorBatch(capacity, pool); |
| for (uint64_t i = 0; i < getSubtypeCount(); ++i) { |
| result->children.push_back( |
| getSubtype(i)->createRowBatch(capacity, pool).release()); |
| } |
| return std::unique_ptr<ColumnVectorBatch>(result); |
| } |
| |
| default: |
| LOG_ERROR(ERRCODE_INTERNAL_ERROR, "not supported yet"); |
| } |
| } |
| |
| std::unique_ptr<Type> createPrimitiveType(ORCTypeKind kind) { |
| return std::unique_ptr<Type>(new TypeImpl(kind)); |
| } |
| |
| std::unique_ptr<Type> createCharType(ORCTypeKind kind, uint64_t maxLength) { |
| return std::unique_ptr<Type>(new TypeImpl(kind, maxLength)); |
| } |
| |
| std::unique_ptr<Type> createDecimalType(uint64_t precision, uint64_t scale) { |
| return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); |
| } |
| |
| std::unique_ptr<Type> createStructType() { |
| return std::unique_ptr<Type>(new TypeImpl(STRUCT)); |
| } |
| |
| std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) { |
| TypeImpl* result = new TypeImpl(LIST); |
| result->addChildType(std::move(elements)); |
| return std::unique_ptr<Type>(result); |
| } |
| |
| std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, |
| std::unique_ptr<Type> value) { |
| TypeImpl* result = new TypeImpl(MAP); |
| result->addChildType(std::move(key)); |
| result->addChildType(std::move(value)); |
| return std::unique_ptr<Type>(result); |
| } |
| |
| std::unique_ptr<Type> createUnionType() { |
| return std::unique_ptr<Type>(new TypeImpl(UNION)); |
| } |
| |
| std::string printProtobufMessage(const google::protobuf::Message& message); |
| |
| std::unique_ptr<Type> convertType(const proto::Type& type, |
| const proto::Footer& footer) { |
| switch (static_cast<int64_t>(type.kind())) { |
| case proto::Type_Kind_BOOLEAN: |
| case proto::Type_Kind_BYTE: |
| case proto::Type_Kind_SHORT: |
| case proto::Type_Kind_INT: |
| case proto::Type_Kind_LONG: |
| case proto::Type_Kind_FLOAT: |
| case proto::Type_Kind_DOUBLE: |
| case proto::Type_Kind_STRING: |
| case proto::Type_Kind_BINARY: |
| case proto::Type_Kind_TIMESTAMP: |
| case proto::Type_Kind_DATE: |
| case proto::Type_Kind_TIME: |
| return std::unique_ptr<Type>( |
| new TypeImpl(static_cast<ORCTypeKind>(type.kind()))); |
| |
| case proto::Type_Kind_CHAR: |
| case proto::Type_Kind_VARCHAR: |
| return std::unique_ptr<Type>(new TypeImpl( |
| static_cast<ORCTypeKind>(type.kind()), type.maximumlength())); |
| |
| case proto::Type_Kind_DECIMAL: |
| return std::unique_ptr<Type>( |
| new TypeImpl(DECIMAL, type.precision(), type.scale())); |
| |
| case proto::Type_Kind_LIST: |
| case proto::Type_Kind_MAP: |
| case proto::Type_Kind_UNION: { |
| TypeImpl* result = new TypeImpl(static_cast<ORCTypeKind>(type.kind())); |
| for (int i = 0; i < type.subtypes_size(); ++i) { |
| result->addUnionChild(convertType( |
| footer.types(static_cast<int>(type.subtypes(i))), footer)); |
| } |
| return std::unique_ptr<Type>(result); |
| } |
| |
| case proto::Type_Kind_STRUCT: { |
| TypeImpl* result = new TypeImpl(STRUCT); |
| uint64_t size = static_cast<uint64_t>(type.subtypes_size()); |
| std::vector<Type*> typeList(size); |
| std::vector<std::string> fieldList(size); |
| for (int i = 0; i < type.subtypes_size(); ++i) { |
| result->addStructField( |
| type.fieldnames(i), |
| convertType(footer.types(static_cast<int>(type.subtypes(i))), |
| footer)); |
| } |
| return std::unique_ptr<Type>(result); |
| } |
| default: |
| LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type kind"); |
| } |
| } |
| |
| /** |
| * Build a clone of the file type, projecting columns from the selected |
| * vector. This routine assumes that the parent of any selected column |
| * is also selected. The column ids are copied from the fileType. |
| * @param fileType the type in the file |
| * @param selected is each column by id selected |
| * @return a clone of the fileType filtered by the selection array |
| */ |
| std::unique_ptr<Type> buildSelectedType(const Type* fileType, |
| const std::vector<bool>& selected) { |
| if (fileType == nullptr || !selected[fileType->getColumnId()]) { |
| return std::unique_ptr<Type>(); |
| } |
| |
| TypeImpl* result; |
| switch (static_cast<int32_t>(fileType->getKind())) { |
| case BOOLEAN: |
| case BYTE: |
| case SHORT: |
| case INT: |
| case LONG: |
| case FLOAT: |
| case DOUBLE: |
| case STRING: |
| case BINARY: |
| case TIMESTAMP: |
| case DATE: |
| case TIME: |
| result = new TypeImpl(fileType->getKind()); |
| break; |
| |
| case DECIMAL: |
| result = new TypeImpl(fileType->getKind(), fileType->getPrecision(), |
| fileType->getScale()); |
| break; |
| |
| case VARCHAR: |
| case CHAR: |
| result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength()); |
| break; |
| |
| case LIST: |
| result = new TypeImpl(fileType->getKind()); |
| result->addChildType( |
| buildSelectedType(fileType->getSubtype(0), selected)); |
| break; |
| |
| case MAP: |
| result = new TypeImpl(fileType->getKind()); |
| result->addChildType( |
| buildSelectedType(fileType->getSubtype(0), selected)); |
| result->addChildType( |
| buildSelectedType(fileType->getSubtype(1), selected)); |
| break; |
| |
| case STRUCT: { |
| result = new TypeImpl(fileType->getKind()); |
| for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) { |
| std::unique_ptr<Type> childType = |
| buildSelectedType(fileType->getSubtype(child), selected); |
| if (childType.get() != nullptr) { |
| result->addStructField(fileType->getFieldName(child), |
| std::move(childType)); |
| } |
| } |
| break; |
| } |
| |
| case UNION: { |
| result = new TypeImpl(fileType->getKind()); |
| for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) { |
| std::unique_ptr<Type> childType = |
| buildSelectedType(fileType->getSubtype(child), selected); |
| if (childType.get() != nullptr) { |
| result->addUnionChild(std::move(childType)); |
| } |
| } |
| break; |
| } |
| |
| default: |
| LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type kind"); |
| } |
| result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); |
| return std::unique_ptr<Type>(result); |
| } |
| |
| } // namespace orc |