PARQUET-1990: [C++] Refuse to write ConvertedType::NA
ConvertedType::NA corresponds to an invalid converted type that was once added to the Parquet spec:
https://github.com/apache/parquet-format/pull/45
but then quickly removed in favour of the Null logical type:
https://github.com/apache/parquet-format/pull/51
Unfortunately, Parquet C++ could still in some cases emit the unofficial converted type.
Also remove the confusingly-named LogicalType::Unknown, while "UNKNOWN" in the Thrift specification points to LogicalType::Null.
Closes #9863 from pitrou/PARQUET-1990-invalid-converted-type
Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 1b921d5..dfd4bd8 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -87,11 +87,15 @@
const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
<< TypeToString(descr->physical_type());
- if (descr->converted_type() != ConvertedType::NONE) {
- stream << "/" << ConvertedTypeToString(descr->converted_type());
+ const auto& logical_type = descr->logical_type();
+ if (!logical_type->is_none()) {
+ stream << " / " << logical_type->ToString();
}
- if (descr->converted_type() == ConvertedType::DECIMAL) {
- stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+ if (descr->converted_type() != ConvertedType::NONE) {
+ stream << " / " << ConvertedTypeToString(descr->converted_type());
+ if (descr->converted_type() == ConvertedType::DECIMAL) {
+ stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+ }
}
stream << ")" << std::endl;
}
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 0171034..7c2f9d7 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -317,7 +317,7 @@
Number of Real Columns: 2
Number of Columns: 2
Number of Selected Columns: 2
-Column 0: a.list.element.list.element.list.element (BYTE_ARRAY/UTF8)
+Column 0: a.list.element.list.element.list.element (BYTE_ARRAY / String / UTF8)
Column 1: b (INT32)
--- Row Group: 0 ---
--- Total Bytes: 155 ---
diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc
index f2a99c8..bfb295f 100644
--- a/cpp/src/parquet/schema.cc
+++ b/cpp/src/parquet/schema.cc
@@ -448,7 +448,7 @@
LogicalType::FromThrift(element->logicalType),
LoadEnumSafe(&element->type), element->type_length, field_id));
} else if (element->__isset.converted_type) {
- // legacy writer with logical type present
+ // legacy writer with converted type present
primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
element->name, LoadEnumSafe(&element->repetition_type),
LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
@@ -500,7 +500,16 @@
element->__set_name(name_);
element->__set_repetition_type(ToThrift(repetition_));
if (converted_type_ != ConvertedType::NONE) {
- element->__set_converted_type(ToThrift(converted_type_));
+ if (converted_type_ != ConvertedType::NA) {
+ element->__set_converted_type(ToThrift(converted_type_));
+ } else {
+ // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
+ // Never emit it (see PARQUET-1990 for discussion).
+ if (!logical_type_ || !logical_type_->is_null()) {
+ throw ParquetException(
+ "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
+ }
+ }
}
if (field_id_ >= 0) {
element->__set_field_id(field_id_);
diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc
index c095e7a..43760d3 100644
--- a/cpp/src/parquet/schema_test.cc
+++ b/cpp/src/parquet/schema_test.cc
@@ -1101,12 +1101,12 @@
ASSERT_TRUE(reconstructed->is_valid());
ASSERT_TRUE(reconstructed->Equals(*original));
- // Unknown
- original = LogicalType::Unknown();
+ // Undefined
+ original = UndefinedLogicalType::Make();
ASSERT_TRUE(original->is_invalid());
ASSERT_FALSE(original->is_valid());
converted_type = original->ToConvertedType(&converted_decimal_metadata);
- ASSERT_EQ(converted_type, ConvertedType::NA);
+ ASSERT_EQ(converted_type, ConvertedType::UNDEFINED);
ASSERT_FALSE(converted_decimal_metadata.isset);
ASSERT_TRUE(original->is_compatible(converted_type, converted_decimal_metadata));
ASSERT_TRUE(original->is_compatible(converted_type));
@@ -1243,7 +1243,6 @@
{BSONLogicalType::Make(), false, true, true},
{UUIDLogicalType::Make(), false, true, true},
{NoLogicalType::Make(), false, false, true},
- {UnknownLogicalType::Make(), false, false, false},
};
for (const ExpectedProperties& c : cases) {
@@ -1339,7 +1338,7 @@
}
std::vector<std::shared_ptr<const LogicalType>> any_type_cases = {
- LogicalType::Null(), LogicalType::None(), LogicalType::Unknown()};
+ LogicalType::Null(), LogicalType::None(), UndefinedLogicalType::Make()};
for (auto c : any_type_cases) {
ConfirmAnyPrimitiveTypeApplicability(c);
@@ -1453,7 +1452,7 @@
};
std::vector<ExpectedRepresentation> cases = {
- {LogicalType::Unknown(), "Unknown", R"({"Type": "Unknown"})"},
+ {UndefinedLogicalType::Make(), "Undefined", R"({"Type": "Undefined"})"},
{LogicalType::String(), "String", R"({"Type": "String"})"},
{LogicalType::Map(), "Map", R"({"Type": "Map"})"},
{LogicalType::List(), "List", R"({"Type": "List"})"},
@@ -1550,7 +1549,6 @@
};
std::vector<ExpectedSortOrder> cases = {
- {LogicalType::Unknown(), SortOrder::UNKNOWN},
{LogicalType::String(), SortOrder::UNSIGNED},
{LogicalType::Map(), SortOrder::UNKNOWN},
{LogicalType::List(), SortOrder::UNKNOWN},
@@ -1888,8 +1886,6 @@
{"uuid", LogicalType::UUID(), Type::FIXED_LEN_BYTE_ARRAY, 16, false,
ConvertedType::NA, true, [this]() { return element_->logicalType.__isset.UUID; }},
{"none", LogicalType::None(), Type::INT64, -1, false, ConvertedType::NA, false,
- check_nothing},
- {"unknown", LogicalType::Unknown(), Type::INT64, -1, true, ConvertedType::NA, false,
check_nothing}};
for (const SchemaElementConstructionArguments& c : cases) {
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index ba3fefd..c9e0269 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -256,6 +256,9 @@
static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
// item 0 is NONE
DCHECK_NE(type, ConvertedType::NONE);
+ // it is forbidden to emit "NA" (PARQUET-1990)
+ DCHECK_NE(type, ConvertedType::NA);
+ DCHECK_NE(type, ConvertedType::UNDEFINED);
return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
}
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 3d3e7c7..4e5bcee 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -366,13 +366,14 @@
return JSONLogicalType::Make();
case ConvertedType::BSON:
return BSONLogicalType::Make();
+ case ConvertedType::NA:
+ return NullLogicalType::Make();
case ConvertedType::NONE:
return NoLogicalType::Make();
- case ConvertedType::NA:
case ConvertedType::UNDEFINED:
- return UnknownLogicalType::Make();
+ return UndefinedLogicalType::Make();
}
- return UnknownLogicalType::Make();
+ return UndefinedLogicalType::Make();
}
std::shared_ptr<const LogicalType> LogicalType::FromThrift(
@@ -483,10 +484,6 @@
std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
-std::shared_ptr<const LogicalType> LogicalType::Unknown() {
- return UnknownLogicalType::Make();
-}
-
/*
* The logical type implementation classes are built in four layers: (1) the base
* layer, which establishes the interface and provides generally reusable implementations
@@ -516,7 +513,7 @@
virtual std::string ToString() const = 0;
virtual bool is_serialized() const {
- return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNKNOWN);
+ return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
}
virtual std::string ToJSON() const {
@@ -567,14 +564,14 @@
class BSON;
class UUID;
class No;
- class Unknown;
+ class Undefined;
protected:
Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
Impl() = default;
private:
- LogicalType::Type::type type_ = LogicalType::Type::UNKNOWN;
+ LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
SortOrder::type order_ = SortOrder::UNKNOWN;
};
@@ -636,7 +633,9 @@
bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
-bool LogicalType::is_valid() const { return impl_->type() != LogicalType::Type::UNKNOWN; }
+bool LogicalType::is_valid() const {
+ return impl_->type() != LogicalType::Type::UNDEFINED;
+}
bool LogicalType::is_invalid() const { return !is_valid(); }
bool LogicalType::is_nested() const {
return (impl_->type() == LogicalType::Type::LIST) ||
@@ -1555,19 +1554,19 @@
GENERATE_MAKE(No)
-class LogicalType::Impl::Unknown final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::UniversalApplicable {
+class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::UniversalApplicable {
public:
- friend class UnknownLogicalType;
+ friend class UndefinedLogicalType;
- OVERRIDE_TOSTRING(Unknown)
+ OVERRIDE_TOSTRING(Undefined)
private:
- Unknown()
- : LogicalType::Impl(LogicalType::Type::UNKNOWN, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::NA) {}
+ Undefined()
+ : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
};
-GENERATE_MAKE(Unknown)
+GENERATE_MAKE(Undefined)
} // namespace parquet
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 953da83..f3d3abf 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -71,7 +71,7 @@
// Mirrors parquet::ConvertedType
struct ConvertedType {
enum type {
- NONE,
+ NONE, // Not a real converted type, but means no converted type is specified
UTF8,
MAP,
MAP_KEY_VALUE,
@@ -94,9 +94,12 @@
JSON,
BSON,
INTERVAL,
+ // DEPRECATED INVALID ConvertedType for all-null data.
+ // Only useful for reading legacy files written out by interim Parquet C++ releases.
+ // For writing, always emit LogicalType::Null instead.
+ // See PARQUET-1990.
NA = 25,
- // Should always be last element.
- UNDEFINED = 26
+ UNDEFINED = 26 // Not a real converted type; should always be last element
};
};
@@ -140,7 +143,7 @@
public:
struct Type {
enum type {
- UNKNOWN = 0,
+ UNDEFINED = 0, // Not a real logical type
STRING = 1,
MAP,
LIST,
@@ -151,11 +154,11 @@
TIMESTAMP,
INTERVAL,
INT,
- NIL, // Thrift NullType
+ NIL, // Thrift NullType: annotates data that is always null
JSON,
BSON,
UUID,
- NONE
+ NONE // Not a real logical type; should always be last element
};
};
@@ -199,12 +202,18 @@
static std::shared_ptr<const LogicalType> Interval();
static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
+
+ /// \brief Create a logical type for data that's always null
+ ///
+ /// Any physical type can be annotated with this logical type.
static std::shared_ptr<const LogicalType> Null();
+
static std::shared_ptr<const LogicalType> JSON();
static std::shared_ptr<const LogicalType> BSON();
static std::shared_ptr<const LogicalType> UUID();
+
+ /// \brief Create a placeholder for when no logical type is specified
static std::shared_ptr<const LogicalType> None();
- static std::shared_ptr<const LogicalType> Unknown();
/// \brief Return true if this logical type is consistent with the given underlying
/// physical type.
@@ -434,13 +443,13 @@
NoLogicalType() = default;
};
-/// \brief Allowed for any type.
-class PARQUET_EXPORT UnknownLogicalType : public LogicalType {
+// Internal API, for unrecognized logical types
+class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();
private:
- UnknownLogicalType() = default;
+ UndefinedLogicalType() = default;
};
// Data encodings. Mirrors parquet::Encoding
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index e97f324..8fa1c85 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -54,7 +54,7 @@
ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY"
enum ParquetLogicalTypeId" parquet::LogicalType::Type::type":
- ParquetLogicalType_UNKNOWN" parquet::LogicalType::Type::UNKNOWN"
+ ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED"
ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING"
ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP"
ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST"
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index ce16acc..67c1c5a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -809,7 +809,7 @@
cdef logical_type_name_from_enum(ParquetLogicalTypeId type_):
return {
- ParquetLogicalType_UNKNOWN: 'UNKNOWN',
+ ParquetLogicalType_UNDEFINED: 'UNDEFINED',
ParquetLogicalType_STRING: 'STRING',
ParquetLogicalType_MAP: 'MAP',
ParquetLogicalType_LIST: 'LIST',