PARQUET-1990: [C++] Refuse to write ConvertedType::NA

ConvertedType::NA corresponds to an invalid converted type that was once added to the Parquet spec:
https://github.com/apache/parquet-format/pull/45
but then quickly removed in favour of the Null logical type:
https://github.com/apache/parquet-format/pull/51

Unfortunately, Parquet C++ could still in some cases emit the unofficial converted type.

Also remove the confusingly-named LogicalType::Unknown, while "UNKNOWN" in the Thrift specification points to LogicalType::Null.

Closes #9863 from pitrou/PARQUET-1990-invalid-converted-type

Authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc
index 1b921d5..dfd4bd8 100644
--- a/cpp/src/parquet/printer.cc
+++ b/cpp/src/parquet/printer.cc
@@ -87,11 +87,15 @@
     const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
     stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
            << TypeToString(descr->physical_type());
-    if (descr->converted_type() != ConvertedType::NONE) {
-      stream << "/" << ConvertedTypeToString(descr->converted_type());
+    const auto& logical_type = descr->logical_type();
+    if (!logical_type->is_none()) {
+      stream << " / " << logical_type->ToString();
     }
-    if (descr->converted_type() == ConvertedType::DECIMAL) {
-      stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+    if (descr->converted_type() != ConvertedType::NONE) {
+      stream << " / " << ConvertedTypeToString(descr->converted_type());
+      if (descr->converted_type() == ConvertedType::DECIMAL) {
+        stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+      }
     }
     stream << ")" << std::endl;
   }
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index 0171034..7c2f9d7 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -317,7 +317,7 @@
 Number of Real Columns: 2
 Number of Columns: 2
 Number of Selected Columns: 2
-Column 0: a.list.element.list.element.list.element (BYTE_ARRAY/UTF8)
+Column 0: a.list.element.list.element.list.element (BYTE_ARRAY / String / UTF8)
 Column 1: b (INT32)
 --- Row Group: 0 ---
 --- Total Bytes: 155 ---
diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc
index f2a99c8..bfb295f 100644
--- a/cpp/src/parquet/schema.cc
+++ b/cpp/src/parquet/schema.cc
@@ -448,7 +448,7 @@
                           LogicalType::FromThrift(element->logicalType),
                           LoadEnumSafe(&element->type), element->type_length, field_id));
   } else if (element->__isset.converted_type) {
-    // legacy writer with logical type present
+    // legacy writer with converted type present
     primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
         element->name, LoadEnumSafe(&element->repetition_type),
         LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
@@ -500,7 +500,16 @@
   element->__set_name(name_);
   element->__set_repetition_type(ToThrift(repetition_));
   if (converted_type_ != ConvertedType::NONE) {
-    element->__set_converted_type(ToThrift(converted_type_));
+    if (converted_type_ != ConvertedType::NA) {
+      element->__set_converted_type(ToThrift(converted_type_));
+    } else {
+      // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
+      // Never emit it (see PARQUET-1990 for discussion).
+      if (!logical_type_ || !logical_type_->is_null()) {
+        throw ParquetException(
+            "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
+      }
+    }
   }
   if (field_id_ >= 0) {
     element->__set_field_id(field_id_);
diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc
index c095e7a..43760d3 100644
--- a/cpp/src/parquet/schema_test.cc
+++ b/cpp/src/parquet/schema_test.cc
@@ -1101,12 +1101,12 @@
   ASSERT_TRUE(reconstructed->is_valid());
   ASSERT_TRUE(reconstructed->Equals(*original));
 
-  // Unknown
-  original = LogicalType::Unknown();
+  // Undefined
+  original = UndefinedLogicalType::Make();
   ASSERT_TRUE(original->is_invalid());
   ASSERT_FALSE(original->is_valid());
   converted_type = original->ToConvertedType(&converted_decimal_metadata);
-  ASSERT_EQ(converted_type, ConvertedType::NA);
+  ASSERT_EQ(converted_type, ConvertedType::UNDEFINED);
   ASSERT_FALSE(converted_decimal_metadata.isset);
   ASSERT_TRUE(original->is_compatible(converted_type, converted_decimal_metadata));
   ASSERT_TRUE(original->is_compatible(converted_type));
@@ -1243,7 +1243,6 @@
       {BSONLogicalType::Make(), false, true, true},
       {UUIDLogicalType::Make(), false, true, true},
       {NoLogicalType::Make(), false, false, true},
-      {UnknownLogicalType::Make(), false, false, false},
   };
 
   for (const ExpectedProperties& c : cases) {
@@ -1339,7 +1338,7 @@
   }
 
   std::vector<std::shared_ptr<const LogicalType>> any_type_cases = {
-      LogicalType::Null(), LogicalType::None(), LogicalType::Unknown()};
+      LogicalType::Null(), LogicalType::None(), UndefinedLogicalType::Make()};
 
   for (auto c : any_type_cases) {
     ConfirmAnyPrimitiveTypeApplicability(c);
@@ -1453,7 +1452,7 @@
   };
 
   std::vector<ExpectedRepresentation> cases = {
-      {LogicalType::Unknown(), "Unknown", R"({"Type": "Unknown"})"},
+      {UndefinedLogicalType::Make(), "Undefined", R"({"Type": "Undefined"})"},
       {LogicalType::String(), "String", R"({"Type": "String"})"},
       {LogicalType::Map(), "Map", R"({"Type": "Map"})"},
       {LogicalType::List(), "List", R"({"Type": "List"})"},
@@ -1550,7 +1549,6 @@
   };
 
   std::vector<ExpectedSortOrder> cases = {
-      {LogicalType::Unknown(), SortOrder::UNKNOWN},
       {LogicalType::String(), SortOrder::UNSIGNED},
       {LogicalType::Map(), SortOrder::UNKNOWN},
       {LogicalType::List(), SortOrder::UNKNOWN},
@@ -1888,8 +1886,6 @@
       {"uuid", LogicalType::UUID(), Type::FIXED_LEN_BYTE_ARRAY, 16, false,
        ConvertedType::NA, true, [this]() { return element_->logicalType.__isset.UUID; }},
       {"none", LogicalType::None(), Type::INT64, -1, false, ConvertedType::NA, false,
-       check_nothing},
-      {"unknown", LogicalType::Unknown(), Type::INT64, -1, true, ConvertedType::NA, false,
        check_nothing}};
 
   for (const SchemaElementConstructionArguments& c : cases) {
diff --git a/cpp/src/parquet/thrift_internal.h b/cpp/src/parquet/thrift_internal.h
index ba3fefd..c9e0269 100644
--- a/cpp/src/parquet/thrift_internal.h
+++ b/cpp/src/parquet/thrift_internal.h
@@ -256,6 +256,9 @@
 static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
   // item 0 is NONE
   DCHECK_NE(type, ConvertedType::NONE);
+  // it is forbidden to emit "NA" (PARQUET-1990)
+  DCHECK_NE(type, ConvertedType::NA);
+  DCHECK_NE(type, ConvertedType::UNDEFINED);
   return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
 }
 
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index 3d3e7c7..4e5bcee 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -366,13 +366,14 @@
       return JSONLogicalType::Make();
     case ConvertedType::BSON:
       return BSONLogicalType::Make();
+    case ConvertedType::NA:
+      return NullLogicalType::Make();
     case ConvertedType::NONE:
       return NoLogicalType::Make();
-    case ConvertedType::NA:
     case ConvertedType::UNDEFINED:
-      return UnknownLogicalType::Make();
+      return UndefinedLogicalType::Make();
   }
-  return UnknownLogicalType::Make();
+  return UndefinedLogicalType::Make();
 }
 
 std::shared_ptr<const LogicalType> LogicalType::FromThrift(
@@ -483,10 +484,6 @@
 
 std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
 
-std::shared_ptr<const LogicalType> LogicalType::Unknown() {
-  return UnknownLogicalType::Make();
-}
-
 /*
  * The logical type implementation classes are built in four layers: (1) the base
  * layer, which establishes the interface and provides generally reusable implementations
@@ -516,7 +513,7 @@
   virtual std::string ToString() const = 0;
 
   virtual bool is_serialized() const {
-    return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNKNOWN);
+    return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
   }
 
   virtual std::string ToJSON() const {
@@ -567,14 +564,14 @@
   class BSON;
   class UUID;
   class No;
-  class Unknown;
+  class Undefined;
 
  protected:
   Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
   Impl() = default;
 
  private:
-  LogicalType::Type::type type_ = LogicalType::Type::UNKNOWN;
+  LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
   SortOrder::type order_ = SortOrder::UNKNOWN;
 };
 
@@ -636,7 +633,9 @@
 bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
 bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
 bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
-bool LogicalType::is_valid() const { return impl_->type() != LogicalType::Type::UNKNOWN; }
+bool LogicalType::is_valid() const {
+  return impl_->type() != LogicalType::Type::UNDEFINED;
+}
 bool LogicalType::is_invalid() const { return !is_valid(); }
 bool LogicalType::is_nested() const {
   return (impl_->type() == LogicalType::Type::LIST) ||
@@ -1555,19 +1554,19 @@
 
 GENERATE_MAKE(No)
 
-class LogicalType::Impl::Unknown final : public LogicalType::Impl::SimpleCompatible,
-                                         public LogicalType::Impl::UniversalApplicable {
+class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
+                                           public LogicalType::Impl::UniversalApplicable {
  public:
-  friend class UnknownLogicalType;
+  friend class UndefinedLogicalType;
 
-  OVERRIDE_TOSTRING(Unknown)
+  OVERRIDE_TOSTRING(Undefined)
 
  private:
-  Unknown()
-      : LogicalType::Impl(LogicalType::Type::UNKNOWN, SortOrder::UNKNOWN),
-        LogicalType::Impl::SimpleCompatible(ConvertedType::NA) {}
+  Undefined()
+      : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
+        LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
 };
 
-GENERATE_MAKE(Unknown)
+GENERATE_MAKE(Undefined)
 
 }  // namespace parquet
diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h
index 953da83..f3d3abf 100644
--- a/cpp/src/parquet/types.h
+++ b/cpp/src/parquet/types.h
@@ -71,7 +71,7 @@
 // Mirrors parquet::ConvertedType
 struct ConvertedType {
   enum type {
-    NONE,
+    NONE,  // Not a real converted type, but means no converted type is specified
     UTF8,
     MAP,
     MAP_KEY_VALUE,
@@ -94,9 +94,12 @@
     JSON,
     BSON,
     INTERVAL,
+    // DEPRECATED INVALID ConvertedType for all-null data.
+    // Only useful for reading legacy files written out by interim Parquet C++ releases.
+    // For writing, always emit LogicalType::Null instead.
+    // See PARQUET-1990.
     NA = 25,
-    // Should always be last element.
-    UNDEFINED = 26
+    UNDEFINED = 26  // Not a real converted type; should always be last element
   };
 };
 
@@ -140,7 +143,7 @@
  public:
   struct Type {
     enum type {
-      UNKNOWN = 0,
+      UNDEFINED = 0,  // Not a real logical type
       STRING = 1,
       MAP,
       LIST,
@@ -151,11 +154,11 @@
       TIMESTAMP,
       INTERVAL,
       INT,
-      NIL,  // Thrift NullType
+      NIL,  // Thrift NullType: annotates data that is always null
       JSON,
       BSON,
       UUID,
-      NONE
+      NONE  // Not a real logical type; should always be last element
     };
   };
 
@@ -199,12 +202,18 @@
 
   static std::shared_ptr<const LogicalType> Interval();
   static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
+
+  /// \brief Create a logical type for data that's always null
+  ///
+  /// Any physical type can be annotated with this logical type.
   static std::shared_ptr<const LogicalType> Null();
+
   static std::shared_ptr<const LogicalType> JSON();
   static std::shared_ptr<const LogicalType> BSON();
   static std::shared_ptr<const LogicalType> UUID();
+
+  /// \brief Create a placeholder for when no logical type is specified
   static std::shared_ptr<const LogicalType> None();
-  static std::shared_ptr<const LogicalType> Unknown();
 
   /// \brief Return true if this logical type is consistent with the given underlying
   /// physical type.
@@ -434,13 +443,13 @@
   NoLogicalType() = default;
 };
 
-/// \brief Allowed for any type.
-class PARQUET_EXPORT UnknownLogicalType : public LogicalType {
+// Internal API, for unrecognized logical types
+class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
  public:
   static std::shared_ptr<const LogicalType> Make();
 
  private:
-  UnknownLogicalType() = default;
+  UndefinedLogicalType() = default;
 };
 
 // Data encodings. Mirrors parquet::Encoding
diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index e97f324..8fa1c85 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -54,7 +54,7 @@
         ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY"
 
     enum ParquetLogicalTypeId" parquet::LogicalType::Type::type":
-        ParquetLogicalType_UNKNOWN" parquet::LogicalType::Type::UNKNOWN"
+        ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED"
         ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING"
         ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP"
         ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST"
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index ce16acc..67c1c5a 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -809,7 +809,7 @@
 
 cdef logical_type_name_from_enum(ParquetLogicalTypeId type_):
     return {
-        ParquetLogicalType_UNKNOWN: 'UNKNOWN',
+        ParquetLogicalType_UNDEFINED: 'UNDEFINED',
         ParquetLogicalType_STRING: 'STRING',
         ParquetLogicalType_MAP: 'MAP',
         ParquetLogicalType_LIST: 'LIST',