PARQUET-1357: FormatStatValue truncates binary statistics on zero character

Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>

Closes #479 from xhochy/PARQUET-1357 and squashes the following commits:

4135976 [Korn, Uwe] Add macro for deprecations
e0e4946 [Korn, Uwe] Update to clang-format 6.0
0073b56 [Korn, Uwe] PARQUET-1357: FormatStatValue truncates binary statistics on zero character
diff --git a/cmake_modules/FindClangTools.cmake b/cmake_modules/FindClangTools.cmake
index e9221ff..215a5cd 100644
--- a/cmake_modules/FindClangTools.cmake
+++ b/cmake_modules/FindClangTools.cmake
@@ -30,6 +30,12 @@
 #  CLANG_FORMAT_BIN, The path to the clang format binary
 #  CLANG_TIDY_FOUND, Whether clang format was found
 
+if (DEFINED ENV{HOMEBREW_PREFIX})
+  set(HOMEBREW_PREFIX "${ENV{HOMEBREW_PREFIX}")
+else()
+  set(HOMEBREW_PREFIX "/usr/local")
+endif()
+
 find_program(CLANG_TIDY_BIN
   NAMES clang-tidy-4.0
   clang-tidy-3.9
@@ -37,7 +43,7 @@
   clang-tidy-3.7
   clang-tidy-3.6
   clang-tidy
-  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
+  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
         NO_DEFAULT_PATH
 )
 
@@ -55,7 +61,7 @@
       PATHS
             ${ClangTools_PATH}
             $ENV{CLANG_TOOLS_PATH}
-            /usr/local/bin /usr/bin
+            /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
             NO_DEFAULT_PATH
     )
 
@@ -67,16 +73,26 @@
         if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0")
             find_program(CLANG_FORMAT_BIN
               NAMES clang-format
-              PATHS /usr/local/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin
+              PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin"
                     NO_DEFAULT_PATH
             )
         else()
             find_program(CLANG_FORMAT_BIN
               NAMES clang-format
-              PATHS /usr/local/opt/llvm@${CLANG_FORMAT_VERSION}/bin
+              PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_VERSION}/bin"
                     NO_DEFAULT_PATH
             )
         endif()
+
+        if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")
+          # binary was still not found, look into Cellar
+          file(GLOB CLANG_FORMAT_PATH "${HOMEBREW_PREFIX}/Cellar/llvm/${CLANG_FORMAT_VERSION}.*")
+          find_program(CLANG_FORMAT_BIN
+            NAMES clang-format
+            PATHS "${CLANG_FORMAT_PATH}/bin"
+                  NO_DEFAULT_PATH
+          )
+        endif()
     endif()
 else()
     find_program(CLANG_FORMAT_BIN
@@ -86,7 +102,7 @@
       clang-format-3.7
       clang-format-3.6
       clang-format
-      PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
+      PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
             NO_DEFAULT_PATH
     )
 endif()
diff --git a/src/parquet/printer.cc b/src/parquet/printer.cc
index 88b5528..3f18a5c 100644
--- a/src/parquet/printer.cc
+++ b/src/parquet/printer.cc
@@ -84,8 +84,8 @@
         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
         stream << ", Null Values: " << stats->null_count()
                << ", Distinct Values: " << stats->distinct_count() << std::endl
-               << "  Max: " << FormatStatValue(descr->physical_type(), max.c_str())
-               << ", Min: " << FormatStatValue(descr->physical_type(), min.c_str());
+               << "  Max: " << FormatStatValue(descr->physical_type(), max)
+               << ", Min: " << FormatStatValue(descr->physical_type(), min);
       } else {
         stream << "  Statistics Not Set";
       }
@@ -207,9 +207,8 @@
         std::string min = stats->EncodeMin(), max = stats->EncodeMax();
         stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
                << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
-               << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max.c_str())
-               << "\", "
-               << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min.c_str())
+               << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
+               << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
                << "\" },";
       } else {
         stream << "\"False\",";
diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc
index 4e75982..6b184e3 100644
--- a/src/parquet/types-test.cc
+++ b/src/parquet/types-test.cc
@@ -62,54 +62,81 @@
 }
 
 TEST(TypePrinter, StatisticsTypes) {
+#if !(defined(_WIN32) || defined(__CYGWIN__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
   std::string smin;
   std::string smax;
   int32_t int_min = 1024;
   int32_t int_max = 2048;
   smin = std::string(reinterpret_cast<char*>(&int_min), sizeof(int32_t));
   smax = std::string(reinterpret_cast<char*>(&int_max), sizeof(int32_t));
+  ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin).c_str());
   ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin.c_str()).c_str());
+  ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax).c_str());
   ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax.c_str()).c_str());
 
   int64_t int64_min = 10240000000000;
   int64_t int64_max = 20480000000000;
   smin = std::string(reinterpret_cast<char*>(&int64_min), sizeof(int64_t));
   smax = std::string(reinterpret_cast<char*>(&int64_max), sizeof(int64_t));
+  ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin).c_str());
   ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin.c_str()).c_str());
+  ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax).c_str());
   ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax.c_str()).c_str());
 
   float float_min = 1.024f;
   float float_max = 2.048f;
   smin = std::string(reinterpret_cast<char*>(&float_min), sizeof(float));
   smax = std::string(reinterpret_cast<char*>(&float_max), sizeof(float));
+  ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin).c_str());
   ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin.c_str()).c_str());
+  ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax).c_str());
   ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax.c_str()).c_str());
 
   double double_min = 1.0245;
   double double_max = 2.0489;
   smin = std::string(reinterpret_cast<char*>(&double_min), sizeof(double));
   smax = std::string(reinterpret_cast<char*>(&double_max), sizeof(double));
+  ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin).c_str());
   ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin.c_str()).c_str());
+  ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax).c_str());
   ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax.c_str()).c_str());
 
   Int96 Int96_min = {{1024, 2048, 4096}};
   Int96 Int96_max = {{2048, 4096, 8192}};
   smin = std::string(reinterpret_cast<char*>(&Int96_min), sizeof(Int96));
   smax = std::string(reinterpret_cast<char*>(&Int96_max), sizeof(Int96));
+  ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin).c_str());
   ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin.c_str()).c_str());
+  ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax).c_str());
   ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax.c_str()).c_str());
 
   smin = std::string("abcdef");
   smax = std::string("ijklmnop");
+  ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin).c_str());
   ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin.c_str()).c_str());
+  ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax).c_str());
   ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax.c_str()).c_str());
 
+  // PARQUET-1357: FormatStatValue truncates binary statistics on zero character
+  smax.push_back('\0');
+  ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax));
+  // This fails, thus the call to FormatStatValue(.., const char*) was deprecated.
+  // ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax.c_str()));
+
   smin = std::string("abcdefgh");
   smax = std::string("ijklmnop");
+  ASSERT_STREQ("abcdefgh", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin).c_str());
   ASSERT_STREQ("abcdefgh",
                FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin.c_str()).c_str());
+  ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax).c_str());
   ASSERT_STREQ("ijklmnop",
                FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax.c_str()).c_str());
+#if !(defined(_WIN32) || defined(__CYGWIN__))
+#pragma GCC diagnostic pop
+#endif
 }
 
 }  // namespace parquet
diff --git a/src/parquet/types.cc b/src/parquet/types.cc
index 79bc5d1..3120963 100644
--- a/src/parquet/types.cc
+++ b/src/parquet/types.cc
@@ -24,6 +24,41 @@
 
 namespace parquet {
 
+std::string FormatStatValue(Type::type parquet_type, const std::string& val) {
+  std::stringstream result;
+  switch (parquet_type) {
+    case Type::BOOLEAN:
+      result << reinterpret_cast<const bool*>(val.c_str())[0];
+      break;
+    case Type::INT32:
+      result << reinterpret_cast<const int32_t*>(val.c_str())[0];
+      break;
+    case Type::INT64:
+      result << reinterpret_cast<const int64_t*>(val.c_str())[0];
+      break;
+    case Type::DOUBLE:
+      result << reinterpret_cast<const double*>(val.c_str())[0];
+      break;
+    case Type::FLOAT:
+      result << reinterpret_cast<const float*>(val.c_str())[0];
+      break;
+    case Type::INT96: {
+      auto const i32_val = reinterpret_cast<const int32_t*>(val.c_str());
+      result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
+      break;
+    }
+    case Type::BYTE_ARRAY: {
+      return val;
+    }
+    case Type::FIXED_LEN_BYTE_ARRAY: {
+      return val;
+    }
+    default:
+      break;
+  }
+  return result.str();
+}
+
 std::string FormatStatValue(Type::type parquet_type, const char* val) {
   std::stringstream result;
   switch (parquet_type) {
diff --git a/src/parquet/types.h b/src/parquet/types.h
index 04cfc4b..0f4cfc2 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -27,6 +27,7 @@
 
 #include "arrow/util/macros.h"
 
+#include "parquet/util/macros.h"
 #include "parquet/util/visibility.h"
 
 namespace parquet {
@@ -292,6 +293,11 @@
 
 PARQUET_EXPORT std::string TypeToString(Type::type t);
 
+PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
+                                           const std::string& val);
+
+/// \deprecated Since 1.5.0
+PARQUET_DEPRECATED("Use std::string instead of char* as input")
 PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val);
 
 PARQUET_EXPORT int GetTypeByteSize(Type::type t);
diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h
index 0d172b1..c28b2fa 100644
--- a/src/parquet/util/macros.h
+++ b/src/parquet/util/macros.h
@@ -68,4 +68,19 @@
 #define FRIEND_TEST(test_case_name, test_name) \
   friend class test_case_name##_##test_name##_Test
 
+// clang-format off
+// [[deprecated]] is only available in C++14, use this for the time being
+// This macro takes an optional deprecation message
+#if __cplusplus <= 201103L
+# ifdef __GNUC__
+#  define PARQUET_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__)))
+# elif defined(_MSC_VER)
+#  define PARQUET_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__))
+# else
+#  define PARQUET_DEPRECATED(...)
+# endif
+#else
+#  define PARQUET_DEPRECATED(...) [[deprecated(__VA_ARGS__)]]
+#endif
+
 #endif  // PARQUET_UTIL_MACROS_H