PARQUET-1357: FormatStatValue truncates binary statistics on zero character
Author: Korn, Uwe <Uwe.Korn@blue-yonder.com>
Closes #479 from xhochy/PARQUET-1357 and squashes the following commits:
4135976 [Korn, Uwe] Add macro for deprecations
e0e4946 [Korn, Uwe] Update to clang-format 6.0
0073b56 [Korn, Uwe] PARQUET-1357: FormatStatValue truncates binary statistics on zero character
diff --git a/cmake_modules/FindClangTools.cmake b/cmake_modules/FindClangTools.cmake
index e9221ff..215a5cd 100644
--- a/cmake_modules/FindClangTools.cmake
+++ b/cmake_modules/FindClangTools.cmake
@@ -30,6 +30,12 @@
# CLANG_FORMAT_BIN, The path to the clang format binary
# CLANG_TIDY_FOUND, Whether clang format was found
+if (DEFINED ENV{HOMEBREW_PREFIX})
+ set(HOMEBREW_PREFIX "${ENV{HOMEBREW_PREFIX}")
+else()
+ set(HOMEBREW_PREFIX "/usr/local")
+endif()
+
find_program(CLANG_TIDY_BIN
NAMES clang-tidy-4.0
clang-tidy-3.9
@@ -37,7 +43,7 @@
clang-tidy-3.7
clang-tidy-3.6
clang-tidy
- PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
+ PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
NO_DEFAULT_PATH
)
@@ -55,7 +61,7 @@
PATHS
${ClangTools_PATH}
$ENV{CLANG_TOOLS_PATH}
- /usr/local/bin /usr/bin
+ /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
NO_DEFAULT_PATH
)
@@ -67,16 +73,26 @@
if ("${CLANG_FORMAT_MINOR_VERSION}" STREQUAL "0")
find_program(CLANG_FORMAT_BIN
NAMES clang-format
- PATHS /usr/local/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin
+ PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_MAJOR_VERSION}/bin"
NO_DEFAULT_PATH
)
else()
find_program(CLANG_FORMAT_BIN
NAMES clang-format
- PATHS /usr/local/opt/llvm@${CLANG_FORMAT_VERSION}/bin
+ PATHS "${HOMEBREW_PREFIX}/opt/llvm@${CLANG_FORMAT_VERSION}/bin"
NO_DEFAULT_PATH
)
endif()
+
+ if ("${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND")
+ # binary was still not found, look into Cellar
+ file(GLOB CLANG_FORMAT_PATH "${HOMEBREW_PREFIX}/Cellar/llvm/${CLANG_FORMAT_VERSION}.*")
+ find_program(CLANG_FORMAT_BIN
+ NAMES clang-format
+ PATHS "${CLANG_FORMAT_PATH}/bin"
+ NO_DEFAULT_PATH
+ )
+ endif()
endif()
else()
find_program(CLANG_FORMAT_BIN
@@ -86,7 +102,7 @@
clang-format-3.7
clang-format-3.6
clang-format
- PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
+ PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin "${HOMEBREW_PREFIX}/bin"
NO_DEFAULT_PATH
)
endif()
diff --git a/src/parquet/printer.cc b/src/parquet/printer.cc
index 88b5528..3f18a5c 100644
--- a/src/parquet/printer.cc
+++ b/src/parquet/printer.cc
@@ -84,8 +84,8 @@
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << ", Null Values: " << stats->null_count()
<< ", Distinct Values: " << stats->distinct_count() << std::endl
- << " Max: " << FormatStatValue(descr->physical_type(), max.c_str())
- << ", Min: " << FormatStatValue(descr->physical_type(), min.c_str());
+ << " Max: " << FormatStatValue(descr->physical_type(), max)
+ << ", Min: " << FormatStatValue(descr->physical_type(), min);
} else {
stream << " Statistics Not Set";
}
@@ -207,9 +207,8 @@
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
<< "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
- << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max.c_str())
- << "\", "
- << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min.c_str())
+ << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
+ << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
<< "\" },";
} else {
stream << "\"False\",";
diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc
index 4e75982..6b184e3 100644
--- a/src/parquet/types-test.cc
+++ b/src/parquet/types-test.cc
@@ -62,54 +62,81 @@
}
TEST(TypePrinter, StatisticsTypes) {
+#if !(defined(_WIN32) || defined(__CYGWIN__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
std::string smin;
std::string smax;
int32_t int_min = 1024;
int32_t int_max = 2048;
smin = std::string(reinterpret_cast<char*>(&int_min), sizeof(int32_t));
smax = std::string(reinterpret_cast<char*>(&int_max), sizeof(int32_t));
+ ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin).c_str());
ASSERT_STREQ("1024", FormatStatValue(Type::INT32, smin.c_str()).c_str());
+ ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax).c_str());
ASSERT_STREQ("2048", FormatStatValue(Type::INT32, smax.c_str()).c_str());
int64_t int64_min = 10240000000000;
int64_t int64_max = 20480000000000;
smin = std::string(reinterpret_cast<char*>(&int64_min), sizeof(int64_t));
smax = std::string(reinterpret_cast<char*>(&int64_max), sizeof(int64_t));
+ ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin).c_str());
ASSERT_STREQ("10240000000000", FormatStatValue(Type::INT64, smin.c_str()).c_str());
+ ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax).c_str());
ASSERT_STREQ("20480000000000", FormatStatValue(Type::INT64, smax.c_str()).c_str());
float float_min = 1.024f;
float float_max = 2.048f;
smin = std::string(reinterpret_cast<char*>(&float_min), sizeof(float));
smax = std::string(reinterpret_cast<char*>(&float_max), sizeof(float));
+ ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin).c_str());
ASSERT_STREQ("1.024", FormatStatValue(Type::FLOAT, smin.c_str()).c_str());
+ ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax).c_str());
ASSERT_STREQ("2.048", FormatStatValue(Type::FLOAT, smax.c_str()).c_str());
double double_min = 1.0245;
double double_max = 2.0489;
smin = std::string(reinterpret_cast<char*>(&double_min), sizeof(double));
smax = std::string(reinterpret_cast<char*>(&double_max), sizeof(double));
+ ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin).c_str());
ASSERT_STREQ("1.0245", FormatStatValue(Type::DOUBLE, smin.c_str()).c_str());
+ ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax).c_str());
ASSERT_STREQ("2.0489", FormatStatValue(Type::DOUBLE, smax.c_str()).c_str());
Int96 Int96_min = {{1024, 2048, 4096}};
Int96 Int96_max = {{2048, 4096, 8192}};
smin = std::string(reinterpret_cast<char*>(&Int96_min), sizeof(Int96));
smax = std::string(reinterpret_cast<char*>(&Int96_max), sizeof(Int96));
+ ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin).c_str());
ASSERT_STREQ("1024 2048 4096", FormatStatValue(Type::INT96, smin.c_str()).c_str());
+ ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax).c_str());
ASSERT_STREQ("2048 4096 8192", FormatStatValue(Type::INT96, smax.c_str()).c_str());
smin = std::string("abcdef");
smax = std::string("ijklmnop");
+ ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin).c_str());
ASSERT_STREQ("abcdef", FormatStatValue(Type::BYTE_ARRAY, smin.c_str()).c_str());
+ ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax).c_str());
ASSERT_STREQ("ijklmnop", FormatStatValue(Type::BYTE_ARRAY, smax.c_str()).c_str());
+ // PARQUET-1357: FormatStatValue truncates binary statistics on zero character
+ smax.push_back('\0');
+ ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax));
+ // This fails, thus the call to FormatStatValue(.., const char*) was deprecated.
+ // ASSERT_EQ(smax, FormatStatValue(Type::BYTE_ARRAY, smax.c_str()));
+
smin = std::string("abcdefgh");
smax = std::string("ijklmnop");
+ ASSERT_STREQ("abcdefgh", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin).c_str());
ASSERT_STREQ("abcdefgh",
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smin.c_str()).c_str());
+ ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax).c_str());
ASSERT_STREQ("ijklmnop",
FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax.c_str()).c_str());
+#if !(defined(_WIN32) || defined(__CYGWIN__))
+#pragma GCC diagnostic pop
+#endif
}
} // namespace parquet
diff --git a/src/parquet/types.cc b/src/parquet/types.cc
index 79bc5d1..3120963 100644
--- a/src/parquet/types.cc
+++ b/src/parquet/types.cc
@@ -24,6 +24,41 @@
namespace parquet {
+std::string FormatStatValue(Type::type parquet_type, const std::string& val) {
+ std::stringstream result;
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ result << reinterpret_cast<const bool*>(val.c_str())[0];
+ break;
+ case Type::INT32:
+ result << reinterpret_cast<const int32_t*>(val.c_str())[0];
+ break;
+ case Type::INT64:
+ result << reinterpret_cast<const int64_t*>(val.c_str())[0];
+ break;
+ case Type::DOUBLE:
+ result << reinterpret_cast<const double*>(val.c_str())[0];
+ break;
+ case Type::FLOAT:
+ result << reinterpret_cast<const float*>(val.c_str())[0];
+ break;
+ case Type::INT96: {
+ auto const i32_val = reinterpret_cast<const int32_t*>(val.c_str());
+ result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
+ break;
+ }
+ case Type::BYTE_ARRAY: {
+ return val;
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY: {
+ return val;
+ }
+ default:
+ break;
+ }
+ return result.str();
+}
+
std::string FormatStatValue(Type::type parquet_type, const char* val) {
std::stringstream result;
switch (parquet_type) {
diff --git a/src/parquet/types.h b/src/parquet/types.h
index 04cfc4b..0f4cfc2 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -27,6 +27,7 @@
#include "arrow/util/macros.h"
+#include "parquet/util/macros.h"
#include "parquet/util/visibility.h"
namespace parquet {
@@ -292,6 +293,11 @@
PARQUET_EXPORT std::string TypeToString(Type::type t);
+PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
+ const std::string& val);
+
+/// \deprecated Since 1.5.0
+PARQUET_DEPRECATED("Use std::string instead of char* as input")
PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val);
PARQUET_EXPORT int GetTypeByteSize(Type::type t);
diff --git a/src/parquet/util/macros.h b/src/parquet/util/macros.h
index 0d172b1..c28b2fa 100644
--- a/src/parquet/util/macros.h
+++ b/src/parquet/util/macros.h
@@ -68,4 +68,19 @@
#define FRIEND_TEST(test_case_name, test_name) \
friend class test_case_name##_##test_name##_Test
+// clang-format off
+// [[deprecated]] is only available in C++14, use this for the time being
+// This macro takes an optional deprecation message
+#if __cplusplus <= 201103L
+# ifdef __GNUC__
+# define PARQUET_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__)))
+# elif defined(_MSC_VER)
+# define PARQUET_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__))
+# else
+# define PARQUET_DEPRECATED(...)
+# endif
+#else
+# define PARQUET_DEPRECATED(...) [[deprecated(__VA_ARGS__)]]
+#endif
+
#endif // PARQUET_UTIL_MACROS_H