ARROW-2227: [Python] Fix off-by-one error in chunked binary conversions
We were already testing the chunked behavior but we did not exercise the off-by-one error edge case
Author: Wes McKinney <wes.mckinney@twosigma.com>
Closes #1740 from wesm/ARROW-2227 and squashes the following commits:
fd1ab178 <Wes McKinney> Move constexpr to builder.h, add constexpr for max # of list elements
ca5ecc0f <Wes McKinney> Fix off-by-one error in chunked binary conversions
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index ef4e7fd..aa9f3ce 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -1236,7 +1236,7 @@
Status ListBuilder::AppendNextOffset() {
int64_t num_values = value_builder_->length();
- if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits<int32_t>::max())) {
+ if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) {
std::stringstream ss;
ss << "ListArray cannot contain more then INT32_MAX - 1 child elements,"
<< " have " << num_values;
@@ -1252,14 +1252,14 @@
}
Status ListBuilder::Init(int64_t elements) {
- DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+ DCHECK_LE(elements, kListMaximumElements);
RETURN_NOT_OK(ArrayBuilder::Init(elements));
// one more then requested for offsets
return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
}
Status ListBuilder::Resize(int64_t capacity) {
- DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+ DCHECK_LE(capacity, kListMaximumElements);
// one more then requested for offsets
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
return ArrayBuilder::Resize(capacity);
@@ -1303,14 +1303,14 @@
BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {}
Status BinaryBuilder::Init(int64_t elements) {
- DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+ DCHECK_LE(elements, kListMaximumElements);
RETURN_NOT_OK(ArrayBuilder::Init(elements));
// one more then requested for offsets
return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
}
Status BinaryBuilder::Resize(int64_t capacity) {
- DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+ DCHECK_LE(capacity, kListMaximumElements);
// one more then requested for offsets
RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
return ArrayBuilder::Resize(capacity);
@@ -1318,7 +1318,7 @@
Status BinaryBuilder::ReserveData(int64_t elements) {
if (value_data_length() + elements > value_data_capacity()) {
- if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) {
+ if (value_data_length() + elements > kBinaryMemoryLimit) {
return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for binary");
}
RETURN_NOT_OK(value_data_builder_.Reserve(elements));
@@ -1328,9 +1328,9 @@
Status BinaryBuilder::AppendNextOffset() {
const int64_t num_bytes = value_data_builder_.length();
- if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
+ if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) {
std::stringstream ss;
- ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " bytes, have "
+ ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have "
<< num_bytes;
return Status::Invalid(ss.str());
}
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index dabfb75..cdcee80 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -41,13 +41,16 @@
class Array;
class Decimal128;
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
+
namespace internal {
struct ArrayData;
} // namespace internal
-static constexpr int64_t kMinBuilderCapacity = 1 << 5;
+constexpr int64_t kMinBuilderCapacity = 1 << 5;
/// Base class for all data array builders.
//
@@ -702,8 +705,6 @@
TypedBufferBuilder<int32_t> offsets_builder_;
TypedBufferBuilder<uint8_t> value_data_builder_;
- static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1;
-
Status AppendNextOffset();
void Reset();
};
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 4d91e53..71bf69f 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -60,8 +60,6 @@
using internal::NumPyTypeSize;
-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max();
-
// ----------------------------------------------------------------------
// Conversion utilities
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index dff4e10..d448de0 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1048,9 +1048,12 @@
@pytest.mark.large_memory
def test_bytes_exceed_2gb(self):
- val = 'x' * (1 << 20)
+ v1 = b'x' * 100000000
+ v2 = b'x' * 147483646
+
+ # ARROW-2227, hit exactly 2GB on the nose
df = pd.DataFrame({
- 'strings': np.array([val] * 4000, dtype=object)
+ 'strings': [v1] * 20 + [v2] + ['x'] * 20
})
arr = pa.array(df['strings'])
assert isinstance(arr, pa.ChunkedArray)