blob: 8a077a22e5055a8eb927893f9eec8d2b060fc13b [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from collections import namedtuple
from collections.abc import Sequence
import datetime
import decimal
import enum
from functools import lru_cache, partial
import itertools
import math
import operator
import struct
import sys
import warnings
import gdb
from gdb.types import get_basic_type
assert sys.version_info[0] >= 3, "Arrow GDB extension needs Python 3+"
# gdb API docs at https://sourceware.org/gdb/onlinedocs/gdb/Python-API.html#Python-API
_type_ids = [
'NA', 'BOOL', 'UINT8', 'INT8', 'UINT16', 'INT16', 'UINT32', 'INT32',
'UINT64', 'INT64', 'HALF_FLOAT', 'FLOAT', 'DOUBLE', 'STRING', 'BINARY',
'FIXED_SIZE_BINARY', 'DATE32', 'DATE64', 'TIMESTAMP', 'TIME32', 'TIME64',
'INTERVAL_MONTHS', 'INTERVAL_DAY_TIME', 'DECIMAL128', 'DECIMAL256',
'LIST', 'STRUCT', 'SPARSE_UNION', 'DENSE_UNION', 'DICTIONARY', 'MAP',
'EXTENSION', 'FIXED_SIZE_LIST', 'DURATION', 'LARGE_STRING',
'LARGE_BINARY', 'LARGE_LIST', 'INTERVAL_MONTH_DAY_NANO']
# Mirror the C++ Type::type enum
Type = enum.IntEnum('Type', _type_ids, start=0)
# Mirror the C++ TimeUnit::type enum
TimeUnit = enum.IntEnum('TimeUnit', ['SECOND', 'MILLI', 'MICRO', 'NANO'],
start=0)
type_id_to_struct_code = {
Type.INT8: 'b',
Type.INT16: 'h',
Type.INT32: 'i',
Type.INT64: 'q',
Type.UINT8: 'B',
Type.UINT16: 'H',
Type.UINT32: 'I',
Type.UINT64: 'Q',
Type.HALF_FLOAT: 'e',
Type.FLOAT: 'f',
Type.DOUBLE: 'd',
Type.DATE32: 'i',
Type.DATE64: 'q',
Type.TIME32: 'i',
Type.TIME64: 'q',
Type.INTERVAL_DAY_TIME: 'ii',
Type.INTERVAL_MONTHS: 'i',
Type.INTERVAL_MONTH_DAY_NANO: 'iiq',
Type.DURATION: 'q',
Type.TIMESTAMP: 'q',
}
TimeUnitTraits = namedtuple('TimeUnitTraits', ('multiplier',
'fractional_digits'))
time_unit_traits = {
TimeUnit.SECOND: TimeUnitTraits(1, 0),
TimeUnit.MILLI: TimeUnitTraits(1_000, 3),
TimeUnit.MICRO: TimeUnitTraits(1_000_000, 6),
TimeUnit.NANO: TimeUnitTraits(1_000_000_000, 9),
}
def identity(v):
return v
def has_null_bitmap(type_id):
return type_id not in (Type.NA, Type.SPARSE_UNION, Type.DENSE_UNION)
@lru_cache()
def byte_order():
"""
Get the target program (not the GDB host's) endianness.
"""
s = gdb.execute("show endian", to_string=True).strip()
if 'big' in s:
return 'big'
elif 'little' in s:
return 'little'
warnings.warn('Could not determine target endianness '
f'from GDB\'s response:\n"""{s}"""')
# Fall back to host endianness
return sys.byteorder
def for_evaluation(val, ty=None):
"""
Return a parsable form of gdb.Value `val`, optionally with gdb.Type `ty`.
"""
if ty is None:
ty = get_basic_type(val.type)
typename = str(ty) # `ty.name` is sometimes None...
if '::' in typename and not typename.startswith('::'):
# ARROW-15652: expressions evaluated by GDB are evaluated in the
# scope of the C++ namespace of the currently selected frame.
# When inside a Parquet frame, `arrow::<some type>` would be looked
# up as `parquet::arrow::<some type>` and fail.
# Therefore, force the lookup to happen in the global namespace scope.
typename = f"::{typename}"
if ty.code == gdb.TYPE_CODE_PTR:
# It's already a pointer, can represent it directly
return f"(({typename}) ({val}))"
if val.address is None:
raise ValueError(f"Cannot further evaluate rvalue: {val}")
return f"(* ({typename}*) ({val.address}))"
def is_char_star(ty):
# Note that "const char*" can have TYPE_CODE_INT as target type...
ty = get_basic_type(ty)
return (ty.code == gdb.TYPE_CODE_PTR and
get_basic_type(ty.target()).code
in (gdb.TYPE_CODE_CHAR, gdb.TYPE_CODE_INT))
def deref(val):
"""
Dereference a raw or smart pointer.
"""
ty = get_basic_type(val.type)
if ty.code == gdb.TYPE_CODE_PTR:
return val.dereference()
if ty.name.startswith('std::'):
if "shared" in ty.name:
return SharedPtr(val).value
if "unique" in ty.name:
return UniquePtr(val).value
raise TypeError(f"Cannot dereference value of type '{ty.name}'")
_string_literal_mapping = {
ord('\\'): r'\\',
ord('\n'): r'\n',
ord('\r'): r'\r',
ord('\t'): r'\t',
ord('"'): r'\"',
}
for c in range(0, 32):
if c not in _string_literal_mapping:
_string_literal_mapping[c] = f"\\x{c:02x}"
def string_literal(s):
"""
Format a Python string or gdb.Value for display as a literal.
"""
max_len = 50
if isinstance(s, gdb.Value):
s = s.string()
if len(s) > max_len:
s = s[:max_len]
return '"' + s.translate(_string_literal_mapping) + '" [continued]'
else:
return '"' + s.translate(_string_literal_mapping) + '"'
def bytes_literal(val, size=None):
"""
Format a gdb.Value for display as a literal containing possibly
unprintable characters.
"""
return val.lazy_string(length=size).value()
def utf8_literal(val, size=None):
"""
Format a gdb.Value for display as a utf-8 literal.
"""
if size is None:
s = val.string(encoding='utf8', errors='backslashreplace')
elif size != 0:
s = val.string(encoding='utf8', errors='backslashreplace', length=size)
else:
s = ""
return string_literal(s)
def half_float_value(val):
"""
Return a Python float of the given half-float (represented as a uint64_t
gdb.Value).
"""
buf = gdb.selected_inferior().read_memory(val.address, 2)
return struct.unpack("e", buf)[0]
def load_atomic(val):
"""
Load a std::atomic<T>'s value.
"""
valty = val.type.template_argument(0)
# XXX This assumes std::atomic<T> has the same layout as a raw T.
return val.address.reinterpret_cast(valty.pointer()).dereference()
def load_null_count(val):
"""
Load a null count from a gdb.Value of an integer (either atomic or not).
"""
if get_basic_type(val.type).code != gdb.TYPE_CODE_INT:
val = load_atomic(val)
return val
def format_null_count(val):
"""
Format a null count value.
"""
if not isinstance(val, int):
null_count = int(load_null_count(val))
return (f"null count {null_count}" if null_count != -1
else "unknown null count")
def short_time_unit(val):
return ['s', 'ms', 'us', 'ns'][int(val)]
def format_month_interval(val):
"""
Format a MonthInterval value.
"""
return f"{int(val)}M"
def format_days_milliseconds(days, milliseconds):
return f"{days}d{milliseconds}ms"
def format_months_days_nanos(months, days, nanos):
return f"{months}M{days}d{nanos}ns"
_date_base = datetime.date(1970, 1, 1).toordinal()
def format_date32(val):
"""
Format a date32 value.
"""
val = int(val)
try:
decoded = datetime.date.fromordinal(val + _date_base)
except ValueError: # "ordinal must be >= 1"
return f"{val}d [year <= 0]"
else:
return f"{val}d [{decoded}]"
def format_date64(val):
"""
Format a date64 value.
"""
val = int(val)
days, remainder = divmod(val, 86400 * 1000)
if remainder:
return f"{val}ms [non-multiple of 86400000]"
try:
decoded = datetime.date.fromordinal(days + _date_base)
except ValueError: # "ordinal must be >= 1"
return f"{val}ms [year <= 0]"
else:
return f"{val}ms [{decoded}]"
def format_timestamp(val, unit):
"""
Format a timestamp value.
"""
val = int(val)
unit = int(unit)
short_unit = short_time_unit(unit)
traits = time_unit_traits[unit]
seconds, subseconds = divmod(val, traits.multiplier)
try:
dt = datetime.datetime.utcfromtimestamp(seconds)
except (ValueError, OSError): # value out of range for datetime.datetime
pretty = "too large to represent"
else:
pretty = dt.isoformat().replace('T', ' ')
if traits.fractional_digits > 0:
pretty += f".{subseconds:0{traits.fractional_digits}d}"
return f"{val}{short_unit} [{pretty}]"
def cast_to_concrete(val, ty):
return (val.reference_value().reinterpret_cast(ty.reference())
.referenced_value())
def scalar_class_from_type(name):
"""
Given a DataTypeClass class name (such as "BooleanType"), return the
corresponding Scalar class name.
"""
assert name.endswith("Type")
return name[:-4] + "Scalar"
def array_class_from_type(name):
"""
Given a DataTypeClass class name (such as "BooleanType"), return the
corresponding Array class name.
"""
assert name.endswith("Type")
return name[:-4] + "Array"
class CString:
"""
A `const char*` or similar value.
"""
def __init__(self, val):
self.val = val
def __bool__(self):
return int(data) != 0 and int(data[0]) != 0
@property
def data(self):
return self.val
def bytes_literal(self):
return self.val.lazy_string().value()
def string_literal(self):
# XXX use lazy_string() as well?
return string_literal(self.val)
def string(self):
return self.val.string()
def __format__(self, fmt):
return str(self.bytes_literal())
# NOTE: gdb.parse_and_eval() is *slow* and calling it multiple times
# may add noticeable latencies. For standard C++ classes, we therefore
# try to fetch their properties from libstdc++ internals (which hopefully
# are stable), before falling back on calling the public API methods.
class SharedPtr:
"""
A `std::shared_ptr<T>` value.
"""
def __init__(self, val):
self.val = val
try:
# libstdc++ internals
self._ptr = val['_M_ptr']
except gdb.error:
# fallback for other C++ standard libraries
self._ptr = gdb.parse_and_eval(f"{for_evaluation(val)}.get()")
def get(self):
"""
Return the underlying pointer (a T*).
"""
return self._ptr
@property
def value(self):
"""
The underlying value (a T).
"""
return self._ptr.dereference()
class UniquePtr:
"""
A `std::unique_ptr<T>` value.
"""
def __init__(self, val):
self.val = val
ty = self.val.type.template_argument(0)
# XXX This assumes that the embedded T* pointer lies at the start
# of std::unique_ptr<T>.
self._ptr = self.val.address.reinterpret_cast(ty.pointer().pointer())
def get(self):
"""
Return the underlying pointer (a T*).
"""
return self._ptr
@property
def value(self):
"""
The underlying value (a T).
"""
return self._ptr.dereference()
class Variant:
"""
A arrow::util::Variant<...>.
"""
def __init__(self, val):
self.val = val
self.index = int(self.val['index_'])
try:
self.value_type = self.val.type.template_argument(self.index)
except RuntimeError:
# Index out of bounds
self.value_type = None
@property
def value(self):
if self.value_type is None:
return None
ptr = self.val.address
if ptr is not None:
return ptr.reinterpret_cast(self.value_type.pointer()
).dereference()
return None
class StdString:
"""
A `std::string` (or possibly `string_view`) value.
"""
def __init__(self, val):
self.val = val
try:
# libstdc++ internals
self._data = val['_M_dataplus']['_M_p']
self._size = val['_M_string_length']
except gdb.error:
# fallback for other C++ standard libraries
self._data = gdb.parse_and_eval(f"{for_evaluation(val)}.c_str()")
self._size = gdb.parse_and_eval(f"{for_evaluation(val)}.size()")
def __bool__(self):
return self._size != 0
@property
def data(self):
return self._data
@property
def size(self):
return self._size
def bytes_literal(self):
return self._data.lazy_string(length=self._size).value()
def string_literal(self):
# XXX use lazy_string() as well?
return string_literal(self._data)
def string(self):
return self._data.string()
def __format__(self, fmt):
return str(self.bytes_literal())
class StdVector(Sequence):
"""
A `std::vector<T>` value.
"""
def __init__(self, val):
self.val = val
try:
# libstdc++ internals
impl = self.val['_M_impl']
self._data = impl['_M_start']
self._size = int(impl['_M_finish'] - self._data)
except gdb.error:
# fallback for other C++ standard libraries
self._data = int(gdb.parse_and_eval(
f"{for_evaluation(self.val)}.data()"))
self._size = int(gdb.parse_and_eval(
f"{for_evaluation(self.val)}.size()"))
def _check_index(self, index):
if index < 0 or index >= self._size:
raise IndexError(
f"Index {index} out of bounds (should be in [0, {self._size - 1}])")
def __len__(self):
return self._size
def __getitem__(self, index):
self._check_index(index)
return self._data[index]
def eval_at(self, index, eval_format):
"""
Run `eval_format` with the value at `index`.
For example, if `eval_format` is "{}.get()", this will evaluate
"{self[index]}.get()".
"""
self._check_index(index)
return gdb.parse_and_eval(
eval_format.format(for_evaluation(self._data[index])))
def iter_eval(self, eval_format):
data_eval = for_evaluation(self._data)
for i in range(self._size):
yield gdb.parse_and_eval(
eval_format.format(f"{data_eval}[{i}]"))
@property
def size(self):
return self._size
class StdPtrVector(StdVector):
def __getitem__(self, index):
return deref(super().__getitem__(index))
class FieldVector(StdVector):
def __getitem__(self, index):
"""
Dereference the Field object at this index.
"""
return Field(deref(super().__getitem__(index)))
def __str__(self):
l = [str(self[i]) for i in range(len(self))]
return "{" + ", ".join(l) + "}"
class Field:
"""
A arrow::Field value.
"""
def __init__(self, val):
self.val = val
@property
def name(self):
return StdString(self.val['name_'])
@property
def type(self):
return deref(self.val['type_'])
@property
def nullable(self):
return bool(self.val['nullable_'])
def __str__(self):
return str(self.val)
class FieldPtr(Field):
"""
A std::shared_ptr<arrow::Field> value.
"""
def __init__(self, val):
super().__init__(deref(val))
class Buffer:
"""
A arrow::Buffer value.
"""
def __init__(self, val):
self.val = val
self.size = int(val['size_'])
@property
def data(self):
return self.val['data_']
def bytes_literal(self):
if self.size > 0:
return self.val['data_'].lazy_string(length=self.size).value()
else:
return '""'
def bytes_view(self, offset=0, length=None):
"""
Return a view over the bytes of this buffer.
"""
if self.size > 0:
if length is None:
length = self.size
mem = gdb.selected_inferior().read_memory(
self.val['data_'] + offset, self.size)
else:
mem = memoryview(b"")
# Read individual bytes as unsigned integers rather than
# Python bytes objects
return mem.cast('B')
view = bytes_view
class BufferPtr:
"""
A arrow::Buffer* value (possibly null).
"""
def __init__(self, val):
self.val = val
ptr = int(self.val)
self.buf = Buffer(val.dereference()) if ptr != 0 else None
@property
def data(self):
if self.buf is None:
return None
return self.buf.data
@property
def size(self):
if self.buf is None:
return None
return self.buf.size
def bytes_literal(self):
if self.buf is None:
return None
return self.buf.bytes_literal()
class TypedBuffer(Buffer):
"""
A buffer containing values of a given a struct format code.
"""
_boolean_format = object()
def __init__(self, val, mem_format):
super().__init__(val)
self.mem_format = mem_format
if not self.is_boolean:
self.byte_width = struct.calcsize('=' + self.mem_format)
@classmethod
def from_type_id(cls, val, type_id):
assert isinstance(type_id, int)
if type_id == Type.BOOL:
mem_format = cls._boolean_format
else:
mem_format = type_id_to_struct_code[type_id]
return cls(val, mem_format)
def view(self, offset=0, length=None):
"""
Return a view over the primitive values in this buffer.
The optional `offset` and `length` are expressed in primitive values,
not bytes.
"""
if self.is_boolean:
return Bitmap.from_buffer(self, offset, length)
byte_offset = offset * self.byte_width
if length is not None:
mem = self.bytes_view(byte_offset, length * self.byte_width)
else:
mem = self.bytes_view(byte_offset)
return TypedView(mem, self.mem_format)
@property
def is_boolean(self):
return self.mem_format is self._boolean_format
class TypedView(Sequence):
"""
View a bytes-compatible object as a sequence of objects described
by a struct format code.
"""
def __init__(self, mem, mem_format):
assert isinstance(mem, memoryview)
self.mem = mem
self.mem_format = mem_format
self.byte_width = struct.calcsize('=' + mem_format)
self.length = mem.nbytes // self.byte_width
def _check_index(self, index):
if not 0 <= index < self.length:
raise IndexError("Wrong index for bitmap")
def __len__(self):
return self.length
def __getitem__(self, index):
self._check_index(index)
w = self.byte_width
# Cannot use memoryview.cast() because the 'e' format for half-floats
# is poorly supported.
mem = self.mem[index * w:(index + 1) * w]
return struct.unpack('=' + self.mem_format, mem)
class Bitmap(Sequence):
"""
View a bytes-compatible object as a sequence of bools.
"""
_masks = [0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80]
def __init__(self, view, offset, length):
self.view = view
self.offset = offset
self.length = length
def _check_index(self, index):
if not 0 <= index < self.length:
raise IndexError("Wrong index for bitmap")
def __len__(self):
return self.length
def __getitem__(self, index):
self._check_index(index)
index += self.offset
byte_index, bit_index = divmod(index, 8)
byte = self.view[byte_index]
return byte & self._masks[bit_index] != 0
@classmethod
def from_buffer(cls, buf, offset, length):
assert isinstance(buf, Buffer)
byte_offset, bit_offset = divmod(offset, 8)
byte_length = math.ceil(length + offset / 8) - byte_offset
return cls(buf.bytes_view(byte_offset, byte_length),
bit_offset, length)
class MappedView(Sequence):
def __init__(self, func, view):
self.view = view
self.func = func
def __len__(self):
return len(self.view)
def __getitem__(self, index):
return self.func(self.view[index])
class StarMappedView(Sequence):
def __init__(self, func, view):
self.view = view
self.func = func
def __len__(self):
return len(self.view)
def __getitem__(self, index):
return self.func(*self.view[index])
class NullBitmap(Bitmap):
def __getitem__(self, index):
self._check_index(index)
if self.view is None:
return True
return super().__getitem__(index)
@classmethod
def from_buffer(cls, buf, offset, length):
"""
Create a null bitmap from a Buffer (or None if missing,
in which case all values are True).
"""
if buf is None:
return cls(buf, offset, length)
return super().from_buffer(buf, offset, length)
KeyValue = namedtuple('KeyValue', ('key', 'value'))
class Metadata(Sequence):
"""
A arrow::KeyValueMetadata value.
"""
def __init__(self, val):
self.val = val
self.keys = StdVector(self.val['keys_'])
self.values = StdVector(self.val['values_'])
def __len__(self):
return len(self.keys)
def __getitem__(self, i):
return KeyValue(StdString(self.keys[i]), StdString(self.values[i]))
class MetadataPtr(Sequence):
"""
A shared_ptr<arrow::KeyValueMetadata> value, possibly null.
"""
def __init__(self, val):
self.ptr = SharedPtr(val).get()
self.is_null = int(self.ptr) == 0
self.md = None if self.is_null else Metadata(self.ptr.dereference())
def __len__(self):
return 0 if self.is_null else len(self.md)
def __getitem__(self, i):
if self.is_null:
raise IndexError
return self.md[i]
DecimalTraits = namedtuple('DecimalTraits', ('bit_width', 'struct_format_le'))
decimal_traits = {
128: DecimalTraits(128, 'Qq'),
256: DecimalTraits(256, 'QQQq'),
}
class BaseDecimal:
"""
Base class for arrow::BasicDecimal{128,256...} values.
"""
def __init__(self, address):
self.address = address
@classmethod
def from_value(cls, val):
"""
Create a decimal from a gdb.Value representing the corresponding
arrow::BasicDecimal{128,256...}.
"""
return cls(val['array_'].address)
@classmethod
def from_address(cls, address):
"""
Create a decimal from a gdb.Value representing the address of the
raw decimal storage.
"""
return cls(address)
@property
def words(self):
"""
The decimal words, from least to most significant.
"""
mem = gdb.selected_inferior().read_memory(self.address,
self.traits.bit_width // 8)
fmt = self.traits.struct_format_le
if byte_order() == 'big':
fmt = fmt[::-1]
words = struct.unpack(f"={fmt}", mem)
if byte_order() == 'big':
words = words[::-1]
return words
def __int__(self):
"""
The underlying bigint value.
"""
v = 0
words = self.words
bits_per_word = self.traits.bit_width // len(words)
for w in reversed(words):
v = (v << bits_per_word) + w
return v
def format(self, precision, scale):
"""
Format as a decimal number with the given precision and scale.
"""
v = int(self)
with decimal.localcontext() as ctx:
ctx.prec = precision
ctx.capitals = False
return str(decimal.Decimal(v).scaleb(-scale))
class Decimal128(BaseDecimal):
traits = decimal_traits[128]
class Decimal256(BaseDecimal):
traits = decimal_traits[256]
decimal_bits_to_class = {
128: Decimal128,
256: Decimal256,
}
decimal_type_to_class = {
f"Decimal{bits}Type": cls
for (bits, cls) in decimal_bits_to_class.items()
}
class ExtensionType:
"""
A arrow::ExtensionType.
"""
def __init__(self, val):
self.val = val
@property
def storage_type(self):
return deref(self.val['storage_type_'])
def to_string(self):
"""
The result of calling ToString().
"""
return StdString(gdb.parse_and_eval(
f"{for_evaluation(self.val)}.ToString()"))
class Schema:
"""
A arrow::Schema.
"""
def __init__(self, val):
self.val = val
impl = deref(self.val['impl_'])
self.fields = FieldVector(impl['fields_'])
self.metadata = MetadataPtr(impl['metadata_'])
class RecordBatch:
"""
A arrow::RecordBatch.
"""
def __init__(self, val):
# XXX this relies on RecordBatch always being a SimpleRecordBatch
# under the hood. What if users create their own RecordBatch
# implementation?
self.val = cast_to_concrete(val,
gdb.lookup_type("arrow::SimpleRecordBatch"))
self.schema = Schema(deref(self.val['schema_']))
self.columns = StdPtrVector(self.val['columns_'])
@property
def num_rows(self):
return self.val['num_rows_']
class Table:
"""
A arrow::Table.
"""
def __init__(self, val):
# XXX this relies on Table always being a SimpleTable under the hood.
# What if users create their own Table implementation?
self.val = cast_to_concrete(val,
gdb.lookup_type("arrow::SimpleTable"))
self.schema = Schema(deref(self.val['schema_']))
self.columns = StdPtrVector(self.val['columns_'])
@property
def num_rows(self):
return self.val['num_rows_']
type_reprs = {
'NullType': 'null',
'BooleanType': 'boolean',
'UInt8Type': 'uint8',
'Int8Type': 'int8',
'UInt16Type': 'uint16',
'Int16Type': 'int16',
'UInt32Type': 'uint32',
'Int32Type': 'int32',
'UInt64Type': 'uint64',
'Int64Type': 'int64',
'HalfFloatType': 'float16',
'FloatType': 'float32',
'DoubleType': 'float64',
'Date32Type': 'date32',
'Date64Type': 'date64',
'Time32Type': 'time32',
'Time64Type': 'time64',
'TimestampType': 'timestamp',
'MonthIntervalType': 'month_interval',
'DayTimeIntervalType': 'day_time_interval',
'MonthDayNanoIntervalType': 'month_day_nano_interval',
'DurationType': 'duration',
'Decimal128Type': 'decimal128',
'Decimal256Type': 'decimal256',
'StringType': 'utf8',
'LargeStringType': 'large_utf8',
'BinaryType': 'binary',
'LargeBinaryType': 'large_binary',
'FixedSizeBinaryType': 'fixed_size_binary',
'ListType': 'list',
'LargeListType': 'large_list',
'FixedSizeListType': 'fixed_size_list',
'MapType': 'map',
'StructType': 'struct_',
'SparseUnionType': 'sparse_union',
'DenseUnionType': 'dense_union',
'DictionaryType': 'dictionary',
}
class TypePrinter:
"""
Pretty-printer for arrow::DataTypeClass and subclasses.
"""
def __init__(self, name, val):
self.name = name
# Cast to concrete type class to access all derived methods
# and properties.
self.type = gdb.lookup_type(f"arrow::{name}")
self.val = cast_to_concrete(val, self.type)
@property
def fields(self):
return FieldVector(self.val['children_'])
def _format_type(self):
r = type_reprs.get(self.name, self.name)
return f"arrow::{r}"
def _for_evaluation(self):
return for_evaluation(self.val, self.type)
class PrimitiveTypePrinter(TypePrinter):
"""
Pretty-printer for non-parametric types.
"""
def to_string(self):
return f"{self._format_type()}()"
class TimeTypePrinter(TypePrinter):
"""
Pretty-printer for time and duration types.
"""
def _get_unit(self):
return self.val['unit_']
def to_string(self):
return f"{self._format_type()}({self._get_unit()})"
class TimestampTypePrinter(TimeTypePrinter):
"""
Pretty-printer for timestamp types.
"""
def to_string(self):
tz = StdString(self.val['timezone_'])
if tz:
return f'{self._format_type()}({self._get_unit()}, {tz})'
else:
return f'{self._format_type()}({self._get_unit()})'
class FixedSizeBinaryTypePrinter(TypePrinter):
"""
Pretty-printer for fixed-size binary types.
"""
def to_string(self):
width = int(self.val['byte_width_'])
return f"{self._format_type()}({width})"
class DecimalTypePrinter(TypePrinter):
"""
Pretty-printer for decimal types.
"""
def to_string(self):
precision = int(self.val['precision_'])
scale = int(self.val['scale_'])
return f"{self._format_type()}({precision}, {scale})"
class ListTypePrinter(TypePrinter):
"""
Pretty-printer for list types.
"""
def _get_value_type(self):
fields = self.fields
if len(fields) != 1:
return None
return fields[0].type
def to_string(self):
child = self._get_value_type()
if child is None:
return f"{self._format_type()}<uninitialized or corrupt>"
else:
return f"{self._format_type()}({child})"
class FixedSizeListTypePrinter(ListTypePrinter):
"""
Pretty-printer for fixed-size list type.
"""
def to_string(self):
child = self._get_value_type()
if child is None:
return f"{self._format_type()}<uninitialized or corrupt>"
list_size = int(self.val['list_size_'])
return f"{self._format_type()}({child}, {list_size})"
class MapTypePrinter(ListTypePrinter):
"""
Pretty-printer for map types.
"""
def to_string(self):
struct_type = self._get_value_type()
if struct_type is None:
return f"{self._format_type()}<uninitialized or corrupt>"
struct_children = FieldVector(struct_type['children_'])
if len(struct_children) != 2:
return f"{self._format_type()}<uninitialized or corrupt>"
key_type = struct_children[0].type
item_type = struct_children[1].type
return (f"{self._format_type()}({key_type}, {item_type}, "
f"keys_sorted={self.val['keys_sorted_']})")
class DictionaryTypePrinter(TypePrinter):
"""
Pretty-printer for dictionary types.
"""
def to_string(self):
index_type = deref(self.val['index_type_'])
value_type = deref(self.val['value_type_'])
ordered = self.val['ordered_']
return (f"{self._format_type()}({index_type}, {value_type}, "
f"ordered={ordered})")
class StructTypePrinter(TypePrinter):
"""
Pretty-printer for struct types.
"""
def to_string(self):
return f"{self._format_type()}({self.fields})"
class UnionTypePrinter(TypePrinter):
"""
Pretty-printer for union types.
"""
def to_string(self):
type_codes = StdVector(self.val['type_codes_'])
type_codes = "{" + ", ".join(str(x.cast(gdb.lookup_type('int')))
for x in type_codes) + "}"
return f"{self._format_type()}(fields={self.fields}, type_codes={type_codes})"
class ExtensionTypePrinter(TypePrinter):
"""
Pretty-printer for extension types.
"""
def to_string(self):
ext_type = ExtensionType(self.val)
return (f"{self._format_type()} {ext_type.to_string().string_literal()} "
f"with storage type {ext_type.storage_type}")
class ScalarPrinter:
"""
Pretty-printer for arrow::Scalar and subclasses.
"""
def __new__(cls, val):
# Lookup actual (derived) class to instantiate
type_id = int(deref(val['type'])['id_'])
type_class = lookup_type_class(type_id)
if type_class is not None:
cls = type_class.scalar_printer
assert issubclass(cls, ScalarPrinter)
self = object.__new__(cls)
self.type_class = type_class
self.type_name = type_class.name
self.name = scalar_class_from_type(self.type_name)
self.type_id = type_id
# Cast to concrete Scalar class to access derived attributes.
concrete_type = gdb.lookup_type(f"arrow::{self.name}")
self.val = cast_to_concrete(val, concrete_type)
self.is_valid = bool(self.val['is_valid'])
return self
@property
def type(self):
"""
The concrete DataTypeClass instance.
"""
concrete_type = gdb.lookup_type(f"arrow::{self.type_name}")
return cast_to_concrete(deref(self.val['type']),
concrete_type)
def _format_type(self):
return f"arrow::{self.name}"
def _format_null(self):
if self.type_class.is_parametric:
return f"{self._format_type()} of type {self.type}, null value"
else:
return f"{self._format_type()} of null value"
def _for_evaluation(self):
return for_evaluation(self.val)
class NullScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::NullScalar.
"""
def to_string(self):
return self._format_type()
class NumericScalarPrinter(ScalarPrinter):
"""
Pretty-printer for numeric Arrow scalars.
"""
def to_string(self):
if not self.is_valid:
return self._format_null()
value = self.val['value']
if self.type_name == "HalfFloatType":
return (f"{self._format_type()} "
f"of value {half_float_value(value)} [{value}]")
if self.type_name in ("UInt8Type", "Int8Type"):
value = value.cast(gdb.lookup_type('int'))
return f"{self._format_type()} of value {value}"
class TimeScalarPrinter(ScalarPrinter):
"""
Pretty-printer for Arrow time-like scalars.
"""
def to_string(self):
unit = short_time_unit(self.type['unit_'])
if not self.is_valid:
return f"{self._format_type()} of null value [{unit}]"
value = self.val['value']
return f"{self._format_type()} of value {value}{unit}"
class Date32ScalarPrinter(TimeScalarPrinter):
"""
Pretty-printer for arrow::Date32Scalar.
"""
def to_string(self):
if not self.is_valid:
return self._format_null()
value = self.val['value']
return f"{self._format_type()} of value {format_date32(value)}"
class Date64ScalarPrinter(TimeScalarPrinter):
"""
Pretty-printer for arrow::Date64Scalar.
"""
def to_string(self):
if not self.is_valid:
return self._format_null()
value = self.val['value']
return f"{self._format_type()} of value {format_date64(value)}"
class TimestampScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::TimestampScalar.
"""
def to_string(self):
unit = short_time_unit(self.type['unit_'])
tz = StdString(self.type['timezone_'])
tz = tz.string_literal() if tz.size != 0 else "no timezone"
if not self.is_valid:
return f"{self._format_type()} of null value [{unit}, {tz}]"
value = self.val['value']
return f"{self._format_type()} of value {value}{unit} [{tz}]"
class MonthIntervalScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::MonthIntervalScalarPrinter.
"""
def to_string(self):
if not self.is_valid:
return self._format_null()
value = self.val['value']
return f"{self._format_type()} of value {format_month_interval(value)}"
class DecimalScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::DecimalScalar and subclasses.
"""
@property
def decimal_class(self):
return decimal_type_to_class[self.type_name]
def to_string(self):
ty = self.type
precision = int(ty['precision_'])
scale = int(ty['scale_'])
suffix = f"[precision={precision}, scale={scale}]"
if not self.is_valid:
return f"{self._format_type()} of null value {suffix}"
value = self.decimal_class.from_value(self.val['value']
).format(precision, scale)
return f"{self._format_type()} of value {value} {suffix}"
class BaseBinaryScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::BaseBinaryScalar and subclasses.
"""
def _format_buf(self, bufptr):
if 'String' in self.type_name:
return utf8_literal(bufptr.data, bufptr.size)
else:
return bufptr.bytes_literal()
def to_string(self):
if not self.is_valid:
return self._format_null()
bufptr = BufferPtr(SharedPtr(self.val['value']).get())
size = bufptr.size
if size is None:
return f"{self._format_type()} of value <unallocated>"
return (f"{self._format_type()} of size {size}, "
f"value {self._format_buf(bufptr)}")
class FixedSizeBinaryScalarPrinter(BaseBinaryScalarPrinter):
"""
Pretty-printer for arrow::FixedSizeBinaryScalar.
"""
def to_string(self):
size = self.type['byte_width_']
if not self.is_valid:
return f"{self._format_type()} of size {size}, null value"
bufptr = BufferPtr(SharedPtr(self.val['value']).get())
if bufptr.data is None:
return f"{self._format_type()} of size {size}, <unallocated>"
return (f"{self._format_type()} of size {size}, "
f"value {self._format_buf(bufptr)}")
class DictionaryScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::DictionaryScalar.
"""
def to_string(self):
if not self.is_valid:
return self._format_null()
index = deref(self.val['value']['index'])
dictionary = deref(self.val['value']['dictionary'])
return (f"{self._format_type()} of index {index}, "
f"dictionary {dictionary}")
class BaseListScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::BaseListScalar and subclasses.
"""
def to_string(self):
if not self.is_valid:
return self._format_null()
value = deref(self.val['value'])
return f"{self._format_type()} of value {value}"
class StructScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::StructScalar.
"""
def display_hint(self):
return 'map'
def children(self):
eval_fields = StdVector(self.type['children_'])
eval_values = StdVector(self.val['value'])
for field, value in zip(eval_fields, eval_values):
name = StdString(deref(field)['name_']).string_literal()
yield ("name", name)
yield ("value", deref(value))
def to_string(self):
if not self.is_valid:
return self._format_null()
return f"{self._format_type()}"
class UnionScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::UnionScalar and subclasses.
"""
def to_string(self):
type_code = self.val['type_code'].cast(gdb.lookup_type('int'))
if not self.is_valid:
return (f"{self._format_type()} of type {self.type}, "
f"type code {type_code}, null value")
value = deref(self.val['value'])
return (f"{self._format_type()} of type code {type_code}, "
f"value {value}")
class MapScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::MapScalar.
"""
def to_string(self):
if not self.is_valid:
return self._format_null()
array = deref(self.val['value'])
data = deref(array['data_'])
data_printer = ArrayDataPrinter("arrow::ArrayData", data)
return (f"{self._format_type()} of type {self.type}, "
f"value {data_printer._format_contents()}")
class ExtensionScalarPrinter(ScalarPrinter):
"""
Pretty-printer for arrow::ExtensionScalar.
"""
def to_string(self):
ext_type = ExtensionType(self.type)
if not self.is_valid:
return (f"{self._format_type()} of type "
f"{ext_type.to_string().string_literal()}, null value")
value = deref(self.val['value'])
return (f"{self._format_type()} of type "
f"{ext_type.to_string().string_literal()}, value {value}")
class ArrayDataPrinter:
"""
Pretty-printer for arrow::ArrayData.
"""
def __new__(cls, name, val):
# Lookup actual (derived) class to instantiate
type_id = int(deref(val['type'])['id_'])
type_class = lookup_type_class(type_id)
if type_class is not None:
cls = type_class.array_data_printer
assert issubclass(cls, ArrayDataPrinter)
self = object.__new__(cls)
self.name = name
self.val = val
self.type_class = type_class
self.type_name = type_class.name
self.type_id = type_id
self.offset = int(self.val['offset'])
self.length = int(self.val['length'])
return self
@property
def type(self):
"""
The concrete DataTypeClass instance.
"""
concrete_type = gdb.lookup_type(f"arrow::{self.type_name}")
return cast_to_concrete(deref(self.val['type']), concrete_type)
def _format_contents(self):
return (f"length {self.length}, "
f"offset {self.offset}, "
f"{format_null_count(self.val['null_count'])}")
def _buffer(self, index, type_id=None):
buffers = StdVector(self.val['buffers'])
bufptr = SharedPtr(buffers[index]).get()
if int(bufptr) == 0:
return None
if type_id is not None:
return TypedBuffer.from_type_id(bufptr.dereference(), type_id)
else:
return Buffer(bufptr.dereference())
def _buffer_values(self, index, type_id, length=None):
"""
Return a typed view of values in the buffer with the given index.
Values are returned as tuples since some types may decode to
multiple values (for example day_time_interval).
"""
buf = self._buffer(index, type_id)
if buf is None:
return None
if length is None:
length = self.length
return buf.view(self.offset, length)
def _unpacked_buffer_values(self, index, type_id, length=None):
"""
Like _buffer_values(), but assumes values are 1-tuples
and returns them unpacked.
"""
return StarMappedView(identity,
self._buffer_values(index, type_id, length))
def _null_bitmap(self):
buf = self._buffer(0) if has_null_bitmap(self.type_id) else None
return NullBitmap.from_buffer(buf, self.offset, self.length)
def _null_child(self, i):
return str(i), "null"
def _valid_child(self, i, value):
return str(i), value
def display_hint(self):
return None
def children(self):
return ()
def to_string(self):
ty = self.type
return (f"{self.name} of type {ty}, "
f"{self._format_contents()}")
class NumericArrayDataPrinter(ArrayDataPrinter):
"""
ArrayDataPrinter specialization for numeric data types.
"""
_format_value = staticmethod(identity)
def _values_view(self):
return StarMappedView(self._format_value,
self._buffer_values(1, self.type_id))
def display_hint(self):
return "array"
def children(self):
if self.length == 0:
return
values = self._values_view()
null_bits = self._null_bitmap()
for i, (valid, value) in enumerate(zip(null_bits, values)):
if valid:
yield self._valid_child(i, str(value))
else:
yield self._null_child(i)
class BooleanArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for boolean.
"""
def _format_value(self, v):
return str(v).lower()
def _values_view(self):
return MappedView(self._format_value,
self._buffer_values(1, self.type_id))
class Date32ArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for date32.
"""
_format_value = staticmethod(format_date32)
class Date64ArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for date64.
"""
_format_value = staticmethod(format_date64)
class TimeArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for time32 and time64.
"""
def __init__(self, name, val):
self.unit = self.type['unit_']
self.unit_string = short_time_unit(self.unit)
def _format_value(self, val):
return f"{val}{self.unit_string}"
class TimestampArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for timestamp.
"""
def __init__(self, name, val):
self.unit = self.type['unit_']
def _format_value(self, val):
return format_timestamp(val, self.unit)
class MonthIntervalArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for month_interval.
"""
_format_value = staticmethod(format_month_interval)
class DayTimeIntervalArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for day_time_interval.
"""
_format_value = staticmethod(format_days_milliseconds)
class MonthDayNanoIntervalArrayDataPrinter(NumericArrayDataPrinter):
"""
ArrayDataPrinter specialization for day_time_interval.
"""
_format_value = staticmethod(format_months_days_nanos)
class DecimalArrayDataPrinter(ArrayDataPrinter):
"""
ArrayDataPrinter specialization for decimals.
"""
def __init__(self, name, val):
ty = self.type
self.precision = int(ty['precision_'])
self.scale = int(ty['scale_'])
self.decimal_class = decimal_type_to_class[self.type_name]
self.byte_width = self.decimal_class.traits.bit_width // 8
def display_hint(self):
return "array"
def children(self):
if self.length == 0:
return
null_bits = self._null_bitmap()
address = self._buffer(1).data + self.offset * self.byte_width
for i, valid in enumerate(null_bits):
if valid:
dec = self.decimal_class.from_address(address)
yield self._valid_child(
i, dec.format(self.precision, self.scale))
else:
yield self._null_child(i)
address += self.byte_width
class FixedSizeBinaryArrayDataPrinter(ArrayDataPrinter):
"""
ArrayDataPrinter specialization for fixed_size_binary.
"""
def __init__(self, name, val):
self.byte_width = self.type['byte_width_']
def display_hint(self):
return "array"
def children(self):
if self.length == 0:
return
null_bits = self._null_bitmap()
address = self._buffer(1).data + self.offset * self.byte_width
for i, valid in enumerate(null_bits):
if valid:
if self.byte_width:
yield self._valid_child(
i, bytes_literal(address, self.byte_width))
else:
yield self._valid_child(i, '""')
else:
yield self._null_child(i)
address += self.byte_width
class BinaryArrayDataPrinter(ArrayDataPrinter):
"""
ArrayDataPrinter specialization for variable-sized binary.
"""
def __init__(self, name, val):
self.is_large = self.type_id in (Type.LARGE_BINARY, Type.LARGE_STRING)
self.is_utf8 = self.type_id in (Type.STRING, Type.LARGE_STRING)
self.format_string = utf8_literal if self.is_utf8 else bytes_literal
def display_hint(self):
return "array"
def children(self):
if self.length == 0:
return
null_bits = self._null_bitmap()
offsets = self._unpacked_buffer_values(
1, Type.INT64 if self.is_large else Type.INT32,
length=self.length + 1)
values = self._buffer(2).data
for i, valid in enumerate(null_bits):
if valid:
start = offsets[i]
size = offsets[i + 1] - start
if size:
yield self._valid_child(
i, self.format_string(values + start, size))
else:
yield self._valid_child(i, '""')
else:
yield self._null_child(i)
class ArrayPrinter:
"""
Pretty-printer for arrow::Array and subclasses.
"""
def __init__(self, val):
data = deref(val['data_'])
self.data_printer = ArrayDataPrinter("arrow::ArrayData", data)
self.name = array_class_from_type(self.data_printer.type_name)
def _format_contents(self):
return self.data_printer._format_contents()
def to_string(self):
if self.data_printer.type_class.is_parametric:
ty = self.data_printer.type
return f"arrow::{self.name} of type {ty}, {self._format_contents()}"
else:
return f"arrow::{self.name} of {self._format_contents()}"
def display_hint(self):
return self.data_printer.display_hint()
def children(self):
return self.data_printer.children()
class ChunkedArrayPrinter:
"""
Pretty-printer for arrow::ChunkedArray.
"""
def __init__(self, name, val):
self.name = name
self.val = val
self.chunks = StdVector(self.val['chunks_'])
def display_hint(self):
return "array"
def children(self):
for i, chunk in enumerate(self.chunks):
printer = ArrayPrinter(deref(chunk))
yield str(i), printer._format_contents()
def to_string(self):
ty = deref(self.val['type_'])
return (f"{self.name} of type {ty}, length {self.val['length_']}, "
f"{format_null_count(self.val['null_count_'])} "
f"with {len(self.chunks)} chunks")
class DataTypeClass:
array_data_printer = ArrayDataPrinter
def __init__(self, name):
self.name = name
class NullTypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = NullScalarPrinter
class NumericTypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = NumericScalarPrinter
array_data_printer = NumericArrayDataPrinter
class BooleanTypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = NumericScalarPrinter
array_data_printer = BooleanArrayDataPrinter
class Date32TypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = Date32ScalarPrinter
array_data_printer = Date32ArrayDataPrinter
class Date64TypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = Date64ScalarPrinter
array_data_printer = Date64ArrayDataPrinter
class TimeTypeClass(DataTypeClass):
is_parametric = True
type_printer = TimeTypePrinter
scalar_printer = TimeScalarPrinter
array_data_printer = TimeArrayDataPrinter
class TimestampTypeClass(DataTypeClass):
is_parametric = True
type_printer = TimestampTypePrinter
scalar_printer = TimestampScalarPrinter
array_data_printer = TimestampArrayDataPrinter
class DurationTypeClass(DataTypeClass):
is_parametric = True
type_printer = TimeTypePrinter
scalar_printer = TimeScalarPrinter
array_data_printer = TimeArrayDataPrinter
class MonthIntervalTypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = MonthIntervalScalarPrinter
array_data_printer = MonthIntervalArrayDataPrinter
class DayTimeIntervalTypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = NumericScalarPrinter
array_data_printer = DayTimeIntervalArrayDataPrinter
class MonthDayNanoIntervalTypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = NumericScalarPrinter
array_data_printer = MonthDayNanoIntervalArrayDataPrinter
class DecimalTypeClass(DataTypeClass):
is_parametric = True
type_printer = DecimalTypePrinter
scalar_printer = DecimalScalarPrinter
array_data_printer = DecimalArrayDataPrinter
class BaseBinaryTypeClass(DataTypeClass):
is_parametric = False
type_printer = PrimitiveTypePrinter
scalar_printer = BaseBinaryScalarPrinter
array_data_printer = BinaryArrayDataPrinter
class FixedSizeBinaryTypeClass(DataTypeClass):
is_parametric = True
type_printer = FixedSizeBinaryTypePrinter
scalar_printer = FixedSizeBinaryScalarPrinter
array_data_printer = FixedSizeBinaryArrayDataPrinter
class BaseListTypeClass(DataTypeClass):
is_parametric = True
type_printer = ListTypePrinter
scalar_printer = BaseListScalarPrinter
class FixedSizeListTypeClass(DataTypeClass):
is_parametric = True
type_printer = FixedSizeListTypePrinter
scalar_printer = BaseListScalarPrinter
class MapTypeClass(DataTypeClass):
is_parametric = True
type_printer = MapTypePrinter
scalar_printer = MapScalarPrinter
class StructTypeClass(DataTypeClass):
is_parametric = True
type_printer = StructTypePrinter
scalar_printer = StructScalarPrinter
class UnionTypeClass(DataTypeClass):
is_parametric = True
type_printer = UnionTypePrinter
scalar_printer = UnionScalarPrinter
class DictionaryTypeClass(DataTypeClass):
is_parametric = True
type_printer = DictionaryTypePrinter
scalar_printer = DictionaryScalarPrinter
class ExtensionTypeClass(DataTypeClass):
is_parametric = True
type_printer = ExtensionTypePrinter
scalar_printer = ExtensionScalarPrinter
DataTypeTraits = namedtuple('DataTypeTraits', ('factory', 'name'))
type_traits_by_id = {
Type.NA: DataTypeTraits(NullTypeClass, 'NullType'),
Type.BOOL: DataTypeTraits(BooleanTypeClass, 'BooleanType'),
Type.UINT8: DataTypeTraits(NumericTypeClass, 'UInt8Type'),
Type.INT8: DataTypeTraits(NumericTypeClass, 'Int8Type'),
Type.UINT16: DataTypeTraits(NumericTypeClass, 'UInt16Type'),
Type.INT16: DataTypeTraits(NumericTypeClass, 'Int16Type'),
Type.UINT32: DataTypeTraits(NumericTypeClass, 'UInt32Type'),
Type.INT32: DataTypeTraits(NumericTypeClass, 'Int32Type'),
Type.UINT64: DataTypeTraits(NumericTypeClass, 'UInt64Type'),
Type.INT64: DataTypeTraits(NumericTypeClass, 'Int64Type'),
Type.HALF_FLOAT: DataTypeTraits(NumericTypeClass, 'HalfFloatType'),
Type.FLOAT: DataTypeTraits(NumericTypeClass, 'FloatType'),
Type.DOUBLE: DataTypeTraits(NumericTypeClass, 'DoubleType'),
Type.STRING: DataTypeTraits(BaseBinaryTypeClass, 'StringType'),
Type.BINARY: DataTypeTraits(BaseBinaryTypeClass, 'BinaryType'),
Type.LARGE_STRING: DataTypeTraits(BaseBinaryTypeClass, 'LargeStringType'),
Type.LARGE_BINARY: DataTypeTraits(BaseBinaryTypeClass, 'LargeBinaryType'),
Type.FIXED_SIZE_BINARY: DataTypeTraits(FixedSizeBinaryTypeClass,
'FixedSizeBinaryType'),
Type.DATE32: DataTypeTraits(Date32TypeClass, 'Date32Type'),
Type.DATE64: DataTypeTraits(Date64TypeClass, 'Date64Type'),
Type.TIMESTAMP: DataTypeTraits(TimestampTypeClass, 'TimestampType'),
Type.TIME32: DataTypeTraits(TimeTypeClass, 'Time32Type'),
Type.TIME64: DataTypeTraits(TimeTypeClass, 'Time64Type'),
Type.DURATION: DataTypeTraits(DurationTypeClass, 'DurationType'),
Type.INTERVAL_MONTHS: DataTypeTraits(MonthIntervalTypeClass,
'MonthIntervalType'),
Type.INTERVAL_DAY_TIME: DataTypeTraits(DayTimeIntervalTypeClass,
'DayTimeIntervalType'),
Type.INTERVAL_MONTH_DAY_NANO: DataTypeTraits(MonthDayNanoIntervalTypeClass,
'MonthDayNanoIntervalType'),
Type.DECIMAL128: DataTypeTraits(DecimalTypeClass, 'Decimal128Type'),
Type.DECIMAL256: DataTypeTraits(DecimalTypeClass, 'Decimal256Type'),
Type.LIST: DataTypeTraits(BaseListTypeClass, 'ListType'),
Type.LARGE_LIST: DataTypeTraits(BaseListTypeClass, 'LargeListType'),
Type.FIXED_SIZE_LIST: DataTypeTraits(FixedSizeListTypeClass,
'FixedSizeListType'),
Type.MAP: DataTypeTraits(MapTypeClass, 'MapType'),
Type.STRUCT: DataTypeTraits(StructTypeClass, 'StructType'),
Type.SPARSE_UNION: DataTypeTraits(UnionTypeClass, 'SparseUnionType'),
Type.DENSE_UNION: DataTypeTraits(UnionTypeClass, 'DenseUnionType'),
Type.DICTIONARY: DataTypeTraits(DictionaryTypeClass, 'DictionaryType'),
Type.EXTENSION: DataTypeTraits(ExtensionTypeClass, 'ExtensionType'),
}
max_type_id = len(type_traits_by_id) - 1
def lookup_type_class(type_id):
"""
Lookup a type class (an instance of DataTypeClass) by its type id.
"""
traits = type_traits_by_id.get(type_id)
if traits is not None:
return traits.factory(traits.name)
return None
class StatusPrinter:
"""
Pretty-printer for arrow::Status.
"""
_status_codes_by_id = {
0: 'OK',
1: 'OutOfMemory',
2: 'KeyError',
3: 'TypeError',
4: 'Invalid',
5: 'IOError',
6: 'CapacityError',
7: 'IndexError',
8: 'Cancelled',
9: 'UnknownError',
10: 'NotImplemented',
11: 'SerializationError',
13: 'RError',
40: 'CodeGenError',
41: 'ExpressionValidationError',
42: 'ExecutionError',
45: 'AlreadyExists',
}
def __init__(self, name, val):
self.val = val
def _format_detail(self, state):
detail_ptr = SharedPtr(state['detail']).get()
if int(detail_ptr) == 0:
return None
detail_id = CString(gdb.parse_and_eval(
f"{for_evaluation(detail_ptr)}->type_id()"))
# Cannot use StdString as ToString() returns a rvalue
detail_msg = CString(gdb.parse_and_eval(
f"{for_evaluation(detail_ptr)}->ToString().c_str()"))
return f"[{detail_id.string()}] {detail_msg.string_literal()}"
def _format_error(self, state):
code = int(state['code'])
codename = self._status_codes_by_id.get(code)
if codename is not None:
s = f"arrow::Status::{codename}("
else:
s = f"arrow::Status(<unknown code {code}>, "
s += StdString(state['msg']).string_literal()
detail_msg = self._format_detail(state)
if detail_msg is not None:
return s + f", detail={detail_msg})"
else:
return s + ")"
def to_string(self):
state_ptr = self.val['state_']
if int(state_ptr) == 0:
return "arrow::Status::OK()"
return self._format_error(state_ptr.dereference())
class ResultPrinter(StatusPrinter):
"""
Pretty-printer for arrow::Result<T>.
"""
def to_string(self):
data_type = self.val.type.template_argument(0)
state_ptr = self.val['status_']['state_']
if int(state_ptr) != 0:
inner = self._format_error(state_ptr)
else:
data_ptr = self.val['storage_']['data_'].address
assert data_ptr
inner = data_ptr.reinterpret_cast(
data_type.pointer()).dereference()
return f"arrow::Result<{data_type}>({inner})"
class StringViewPrinter:
"""
Pretty-printer for arrow::util::string_view.
"""
def __init__(self, name, val):
self.val = val
def to_string(self):
size = int(self.val['size_'])
if size == 0:
return f"arrow::util::string_view of size 0"
else:
data = bytes_literal(self.val['data_'], size)
return f"arrow::util::string_view of size {size}, {data}"
class OptionalPrinter:
"""
Pretty-printer for arrow::util::optional.
"""
def __init__(self, name, val):
self.val = val
def to_string(self):
data_type = self.val.type.template_argument(0)
# XXX We rely on internal details of our vendored optional<T>
# implementation, as inlined methods may not be callable from gdb.
if not self.val['has_value_']:
inner = "nullopt"
else:
data_ptr = self.val['contained']['data'].address
assert data_ptr
inner = data_ptr.reinterpret_cast(
data_type.pointer()).dereference()
return f"arrow::util::optional<{data_type}>({inner})"
class VariantPrinter:
"""
Pretty-printer for arrow::util::Variant.
"""
def __init__(self, name, val):
self.val = val
self.variant = Variant(val)
def to_string(self):
if self.variant.value_type is None:
return "arrow::util::Variant (uninitialized or corrupt)"
type_desc = (f"arrow::util::Variant of index {self.variant.index} "
f"(actual type {self.variant.value_type})")
value = self.variant.value
if value is None:
return (f"{type_desc}, unavailable value")
else:
return (f"{type_desc}, value {value}")
class FieldPrinter:
"""
Pretty-printer for arrow::Field.
"""
def __init__(self, name, val):
self.val = val
def to_string(self):
f = Field(self.val)
nullable = f.nullable
if nullable:
return f'arrow::field({f.name}, {f.type})'
else:
return f'arrow::field({f.name}, {f.type}, nullable=false)'
class MetadataPrinter:
"""
Pretty-printer for arrow::KeyValueMetadata.
"""
def __init__(self, name, val):
self.val = val
self.metadata = Metadata(self.val)
def display_hint(self):
return 'map'
def children(self):
for k, v in self.metadata:
yield ("key", k.bytes_literal())
yield ("value", v.bytes_literal())
def to_string(self):
return f"arrow::KeyValueMetadata of size {len(self.metadata)}"
class SchemaPrinter:
"""
Pretty-printer for arrow::Schema.
"""
def __init__(self, name, val):
self.val = val
self.schema = Schema(val)
# TODO endianness
def display_hint(self):
return 'map'
def children(self):
for field in self.schema.fields:
yield ("name", field.name.string_literal())
yield ("type", field.type)
def to_string(self):
num_fields = len(self.schema.fields)
md_items = len(self.schema.metadata)
if md_items > 0:
return (f"arrow::Schema with {num_fields} fields "
f"and {md_items} metadata items")
else:
return f"arrow::Schema with {num_fields} fields"
class BaseColumnarPrinter:
def __init__(self, name, val, columnar):
self.name = name
self.val = val
self.columnar = columnar
self.schema = self.columnar.schema
def display_hint(self):
return 'map'
def children(self):
for field, col in zip(self.schema.fields,
self.columnar.columns):
yield ("name", field.name.string_literal())
yield ("value", col)
def to_string(self):
num_fields = len(self.schema.fields)
num_rows = self.columnar.num_rows
md_items = len(self.schema.metadata)
if md_items > 0:
return (f"arrow::{self.name} with {num_fields} columns, "
f"{num_rows} rows, {md_items} metadata items")
else:
return (f"arrow::{self.name} with {num_fields} columns, "
f"{num_rows} rows")
class RecordBatchPrinter(BaseColumnarPrinter):
"""
Pretty-printer for arrow::RecordBatch.
"""
def __init__(self, name, val):
BaseColumnarPrinter.__init__(self, "RecordBatch", val, RecordBatch(val))
class TablePrinter(BaseColumnarPrinter):
"""
Pretty-printer for arrow::Table.
"""
def __init__(self, name, val):
BaseColumnarPrinter.__init__(self, "Table", val, Table(val))
class DatumPrinter:
"""
Pretty-printer for arrow::Datum.
"""
def __init__(self, name, val):
self.val = val
self.variant = Variant(val['value'])
def to_string(self):
if self.variant.index == 0:
# Datum::NONE
return "arrow::Datum (empty)"
if self.variant.value_type is None:
return "arrow::Datum (uninitialized or corrupt?)"
# All non-empty Datums contain a shared_ptr<T>
value = deref(self.variant.value)
return f"arrow::Datum of value {value}"
class BufferPrinter:
"""
Pretty-printer for arrow::Buffer and subclasses.
"""
def __init__(self, name, val):
self.name = name
self.val = val
def to_string(self):
if bool(self.val['is_mutable_']):
mutable = 'mutable'
else:
mutable = 'read-only'
size = int(self.val['size_'])
if size == 0:
return f"arrow::{self.name} of size 0, {mutable}"
if not self.val['is_cpu_']:
return f"arrow::{self.name} of size {size}, {mutable}, not on CPU"
data = bytes_literal(self.val['data_'], size)
return f"arrow::{self.name} of size {size}, {mutable}, {data}"
class DayMillisecondsPrinter:
"""
Pretty-printer for arrow::DayTimeIntervalType::DayMilliseconds.
"""
def __init__(self, name, val):
self.val = val
def to_string(self):
return format_days_milliseconds(self.val['days'],
self.val['milliseconds'])
class MonthDayNanosPrinter:
"""
Pretty-printer for arrow::MonthDayNanoIntervalType::MonthDayNanos.
"""
def __init__(self, name, val):
self.val = val
def to_string(self):
return format_months_days_nanos(self.val['months'],
self.val['days'],
self.val['nanoseconds'])
class DecimalPrinter:
"""
Pretty-printer for Arrow decimal values.
"""
def __init__(self, bit_width, name, val):
self.name = name
self.val = val
self.bit_width = bit_width
def to_string(self):
dec = decimal_bits_to_class[self.bit_width].from_value(self.val)
return f"{self.name}({int(dec)})"
printers = {
"arrow::ArrayData": ArrayDataPrinter,
"arrow::BasicDecimal128": partial(DecimalPrinter, 128),
"arrow::BasicDecimal256": partial(DecimalPrinter, 256),
"arrow::ChunkedArray": ChunkedArrayPrinter,
"arrow::Datum": DatumPrinter,
"arrow::DayTimeIntervalType::DayMilliseconds": DayMillisecondsPrinter,
"arrow::Decimal128": partial(DecimalPrinter, 128),
"arrow::Decimal256": partial(DecimalPrinter, 256),
"arrow::MonthDayNanoIntervalType::MonthDayNanos": MonthDayNanosPrinter,
"arrow::Field": FieldPrinter,
"arrow::KeyValueMetadata": MetadataPrinter,
"arrow::RecordBatch": RecordBatchPrinter,
"arrow::Result": ResultPrinter,
"arrow::Schema": SchemaPrinter,
"arrow::SimpleRecordBatch": RecordBatchPrinter,
"arrow::SimpleTable": TablePrinter,
"arrow::Status": StatusPrinter,
"arrow::Table": TablePrinter,
"arrow::util::optional": OptionalPrinter,
"arrow::util::string_view": StringViewPrinter,
"arrow::util::Variant": VariantPrinter,
"nonstd::optional_lite::optional": OptionalPrinter,
"nonstd::sv_lite::basic_string_view": StringViewPrinter,
}
def arrow_pretty_print(val):
name = val.type.strip_typedefs().name
if name is None:
return
name = name.partition('<')[0] # Remove template parameters
printer = printers.get(name)
if printer is not None:
return printer(name, val)
if not name.startswith("arrow::"):
return
arrow_name = name[len("arrow::"):]
if arrow_name.endswith("Buffer"):
try:
val['data_']
except Exception:
# Not a Buffer?
pass
else:
return BufferPrinter(arrow_name, val)
elif arrow_name.endswith("Type"):
# Look up dynamic type, as it may be hidden behind a DataTypeClass
# pointer or reference.
try:
type_id = int(val['id_'])
except Exception:
# Not a DataTypeClass?
pass
else:
type_class = lookup_type_class(type_id)
if type_class is not None:
return type_class.type_printer(type_class.name, val)
elif arrow_name.endswith("Array"):
return ArrayPrinter(val)
elif arrow_name.endswith("Scalar"):
try:
val['is_valid']
except Exception:
# Not a Scalar?
pass
else:
return ScalarPrinter(val)
def main():
# This pattern allows for two modes of use:
# - manual loading using `source gdb-arrow.py`: current_objfile()
# will be None;
# - automatic loading from the GDB `scripts-directory`: current_objfile()
# will be tied to the inferior being debugged.
objfile = gdb.current_objfile()
if objfile is None:
objfile = gdb
objfile.pretty_printers.append(arrow_pretty_print)
main()