blob: 5f962901c36973f6e09ba5c2a1cc65c1fcf46f74 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# These are imprecise because the type (in pandas 0.x) depends on the presence
# of nulls
cdef dict _pandas_type_map = {
_Type_NA: np.float64, # NaNs
_Type_BOOL: np.bool_,
_Type_INT8: np.int8,
_Type_INT16: np.int16,
_Type_INT32: np.int32,
_Type_INT64: np.int64,
_Type_UINT8: np.uint8,
_Type_UINT16: np.uint16,
_Type_UINT32: np.uint32,
_Type_UINT64: np.uint64,
_Type_HALF_FLOAT: np.float16,
_Type_FLOAT: np.float32,
_Type_DOUBLE: np.float64,
_Type_DATE32: np.dtype('datetime64[ns]'),
_Type_DATE64: np.dtype('datetime64[ns]'),
_Type_TIMESTAMP: np.dtype('datetime64[ns]'),
_Type_BINARY: np.object_,
_Type_FIXED_SIZE_BINARY: np.object_,
_Type_STRING: np.object_,
_Type_LIST: np.object_,
_Type_DECIMAL: np.object_,
}
cdef class DataType:
"""
Base type for Apache Arrow data type instances. Wraps C++ arrow::DataType
"""
def __cinit__(self):
pass
cdef void init(self, const shared_ptr[CDataType]& type):
self.sp_type = type
self.type = type.get()
property id:
def __get__(self):
return self.type.id()
def __str__(self):
if self.type is NULL:
raise TypeError(
'{} is incomplete. The correct way to construct types is '
'through public API functions named '
'pyarrow.int64, pyarrow.list_, etc.'.format(
type(self).__name__
)
)
return frombytes(self.type.ToString())
def __hash__(self):
return hash(str(self))
def __reduce__(self):
return self.__class__, (), self.__getstate__()
def __getstate__(self):
return str(self),
def __setstate__(self, state):
cdef DataType reconstituted = type_for_alias(state[0])
self.init(reconstituted.sp_type)
def __repr__(self):
return '{0.__class__.__name__}({0})'.format(self)
def __richcmp__(DataType self, object other, int op):
if op == cp.Py_EQ:
return self.equals(other)
elif op == cp.Py_NE:
return not self.equals(other)
else:
raise TypeError('Invalid comparison')
def equals(self, other):
"""
Return true if type is equivalent to passed value
Parameters
----------
other : DataType or string convertible to DataType
Returns
-------
is_equal : boolean
"""
cdef DataType other_type
if not isinstance(other, DataType):
if not isinstance(other, six.string_types):
raise TypeError(other)
other_type = type_for_alias(other)
else:
other_type = other
return self.type.Equals(deref(other_type.type))
def to_pandas_dtype(self):
"""
Return the NumPy dtype that would be used for storing this
"""
cdef Type type_id = self.type.id()
if type_id in _pandas_type_map:
return _pandas_type_map[type_id]
else:
raise NotImplementedError(str(self))
cdef class DictionaryType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.dict_type = <const CDictionaryType*> type.get()
property ordered:
def __get__(self):
return self.dict_type.ordered()
cdef class ListType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.list_type = <const CListType*> type.get()
def __getstate__(self):
cdef CField* field = self.list_type.value_field().get()
name = field.name()
return name, self.value_type
def __setstate__(self, state):
cdef DataType reconstituted = list_(field(state[0], state[1]))
self.init(reconstituted.sp_type)
property value_type:
def __get__(self):
return pyarrow_wrap_data_type(self.list_type.value_type())
cdef class StructType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
def __getitem__(self, i):
if i < 0 or i >= self.num_children:
raise IndexError(i)
return pyarrow_wrap_field(self.type.child(i))
property num_children:
def __get__(self):
return self.type.num_children()
def __getstate__(self):
cdef CStructType* type = <CStructType*> self.sp_type.get()
return [self[i] for i in range(self.num_children)]
def __setstate__(self, state):
cdef DataType reconstituted = struct(state)
self.init(reconstituted.sp_type)
cdef class UnionType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.child_types = [
pyarrow_wrap_data_type(type.get().child(i).get().type())
for i in range(self.num_children)]
property num_children:
def __get__(self):
return self.type.num_children()
property mode:
def __get__(self):
cdef CUnionType* type = <CUnionType*> self.sp_type.get()
return type.mode()
def __getitem__(self, i):
return self.child_types[i]
def __getstate__(self):
children = [pyarrow_wrap_field(self.type.child(i))
for i in range(self.num_children)]
return children, self.mode
def __setstate__(self, state):
cdef DataType reconstituted = union(*state)
self.init(reconstituted.sp_type)
cdef class TimestampType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.ts_type = <const CTimestampType*> type.get()
property unit:
def __get__(self):
return timeunit_to_string(self.ts_type.unit())
property tz:
def __get__(self):
if self.ts_type.timezone().size() > 0:
return frombytes(self.ts_type.timezone())
else:
return None
def to_pandas_dtype(self):
"""
Return the NumPy dtype that would be used for storing this
"""
if self.tz is None:
return _pandas_type_map[_Type_TIMESTAMP]
else:
# Return DatetimeTZ
return pdcompat.make_datetimetz(self.tz)
cdef class Time32Type(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.time_type = <const CTime32Type*> type.get()
property unit:
def __get__(self):
return timeunit_to_string(self.time_type.unit())
cdef class Time64Type(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.time_type = <const CTime64Type*> type.get()
property unit:
def __get__(self):
return timeunit_to_string(self.time_type.unit())
cdef class FixedSizeBinaryType(DataType):
cdef void init(self, const shared_ptr[CDataType]& type):
DataType.init(self, type)
self.fixed_size_binary_type = (
<const CFixedSizeBinaryType*> type.get())
def __getstate__(self):
return self.byte_width
def __setstate__(self, state):
cdef DataType reconstituted = binary(state)
self.init(reconstituted.sp_type)
property byte_width:
def __get__(self):
return self.fixed_size_binary_type.byte_width()
cdef class Decimal128Type(FixedSizeBinaryType):
cdef void init(self, const shared_ptr[CDataType]& type):
FixedSizeBinaryType.init(self, type)
self.decimal128_type = <const CDecimal128Type*> type.get()
def __getstate__(self):
return (self.precision, self.scale)
def __setstate__(self, state):
cdef DataType reconstituted = decimal128(*state)
self.init(reconstituted.sp_type)
property precision:
def __get__(self):
return self.decimal128_type.precision()
property scale:
def __get__(self):
return self.decimal128_type.scale()
cdef class Field:
"""
Represents a named field, with a data type, nullability, and optional
metadata
Notes
-----
Do not use this class's constructor directly; use pyarrow.field
"""
def __cinit__(self):
pass
cdef void init(self, const shared_ptr[CField]& field):
self.sp_field = field
self.field = field.get()
self.type = pyarrow_wrap_data_type(field.get().type())
def equals(self, Field other):
"""
Test if this field is equal to the other
"""
return self.field.Equals(deref(other.field))
def __richcmp__(Field self, Field other, int op):
if op == cp.Py_EQ:
return self.equals(other)
elif op == cp.Py_NE:
return not self.equals(other)
else:
raise TypeError('Invalid comparison')
def __reduce__(self):
return Field, (), self.__getstate__()
def __getstate__(self):
return (self.name, self.type, self.metadata)
def __setstate__(self, state):
cdef Field reconstituted = field(state[0], state[1], metadata=state[2])
self.init(reconstituted.sp_field)
def __str__(self):
self._check_null()
return 'pyarrow.Field<{0}>'.format(frombytes(self.field.ToString()))
def __repr__(self):
return self.__str__()
property nullable:
def __get__(self):
self._check_null()
return self.field.nullable()
property name:
def __get__(self):
self._check_null()
return frombytes(self.field.name())
property metadata:
def __get__(self):
self._check_null()
cdef shared_ptr[const CKeyValueMetadata] metadata = (
self.field.metadata())
return box_metadata(metadata.get())
def _check_null(self):
if self.field == NULL:
raise ReferenceError(
'Field not initialized (references NULL pointer)')
def add_metadata(self, dict metadata):
"""
Add metadata as dict of string keys and values to Field
Parameters
----------
metadata : dict
Keys and values must be string-like / coercible to bytes
Returns
-------
field : pyarrow.Field
"""
cdef shared_ptr[CKeyValueMetadata] c_meta
convert_metadata(metadata, &c_meta)
cdef shared_ptr[CField] new_field
with nogil:
new_field = self.field.AddMetadata(c_meta)
return pyarrow_wrap_field(new_field)
def remove_metadata(self):
"""
Create new field without metadata, if any
Returns
-------
field : pyarrow.Field
"""
cdef shared_ptr[CField] new_field
with nogil:
new_field = self.field.RemoveMetadata()
return pyarrow_wrap_field(new_field)
cdef class Schema:
def __cinit__(self):
pass
def __len__(self):
return self.schema.num_fields()
def __getitem__(self, int i):
cdef:
Field result = Field()
int num_fields = self.schema.num_fields()
int index
if not -num_fields <= i < num_fields:
raise IndexError(
'Schema field index {:d} is out of range'.format(i)
)
index = i if i >= 0 else num_fields + i
assert index >= 0
result.init(self.schema.field(index))
result.type = pyarrow_wrap_data_type(result.field.type())
return result
def __iter__(self):
for i in range(len(self)):
yield self[i]
def _check_null(self):
if self.schema == NULL:
raise ReferenceError(
'Schema not initialized (references NULL pointer)')
cdef void init(self, const vector[shared_ptr[CField]]& fields):
self.schema = new CSchema(fields)
self.sp_schema.reset(self.schema)
cdef void init_schema(self, const shared_ptr[CSchema]& schema):
self.schema = schema.get()
self.sp_schema = schema
def __reduce__(self):
return Schema, (), self.__getstate__()
def __getstate__(self):
return ([self[i] for i in range(len(self))], self.metadata)
def __setstate__(self, state):
cdef Schema reconstituted = schema(state[0], metadata=state[1])
self.init_schema(reconstituted.sp_schema)
property names:
def __get__(self):
cdef int i
result = []
for i in range(self.schema.num_fields()):
name = frombytes(self.schema.field(i).get().name())
result.append(name)
return result
property metadata:
def __get__(self):
self._check_null()
cdef shared_ptr[const CKeyValueMetadata] metadata = (
self.schema.metadata())
return box_metadata(metadata.get())
def __richcmp__(self, other, int op):
if op == cp.Py_EQ:
return self.equals(other)
elif op == cp.Py_NE:
return not self.equals(other)
else:
raise TypeError('Invalid comparison')
def equals(self, other):
"""
Test if this schema is equal to the other
"""
cdef Schema _other
_other = other
return self.sp_schema.get().Equals(deref(_other.schema))
def field_by_name(self, name):
"""
Access a field by its name rather than the column index.
Parameters
----------
name: str
Returns
-------
field: pyarrow.Field
"""
return pyarrow_wrap_field(self.schema.GetFieldByName(tobytes(name)))
def get_field_index(self, name):
return self.schema.GetFieldIndex(tobytes(name))
def append(self, Field field):
"""
Append a field at the end of the schema.
Parameters
----------
field: Field
Returns
-------
schema: Schema
"""
return self.insert(self.schema.num_fields(), field)
def insert(self, int i, Field field):
"""
Add a field at position i to the schema.
Parameters
----------
i: int
field: Field
Returns
-------
schema: Schema
"""
cdef:
shared_ptr[CSchema] new_schema
shared_ptr[CField] c_field
c_field = field.sp_field
with nogil:
check_status(self.schema.AddField(i, c_field, &new_schema))
return pyarrow_wrap_schema(new_schema)
def remove(self, int i):
"""
Remove the field at index i from the schema.
Parameters
----------
i: int
Returns
-------
schema: Schema
"""
cdef shared_ptr[CSchema] new_schema
with nogil:
check_status(self.schema.RemoveField(i, &new_schema))
return pyarrow_wrap_schema(new_schema)
def add_metadata(self, dict metadata):
"""
Add metadata as dict of string keys and values to Schema
Parameters
----------
metadata : dict
Keys and values must be string-like / coercible to bytes
Returns
-------
schema : pyarrow.Schema
"""
cdef shared_ptr[CKeyValueMetadata] c_meta
convert_metadata(metadata, &c_meta)
cdef shared_ptr[CSchema] new_schema
with nogil:
new_schema = self.schema.AddMetadata(c_meta)
return pyarrow_wrap_schema(new_schema)
def serialize(self, memory_pool=None):
"""
Write Schema to Buffer as encapsulated IPC message
Parameters
----------
memory_pool : MemoryPool, default None
Uses default memory pool if not specified
Returns
-------
serialized : Buffer
"""
cdef:
shared_ptr[CBuffer] buffer
CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
with nogil:
check_status(SerializeSchema(deref(self.schema),
pool, &buffer))
return pyarrow_wrap_buffer(buffer)
def remove_metadata(self):
"""
Create new schema without metadata, if any
Returns
-------
schema : pyarrow.Schema
"""
cdef shared_ptr[CSchema] new_schema
with nogil:
new_schema = self.schema.RemoveMetadata()
return pyarrow_wrap_schema(new_schema)
def __str__(self):
self._check_null()
cdef:
PrettyPrintOptions options
c_string result
options.indent = 0
with nogil:
check_status(PrettyPrint(deref(self.schema), options, &result))
printed = frombytes(result)
if self.metadata is not None:
import pprint
metadata_formatted = pprint.pformat(self.metadata)
printed += '\nmetadata\n--------\n' + metadata_formatted
return printed
def __repr__(self):
return self.__str__()
cdef dict box_metadata(const CKeyValueMetadata* metadata):
cdef unordered_map[c_string, c_string] result
if metadata != nullptr:
metadata.ToUnorderedMap(&result)
return result
else:
return None
cdef dict _type_cache = {}
cdef DataType primitive_type(Type type):
if type in _type_cache:
return _type_cache[type]
cdef DataType out = DataType()
out.init(GetPrimitiveType(type))
_type_cache[type] = out
return out
# -----------------------------------------------------------
# Type factory functions
cdef int convert_metadata(dict metadata,
shared_ptr[CKeyValueMetadata]* out) except -1:
cdef:
shared_ptr[CKeyValueMetadata] meta = (
make_shared[CKeyValueMetadata]())
c_string key, value
for py_key, py_value in metadata.items():
key = tobytes(py_key)
value = tobytes(py_value)
meta.get().Append(key, value)
out[0] = meta
return 0
def field(name, type, bint nullable=True, dict metadata=None):
"""
Create a pyarrow.Field instance
Parameters
----------
name : string or bytes
type : pyarrow.DataType
nullable : boolean, default True
metadata : dict, default None
Keys and values must be coercible to bytes
Returns
-------
field : pyarrow.Field
"""
cdef:
shared_ptr[CKeyValueMetadata] c_meta
Field result = Field()
DataType _type
if metadata is not None:
convert_metadata(metadata, &c_meta)
_type = _as_type(type)
result.sp_field.reset(new CField(tobytes(name), _type.sp_type,
nullable == 1, c_meta))
result.field = result.sp_field.get()
result.type = _type
return result
cdef _as_type(type):
if isinstance(type, DataType):
return type
if not isinstance(type, six.string_types):
raise TypeError(type)
return type_for_alias(type)
cdef set PRIMITIVE_TYPES = set([
_Type_NA, _Type_BOOL,
_Type_UINT8, _Type_INT8,
_Type_UINT16, _Type_INT16,
_Type_UINT32, _Type_INT32,
_Type_UINT64, _Type_INT64,
_Type_TIMESTAMP, _Type_DATE32,
_Type_TIME32, _Type_TIME64,
_Type_DATE64,
_Type_HALF_FLOAT,
_Type_FLOAT,
_Type_DOUBLE])
def null():
"""
Create instance of null type
"""
return primitive_type(_Type_NA)
def bool_():
"""
Create instance of boolean type
"""
return primitive_type(_Type_BOOL)
def uint8():
"""
Create instance of boolean type
"""
return primitive_type(_Type_UINT8)
def int8():
"""
Create instance of signed int8 type
"""
return primitive_type(_Type_INT8)
def uint16():
"""
Create instance of unsigned uint16 type
"""
return primitive_type(_Type_UINT16)
def int16():
"""
Create instance of signed int16 type
"""
return primitive_type(_Type_INT16)
def uint32():
"""
Create instance of unsigned uint32 type
"""
return primitive_type(_Type_UINT32)
def int32():
"""
Create instance of signed int32 type
"""
return primitive_type(_Type_INT32)
def uint64():
"""
Create instance of unsigned uint64 type
"""
return primitive_type(_Type_UINT64)
def int64():
"""
Create instance of signed int64 type
"""
return primitive_type(_Type_INT64)
cdef dict _timestamp_type_cache = {}
cdef dict _time_type_cache = {}
cdef timeunit_to_string(TimeUnit unit):
if unit == TimeUnit_SECOND:
return 's'
elif unit == TimeUnit_MILLI:
return 'ms'
elif unit == TimeUnit_MICRO:
return 'us'
elif unit == TimeUnit_NANO:
return 'ns'
def timestamp(unit, tz=None):
"""
Create instance of timestamp type with resolution and optional time zone
Parameters
----------
unit : string
one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns'
[nanosecond]
tz : string, default None
Time zone name. None indicates time zone naive
Examples
--------
::
t1 = pa.timestamp('us')
t2 = pa.timestamp('s', tz='America/New_York')
Returns
-------
timestamp_type : TimestampType
"""
cdef:
TimeUnit unit_code
c_string c_timezone
if unit == "s":
unit_code = TimeUnit_SECOND
elif unit == 'ms':
unit_code = TimeUnit_MILLI
elif unit == 'us':
unit_code = TimeUnit_MICRO
elif unit == 'ns':
unit_code = TimeUnit_NANO
else:
raise ValueError('Invalid TimeUnit string')
cdef TimestampType out = TimestampType()
if tz is None:
out.init(ctimestamp(unit_code))
if unit_code in _timestamp_type_cache:
return _timestamp_type_cache[unit_code]
_timestamp_type_cache[unit_code] = out
else:
if not isinstance(tz, six.string_types):
tz = tz.zone
c_timezone = tobytes(tz)
out.init(ctimestamp(unit_code, c_timezone))
return out
def time32(unit):
"""
Create instance of 32-bit time (time of day) type with unit resolution
Parameters
----------
unit : string
one of 's' [second], or 'ms' [millisecond]
Examples
--------
::
t1 = pa.time32('s')
t2 = pa.time32('ms')
"""
cdef:
TimeUnit unit_code
c_string c_timezone
if unit == "s":
unit_code = TimeUnit_SECOND
elif unit == 'ms':
unit_code = TimeUnit_MILLI
else:
raise ValueError('Invalid TimeUnit for time32: {}'.format(unit))
cdef Time32Type out
if unit_code in _time_type_cache:
return _time_type_cache[unit_code]
else:
out = Time32Type()
out.init(ctime32(unit_code))
_time_type_cache[unit_code] = out
return out
def time64(unit):
"""
Create instance of 64-bit time (time of day) type with unit resolution
Parameters
----------
unit : string
one of 'us' [microsecond], or 'ns' [nanosecond]
Examples
--------
::
t1 = pa.time64('us')
t2 = pa.time64('ns')
"""
cdef:
TimeUnit unit_code
c_string c_timezone
if unit == "us":
unit_code = TimeUnit_MICRO
elif unit == 'ns':
unit_code = TimeUnit_NANO
else:
raise ValueError('Invalid TimeUnit for time64: {}'.format(unit))
cdef Time64Type out
if unit_code in _time_type_cache:
return _time_type_cache[unit_code]
else:
out = Time64Type()
out.init(ctime64(unit_code))
_time_type_cache[unit_code] = out
return out
def date32():
"""
Create instance of 32-bit date (days since UNIX epoch 1970-01-01)
"""
return primitive_type(_Type_DATE32)
def date64():
"""
Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01)
"""
return primitive_type(_Type_DATE64)
def float16():
"""
Create half-precision floating point type
"""
return primitive_type(_Type_HALF_FLOAT)
def float32():
"""
Create single-precision floating point type
"""
return primitive_type(_Type_FLOAT)
def float64():
"""
Create double-precision floating point type
"""
return primitive_type(_Type_DOUBLE)
cpdef DataType decimal128(int precision, int scale=0):
"""
Create decimal type with precision and scale and 128bit width
Parameters
----------
precision : int
scale : int
Returns
-------
decimal_type : Decimal128Type
"""
cdef shared_ptr[CDataType] decimal_type
decimal_type.reset(new CDecimal128Type(precision, scale))
return pyarrow_wrap_data_type(decimal_type)
def string():
"""
Create UTF8 variable-length string type
"""
return primitive_type(_Type_STRING)
def binary(int length=-1):
"""
Create variable-length binary type
Parameters
----------
length : int, optional, default -1
If length == -1 then return a variable length binary type. If length is
greater than or equal to 0 then return a fixed size binary type of
width `length`.
"""
if length == -1:
return primitive_type(_Type_BINARY)
cdef shared_ptr[CDataType] fixed_size_binary_type
fixed_size_binary_type.reset(new CFixedSizeBinaryType(length))
return pyarrow_wrap_data_type(fixed_size_binary_type)
cpdef ListType list_(value_type):
"""
Create ListType instance from child data type or field
Parameters
----------
value_type : DataType or Field
Returns
-------
list_type : DataType
"""
cdef:
DataType data_type
Field field
shared_ptr[CDataType] list_type
ListType out = ListType()
if isinstance(value_type, DataType):
list_type.reset(new CListType((<DataType> value_type).sp_type))
elif isinstance(value_type, Field):
list_type.reset(new CListType((<Field> value_type).sp_field))
else:
raise ValueError('List requires DataType or Field')
out.init(list_type)
return out
cpdef DictionaryType dictionary(DataType index_type, Array dict_values,
bint ordered=False):
"""
Dictionary (categorical, or simply encoded) type
Parameters
----------
index_type : DataType
dictionary : Array
Returns
-------
type : DictionaryType
"""
cdef DictionaryType out = DictionaryType()
cdef shared_ptr[CDataType] dict_type
dict_type.reset(new CDictionaryType(index_type.sp_type,
dict_values.sp_array,
ordered == 1))
out.init(dict_type)
return out
def struct(fields):
"""
Create StructType instance from fields
Parameters
----------
fields : sequence of Field values
Examples
--------
::
import pyarrow as pa
fields = [
pa.field('f1', pa.int32()),
pa.field('f2', pa.string())
]
struct_type = pa.struct(fields)
Returns
-------
type : DataType
"""
cdef:
Field field
vector[shared_ptr[CField]] c_fields
cdef shared_ptr[CDataType] struct_type
for field in fields:
c_fields.push_back(field.sp_field)
struct_type.reset(new CStructType(c_fields))
return pyarrow_wrap_data_type(struct_type)
def union(children_fields, mode):
"""
Create UnionType from children fields.
"""
cdef:
Field child_field
vector[shared_ptr[CField]] c_fields
vector[uint8_t] type_codes
shared_ptr[CDataType] union_type
int i
for i, child_field in enumerate(children_fields):
type_codes.push_back(i)
c_fields.push_back(child_field.sp_field)
if mode == UnionMode_SPARSE:
union_type.reset(new CUnionType(c_fields, type_codes,
_UnionMode_SPARSE))
else:
union_type.reset(new CUnionType(c_fields, type_codes,
_UnionMode_DENSE))
return pyarrow_wrap_data_type(union_type)
cdef dict _type_aliases = {
'null': null,
'i1': int8,
'int8': int8,
'i2': int16,
'int16': int16,
'i4': int32,
'int32': int32,
'i8': int64,
'int64': int64,
'u1': uint8,
'uint8': uint8,
'u2': uint16,
'uint16': uint16,
'u4': uint32,
'uint32': uint32,
'u8': uint64,
'uint64': uint64,
'f4': float32,
'float32': float32,
'f8': float64,
'float64': float64,
'string': string,
'str': string,
'utf8': string,
'binary': binary,
'date32': date32,
'date64': date64,
'date32[day]': date32,
'date64[ms]': date64,
'time32[s]': time32('s'),
'time32[ms]': time32('ms'),
'time64[us]': time64('us'),
'time64[ns]': time64('ns'),
'timestamp[s]': timestamp('s'),
'timestamp[ms]': timestamp('ms'),
'timestamp[us]': timestamp('us'),
'timestamp[ns]': timestamp('ns'),
}
def type_for_alias(name):
"""
Return DataType given a string alias if one exists
Returns
-------
type : DataType
"""
name = name.lower()
try:
alias = _type_aliases[name]
except KeyError:
raise ValueError('No type alias for {0}'.format(name))
if isinstance(alias, DataType):
return alias
return alias()
def schema(fields, dict metadata=None):
"""
Construct pyarrow.Schema from collection of fields
Parameters
----------
field : list or iterable
metadata : dict, default None
Keys and values must be coercible to bytes
Returns
-------
schema : pyarrow.Schema
"""
cdef:
shared_ptr[CKeyValueMetadata] c_meta
shared_ptr[CSchema] c_schema
Schema result
Field field
vector[shared_ptr[CField]] c_fields
for i, field in enumerate(fields):
c_fields.push_back(field.sp_field)
if metadata is not None:
convert_metadata(metadata, &c_meta)
c_schema.reset(new CSchema(c_fields, c_meta))
result = Schema()
result.init_schema(c_schema)
return result
def from_numpy_dtype(object dtype):
"""
Convert NumPy dtype to pyarrow.DataType
"""
cdef shared_ptr[CDataType] c_type
dtype = np.dtype(dtype)
with nogil:
check_status(NumPyDtypeToArrow(dtype, &c_type))
return pyarrow_wrap_data_type(c_type)