python/kudu/schema.pyx - kudu - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # distutils: language = c++
 # cython: embedsignature = True

 include "config.pxi"

 from cython.operator cimport dereference as deref

 from kudu.compat import tobytes, frombytes
 from kudu.schema cimport *
 from kudu.errors cimport check_status
 from kudu.client cimport PartialRow
 from kudu.util import get_decimal_scale, to_unixtime_micros, to_unscaled_decimal, \
     unix_epoch_days_to_date, date_to_unix_epoch_days
 from errors import KuduException

 import six

 from . import util

 BOOL = KUDU_BOOL
 STRING = KUDU_STRING
 VARCHAR = KUDU_VARCHAR

 INT8 = KUDU_INT8
 INT16 = KUDU_INT16
 INT32 = KUDU_INT32
 INT64 = KUDU_INT64

 FLOAT = KUDU_FLOAT
 DOUBLE = KUDU_DOUBLE

 UNIXTIME_MICROS = KUDU_UNIXTIME_MICROS
 DATE = KUDU_DATE
 BINARY = KUDU_BINARY

 DECIMAL = KUDU_DECIMAL

 cdef dict _reverse_dict(d):
     return dict((v, k) for k, v in d.items())


 # CompressionType enums
 COMPRESSION_DEFAULT = CompressionType_DEFAULT
 COMPRESSION_NONE = CompressionType_NONE
 COMPRESSION_SNAPPY = CompressionType_SNAPPY
 COMPRESSION_LZ4 = CompressionType_LZ4
 COMPRESSION_ZLIB = CompressionType_ZLIB

 cdef dict _compression_types = {
     'default': COMPRESSION_DEFAULT,
     'none': COMPRESSION_NONE,
     'snappy': COMPRESSION_SNAPPY,
     'lz4': COMPRESSION_LZ4,
     'zlib': COMPRESSION_ZLIB,
 }

 cdef dict _compression_type_to_name = _reverse_dict(_compression_types)


 # EncodingType enums
 ENCODING_AUTO = EncodingType_AUTO
 ENCODING_PLAIN = EncodingType_PLAIN
 ENCODING_PREFIX = EncodingType_PREFIX
 ENCODING_BIT_SHUFFLE = EncodingType_BIT_SHUFFLE
 ENCODING_RLE = EncodingType_RLE
 ENCODING_DICT = EncodingType_DICT

 cdef dict _encoding_types = {
     'auto': ENCODING_AUTO,
     'plain': ENCODING_PLAIN,
     'prefix': ENCODING_PREFIX,
     'bitshuffle': ENCODING_BIT_SHUFFLE,
     'rle': ENCODING_RLE,
     'dict': ENCODING_DICT,
 }

 cdef dict _encoding_type_to_name = _reverse_dict(_encoding_types)


 cdef class KuduType(object):

     """
     Usability wrapper for Kudu data type enum
     """

     def __cinit__(self, DataType type):
         self.type = type

     property name:

         def __get__(self):
             return _type_names[self.type]

     def __repr__(self):
         return 'KuduType({0})'.format(self.name)

     def new_value(self, value):
         return KuduValue(self, value)


 int8 = KuduType(KUDU_INT8)
 int16 = KuduType(KUDU_INT16)
 int32 = KuduType(KUDU_INT32)
 int64 = KuduType(KUDU_INT64)
 string_ = KuduType(KUDU_STRING)
 bool_ = KuduType(KUDU_BOOL)
 float_ = KuduType(KUDU_FLOAT)
 double_ = KuduType(KUDU_DOUBLE)
 binary = KuduType(KUDU_BINARY)
 unixtime_micros = KuduType(KUDU_UNIXTIME_MICROS)
 decimal = KuduType(KUDU_DECIMAL)
 varchar = KuduType(KUDU_VARCHAR)
 date = KuduType(KUDU_DATE)


 cdef dict _type_names = {
     INT8: 'int8',
     INT16: 'int16',
     INT32: 'int32',
     INT64: 'int64',
     STRING: 'string',
     BOOL: 'bool',
     FLOAT: 'float',
     DOUBLE: 'double',
     BINARY: 'binary',
     UNIXTIME_MICROS: 'unixtime_micros',
     DECIMAL: 'decimal',
     VARCHAR: 'varchar',
     DATE: 'date'
 }


 cdef dict _type_name_to_number = _reverse_dict(_type_names)

 cdef dict _type_to_obj = {
     INT8: int8,
     INT16: int16,
     INT32: int32,
     INT64: int64,
     STRING: string_,
     BOOL: bool_,
     FLOAT: float_,
     DOUBLE: double_,
     BINARY: binary,
     UNIXTIME_MICROS: unixtime_micros,
     DECIMAL: decimal,
     VARCHAR: varchar,
     DATE: date
 }


 cdef KuduType to_data_type(object obj):
     if isinstance(obj, KuduType):
         return obj
     elif isinstance(obj, six.string_types):
         return _type_to_obj[_type_name_to_number[obj]]
     elif obj in _type_to_obj:
         return _type_to_obj[obj]
     else:
         raise ValueError('Invalid type: {0}'.format(obj))

 cdef cppclass KuduColumnTypeAttributes:
         KuduColumnTypeAttributes()
         KuduColumnTypeAttributes(const KuduColumnTypeAttributes& other)
         KuduColumnTypeAttributes(int8_t precision, int8_t scale)
         KuduColumnTypeAttributes(uint16_t length)

         int8_t precision()
         int8_t scale()
         uint16_t length()

         c_bool Equals(KuduColumnTypeAttributes& other)
         void CopyFrom(KuduColumnTypeAttributes& other)

 cdef class ColumnTypeAttributes:
     """
     Wraps a Kudu client ColumnTypeAttributes object.
     """
     def __cinit__(self):
         self.type_attributes = NULL

     def __dealloc__(self):
         if self.type_attributes is not NULL:
             del self.type_attributes

     property precision:
         def __get__(self):
             return self.type_attributes.precision()

     property scale:
         def __get__(self):
             return self.type_attributes.scale()

     property length:
         def __get__(self):
             return self.type_attributes.length()

     def __repr__(self):
         return ('ColumnTypeAttributes(precision=%s, scale=%s, length=%s)'
                 % (self.type_attributes.precision(),
                    self.type_attributes.scale(),
                    self.type_attributes.length()))

 cdef class ColumnSchema:
     """
     Wraps a Kudu client ColumnSchema object. Use schema.at(i) or schema[i] to
     construct one.
     """

     def __cinit__(self):
         self.schema = NULL
         self._type = None

     def __dealloc__(self):
         if self.schema is not NULL:
             del self.schema

     property name:
         def __get__(self):
             return frombytes(self.schema.name())

     property type:
         def __get__(self):
             if self._type is None:
                 self._type = _type_to_obj[self.schema.type()]
             return self._type

     property nullable:
         def __get__(self):
             return self.schema.is_nullable()

     property mutable:
         def __get__(self):
             return not self.schema.is_immutable()

     property type_attributes:
         def __get__(self):
             cdef ColumnTypeAttributes result = ColumnTypeAttributes()
             result.type_attributes = new KuduColumnTypeAttributes(self.schema.type_attributes())
             return result

     property comment:
         def __get__(self):
             return frombytes(self.schema.comment())

     def equals(self, other):
         if not isinstance(other, ColumnSchema):
             return False
         return self.schema.Equals(deref((<ColumnSchema> other).schema))

     def __repr__(self):
         return ('ColumnSchema(name=%s, type=%s, nullable=%s)'
                 % (self.name, self.type.name,
                    self.nullable))


 #----------------------------------------------------------------------

 cdef class ColumnSpec:

     """
     Helper class for configuring a column's settings while using the
     SchemaBuilder.
     """

     def type(self, type_):
         self._type = to_data_type(type_)
         self.spec.Type(self._type.type)
         return self

     def default(self, value):
         """
         Set a default value for the column
         """
         cdef:
             KuduValue kval

         if not self._type:
             raise ValueError("You must set the Column type before setting " +
                              "the default value.")
         else:
             kval = self._type.new_value(value)
             self.spec.Default(kval._value)
         return self

     def remove_default(self):
         """
         Remove a default value set.
         """
         self.spec.RemoveDefault()
         return self

     def compression(self, compression):
         """
         Set the compression type

         Parameters
         ----------
         compression : string or int
           One of {'default', 'none', 'snappy', 'lz4', 'zlib'}
           Or see kudu.COMPRESSION_* constants

         Returns
         -------
         self
         """
         cdef CompressionType type
         if isinstance(compression, int):
             # todo: validation
             type = <CompressionType> compression
         else:
             if compression is None:
                 type = CompressionType_NONE
             else:
                 try:
                     type = _compression_types[compression.lower()]
                 except KeyError:
                     raise ValueError('Invalid compression type: {0}'
                                      .format(compression))

         self.spec.Compression(type)
         return self

     def encoding(self, encoding):
         """
         Set the encoding type

         Parameters
         ----------
         encoding : string or int
           One of {'auto', 'plain', 'prefix', 'bitshuffle', 'rle', 'dict'}
           Or see kudu.ENCODING_* constants

         Returns
         -------
         self
         """
         cdef EncodingType type
         if isinstance(encoding, six.string_types):
             try:
                 type = _encoding_types[encoding.lower()]
             except KeyError:
                 raise ValueError('Invalid encoding type: {0}'
                                  .format(encoding))
         else:
             # todo: validation
             type = <EncodingType> encoding

         self.spec.Encoding(type)
         return self

     def precision(self, precision):
         """
         Set the precision for the column.

         Clients must specify a precision for decimal columns. Precision is the total
         number of digits that can be represented by the column, regardless of the
         location of the decimal point. For example, representing integer values up to 9999,
         and fractional values up to 99.99, both require a precision of 4. You can also
         represent corresponding negative values, without any change in the precision.
         For example, the range -9999 to 9999 still only requires a precision of 4.

         The precision must be between 1 and 38.

         Returns
         -------
         self
         """
         self.spec.Precision(precision)
         return self

     def scale(self, scale):
         """
         Set the scale for the column.

         Clients can specify a scale for decimal columns. Scale represents the number
         of fractional digits. This value must be less than or equal to precision.
         A scale of 0 produces integral values, with no fractional part. If precision
         and scale are equal, all the digits come after the decimal point, making all
         the values between 0.9999 and -0.9999.

         The scale must be greater than 0 and less than the column's precision.
         If no scale is provided a default scale of 0 is used.

         Returns
         -------
         self
         """
         self.spec.Scale(scale)
         return self

     def length(self, length):
         """
         Set the length for the column.

         Clients can specify a length for varchar columns. Length is the maximum
         length in characters (UTF-8) of the string that the varchar can hold.

         The length must be between 1 and 65,535 (inclusive).

         Returns
         -------
         self
         """
         self.spec.Length(length)
         return self

     def primary_key(self):
         """
         Make this column a primary key. If you use this method, it will be the
         only primary key. Otherwise see set_primary_keys method on
         SchemaBuilder.

         Returns
         -------
         self
         """
         self.spec.PrimaryKey()
         return self

     def non_unique_primary_key(self):
         """
         Make this column a non-unique primary key.

         This may only be used to set non-composite non-unique primary keys. If a composite
         key is desired, use set_non_unique_primary_keys(). This may not be
         used in conjunction with set_non_unique_primary_keys().

         Notes
         -----
         Non-unique primary keys may not be changed after a table is created.

         By specifying non-unique primary key, an auto incrementing column is created
         automatically. They form together the effective primary key. The auto incrementing field
         is populated on the server side, it must not be specified during insertion. All subsequent
         operations like scans will contain the auto incrementing column by default. If one wants to
         omit the auto incrementing column, it can be accomplished through existing projection
         methods.

         A call to primary_key() or non_unique_primary_key() overrides any previous call
         to these two methods.

         Returns
         -------
         self
         """
         self.spec.NonUniquePrimaryKey()
         return self

     def nullable(self, bint is_nullable=True):
         """
         Set nullable (True) or not nullable (False)

         Parameters
         ----------
         is_nullable : boolean, default True

         Returns
         -------
         self
         """
         if is_nullable:
             self.spec.Nullable()
         else:
             self.spec.NotNull()
         return self

     def mutable(self, bint is_mutable=True):
         """
         Set mutable (True) or immutable (False)

         Parameters
         ----------
         is_mutable : boolean, default True

         Returns
         -------
         self
         """
         if is_mutable:
             self.spec.Mutable()
         else:
             self.spec.Immutable()
         return self

     def rename(self, new_name):
         """
         Change the column name.
         """
         self.spec.RenameTo(tobytes(new_name))
         return self

     def comment(self, comment):
         """
         The comment for the column.

         Parameters
         ----------
         comment : str

         Retruns
         -------
         self : ColumnSpec
         """
         self.spec.Comment(tobytes(comment))
         return self

     def block_size(self, block_size):
         """
         Set the target block size for the column.

         This is the number of bytes of user data packed per block on disk, and
         represents the unit of IO when reading the column. Larger values may
         improve scan performance, particularly on spinning media. Smaller
         values may improve random access performance, particularly for
         workloads that have high cache hit rates or operate on fast storage
         such as SSD.

         Parameters
         ----------
         block_size : int
           Block size (in bytes) to use.

         Returns
         -------
         self : ColumnSpec
         """
         self.spec.BlockSize(block_size)
         return self


 cdef class SchemaBuilder:

     def copy_column(self, colschema):
         """
         Add a new column to the schema by copying it from an existing one.
         Returns a ColumnSpec object for further configuration and use in a
         fluid programming style. This method allows the SchemaBuilder to be
         more easily used to build a new Schema from an existing one.

         Parameters
         ----------
         colschema : ColumnSchema

         Examples
         --------
         for col in scanner.get_projection_schema():
             builder.copy_column(col).compression('lz4')
         builder.set_primary_keys(['key'])

         Returns
         -------
         spec : ColumnSpec
         """
         return self.add_column(colschema.name,
                                colschema.type,
                                colschema.nullable)

     def add_column(self, name, type_=None, nullable=None, compression=None, encoding=None,
                    primary_key=False, non_unique_primary_key=False, block_size=None, default=None,
                    precision=None, scale=None, length=None, comment=None):
         """
         Add a new column to the schema. Returns a ColumnSpec object for further
         configuration and use in a fluid programming style.

         Parameters
         ----------
         name : string
         type_ : string or KuduType
           Data type e.g. 'int32' or kudu.int32
         nullable : boolean, default None
           New columns are nullable by default. Set boolean value for explicit
           nullable / not-nullable
         compression : string or int
           One of {'default', 'none', 'snappy', 'lz4', 'zlib'}
           Or see kudu.COMPRESSION_* constants
         encoding : string or int
           One of {'auto', 'plain', 'prefix', 'bitshuffle', 'rle', 'dict'}
           Or see kudu.ENCODING_* constants
         primary_key : boolean, default False
           Use this column as the table primary key
         non_unique_primary_key : boolean, default False
           Use this column as the table non-unique primary key
         block_size : int, optional
           Block size (in bytes) to use for the target column.
         default : obj
           Use this to set the column default value
         precision : int
           Use this precision for the decimal column
         scale : int
           Use this scale for the decimal column
         length : int
           Use this length for the varchar column
         comment : string, default ''
           Comment of the column schema

         Examples
         --------
         (builder.add_column('foo')
          .nullable(True)
          .compression('lz4'))

         Returns
         -------
         spec : ColumnSpec
         """
         cdef:
             ColumnSpec result = ColumnSpec()
             string c_name = tobytes(name)

         result.spec = self.builder.AddColumn(c_name)

         if type_ is not None:
             result.type(type_)

         if nullable is not None:
             result.nullable(nullable)

         if compression is not None:
             result.compression(compression)

         if encoding is not None:
             result.encoding(encoding)

         if precision is not None:
             result.precision(precision)

         if scale is not None:
             result.scale(scale)

         if length is not None:
             result.length(length)

         if comment is not None:
             result.comment(comment)

         if primary_key:
             result.primary_key()

         if non_unique_primary_key:
             result.non_unique_primary_key()

         if block_size:
             result.block_size(block_size)

         if default:
             result.default(default)

         return result

     def set_primary_keys(self, key_names):
         """
         Set indicated columns (by name) to be the primary keys of the table
         schema

         Parameters
         ----------
         key_names : list of Python strings

         Returns
         -------
         None
         """
         cdef:
             vector[string] key_col_names

         for name in key_names:
             key_col_names.push_back(tobytes(name))

         self.builder.SetPrimaryKey(key_col_names)

     def set_non_unique_primary_keys(self, key_names):
         """
         Set the non-unique primary key of the new Schema based on the given column names.

         This may be used to specify a compound non-unique primary key.

         Notes
         -----
         Non-unique primary keys may not be changed after a table is created.

         By specifying non-unique primary key, an auto incrementing column is created
         automatically. They form together the effective primary key. The auto incrementing field
         is populated on the server side, it must not be specified during insertion. All subsequent
         operations like scans will contain the auto incrementing column by default. If one wants to
         omit the auto incrementing column, it can be accomplished through existing projection
         methods.

         A call to primary_key() or non_unique_primary_key() overrides any previous call
         to these two methods.

         Parameters
         ----------
         key_names: list of Python strings

         Returns
         -------
         self

         """
         cdef:
             vector[string] key_col_names

         for name in key_names:
             key_col_names.push_back(tobytes(name))

         self.builder.SetNonUniquePrimaryKey(key_col_names)

     def build(self):
         """
         Creates an immutable Schema object after the user has finished adding
         and onfiguring columns

         Returns
         -------
         schema : Schema
         """
         cdef Schema result = Schema()
         cdef KuduSchema* schema = new KuduSchema()
         check_status(self.builder.Build(schema))

         result.schema = schema
         return result


 cdef class Schema:

     """
     Container for a Kudu table schema. Obtain from Table instances or create
     new ones using kudu.SchemaBuilder
     """

     def __cinit__(self):
         # Users should not call this directly
         self.schema = NULL
         self.own_schema = 1
         self._col_mapping.clear()
         self._mapping_initialized = 0

     def __dealloc__(self):
         if self.schema is not NULL and self.own_schema:
             del self.schema

     property names:

         def __get__(self):
             result = []
             for i in range(self.schema.num_columns()):
                 name = frombytes(self.schema.Column(i).name())
                 result.append(name)

             return result

     def __repr__(self):
         # Got to be careful with huge schemas, maybe some kind of summary repr
         # when more than 20-30 columns?
         buf = six.StringIO()

         col_names = self.names
         space = 2 + max(len(x) for x in col_names)

         for i in range(len(self)):
             col = self.at(i)
             not_null = '' if col.nullable else ' NOT NULL'

             buf.write('\n{0}{1}{2}'
                       .format(col.name.ljust(space),
                               col.type.name, not_null))

         pk_string = ', '.join(col_names[i] for i in self.primary_key_indices())
         buf.write('\nPRIMARY KEY ({0})'.format(pk_string))

         return "kudu.Schema {{{0}\n}}".format(util.indent(buf.getvalue(), 2))

     def __len__(self):
         return self.schema.num_columns()

     def __getitem__(self, key):
         if isinstance(key, six.string_types):
             key = self.get_loc(key)

         if key < 0:
             key += len(self)
         return self.at(key)

     def equals(self, Schema other):
         """
         Returns True if the table schemas are equal
         """
         return self.schema.Equals(deref(other.schema))

     cdef int get_loc(self, name) except -1:
         if not self._mapping_initialized:
             for i in range(self.schema.num_columns()):
                 self._col_mapping[self.schema.Column(i).name()] = i
             self._mapping_initialized = 1

         name = tobytes(name)

         # TODO: std::map is slightly verbose and inefficient here (O(lg n)
         # lookups), may consider replacing with a better / different hash table
         # should it become a performance bottleneck
         cdef map[string, int].iterator it = self._col_mapping.find(name)
         if it == self._col_mapping.end():
             raise KeyError(name)
         return self._col_mapping[name]

     def at(self, size_t i):
         """
         Return the ColumnSchema for a column index. Analogous to schema[i].

         Returns
         -------
         col_schema : ColumnSchema
         """
         cdef ColumnSchema result = ColumnSchema()

         if i < 0 or i >= self.schema.num_columns():
             raise IndexError('Column index {0} is not in range'
                              .format(i))

         result.schema = new KuduColumnSchema(self.schema.Column(i))

         return result

     def new_row(self, record=None):
         """
         Create a new row corresponding to this schema. If a record is provided,
         a PartialRow will be initialized with values from the input record.
         The record can be in the form of a tuple, dict, or list. Dictionary
         keys can be either column names, indexes, or a mix of both names and
         indexes.

         Parameters
         ----------
         record : tuple/list/dict

         Returns
         -------
         row : PartialRow
         """
         result = PartialRow(self)
         result.row = self.schema.NewRow()

         if record:
             result.from_record(record)

         return result

     def primary_key_indices(self):
         """
         Return the indices of the columns used as primary keys

         Returns
         -------
         key_indices : list[int]
         """
         cdef:
             vector[int] indices
             size_t i

         self.schema.GetPrimaryKeyColumnIndexes(&indices)

         result = []
         for i in range(indices.size()):
             result.append(indices[i])
         return result

     def primary_keys(self):
         """
         Return the names of the columns used as primary keys

         Returns
         -------
         key_names : list[str]
         """
         indices = self.primary_key_indices()
         return [self.at(i).name for i in indices]

     @staticmethod
     def get_auto_incrementing_column_name():
         """
         Utility function to return the actual name of the auto-incrementing column.

         Returns
         -------
         auto-incrementing column name: str
         """
         col_name = KuduSchema.GetAutoIncrementingColumnName()
         return frombytes(col_name)

 cdef class KuduValue:

     def __cinit__(self, KuduType type_, value):
         cdef:
             Slice slc

         if (type_.name[:3] == 'int'):
             self._value = C_KuduValue.FromInt(value)
         elif (type_.name in ['string', 'binary', 'varchar']):
             if isinstance(value, unicode):
                 value = value.encode('utf8')

             slc = Slice(<char*> value, len(value))
             self._value = C_KuduValue.CopyString(slc)
         elif (type_.name == 'bool'):
             self._value = C_KuduValue.FromBool(value)
         elif (type_.name == 'float'):
             self._value = C_KuduValue.FromFloat(value)
         elif (type_.name == 'double'):
             self._value = C_KuduValue.FromDouble(value)
         elif (type_.name == 'unixtime_micros'):
             value = to_unixtime_micros(value)
             self._value = C_KuduValue.FromInt(value)
         elif (type_.name == 'date'):
             val = date_to_unix_epoch_days(value)
             self._value = C_KuduValue.FromInt(val)
         elif (type_.name == 'decimal'):
             IF PYKUDU_INT128_SUPPORTED == 1:
                 scale = get_decimal_scale(value)
                 value = to_unscaled_decimal(value)
                 self._value = C_KuduValue.FromDecimal(value, scale)
             ELSE:
                 raise KuduException("The decimal type is not supported when GCC version is < 4.6.0" % self)
         else:
             raise TypeError("Cannot initialize KuduValue for kudu type <{0}>"
                             .format(type_.name))

     def __dealloc__(self):
         # We don't own this.
         pass