blob: e4de82d2b1f69109d7e77b35dcb46ab620d02d2e [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from _datasketches import PyObjectSerDe
import struct
# This file provides several Python SerDe implementation examples.
#
# Each implementation must extend the PyObjectSerDe class and define
# three methods:
# * get_size(item) returns an int of the number of bytes needed to
# serialize the given item
# * to_bytes(item) returns a bytes object representing a serialized
# version of the given item
# * from_bytes(data, offset) takes a bytes object (data) and an offset
# indicating where in the data array to start reading. The method
# returns a tuple with the newly reconstructed object and the
# total number of bytes beyond the offset read from the input data.
# Implements a simple string-encoding scheme where a string is
# written as <num_bytes> <string_contents>, with no null termination.
# This format allows pre-allocating each string, at the cost of
# additional storage. Using this format, the serialized string consumes
# 4 + len(item) bytes.
class PyStringsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(4 + len(item))
def to_bytes(self, item: str):
b = bytearray()
b.extend(len(item).to_bytes(4, 'little'))
b.extend(map(ord,item))
return bytes(b)
def from_bytes(self, data: bytes, offset: int):
num_chars = int.from_bytes(data[offset:offset+3], 'little')
if (num_chars < 0 or num_chars > offset + len(data)):
raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
str = data[offset+4:offset+4+num_chars].decode()
return (str, 4+num_chars)
# Implements an integer encoding scheme where each integer is written
# as a 32-bit (4 byte) little-endian value.
class PyIntsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(4)
def to_bytes(self, item):
return struct.pack('<i', item)
def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('<i', data, offset)[0]
return (val, 4)
# Implements an integer encoding scheme where each integer is written
# as a 64-bit (8 byte) little-endian value.
class PyLongsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(8)
def to_bytes(self, item):
return struct.pack('<l', item)
def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('<l', data, offset)[0]
return (val, 8)
# Implements a floating point encoding scheme where each value is written
# as a 32-bit floating point value.
class PyFloatsSerDe(PyObjectSerDe):
def get_size(self, item):
return int(4)
def to_bytes(self, item):
return struct.pack('<f', item)
def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('<f', data, offset)[0]
return (val, 4)
# Implements a floating point encoding scheme where each value is written
# as a 64-bit floating point value.
class PyDoublesSerDe(PyObjectSerDe):
def get_size(self, item):
return int(8)
def to_bytes(self, item):
return struct.pack('<d', item)
def from_bytes(self, data: bytes, offset: int):
val = struct.unpack_from('<d', data, offset)[0]
return (val, 8)