python/datasketches/PySerDe.py - datasketches-cpp - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from _datasketches import PyObjectSerDe

 import struct

 # This file provides several Python SerDe implementation examples.
 #
 # Each implementation must extend the PyObjectSerDe class and define
 # three methods:
 #   * get_size(item) returns an int of the number of bytes needed to
 #     serialize the given item
 #   * to_bytes(item) returns a bytes object representing a serialized
 #     version of the given item
 #   * from_bytes(data, offset) takes a bytes object (data) and an offset
 #     indicating where in the data array to start reading. The method
 #     returns a tuple with the newly reconstructed object and the
 #     total number of bytes beyond the offset read from the input data.

 # Implements a simple string-encoding scheme where a string is
 # written as <num_bytes> <string_contents>, with no null termination.
 # This format allows pre-allocating each string, at the cost of
 # additional storage. Using this format, the serialized string consumes
 # 4 + len(item) bytes.
 class PyStringsSerDe(PyObjectSerDe):
   def get_size(self, item):
     return int(4 + len(item))

   def to_bytes(self, item: str):
     b = bytearray()
     b.extend(len(item).to_bytes(4, 'little'))
     b.extend(map(ord,item))
     return bytes(b)

   def from_bytes(self, data: bytes, offset: int):
     num_chars = int.from_bytes(data[offset:offset+3], 'little')
     if (num_chars < 0 or num_chars > offset + len(data)):
         raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
     str = data[offset+4:offset+4+num_chars].decode()
     return (str, 4+num_chars)

 # Implements an integer encoding scheme where each integer is written
 # as a 32-bit (4 byte) little-endian value.
 class PyIntsSerDe(PyObjectSerDe):
   def get_size(self, item):
     return int(4)

   def to_bytes(self, item):
     return struct.pack('<i', item)

   def from_bytes(self, data: bytes, offset: int):
     val = struct.unpack_from('<i', data, offset)[0]
     return (val, 4)


 # Implements an integer encoding scheme where each integer is written
 # as a 64-bit (8 byte) little-endian value.
 class PyLongsSerDe(PyObjectSerDe):
   def get_size(self, item):
     return int(8)

   def to_bytes(self, item):
     return struct.pack('<l', item)

   def from_bytes(self, data: bytes, offset: int):
     val = struct.unpack_from('<l', data, offset)[0]
     return (val, 8)


 # Implements a floating point encoding scheme where each value is written
 # as a 32-bit floating point value.
 class PyFloatsSerDe(PyObjectSerDe):
   def get_size(self, item):
     return int(4)

   def to_bytes(self, item):
     return struct.pack('<f', item)

   def from_bytes(self, data: bytes, offset: int):
     val = struct.unpack_from('<f', data, offset)[0]
     return (val, 4)


 # Implements a floating point encoding scheme where each value is written
 # as a 64-bit floating point value.
 class PyDoublesSerDe(PyObjectSerDe):
   def get_size(self, item):
     return int(8)

   def to_bytes(self, item):
     return struct.pack('<d', item)

   def from_bytes(self, data: bytes, offset: int):
     val = struct.unpack_from('<d', data, offset)[0]
     return (val, 8)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from _datasketches import PyObjectSerDe

	import struct

	# This file provides several Python SerDe implementation examples.
	#
	# Each implementation must extend the PyObjectSerDe class and define
	# three methods:
	# * get_size(item) returns an int of the number of bytes needed to
	# serialize the given item
	# * to_bytes(item) returns a bytes object representing a serialized
	# version of the given item
	# * from_bytes(data, offset) takes a bytes object (data) and an offset
	# indicating where in the data array to start reading. The method
	# returns a tuple with the newly reconstructed object and the
	# total number of bytes beyond the offset read from the input data.

	# Implements a simple string-encoding scheme where a string is
	# written as <num_bytes> <string_contents>, with no null termination.
	# This format allows pre-allocating each string, at the cost of
	# additional storage. Using this format, the serialized string consumes
	# 4 + len(item) bytes.
	class PyStringsSerDe(PyObjectSerDe):
	def get_size(self, item):
	return int(4 + len(item))

	def to_bytes(self, item: str):
	b = bytearray()
	b.extend(len(item).to_bytes(4, 'little'))
	b.extend(map(ord,item))
	return bytes(b)

	def from_bytes(self, data: bytes, offset: int):
	num_chars = int.from_bytes(data[offset:offset+3], 'little')
	if (num_chars < 0 or num_chars > offset + len(data)):
	raise IndexError(f'num_chars read must be non-negative and not larger than the buffer. Found {num_chars}')
	str = data[offset+4:offset+4+num_chars].decode()
	return (str, 4+num_chars)

	# Implements an integer encoding scheme where each integer is written
	# as a 32-bit (4 byte) little-endian value.
	class PyIntsSerDe(PyObjectSerDe):
	def get_size(self, item):
	return int(4)

	def to_bytes(self, item):
	return struct.pack('<i', item)

	def from_bytes(self, data: bytes, offset: int):
	val = struct.unpack_from('<i', data, offset)[0]
	return (val, 4)


	# Implements an integer encoding scheme where each integer is written
	# as a 64-bit (8 byte) little-endian value.
	class PyLongsSerDe(PyObjectSerDe):
	def get_size(self, item):
	return int(8)

	def to_bytes(self, item):
	return struct.pack('<l', item)

	def from_bytes(self, data: bytes, offset: int):
	val = struct.unpack_from('<l', data, offset)[0]
	return (val, 8)


	# Implements a floating point encoding scheme where each value is written
	# as a 32-bit floating point value.
	class PyFloatsSerDe(PyObjectSerDe):
	def get_size(self, item):
	return int(4)

	def to_bytes(self, item):
	return struct.pack('<f', item)

	def from_bytes(self, data: bytes, offset: int):
	val = struct.unpack_from('<f', data, offset)[0]
	return (val, 4)


	# Implements a floating point encoding scheme where each value is written
	# as a 64-bit floating point value.
	class PyDoublesSerDe(PyObjectSerDe):
	def get_size(self, item):
	return int(8)

	def to_bytes(self, item):
	return struct.pack('<d', item)

	def from_bytes(self, data: bytes, offset: int):
	val = struct.unpack_from('<d', data, offset)[0]
	return (val, 8)