python/pyspark/sql/variant_utils.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import base64
 import decimal
 import datetime
 import json
 import struct
 from array import array
 from typing import Any, Callable, Dict, List, NamedTuple, Tuple
 from pyspark.errors import PySparkValueError
 from zoneinfo import ZoneInfo


 class VariantUtils:
     """
     A utility class for VariantVal.

     Adapted from library at: org.apache.spark.types.variant.VariantUtil
     """

     BASIC_TYPE_BITS = 2
     BASIC_TYPE_MASK = 0x3
     TYPE_INFO_MASK = 0x3F
     # The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`.
     MAX_SHORT_STR_SIZE = 0x3F

     # Below is all possible basic type values.
     # Primitive value. The type info value must be one of the values in the below section.
     PRIMITIVE = 0
     # Short string value. The type info value is the string size, which must be in `[0,
     # MAX_SHORT_STR_SIZE]`.
     # The string content bytes directly follow the header byte.
     SHORT_STR = 1
     # Object value. The content contains a size, a list of field ids, a list of field offsets, and
     # the actual field data. The length of the id list is `size`, while the length of the offset
     # list is `size + 1`, where the last offset represent the total size of the field data. The
     # fields in an object must be sorted by the field name in alphabetical order. Duplicate field
     # names in one object are not allowed.
     # We use 5 bits in the type info to specify the integer type of the object header: it should
     # be 0_b4_b3b2_b1b0 (MSB is 0), where:
     # - b4 specifies the type of size. When it is 0/1, `size` is a little-endian 1/4-byte
     # unsigned integer.
     # - b3b2/b1b0 specifies the integer type of id and offset. When the 2 bits are  0/1/2, the
     # list contains 1/2/3-byte little-endian unsigned integers.
     OBJECT = 2
     # Array value. The content contains a size, a list of field offsets, and the actual element
     # data. It is similar to an object without the id list. The length of the offset list
     # is `size + 1`, where the last offset represent the total size of the element data.
     # Its type info should be: 000_b2_b1b0:
     # - b2 specifies the type of size.
     # - b1b0 specifies the integer type of offset.
     ARRAY = 3

     # Below is all possible type info values for `PRIMITIVE`.
     # JSON Null value. Empty content.
     NULL = 0
     # True value. Empty content.
     TRUE = 1
     # False value. Empty content.
     FALSE = 2
     # 1-byte little-endian signed integer.
     INT1 = 3
     # 2-byte little-endian signed integer.
     INT2 = 4
     # 4-byte little-endian signed integer.
     INT4 = 5
     # 4-byte little-endian signed integer.
     INT8 = 6
     # 8-byte IEEE double.
     DOUBLE = 7
     # 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer.
     DECIMAL4 = 8
     # 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer.
     DECIMAL8 = 9
     # 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer.
     DECIMAL16 = 10
     # Date value. Content is 4-byte little-endian signed integer that represents the number of days
     # from the Unix epoch.
     DATE = 11
     # Timestamp value. Content is 8-byte little-endian signed integer that represents the number of
     # microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. This is a timezone-aware
     # field and when reading into a Python datetime object defaults to the UTC timezone.
     TIMESTAMP = 12
     # Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted
     # as if the local time zone is UTC.
     TIMESTAMP_NTZ = 13
     # 4-byte IEEE float.
     FLOAT = 14
     # Binary value. The content is (4-byte little-endian unsigned integer representing the binary
     # size) + (size bytes of binary content).
     BINARY = 15
     # Long string value. The content is (4-byte little-endian unsigned integer representing the
     # string size) + (size bytes of string content).
     LONG_STR = 16

     VERSION = 1
     # The lower 4 bits of the first metadata byte contain the version.
     VERSION_MASK = 0x0F

     U8_MAX = 0xFF
     U16_MAX = 0xFFFF
     U24_MAX = 0xFFFFFF
     U24_SIZE = 3
     U32_SIZE = 4

     I8_MAX = 0x7F
     I8_MIN = -0x80
     I16_MAX = 0x7FFF
     I16_MIN = -0x8000
     I32_MAX = 0x7FFFFFFF
     I32_MIN = -0x80000000
     I64_MAX = 0x7FFFFFFFFFFFFFFF
     I64_MIN = -0x8000000000000000

     EPOCH = datetime.datetime(
         year=1970, month=1, day=1, hour=0, minute=0, second=0, tzinfo=datetime.timezone.utc
     )
     EPOCH_NTZ = datetime.datetime(year=1970, month=1, day=1, hour=0, minute=0, second=0)

     MAX_DECIMAL4_PRECISION = 9
     MAX_DECIMAL4_VALUE = 10**MAX_DECIMAL4_PRECISION
     MAX_DECIMAL8_PRECISION = 18
     MAX_DECIMAL8_VALUE = 10**MAX_DECIMAL8_PRECISION
     MAX_DECIMAL16_PRECISION = 38
     MAX_DECIMAL16_VALUE = 10**MAX_DECIMAL16_PRECISION

     @classmethod
     def to_json(cls, value: bytes, metadata: bytes, zone_id: str = "UTC") -> str:
         """
         Convert the VariantVal to a JSON string. The `zone_id` parameter denotes the time zone that
         timestamp fields should be parsed in. It defaults to "UTC". The list of valid zone IDs can
         found by importing the `zoneinfo` module and running `zoneinfo.available_timezones()`.
         :return: JSON string
         """
         return cls._to_json(value, metadata, 0, zone_id)

     @classmethod
     def to_python(cls, value: bytes, metadata: bytes) -> str:
         """
         Convert the VariantVal to a nested Python object of Python data types.
         :return: Python representation of the Variant nested structure
         """
         return cls._to_python(value, metadata, 0)

     @classmethod
     def parse_json(cls, json_str: str) -> Tuple[bytes, bytes]:
         """
         Parses the JSON string and creates the Variant binary (value, metadata)
         :return: tuple of 2 binary values (value, metadata)
         """
         builder = VariantBuilder()
         return builder.build(json_str)

     @classmethod
     def _read_long(cls, data: bytes, pos: int, num_bytes: int, signed: bool) -> int:
         cls._check_index(pos, len(data))
         cls._check_index(pos + num_bytes - 1, len(data))
         return int.from_bytes(data[pos : pos + num_bytes], byteorder="little", signed=signed)

     @classmethod
     def _check_index(cls, pos: int, length: int) -> None:
         if pos < 0 or pos >= length:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _get_type_info(cls, value: bytes, pos: int) -> Tuple[int, int]:
         """
         Returns the (basic_type, type_info) pair from the given position in the value.
         """
         basic_type = value[pos] & VariantUtils.BASIC_TYPE_MASK
         type_info = (value[pos] >> VariantUtils.BASIC_TYPE_BITS) & VariantUtils.TYPE_INFO_MASK
         return (basic_type, type_info)

     @classmethod
     def _get_metadata_key(cls, metadata: bytes, id: int) -> str:
         """
         Returns the key string from the dictionary in the metadata, corresponding to `id`.
         """
         cls._check_index(0, len(metadata))
         offset_size = ((metadata[0] >> 6) & 0x3) + 1
         dict_size = cls._read_long(metadata, 1, offset_size, signed=False)
         if id >= dict_size:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         string_start = 1 + (dict_size + 2) * offset_size
         offset = cls._read_long(metadata, 1 + (id + 1) * offset_size, offset_size, signed=False)
         next_offset = cls._read_long(
             metadata, 1 + (id + 2) * offset_size, offset_size, signed=False
         )
         if offset > next_offset:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         cls._check_index(string_start + next_offset - 1, len(metadata))
         return metadata[string_start + offset : (string_start + next_offset)].decode("utf-8")

     @classmethod
     def _get_boolean(cls, value: bytes, pos: int) -> bool:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.PRIMITIVE or (
             type_info != VariantUtils.TRUE and type_info != VariantUtils.FALSE
         ):
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         return type_info == VariantUtils.TRUE

     @classmethod
     def _get_long(cls, value: bytes, pos: int) -> int:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.PRIMITIVE:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         if type_info == VariantUtils.INT1:
             return cls._read_long(value, pos + 1, 1, signed=True)
         elif type_info == VariantUtils.INT2:
             return cls._read_long(value, pos + 1, 2, signed=True)
         elif type_info == VariantUtils.INT4 or type_info == VariantUtils.DATE:
             return cls._read_long(value, pos + 1, 4, signed=True)
         elif type_info == VariantUtils.INT8:
             return cls._read_long(value, pos + 1, 8, signed=True)
         raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _get_date(cls, value: bytes, pos: int) -> datetime.date:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.PRIMITIVE:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         if type_info == VariantUtils.DATE:
             days_since_epoch = cls._read_long(value, pos + 1, 4, signed=True)
             return datetime.date.fromordinal(VariantUtils.EPOCH.toordinal() + days_since_epoch)
         raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _get_timestamp(cls, value: bytes, pos: int, zone_id: str) -> datetime.datetime:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.PRIMITIVE:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         if type_info == VariantUtils.TIMESTAMP_NTZ:
             microseconds_since_epoch = cls._read_long(value, pos + 1, 8, signed=True)
             return VariantUtils.EPOCH_NTZ + datetime.timedelta(
                 microseconds=microseconds_since_epoch
             )
         if type_info == VariantUtils.TIMESTAMP:
             microseconds_since_epoch = cls._read_long(value, pos + 1, 8, signed=True)
             return (
                 VariantUtils.EPOCH + datetime.timedelta(microseconds=microseconds_since_epoch)
             ).astimezone(ZoneInfo(zone_id))
         raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _get_string(cls, value: bytes, pos: int) -> str:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type == VariantUtils.SHORT_STR or (
             basic_type == VariantUtils.PRIMITIVE and type_info == VariantUtils.LONG_STR
         ):
             start = 0
             length = 0
             if basic_type == VariantUtils.SHORT_STR:
                 start = pos + 1
                 length = type_info
             else:
                 start = pos + 1 + VariantUtils.U32_SIZE
                 length = cls._read_long(value, pos + 1, VariantUtils.U32_SIZE, signed=False)
             cls._check_index(start + length - 1, len(value))
             return value[start : start + length].decode("utf-8")
         raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _get_double(cls, value: bytes, pos: int) -> float:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.PRIMITIVE:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         if type_info == VariantUtils.FLOAT:
             cls._check_index(pos + 4, len(value))
             return struct.unpack("<f", value[pos + 1 : pos + 5])[0]
         elif type_info == VariantUtils.DOUBLE:
             cls._check_index(pos + 8, len(value))
             return struct.unpack("<d", value[pos + 1 : pos + 9])[0]
         raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _check_decimal(cls, unscaled: int, scale: int, max_unscaled: int, max_scale: int) -> None:
         # max_unscaled == 10**max_scale, but we pass a literal parameter to avoid redundant
         # computation.
         if unscaled >= max_unscaled or unscaled <= -max_unscaled or scale > max_scale:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _get_decimal(cls, value: bytes, pos: int) -> decimal.Decimal:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.PRIMITIVE:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         scale = value[pos + 1]
         unscaled = 0
         if type_info == VariantUtils.DECIMAL4:
             unscaled = cls._read_long(value, pos + 2, 4, signed=True)
             cls._check_decimal(unscaled, scale, cls.MAX_DECIMAL4_VALUE, cls.MAX_DECIMAL4_PRECISION)
         elif type_info == VariantUtils.DECIMAL8:
             unscaled = cls._read_long(value, pos + 2, 8, signed=True)
             cls._check_decimal(unscaled, scale, cls.MAX_DECIMAL8_VALUE, cls.MAX_DECIMAL8_PRECISION)
         elif type_info == VariantUtils.DECIMAL16:
             cls._check_index(pos + 17, len(value))
             unscaled = int.from_bytes(value[pos + 2 : pos + 18], byteorder="little", signed=True)
             cls._check_decimal(
                 unscaled, scale, cls.MAX_DECIMAL16_VALUE, cls.MAX_DECIMAL16_PRECISION
             )
         else:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         return decimal.Decimal(unscaled) * (decimal.Decimal(10) ** (-scale))

     @classmethod
     def _get_binary(cls, value: bytes, pos: int) -> bytes:
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.PRIMITIVE or type_info != VariantUtils.BINARY:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         start = pos + 1 + VariantUtils.U32_SIZE
         length = cls._read_long(value, pos + 1, VariantUtils.U32_SIZE, signed=False)
         cls._check_index(start + length - 1, len(value))
         return bytes(value[start : start + length])

     @classmethod
     def _get_type(cls, value: bytes, pos: int) -> Any:
         """
         Returns the Python type of the Variant at the given position.
         """
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type == VariantUtils.SHORT_STR:
             return str
         elif basic_type == VariantUtils.OBJECT:
             return dict
         elif basic_type == VariantUtils.ARRAY:
             return array
         elif type_info == VariantUtils.NULL:
             return type(None)
         elif type_info == VariantUtils.TRUE or type_info == VariantUtils.FALSE:
             return bool
         elif (
             type_info == VariantUtils.INT1
             or type_info == VariantUtils.INT2
             or type_info == VariantUtils.INT4
             or type_info == VariantUtils.INT8
         ):
             return int
         elif type_info == VariantUtils.DOUBLE or type_info == VariantUtils.FLOAT:
             return float
         elif (
             type_info == VariantUtils.DECIMAL4
             or type_info == VariantUtils.DECIMAL8
             or type_info == VariantUtils.DECIMAL16
         ):
             return decimal.Decimal
         elif type_info == VariantUtils.BINARY:
             return bytes
         elif type_info == VariantUtils.DATE:
             return datetime.date
         elif type_info == VariantUtils.TIMESTAMP or type_info == VariantUtils.TIMESTAMP_NTZ:
             return datetime.datetime
         elif type_info == VariantUtils.LONG_STR:
             return str
         raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _to_json(cls, value: bytes, metadata: bytes, pos: int, zone_id: str) -> str:
         variant_type = cls._get_type(value, pos)
         if variant_type == dict:

             def handle_object(key_value_pos_list: List[Tuple[str, int]]) -> str:
                 key_value_list = [
                     json.dumps(key) + ":" + cls._to_json(value, metadata, value_pos, zone_id)
                     for (key, value_pos) in key_value_pos_list
                 ]
                 return "{" + ",".join(key_value_list) + "}"

             return cls._handle_object(value, metadata, pos, handle_object)
         elif variant_type == array:

             def handle_array(value_pos_list: List[int]) -> str:
                 value_list = [
                     cls._to_json(value, metadata, value_pos, zone_id)
                     for value_pos in value_pos_list
                 ]
                 return "[" + ",".join(value_list) + "]"

             return cls._handle_array(value, pos, handle_array)
         else:
             value = cls._get_scalar(variant_type, value, metadata, pos, zone_id)
             if value is None:
                 return "null"
             if type(value) == bool:
                 return "true" if value else "false"
             if type(value) == str:
                 return json.dumps(value)
             if type(value) == bytes:
                 # decoding simply converts byte array to string
                 return '"' + base64.b64encode(value).decode("utf-8") + '"'
             if type(value) == datetime.date or type(value) == datetime.datetime:
                 return '"' + str(value) + '"'
             return str(value)

     @classmethod
     def _to_python(cls, value: bytes, metadata: bytes, pos: int) -> Any:
         variant_type = cls._get_type(value, pos)
         if variant_type == dict:

             def handle_object(key_value_pos_list: List[Tuple[str, int]]) -> Dict[str, Any]:
                 key_value_list = [
                     (key, cls._to_python(value, metadata, value_pos))
                     for (key, value_pos) in key_value_pos_list
                 ]
                 return dict(key_value_list)

             return cls._handle_object(value, metadata, pos, handle_object)
         elif variant_type == array:

             def handle_array(value_pos_list: List[int]) -> List[Any]:
                 value_list = [
                     cls._to_python(value, metadata, value_pos) for value_pos in value_pos_list
                 ]
                 return value_list

             return cls._handle_array(value, pos, handle_array)
         else:
             return cls._get_scalar(variant_type, value, metadata, pos, zone_id="UTC")

     @classmethod
     def _get_scalar(
         cls, variant_type: Any, value: bytes, metadata: bytes, pos: int, zone_id: str
     ) -> Any:
         if isinstance(None, variant_type):
             return None
         elif variant_type == bool:
             return cls._get_boolean(value, pos)
         elif variant_type == int:
             return cls._get_long(value, pos)
         elif variant_type == str:
             return cls._get_string(value, pos)
         elif variant_type == float:
             return cls._get_double(value, pos)
         elif variant_type == decimal.Decimal:
             return cls._get_decimal(value, pos)
         elif variant_type == bytes:
             return cls._get_binary(value, pos)
         elif variant_type == datetime.date:
             return cls._get_date(value, pos)
         elif variant_type == datetime.datetime:
             return cls._get_timestamp(value, pos, zone_id)
         else:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     @classmethod
     def _handle_object(
         cls, value: bytes, metadata: bytes, pos: int, func: Callable[[List[Tuple[str, int]]], Any]
     ) -> Any:
         """
         Parses the variant object at position `pos`.
         Calls `func` with a list of (key, value position) pairs of the object.
         """
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.OBJECT:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         large_size = ((type_info >> 4) & 0x1) != 0
         size_bytes = VariantUtils.U32_SIZE if large_size else 1
         num_fields = cls._read_long(value, pos + 1, size_bytes, signed=False)
         id_size = ((type_info >> 2) & 0x3) + 1
         offset_size = ((type_info) & 0x3) + 1
         id_start = pos + 1 + size_bytes
         offset_start = id_start + num_fields * id_size
         data_start = offset_start + (num_fields + 1) * offset_size

         key_value_pos_list = []
         for i in range(num_fields):
             id = cls._read_long(value, id_start + id_size * i, id_size, signed=False)
             offset = cls._read_long(
                 value, offset_start + offset_size * i, offset_size, signed=False
             )
             value_pos = data_start + offset
             if metadata is not None:
                 key_value_pos_list.append((cls._get_metadata_key(metadata, id), value_pos))
             else:
                 key_value_pos_list.append(("", value_pos))
         return func(key_value_pos_list)

     @classmethod
     def _handle_array(cls, value: bytes, pos: int, func: Callable[[List[int]], Any]) -> Any:
         """
         Parses the variant array at position `pos`.
         Calls `func` with a list of element positions of the array.
         """
         cls._check_index(pos, len(value))
         basic_type, type_info = cls._get_type_info(value, pos)
         if basic_type != VariantUtils.ARRAY:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})
         large_size = ((type_info >> 2) & 0x1) != 0
         size_bytes = VariantUtils.U32_SIZE if large_size else 1
         num_fields = cls._read_long(value, pos + 1, size_bytes, signed=False)
         offset_size = (type_info & 0x3) + 1
         offset_start = pos + 1 + size_bytes
         data_start = offset_start + (num_fields + 1) * offset_size

         value_pos_list = []
         for i in range(num_fields):
             offset = cls._read_long(
                 value, offset_start + offset_size * i, offset_size, signed=False
             )
             element_pos = data_start + offset
             value_pos_list.append(element_pos)
         return func(value_pos_list)


 class FieldEntry(NamedTuple):
     """
     Info about an object field
     """

     key: str
     id: int
     offset: int


 class VariantBuilder:
     """
     A utility class for building VariantVal.
     """

     DEFAULT_SIZE_LIMIT = 16 * 1024 * 1024

     def __init__(self, size_limit: int = DEFAULT_SIZE_LIMIT):
         self.value = bytearray()
         self.dictionary = dict[str, int]()
         self.dictionary_keys = list[bytes]()
         self.size_limit = size_limit

     def build(self, json_str: str) -> Tuple[bytes, bytes]:
         parsed = json.loads(json_str, parse_float=self._handle_float)
         self._process_parsed_json(parsed)

         num_keys = len(self.dictionary_keys)
         dictionary_string_size = sum(len(key) for key in self.dictionary_keys)

         # Determine the number of bytes required per offset entry.
         # The largest offset is the one-past-the-end value, which is total string size. It's very
         # unlikely that the number of keys could be larger, but incorporate that into the
         # calculation in case of pathological data.
         max_size = max(dictionary_string_size, num_keys)
         if max_size > self.size_limit:
             raise PySparkValueError(errorClass="VARIANT_SIZE_LIMIT_EXCEEDED", messageParameters={})
         offset_size = self._get_integer_size(max_size)

         offset_start = 1 + offset_size
         string_start = offset_start + (num_keys + 1) * offset_size
         metadata_size = string_start + dictionary_string_size
         if metadata_size > self.size_limit:
             raise PySparkValueError(errorClass="VARIANT_SIZE_LIMIT_EXCEEDED", messageParameters={})

         metadata = bytearray()
         header_byte = VariantUtils.VERSION | ((offset_size - 1) << 6)
         metadata.extend(header_byte.to_bytes(1, byteorder="little"))
         metadata.extend(num_keys.to_bytes(offset_size, byteorder="little"))
         # write offsets
         current_offset = 0
         for key in self.dictionary_keys:
             metadata.extend(current_offset.to_bytes(offset_size, byteorder="little"))
             current_offset += len(key)
         metadata.extend(current_offset.to_bytes(offset_size, byteorder="little"))
         # write key data
         for key in self.dictionary_keys:
             metadata.extend(key)
         return (bytes(self.value), bytes(metadata))

     def _process_parsed_json(self, parsed: Any) -> None:
         if type(parsed) is dict:
             fields = list[FieldEntry]()
             start = len(self.value)
             for key, value in parsed.items():
                 id = self._add_key(key)
                 fields.append(FieldEntry(key, id, len(self.value) - start))
                 self._process_parsed_json(value)
             self._finish_writing_object(start, fields)
         elif type(parsed) is list:
             offsets = []
             start = len(self.value)
             for elem in parsed:
                 offsets.append(len(self.value) - start)
                 self._process_parsed_json(elem)
             self._finish_writing_array(start, offsets)
         elif type(parsed) is str:
             self._append_string(parsed)
         elif type(parsed) is int:
             if not self._append_int(parsed):
                 self._process_parsed_json(self._handle_float(str(parsed)))
         elif type(parsed) is float:
             self._append_float(parsed)
         elif type(parsed) is decimal.Decimal:
             self._append_decimal(parsed)
         elif type(parsed) is bool:
             self._append_boolean(parsed)
         elif parsed is None:
             self._append_null()
         else:
             raise PySparkValueError(errorClass="MALFORMED_VARIANT", messageParameters={})

     # Choose the smallest unsigned integer type that can store `value`. It must be within
     # [0, U24_MAX].
     def _get_integer_size(self, value: int) -> int:
         if value <= VariantUtils.U8_MAX:
             return 1
         if value <= VariantUtils.U16_MAX:
             return 2
         return VariantUtils.U24_SIZE

     def _check_capacity(self, additional: int) -> None:
         required = len(self.value) + additional
         if required > self.size_limit:
             raise PySparkValueError(errorClass="VARIANT_SIZE_LIMIT_EXCEEDED", messageParameters={})

     def _primitive_header(self, type: int) -> bytes:
         return bytes([(type << 2) | VariantUtils.PRIMITIVE])

     def _short_string_header(self, size: int) -> bytes:
         return bytes([size << 2 | VariantUtils.SHORT_STR])

     def _array_header(self, large_size: bool, offset_size: int) -> bytes:
         return bytes(
             [
                 (
                     (large_size << (VariantUtils.BASIC_TYPE_BITS + 2))
                     | ((offset_size - 1) << VariantUtils.BASIC_TYPE_BITS)
                     | VariantUtils.ARRAY
                 )
             ]
         )

     def _object_header(self, large_size: bool, id_size: int, offset_size: int) -> bytes:
         return bytes(
             [
                 (
                     (large_size << (VariantUtils.BASIC_TYPE_BITS + 4))
                     | ((id_size - 1) << (VariantUtils.BASIC_TYPE_BITS + 2))
                     | ((offset_size - 1) << VariantUtils.BASIC_TYPE_BITS)
                     | VariantUtils.OBJECT
                 )
             ]
         )

     # Add a key to the variant dictionary. If the key already exists, the dictionary is
     # not modified. In either case, return the id of the key.
     def _add_key(self, key: str) -> int:
         if key in self.dictionary:
             return self.dictionary[key]
         id = len(self.dictionary_keys)
         self.dictionary[key] = id
         self.dictionary_keys.append(key.encode("utf-8"))
         return id

     def _handle_float(self, num_str: str) -> Any:
         # a float can be a decimal if it only contains digits, '-', or '-'.
         if all([ch.isdecimal() or ch == "-" or ch == "." for ch in num_str]):
             dec = decimal.Decimal(num_str)
             precision = len(dec.as_tuple().digits)
             scale = -int(dec.as_tuple().exponent)

             if (
                 scale <= VariantUtils.MAX_DECIMAL16_PRECISION
                 and precision <= VariantUtils.MAX_DECIMAL16_PRECISION
             ):
                 return dec
         return float(num_str)

     def _append_boolean(self, b: bool) -> None:
         self._check_capacity(1)
         self.value.extend(self._primitive_header(VariantUtils.TRUE if b else VariantUtils.FALSE))

     def _append_null(self) -> None:
         self._check_capacity(1)
         self.value.extend(self._primitive_header(VariantUtils.NULL))

     def _append_string(self, s: str) -> None:
         text = s.encode("utf-8")
         long_str = len(text) > VariantUtils.MAX_SHORT_STR_SIZE
         additional = (1 + VariantUtils.U32_SIZE) if long_str else 1
         self._check_capacity(additional + len(text))
         if long_str:
             self.value.extend(self._primitive_header(VariantUtils.LONG_STR))
             self.value.extend(len(text).to_bytes(VariantUtils.U32_SIZE, byteorder="little"))
         else:
             self.value.extend(self._short_string_header(len(text)))
         self.value.extend(text)

     def _append_int(self, i: int) -> bool:
         self._check_capacity(1 + 8)
         if i >= VariantUtils.I8_MIN and i <= VariantUtils.I8_MAX:
             self.value.extend(self._primitive_header(VariantUtils.INT1))
             self.value.extend(i.to_bytes(1, byteorder="little", signed=True))
         elif i >= VariantUtils.I16_MIN and i <= VariantUtils.I16_MAX:
             self.value.extend(self._primitive_header(VariantUtils.INT2))
             self.value.extend(i.to_bytes(2, byteorder="little", signed=True))
         elif i >= VariantUtils.I32_MIN and i <= VariantUtils.I32_MAX:
             self.value.extend(self._primitive_header(VariantUtils.INT4))
             self.value.extend(i.to_bytes(4, byteorder="little", signed=True))
         elif i >= VariantUtils.I64_MIN and i <= VariantUtils.I64_MAX:
             self.value.extend(self._primitive_header(VariantUtils.INT8))
             self.value.extend(i.to_bytes(8, byteorder="little", signed=True))
         else:
             return False
         return True

     # Append a decimal value to the variant builder. The caller should guarantee that its precision
     # and scale fit into `MAX_DECIMAL16_PRECISION`.
     def _append_decimal(self, d: decimal.Decimal) -> None:
         self._check_capacity(2 + 16)
         precision = len(d.as_tuple().digits)
         scale = -int(d.as_tuple().exponent)
         unscaled = int("".join(map(str, d.as_tuple().digits)))
         unscaled = -unscaled if d < 0 else unscaled
         if (
             scale <= VariantUtils.MAX_DECIMAL4_PRECISION
             and precision <= VariantUtils.MAX_DECIMAL4_PRECISION
         ):
             self.value.extend(self._primitive_header(VariantUtils.DECIMAL4))
             self.value.extend(scale.to_bytes(1, byteorder="little"))
             self.value.extend(unscaled.to_bytes(4, byteorder="little", signed=True))
         elif (
             scale <= VariantUtils.MAX_DECIMAL8_PRECISION
             and precision <= VariantUtils.MAX_DECIMAL8_PRECISION
         ):
             self.value.extend(self._primitive_header(VariantUtils.DECIMAL8))
             self.value.extend(scale.to_bytes(1, byteorder="little"))
             self.value.extend(unscaled.to_bytes(8, byteorder="little", signed=True))
         else:
             assert (
                 scale <= VariantUtils.MAX_DECIMAL16_PRECISION
                 and precision <= VariantUtils.MAX_DECIMAL16_PRECISION
             )
             self.value.extend(self._primitive_header(VariantUtils.DECIMAL16))
             self.value.extend(scale.to_bytes(1, byteorder="little"))
             self.value.extend(unscaled.to_bytes(16, byteorder="little", signed=True))

     def _append_float(self, f: float) -> None:
         self._check_capacity(1 + 8)
         self.value.extend(self._primitive_header(VariantUtils.DOUBLE))
         self.value.extend(struct.pack("<d", f))

     # Finish writing a variant array after all of its elements have already been written.
     def _finish_writing_array(self, start: int, offsets: List[int]) -> None:
         data_size = len(self.value) - start
         num_offsets = len(offsets)
         large_size = num_offsets > VariantUtils.U8_MAX
         size_bytes = VariantUtils.U32_SIZE if large_size else 1
         offset_size = self._get_integer_size(data_size)
         # The space for header byte, object size, and offset list.
         header_size = 1 + size_bytes + (num_offsets + 1) * offset_size
         self._check_capacity(header_size)
         self.value.extend(bytearray(header_size))
         # Shift the just-written element data to make room for the header section.
         self.value[start + header_size :] = bytes(self.value[start : start + data_size])
         # Write the header byte, num offsets
         offset_start = start + 1 + size_bytes
         self.value[start : start + 1] = self._array_header(large_size, offset_size)
         self.value[start + 1 : offset_start] = num_offsets.to_bytes(size_bytes, byteorder="little")
         # write offset list
         offset_list = bytearray()
         for offset in offsets:
             offset_list.extend(offset.to_bytes(offset_size, byteorder="little"))
         offset_list.extend(data_size.to_bytes(offset_size, byteorder="little"))
         self.value[offset_start : offset_start + len(offset_list)] = offset_list

     # Finish writing a variant object after all of its fields have already been written.
     def _finish_writing_object(self, start: int, fields: List[FieldEntry]) -> None:
         num_fields = len(fields)
         # object fields are from a python dictionary, so keys are already distinct
         fields.sort(key=lambda f: f.key)
         max_id = 0
         for field in fields:
             max_id = max(max_id, field.id)

         data_size = len(self.value) - start
         large_size = num_fields > VariantUtils.U8_MAX
         size_bytes = VariantUtils.U32_SIZE if large_size else 1
         id_size = self._get_integer_size(max_id)
         offset_size = self._get_integer_size(data_size)
         # The space for header byte, object size, id list, and offset list.
         header_size = 1 + size_bytes + num_fields * id_size + (num_fields + 1) * offset_size
         self._check_capacity(header_size)
         self.value.extend(bytearray(header_size))
         # Shift the just-written field data to make room for the object header section.
         self.value[start + header_size :] = self.value[start : start + data_size]
         # Write the header byte, num fields, id list, offset list
         self.value[start : start + 1] = self._object_header(large_size, id_size, offset_size)
         self.value[start + 1 : start + 1 + size_bytes] = num_fields.to_bytes(
             size_bytes, byteorder="little"
         )
         id_start = start + 1 + size_bytes
         offset_start = id_start + num_fields * id_size
         id_list = bytearray()
         offset_list = bytearray()
         for field in fields:
             id_list.extend(field.id.to_bytes(id_size, byteorder="little"))
             offset_list.extend(field.offset.to_bytes(offset_size, byteorder="little"))
         offset_list.extend(data_size.to_bytes(offset_size, byteorder="little"))
         self.value[id_start : id_start + len(id_list)] = id_list
         self.value[offset_start : offset_start + len(offset_list)] = offset_list