python/pyspark/sql/interchange.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 from typing import Iterator, Optional

 import pyarrow as pa

 import pyspark.sql
 from pyspark.sql.types import StructType, StructField, BinaryType
 from pyspark.sql.pandas.types import to_arrow_schema


 def _get_arrow_array_partition_stream(df: pyspark.sql.DataFrame) -> Iterator[pa.RecordBatch]:
     """Return all the partitions as Arrow arrays in an Iterator."""
     # We will be using mapInArrow to convert each partition to Arrow RecordBatches.
     # The return type of the function will be a single binary column containing
     # the serialized RecordBatch in Arrow IPC format.
     binary_schema = StructType([StructField("arrow_ipc_bytes", BinaryType(), nullable=False)])

     def batch_to_bytes_iter(batch_iter: Iterator[pa.RecordBatch]) -> Iterator[pa.RecordBatch]:
         """
         A generator function that converts RecordBatches to serialized Arrow IPC format.

         Spark sends each partition as an iterator of RecordBatches. In order to return
         the entire partition as a stream of Arrow RecordBatches, we need to serialize
         each RecordBatch to Arrow IPC format and yield it as a single binary blob.
         """
         # The size of the batch can be controlled by the Spark config
         # `spark.sql.execution.arrow.maxRecordsPerBatch`.
         for arrow_batch in batch_iter:
             # We create an in-memory byte stream to hold the serialized batch
             sink = pa.BufferOutputStream()
             # Write the batch to the stream using Arrow IPC format
             with pa.ipc.new_stream(sink, arrow_batch.schema) as writer:
                 writer.write_batch(arrow_batch)
             buf = sink.getvalue()
             # The second buffer contains the offsets we are manually creating.
             offset_buf = pa.array([0, len(buf)], type=pa.int32()).buffers()[1]
             null_bitmap = None
             # Wrap the bytes in a new 1-row, 1-column RecordBatch to satisfy mapInArrow return
             # signature. This serializes the whole batch into a single pyarrow serialized cell.
             storage_arr = pa.Array.from_buffers(
                 type=pa.binary(), length=1, buffers=[null_bitmap, offset_buf, buf]
             )
             yield pa.RecordBatch.from_arrays([storage_arr], names=["arrow_ipc_bytes"])

     # Convert all partitions to Arrow RecordBatches and map to binary blobs.
     byte_df = df.mapInArrow(batch_to_bytes_iter, binary_schema)
     # A row is actually a batch of data in Arrow IPC format. Fetch the batches one by one.
     for row in byte_df.toLocalIterator():
         with pa.ipc.open_stream(row.arrow_ipc_bytes) as reader:
             for batch in reader:
                 # Each batch corresponds to a chunk of data in the partition.
                 yield batch


 class SparkArrowCStreamer:
     """
     A class that implements that __arrow_c_stream__ protocol for Spark partitions.

     This class is implemented in a way that allows consumers to consume each partition
     one at a time without materializing all partitions at once on the driver side.
     """

     def __init__(self, df: pyspark.sql.DataFrame):
         self._df = df
         self._schema = to_arrow_schema(df.schema)

     def __arrow_c_stream__(self, requested_schema: Optional[object] = None) -> object:
         """
         Return the Arrow C stream for the dataframe partitions.
         """
         reader: pa.RecordBatchReader = pa.RecordBatchReader.from_batches(
             self._schema, _get_arrow_array_partition_stream(self._df)
         )
         return reader.__arrow_c_stream__(requested_schema=requested_schema)
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	from typing import Iterator, Optional

	import pyarrow as pa

	import pyspark.sql
	from pyspark.sql.types import StructType, StructField, BinaryType
	from pyspark.sql.pandas.types import to_arrow_schema


	def _get_arrow_array_partition_stream(df: pyspark.sql.DataFrame) -> Iterator[pa.RecordBatch]:
	"""Return all the partitions as Arrow arrays in an Iterator."""
	# We will be using mapInArrow to convert each partition to Arrow RecordBatches.
	# The return type of the function will be a single binary column containing
	# the serialized RecordBatch in Arrow IPC format.
	binary_schema = StructType([StructField("arrow_ipc_bytes", BinaryType(), nullable=False)])

	def batch_to_bytes_iter(batch_iter: Iterator[pa.RecordBatch]) -> Iterator[pa.RecordBatch]:
	"""
	A generator function that converts RecordBatches to serialized Arrow IPC format.

	Spark sends each partition as an iterator of RecordBatches. In order to return
	the entire partition as a stream of Arrow RecordBatches, we need to serialize
	each RecordBatch to Arrow IPC format and yield it as a single binary blob.
	"""
	# The size of the batch can be controlled by the Spark config
	# `spark.sql.execution.arrow.maxRecordsPerBatch`.
	for arrow_batch in batch_iter:
	# We create an in-memory byte stream to hold the serialized batch
	sink = pa.BufferOutputStream()
	# Write the batch to the stream using Arrow IPC format
	with pa.ipc.new_stream(sink, arrow_batch.schema) as writer:
	writer.write_batch(arrow_batch)
	buf = sink.getvalue()
	# The second buffer contains the offsets we are manually creating.
	offset_buf = pa.array([0, len(buf)], type=pa.int32()).buffers()[1]
	null_bitmap = None
	# Wrap the bytes in a new 1-row, 1-column RecordBatch to satisfy mapInArrow return
	# signature. This serializes the whole batch into a single pyarrow serialized cell.
	storage_arr = pa.Array.from_buffers(
	type=pa.binary(), length=1, buffers=[null_bitmap, offset_buf, buf]
	)
	yield pa.RecordBatch.from_arrays([storage_arr], names=["arrow_ipc_bytes"])

	# Convert all partitions to Arrow RecordBatches and map to binary blobs.
	byte_df = df.mapInArrow(batch_to_bytes_iter, binary_schema)
	# A row is actually a batch of data in Arrow IPC format. Fetch the batches one by one.
	for row in byte_df.toLocalIterator():
	with pa.ipc.open_stream(row.arrow_ipc_bytes) as reader:
	for batch in reader:
	# Each batch corresponds to a chunk of data in the partition.
	yield batch


	class SparkArrowCStreamer:
	"""
	A class that implements that __arrow_c_stream__ protocol for Spark partitions.

	This class is implemented in a way that allows consumers to consume each partition
	one at a time without materializing all partitions at once on the driver side.
	"""

	def __init__(self, df: pyspark.sql.DataFrame):
	self._df = df
	self._schema = to_arrow_schema(df.schema)

	def __arrow_c_stream__(self, requested_schema: Optional[object] = None) -> object:
	"""
	Return the Arrow C stream for the dataframe partitions.
	"""
	reader: pa.RecordBatchReader = pa.RecordBatchReader.from_batches(
	self._schema, _get_arrow_array_partition_stream(self._df)
	)
	return reader.__arrow_c_stream__(requested_schema=requested_schema)