http/get_multipart/python/server/server.py - arrow-experiments - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from random import choice, randint
 from http.server import BaseHTTPRequestHandler, HTTPServer
 import io
 import json
 import secrets
 import string
 import time

 import pyarrow as pa

 # configuration: use chunked transfer encoding for HTTP/1.1 responses?
 CHUNKED_ENCODING = True


 def random_string(alphabet, length):
     return "".join(choice(alphabet) for _ in range(length))


 def random_name(initial):
     length = randint(3, 7)
     return initial + random_string(string.ascii_lowercase, length)


 def example_tickers(num_tickers):
     tickers = []
     while len(tickers) < num_tickers:
         length = randint(3, 4)
         random_ticker = random_string(string.ascii_uppercase, length)
         if random_ticker not in tickers:
             tickers.append(random_ticker)
     return tickers


 def example_json_data(tickers):
     json_data = []
     for ticker in tickers:
         description = ""
         for c in ticker:
             description = " ".join(random_name(c) for c in ticker)
         json_data.append(
             {
                 "ticker": ticker,
                 "description": description,
             }
         )
     return json_data


 the_schema = pa.schema(
     [
         ("ticker", pa.utf8()),
         ("price", pa.int64()),
         ("volume", pa.int64()),
     ]
 )


 def example_batch(tickers, length):
     data = {"ticker": [], "price": [], "volume": []}
     for _ in range(length):
         data["ticker"].append(choice(tickers))
         data["price"].append(randint(1, 1000) * 100)
         data["volume"].append(randint(1, 10000))

     return pa.RecordBatch.from_pydict(data, the_schema)


 def example_batches(tickers):
     # these parameters are chosen to generate a response
     # of ~1 GB and chunks of ~140 KB.
     total_records = 42_000_000
     batch_len = 6 * 1024
     # all the batches sent are random slices of the larger base batch
     base_batch = example_batch(tickers, length=8 * batch_len)
     batches = []
     records = 0
     while records < total_records:
         length = min(batch_len, total_records - records)
         offset = randint(0, base_batch.num_rows - length - 1)
         batch = base_batch.slice(offset, length)
         batches.append(batch)
         records += length
     return batches


 # end of example data generation


 def random_multipart_boundary():
     """
     Generate a random boundary string for a multipart response.

     Uses a cryptographically secure random number generator to generate a
     random boundary string for a multipart response. The boundary string has
     enough entropy to make it impossible that it will be repeated in the
     response body.

     Use a new boundary string for each multipart response so that once the
     secret is revealed to the client, it won't be possible to exploit it to
     create a malicious response.
     """
     # 28 bytes (224 bits) of entropy is enough to make a collision impossible.
     # See [1] for a mathematical discussion.
     #
     # The 28 bytes are encoded into URL-safe characters (alphanumeric, -, and _)
     # so the string ends up longer than 28 characters. RFC1341 [2] recommends a
     # maximum boundary length of 70 characters, so we're well within that limit.
     #
     # [1] https://preshing.com/20110504/hash-collision-probabilities/
     # [2] https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html
     return secrets.token_urlsafe(28)


 def gen_arrow_multipart_buffers(boundary, schema, source, is_last_part=False):
     """
     Generate buffers for the Arrow Stream part of a multipart response.

     That is, an HTTP response started with the header:

         Content-type: multipart/mixed; boundary=the_boundary_string

     The buffers, when taken together, will form the following structure:

         --the_boundary_string<CR><LF>
         Content-Type: application/vnd.apache.arrow.stream<CR><LF>
         <CR><LF>
         <Arrow Stream data>
         <CR><LF>

     If is_last_part is True, the boundary string will be appended with two
     hyphens at the end of the last buffer to indicate the end of the multipart
     response:

         --the_boundary_string--<CR><LF>
     """
     with io.BytesIO() as sink, pa.ipc.new_stream(sink, schema) as writer:
         sink.write(
             f"--{boundary}\r\n"
             "Content-Type: application/vnd.apache.arrow.stream\r\n"
             "\r\n".encode("utf-8")
         )
         for batch in source:
             writer.write_batch(batch)
             sink.truncate()
             with sink.getbuffer() as buffer:
                 yield buffer
             sink.seek(0)

         writer.close()
         sink.write("\r\n".encode("utf-8"))
         if is_last_part:
             sink.write(f"--{boundary}--\r\n".encode("utf-8"))
         sink.truncate()
         with sink.getbuffer() as buffer:
             yield buffer


 def gen_json_multipart_buffers(boundary, json_data, is_last_part=False):
     """
     Generate buffers for the JSON part of a multipart response.

     That is, an HTTP response started with the header:

         Content-type: multipart/mixed; boundary=the_boundary_string

     The buffer will have the following structure:

         --the_boundary_string<CR><LF>
         Content-Type: application/json<CR><LF>
         <CR><LF>
         <serialized JSON data>
         <CR><LF>

     If is_last_part is True, the boundary string will be appended with two
     hyphens at the end of the buffer to indicate the end of the multipart
     response:

         --the_boundary_string--<CR><LF>

     Allocation of a big string for the JSON data is avoided by appending the
     JSON data directly to the same output buffer.
     """
     with io.BytesIO() as sink:
         with io.TextIOWrapper(sink, encoding="utf-8", write_through=True) as wrapper:
             wrapper.write(f"--{boundary}\r\n" "Content-Type: application/json\r\n\r\n")
             json.dump(json_data, wrapper)
             wrapper.write("\r\n")
             if is_last_part:
                 wrapper.write(f"--{boundary}--\r\n")
             with sink.getbuffer() as buffer:
                 yield buffer


 def multipart_buffer_from_string(boundary, content_type, text, is_last_part=False):
     close_delimiter = f"--{boundary}--\r\n" if is_last_part else ""
     return (
         f"--{boundary}\r\n"
         f"Content-Type: {content_type}\r\n\r\n"
         f"{text}\r\n{close_delimiter}".encode("utf-8")
     )


 class MyRequestHandler(BaseHTTPRequestHandler):
     """
     Multipart response handler for a simple HTTP server.

     This HTTP request handler serves a multipart/mixed response containing
     a JSON data part, followed by an Arrow Stream part and an optional text
     footer as the last part.

     The Arrow data is randomly generated "trading data" with a schema consisting
     of a ticker, price (in cents), and volume. The JSON header contains all the
     tickers and their descriptions. This could be returned as an Arrow table as
     well, but to illustrate the use of multiple parts in a response, it is sent
     as JSON.

     To make things more... mixed, a third part is added to the response: a
     plaintext footer containing footnotes about the request. This part is
     optional and only included if the client requests it by sending a query
     parameter `include_footnotes`.
     """

     _include_footnotes = False
     _start_arrow_stream_time = None
     _end_arrow_stream_time = None
     _number_of_arrow_data_chunks = 0
     _bytes_sent_on_arrow_stream = 0

     def _resolve_json_data_header(self):
         return the_json_data

     def _resolve_batches(self):
         return pa.RecordBatchReader.from_batches(the_schema, all_batches)

     def _build_footnotes(self):
         num_batches = len(all_batches)
         elapsed_time = self._end_arrow_stream_time - self._start_arrow_stream_time
         num_chunks = self._number_of_arrow_data_chunks
         avg_chunk_size = self._bytes_sent_on_arrow_stream / num_chunks
         text = (
             f"Hello Client,\n\n{num_batches} Arrow batch(es) were sent in "
             f"{elapsed_time:.3f} seconds through {num_chunks} HTTP\nresponse chunks. "
             f"Average size of each chunk was {avg_chunk_size:.2f} bytes.\n"
             "\n--\nSincerely,\nThe Server\n"
         )
         return text

     def _gen_buffers(self, boundary, json_header, schema, source):
         # JSON header
         yield from gen_json_multipart_buffers(boundary, json_header)
         # Arrow data
         is_last_part = not self._include_footnotes
         self._start_arrow_stream_time = time.time()
         for buffer in gen_arrow_multipart_buffers(
             boundary, schema, source, is_last_part=is_last_part
         ):
             self._number_of_arrow_data_chunks += 1
             self._bytes_sent_on_arrow_stream += len(buffer)
             yield buffer
         self._end_arrow_stream_time = time.time()
         # Footnotes (optional)
         if self._include_footnotes:
             footnotes = self._build_footnotes()
             yield multipart_buffer_from_string(
                 boundary, "text/plain", footnotes, is_last_part=True
             )

     def do_GET(self):
         ### note: always use urlparse in your applications.
         self._include_footnotes = self.path.endswith("?include_footnotes")
         ### in a real application the data would be resolved from a database or
         ### another source like a file and error handling would be done here
         ### before the 200 OK response starts being sent to the client.
         json_data_header = self._resolve_json_data_header()
         source = self._resolve_batches()

         if self.request_version == "HTTP/1.0":
             self.protocol_version = "HTTP/1.0"
             chunked = False
         else:
             self.protocol_version = "HTTP/1.1"
             chunked = CHUNKED_ENCODING

         self.send_response(200)
         boundary = random_multipart_boundary()
         self.send_header("Content-Type", f"multipart/mixed; boundary={boundary}")
         ### set these headers if testing with a local browser-based client:
         # self.send_header('Access-Control-Allow-Origin', 'http://localhost:8008')
         # self.send_header('Access-Control-Allow-Methods', 'GET')
         # self.send_header('Access-Control-Allow-Headers', 'Content-Type')
         if chunked:
             self.send_header("Transfer-Encoding", "chunked")

         self.end_headers()

         for buffer in self._gen_buffers(boundary, json_data_header, the_schema, source):
             if chunked:
                 self.wfile.write(f"{len(buffer):X}\r\n".encode("utf-8"))
             self.wfile.write(buffer)
             if chunked:
                 self.wfile.write("\r\n".encode("utf-8"))
             self.wfile.flush()

         if chunked:
             self.wfile.write("0\r\n\r\n".encode("utf-8"))
             self.wfile.flush()


 print("Generating example data...")
 all_tickers = example_tickers(60)
 all_batches = example_batches(all_tickers)
 the_json_data = example_json_data(all_tickers)

 server_address = ("localhost", 8008)
 try:
     httpd = HTTPServer(server_address, MyRequestHandler)
     print(f"Serving on {server_address[0]}:{server_address[1]}...")
     httpd.serve_forever()
 except KeyboardInterrupt:
     print("Shutting down server")
     httpd.socket.close()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from random import choice, randint
	from http.server import BaseHTTPRequestHandler, HTTPServer
	import io
	import json
	import secrets
	import string
	import time

	import pyarrow as pa

	# configuration: use chunked transfer encoding for HTTP/1.1 responses?
	CHUNKED_ENCODING = True


	def random_string(alphabet, length):
	return "".join(choice(alphabet) for _ in range(length))


	def random_name(initial):
	length = randint(3, 7)
	return initial + random_string(string.ascii_lowercase, length)


	def example_tickers(num_tickers):
	tickers = []
	while len(tickers) < num_tickers:
	length = randint(3, 4)
	random_ticker = random_string(string.ascii_uppercase, length)
	if random_ticker not in tickers:
	tickers.append(random_ticker)
	return tickers


	def example_json_data(tickers):
	json_data = []
	for ticker in tickers:
	description = ""
	for c in ticker:
	description = " ".join(random_name(c) for c in ticker)
	json_data.append(
	{
	"ticker": ticker,
	"description": description,
	}
	)
	return json_data


	the_schema = pa.schema(
	[
	("ticker", pa.utf8()),
	("price", pa.int64()),
	("volume", pa.int64()),
	]
	)


	def example_batch(tickers, length):
	data = {"ticker": [], "price": [], "volume": []}
	for _ in range(length):
	data["ticker"].append(choice(tickers))
	data["price"].append(randint(1, 1000) * 100)
	data["volume"].append(randint(1, 10000))

	return pa.RecordBatch.from_pydict(data, the_schema)


	def example_batches(tickers):
	# these parameters are chosen to generate a response
	# of ~1 GB and chunks of ~140 KB.
	total_records = 42_000_000
	batch_len = 6 * 1024
	# all the batches sent are random slices of the larger base batch
	base_batch = example_batch(tickers, length=8 * batch_len)
	batches = []
	records = 0
	while records < total_records:
	length = min(batch_len, total_records - records)
	offset = randint(0, base_batch.num_rows - length - 1)
	batch = base_batch.slice(offset, length)
	batches.append(batch)
	records += length
	return batches


	# end of example data generation


	def random_multipart_boundary():
	"""
	Generate a random boundary string for a multipart response.

	Uses a cryptographically secure random number generator to generate a
	random boundary string for a multipart response. The boundary string has
	enough entropy to make it impossible that it will be repeated in the
	response body.

	Use a new boundary string for each multipart response so that once the
	secret is revealed to the client, it won't be possible to exploit it to
	create a malicious response.
	"""
	# 28 bytes (224 bits) of entropy is enough to make a collision impossible.
	# See [1] for a mathematical discussion.
	#
	# The 28 bytes are encoded into URL-safe characters (alphanumeric, -, and _)
	# so the string ends up longer than 28 characters. RFC1341 [2] recommends a
	# maximum boundary length of 70 characters, so we're well within that limit.
	#
	# [1] https://preshing.com/20110504/hash-collision-probabilities/
	# [2] https://www.w3.org/Protocols/rfc1341/7_2_Multipart.html
	return secrets.token_urlsafe(28)


	def gen_arrow_multipart_buffers(boundary, schema, source, is_last_part=False):
	"""
	Generate buffers for the Arrow Stream part of a multipart response.

	That is, an HTTP response started with the header:

	Content-type: multipart/mixed; boundary=the_boundary_string

	The buffers, when taken together, will form the following structure:

	--the_boundary_string<CR><LF>
	Content-Type: application/vnd.apache.arrow.stream<CR><LF>
	<CR><LF>
	<Arrow Stream data>
	<CR><LF>

	If is_last_part is True, the boundary string will be appended with two
	hyphens at the end of the last buffer to indicate the end of the multipart
	response:

	--the_boundary_string--<CR><LF>
	"""
	with io.BytesIO() as sink, pa.ipc.new_stream(sink, schema) as writer:
	sink.write(
	f"--{boundary}\r\n"
	"Content-Type: application/vnd.apache.arrow.stream\r\n"
	"\r\n".encode("utf-8")
	)
	for batch in source:
	writer.write_batch(batch)
	sink.truncate()
	with sink.getbuffer() as buffer:
	yield buffer
	sink.seek(0)

	writer.close()
	sink.write("\r\n".encode("utf-8"))
	if is_last_part:
	sink.write(f"--{boundary}--\r\n".encode("utf-8"))
	sink.truncate()
	with sink.getbuffer() as buffer:
	yield buffer


	def gen_json_multipart_buffers(boundary, json_data, is_last_part=False):
	"""
	Generate buffers for the JSON part of a multipart response.

	That is, an HTTP response started with the header:

	Content-type: multipart/mixed; boundary=the_boundary_string

	The buffer will have the following structure:

	--the_boundary_string<CR><LF>
	Content-Type: application/json<CR><LF>
	<CR><LF>
	<serialized JSON data>
	<CR><LF>

	If is_last_part is True, the boundary string will be appended with two
	hyphens at the end of the buffer to indicate the end of the multipart
	response:

	--the_boundary_string--<CR><LF>

	Allocation of a big string for the JSON data is avoided by appending the
	JSON data directly to the same output buffer.
	"""
	with io.BytesIO() as sink:
	with io.TextIOWrapper(sink, encoding="utf-8", write_through=True) as wrapper:
	wrapper.write(f"--{boundary}\r\n" "Content-Type: application/json\r\n\r\n")
	json.dump(json_data, wrapper)
	wrapper.write("\r\n")
	if is_last_part:
	wrapper.write(f"--{boundary}--\r\n")
	with sink.getbuffer() as buffer:
	yield buffer


	def multipart_buffer_from_string(boundary, content_type, text, is_last_part=False):
	close_delimiter = f"--{boundary}--\r\n" if is_last_part else ""
	return (
	f"--{boundary}\r\n"
	f"Content-Type: {content_type}\r\n\r\n"
	f"{text}\r\n{close_delimiter}".encode("utf-8")
	)


	class MyRequestHandler(BaseHTTPRequestHandler):
	"""
	Multipart response handler for a simple HTTP server.

	This HTTP request handler serves a multipart/mixed response containing
	a JSON data part, followed by an Arrow Stream part and an optional text
	footer as the last part.

	The Arrow data is randomly generated "trading data" with a schema consisting
	of a ticker, price (in cents), and volume. The JSON header contains all the
	tickers and their descriptions. This could be returned as an Arrow table as
	well, but to illustrate the use of multiple parts in a response, it is sent
	as JSON.

	To make things more... mixed, a third part is added to the response: a
	plaintext footer containing footnotes about the request. This part is
	optional and only included if the client requests it by sending a query
	parameter `include_footnotes`.
	"""

	_include_footnotes = False
	_start_arrow_stream_time = None
	_end_arrow_stream_time = None
	_number_of_arrow_data_chunks = 0
	_bytes_sent_on_arrow_stream = 0

	def _resolve_json_data_header(self):
	return the_json_data

	def _resolve_batches(self):
	return pa.RecordBatchReader.from_batches(the_schema, all_batches)

	def _build_footnotes(self):
	num_batches = len(all_batches)
	elapsed_time = self._end_arrow_stream_time - self._start_arrow_stream_time
	num_chunks = self._number_of_arrow_data_chunks
	avg_chunk_size = self._bytes_sent_on_arrow_stream / num_chunks
	text = (
	f"Hello Client,\n\n{num_batches} Arrow batch(es) were sent in "
	f"{elapsed_time:.3f} seconds through {num_chunks} HTTP\nresponse chunks. "
	f"Average size of each chunk was {avg_chunk_size:.2f} bytes.\n"
	"\n--\nSincerely,\nThe Server\n"
	)
	return text

	def _gen_buffers(self, boundary, json_header, schema, source):
	# JSON header
	yield from gen_json_multipart_buffers(boundary, json_header)
	# Arrow data
	is_last_part = not self._include_footnotes
	self._start_arrow_stream_time = time.time()
	for buffer in gen_arrow_multipart_buffers(
	boundary, schema, source, is_last_part=is_last_part
	):
	self._number_of_arrow_data_chunks += 1
	self._bytes_sent_on_arrow_stream += len(buffer)
	yield buffer
	self._end_arrow_stream_time = time.time()
	# Footnotes (optional)
	if self._include_footnotes:
	footnotes = self._build_footnotes()
	yield multipart_buffer_from_string(
	boundary, "text/plain", footnotes, is_last_part=True
	)

	def do_GET(self):
	### note: always use urlparse in your applications.
	self._include_footnotes = self.path.endswith("?include_footnotes")
	### in a real application the data would be resolved from a database or
	### another source like a file and error handling would be done here
	### before the 200 OK response starts being sent to the client.
	json_data_header = self._resolve_json_data_header()
	source = self._resolve_batches()

	if self.request_version == "HTTP/1.0":
	self.protocol_version = "HTTP/1.0"
	chunked = False
	else:
	self.protocol_version = "HTTP/1.1"
	chunked = CHUNKED_ENCODING

	self.send_response(200)
	boundary = random_multipart_boundary()
	self.send_header("Content-Type", f"multipart/mixed; boundary={boundary}")
	### set these headers if testing with a local browser-based client:
	# self.send_header('Access-Control-Allow-Origin', 'http://localhost:8008')
	# self.send_header('Access-Control-Allow-Methods', 'GET')
	# self.send_header('Access-Control-Allow-Headers', 'Content-Type')
	if chunked:
	self.send_header("Transfer-Encoding", "chunked")

	self.end_headers()

	for buffer in self._gen_buffers(boundary, json_data_header, the_schema, source):
	if chunked:
	self.wfile.write(f"{len(buffer):X}\r\n".encode("utf-8"))
	self.wfile.write(buffer)
	if chunked:
	self.wfile.write("\r\n".encode("utf-8"))
	self.wfile.flush()

	if chunked:
	self.wfile.write("0\r\n\r\n".encode("utf-8"))
	self.wfile.flush()


	print("Generating example data...")
	all_tickers = example_tickers(60)
	all_batches = example_batches(all_tickers)
	the_json_data = example_json_data(all_tickers)

	server_address = ("localhost", 8008)
	try:
	httpd = HTTPServer(server_address, MyRequestHandler)
	print(f"Serving on {server_address[0]}:{server_address[1]}...")
	httpd.serve_forever()
	except KeyboardInterrupt:
	print("Shutting down server")
	httpd.socket.close()