scripts/generate-comparison.py - datafusion-benchmarks - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import argparse
 import json
 import matplotlib.pyplot as plt
 import numpy as np

 def geomean(data):
     return np.prod(data) ** (1 / len(data))

 def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str):
     results = []
     for query in range(1, query_count(benchmark)+1):
         a = np.median(np.array(baseline[str(query)]))
         b = np.median(np.array(comparison[str(query)]))
         if a > b:
             speedup = a/b-1
         else:
             speedup = -(1/(a/b)-1)
         results.append(("q" + str(query), round(speedup*100, 0)))

     results = sorted(results, key=lambda x: -x[1])

     queries, speedups = zip(*results)

     # Create figure and axis
     if benchmark == "tpch":
         fig, ax = plt.subplots(figsize=(10, 6))
     else:
         fig, ax = plt.subplots(figsize=(35, 10))

     # Create bar chart
     bars = ax.bar(queries, speedups, color='skyblue')

     # Add text annotations
     for bar, speedup in zip(bars, speedups):
         yval = bar.get_height()
         if yval >= 0:
             ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
                     color='blue', rotation=90)
         else:
             ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8,
                     color='blue', rotation=90)

     # Add title and labels
     ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")")
     ax.set_ylabel('Speedup (100% speedup = 2x faster)')
     ax.set_xlabel('Query')

     # Customize the y-axis to handle both positive and negative values better
     ax.axhline(0, color='black', linewidth=0.8)
     min_value = (min(speedups) // 100) * 100
     max_value = ((max(speedups) // 100) + 1) * 100 + 50
     if benchmark == "tpch":
         ax.set_ylim(min_value, max_value)
     else:
         # TODO improve this
         ax.set_ylim(-250, 300)

     # Show grid for better readability
     ax.yaxis.grid(True)

     # Save the plot as an image file
     plt.savefig(f'{benchmark}_queries_speedup.png', format='png')


 def generate_query_comparison_chart(results, labels, benchmark: str, title: str):
     queries = []
     benches = []
     for _ in results:
         benches.append([])
     for query in range(1, query_count(benchmark)+1):
         queries.append("q" + str(query))
         for i in range(0, len(results)):
             benches[i].append(np.median(np.array(results[i][str(query)])))

     # Define the width of the bars
     bar_width = 0.3

     # Define the positions of the bars on the x-axis
     index = np.arange(len(queries)) * 1.5

     # Create a bar chart
     if benchmark == "tpch":
         fig, ax = plt.subplots(figsize=(15, 6))
     else:
         fig, ax = plt.subplots(figsize=(35, 6))

     for i in range(0, len(results)):
         bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i])

     # Add labels, title, and legend
     ax.set_title(title)
     ax.set_xlabel('Queries')
     ax.set_ylabel('Query Time (seconds)')
     ax.set_xticks(index + bar_width / 2)
     ax.set_xticklabels(queries)
     ax.legend()

     # Save the plot as an image file
     plt.savefig(f'{benchmark}_queries_compare.png', format='png')

 def generate_summary(results, labels, benchmark: str, title: str):
     timings = []
     for _ in results:
         timings.append(0)

     num_queries = query_count(benchmark)
     for query in range(1, num_queries + 1):
         for i in range(0, len(results)):
             timings[i] += np.median(np.array(results[i][str(query)]))

     # Create figure and axis
     fig, ax = plt.subplots()

     # Add title and labels
     ax.set_title(title)
     ax.set_ylabel(f'Time in seconds to run all {num_queries} {benchmark} queries (lower is better)')

     times = [round(x,0) for x in timings]

     # Create bar chart
     bars = ax.bar(labels, times, color='skyblue')

     # Add text annotations
     for bar in bars:
         yval = bar.get_height()
         ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom')  # va: vertical alignment

     plt.savefig(f'{benchmark}_allqueries.png', format='png')

 def query_count(benchmark: str):
     if benchmark == "tpch":
         return 22
     elif benchmark == "tpcds":
         return 99
     else:
         raise "invalid benchmark name"

 def main(files, labels, benchmark: str, title: str):
     results = []
     for filename in files:
         with open(filename) as f:
             results.append(json.load(f))
     generate_summary(results, labels, benchmark, title)
     generate_query_comparison_chart(results, labels, benchmark, title)
     if len(files) == 2:
         generate_query_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title)

 if __name__ == '__main__':
     argparse = argparse.ArgumentParser(description='Generate comparison')
     argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files')
     argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
     argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch or tpcds)')
     argparse.add_argument('--title', type=str, help='Chart title')
     args = argparse.parse_args()
     main(args.filenames, args.labels, args.benchmark, args.title)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import argparse
	import json
	import matplotlib.pyplot as plt
	import numpy as np

	def geomean(data):
	return np.prod(data) ** (1 / len(data))

	def generate_query_speedup_chart(baseline, comparison, label1: str, label2: str, benchmark: str, title: str):
	results = []
	for query in range(1, query_count(benchmark)+1):
	a = np.median(np.array(baseline[str(query)]))
	b = np.median(np.array(comparison[str(query)]))
	if a > b:
	speedup = a/b-1
	else:
	speedup = -(1/(a/b)-1)
	results.append(("q" + str(query), round(speedup*100, 0)))

	results = sorted(results, key=lambda x: -x[1])

	queries, speedups = zip(*results)

	# Create figure and axis
	if benchmark == "tpch":
	fig, ax = plt.subplots(figsize=(10, 6))
	else:
	fig, ax = plt.subplots(figsize=(35, 10))

	# Create bar chart
	bars = ax.bar(queries, speedups, color='skyblue')

	# Add text annotations
	for bar, speedup in zip(bars, speedups):
	yval = bar.get_height()
	if yval >= 0:
	ax.text(bar.get_x() + bar.get_width() / 2.0, min(800, yval+5), f'{yval:.0f}%', va='bottom', ha='center', fontsize=8,
	color='blue', rotation=90)
	else:
	ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.0f}%', va='top', ha='center', fontsize=8,
	color='blue', rotation=90)

	# Add title and labels
	ax.set_title(label2 + " speedup over " + label1 + " (" + title + ")")
	ax.set_ylabel('Speedup (100% speedup = 2x faster)')
	ax.set_xlabel('Query')

	# Customize the y-axis to handle both positive and negative values better
	ax.axhline(0, color='black', linewidth=0.8)
	min_value = (min(speedups) // 100) * 100
	max_value = ((max(speedups) // 100) + 1) * 100 + 50
	if benchmark == "tpch":
	ax.set_ylim(min_value, max_value)
	else:
	# TODO improve this
	ax.set_ylim(-250, 300)

	# Show grid for better readability
	ax.yaxis.grid(True)

	# Save the plot as an image file
	plt.savefig(f'{benchmark}_queries_speedup.png', format='png')


	def generate_query_comparison_chart(results, labels, benchmark: str, title: str):
	queries = []
	benches = []
	for _ in results:
	benches.append([])
	for query in range(1, query_count(benchmark)+1):
	queries.append("q" + str(query))
	for i in range(0, len(results)):
	benches[i].append(np.median(np.array(results[i][str(query)])))

	# Define the width of the bars
	bar_width = 0.3

	# Define the positions of the bars on the x-axis
	index = np.arange(len(queries)) * 1.5

	# Create a bar chart
	if benchmark == "tpch":
	fig, ax = plt.subplots(figsize=(15, 6))
	else:
	fig, ax = plt.subplots(figsize=(35, 6))

	for i in range(0, len(results)):
	bar = ax.bar(index + i * bar_width, benches[i], bar_width, label=labels[i])

	# Add labels, title, and legend
	ax.set_title(title)
	ax.set_xlabel('Queries')
	ax.set_ylabel('Query Time (seconds)')
	ax.set_xticks(index + bar_width / 2)
	ax.set_xticklabels(queries)
	ax.legend()

	# Save the plot as an image file
	plt.savefig(f'{benchmark}_queries_compare.png', format='png')

	def generate_summary(results, labels, benchmark: str, title: str):
	timings = []
	for _ in results:
	timings.append(0)

	num_queries = query_count(benchmark)
	for query in range(1, num_queries + 1):
	for i in range(0, len(results)):
	timings[i] += np.median(np.array(results[i][str(query)]))

	# Create figure and axis
	fig, ax = plt.subplots()

	# Add title and labels
	ax.set_title(title)
	ax.set_ylabel(f'Time in seconds to run all {num_queries} {benchmark} queries (lower is better)')

	times = [round(x,0) for x in timings]

	# Create bar chart
	bars = ax.bar(labels, times, color='skyblue')

	# Add text annotations
	for bar in bars:
	yval = bar.get_height()
	ax.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval}', va='bottom') # va: vertical alignment

	plt.savefig(f'{benchmark}_allqueries.png', format='png')

	def query_count(benchmark: str):
	if benchmark == "tpch":
	return 22
	elif benchmark == "tpcds":
	return 99
	else:
	raise "invalid benchmark name"

	def main(files, labels, benchmark: str, title: str):
	results = []
	for filename in files:
	with open(filename) as f:
	results.append(json.load(f))
	generate_summary(results, labels, benchmark, title)
	generate_query_comparison_chart(results, labels, benchmark, title)
	if len(files) == 2:
	generate_query_speedup_chart(results[0], results[1], labels[0], labels[1], benchmark, title)

	if __name__ == '__main__':
	argparse = argparse.ArgumentParser(description='Generate comparison')
	argparse.add_argument('filenames', nargs='+', type=str, help='JSON result files')
	argparse.add_argument('--labels', nargs='+', type=str, help='Labels')
	argparse.add_argument('--benchmark', type=str, help='Benchmark name (tpch or tpcds)')
	argparse.add_argument('--title', type=str, help='Chart title')
	args = argparse.parse_args()
	main(args.filenames, args.labels, args.benchmark, args.title)