blob: 9f7112e47330d596961ba85f86fcb9c99959ab55 [file] [log] [blame]
#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import argparse
import os
import platform
import re
from collections import defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
try:
import psutil
HAS_PSUTIL = True
except ImportError:
HAS_PSUTIL = False
COLORS = {
"fory": "#FF6f01",
"protobuf": "#55BCC2",
}
SERIALIZER_ORDER = ["fory", "protobuf"]
SERIALIZER_LABELS = {
"fory": "fory",
"protobuf": "protobuf",
}
DATATYPE_ORDER = [
"struct",
"sample",
"mediacontent",
"structlist",
"samplelist",
"mediacontentlist",
]
OPERATIONS = ["serialize", "deserialize"]
UNIT_TO_NS = {
"ps": 1e-3,
"ns": 1.0,
"us": 1e3,
"µs": 1e3,
"ms": 1e6,
"s": 1e9,
}
def parse_args():
parser = argparse.ArgumentParser(
description="Generate Rust benchmark report from Criterion output"
)
parser.add_argument(
"--log-file",
default="results/cargo_bench.log",
help="Criterion cargo bench log file",
)
parser.add_argument(
"--size-file",
default="results/serialized_sizes.txt",
help="Serialized size table generated by fory_profiler",
)
parser.add_argument(
"--output-dir",
default="results",
help="Output directory for report artifacts",
)
parser.add_argument(
"--plot-prefix",
default="",
help="Image path prefix inside the markdown report",
)
return parser.parse_args()
def datatype_title(datatype):
if datatype == "mediacontent":
return "MediaContent"
if datatype == "mediacontentlist":
return "MediaContentList"
if datatype.endswith("list"):
return f"{datatype[:-4].capitalize()}List"
return datatype.capitalize()
def datatype_plot_label(datatype):
if datatype == "mediacontent":
return "MediaContent"
if datatype == "mediacontentlist":
return "MediaContent\nList"
if datatype.endswith("list"):
return f"{datatype[:-4].capitalize()}\nList"
return datatype.capitalize()
def get_system_info(log_file):
info = {
"OS": f"{platform.system()} {platform.release()}",
"Machine": platform.machine(),
"Processor": platform.processor() or "Unknown",
}
if HAS_PSUTIL:
info["CPU Cores (Physical)"] = psutil.cpu_count(logical=False)
info["CPU Cores (Logical)"] = psutil.cpu_count(logical=True)
info["Total RAM (GB)"] = round(psutil.virtual_memory().total / (1024**3), 2)
if os.path.exists(log_file):
info["Benchmark Date"] = datetime.fromtimestamp(
os.path.getmtime(log_file)
).isoformat(timespec="seconds")
return info
def parse_time_ns(measurement):
tokens = measurement.replace("μ", "µ").split()
if len(tokens) < 4:
raise ValueError(f"unexpected criterion timing format: {measurement}")
median_value = float(tokens[2])
median_unit = tokens[3]
return median_value * UNIT_TO_NS[median_unit]
def load_benchmark_results(log_file):
pattern = re.compile(
r"Benchmarking\s+([A-Za-z0-9_]+)/([A-Za-z0-9_]+).*?time:\s+\[([^\]]+)\]",
re.DOTALL,
)
results = defaultdict(lambda: defaultdict(dict))
with open(log_file, "r", encoding="utf-8") as file:
content = file.read()
for datatype, benchmark_name, measurement in pattern.findall(content):
if datatype not in DATATYPE_ORDER:
continue
if "_" not in benchmark_name:
continue
serializer, operation = benchmark_name.split("_", 1)
if serializer not in SERIALIZER_ORDER or operation not in OPERATIONS:
continue
time_ns = parse_time_ns(measurement)
results[datatype][operation][serializer] = time_ns
return results
def load_serialized_sizes(size_file):
if not os.path.exists(size_file):
return {}
pattern = re.compile(r"^\|\s*([^|]+?)\s*\|\s*(\d+)\s*\|\s*(\d+)\s*\|$")
sizes = {}
with open(size_file, "r", encoding="utf-8") as file:
for line in file:
match = pattern.match(line.strip())
if not match:
continue
datatype, fory_size, protobuf_size = match.groups()
if datatype == "Datatype":
continue
sizes[datatype] = {
"fory": int(fory_size),
"protobuf": int(protobuf_size),
}
return sizes
def format_tps_label(tps):
if tps >= 1e9:
return f"{tps / 1e9:.2f}G"
if tps >= 1e6:
return f"{tps / 1e6:.2f}M"
if tps >= 1e3:
return f"{tps / 1e3:.2f}K"
return f"{tps:.0f}"
def plot_datatype(ax, results, datatype, operation):
if datatype not in results or operation not in results[datatype]:
ax.set_title(f"{datatype} {operation} - No Data")
ax.axis("off")
return
libs = [
serializer
for serializer in SERIALIZER_ORDER
if results[datatype][operation].get(serializer, 0) > 0
]
throughput = [1e9 / results[datatype][operation][serializer] for serializer in libs]
x = np.arange(len(libs))
bars = ax.bar(
x,
throughput,
color=[COLORS.get(serializer, "#888888") for serializer in libs],
width=0.6,
)
ax.set_title(f"{operation.capitalize()} Throughput (higher is better)")
ax.set_xticks(x)
ax.set_xticklabels([SERIALIZER_LABELS[serializer] for serializer in libs])
ax.set_ylabel("Throughput (ops/sec)")
ax.grid(True, axis="y", linestyle="--", alpha=0.5)
ax.ticklabel_format(style="scientific", axis="y", scilimits=(0, 0))
for bar, value in zip(bars, throughput):
ax.annotate(
format_tps_label(value),
xy=(bar.get_x() + bar.get_width() / 2, value),
xytext=(0, 3),
textcoords="offset points",
ha="center",
va="bottom",
fontsize=9,
)
def plot_combined_subplot(ax, results, datatypes, operation, title):
if not datatypes:
ax.set_title(f"{title}\nNo Data")
ax.axis("off")
return
available = [
serializer
for serializer in SERIALIZER_ORDER
if any(
results[datatype][operation].get(serializer, 0) > 0
for datatype in datatypes
)
]
if not available:
ax.set_title(f"{title}\nNo Data")
ax.axis("off")
return
x = np.arange(len(datatypes))
width = 0.8 / len(available)
for index, serializer in enumerate(available):
throughput = []
for datatype in datatypes:
time_ns = results[datatype][operation].get(serializer, 0)
throughput.append(1e9 / time_ns if time_ns > 0 else 0)
offset = (index - (len(available) - 1) / 2) * width
ax.bar(
x + offset,
throughput,
width,
label=SERIALIZER_LABELS[serializer],
color=COLORS.get(serializer, "#888888"),
)
ax.set_title(title)
ax.set_xticks(x)
ax.set_xticklabels([datatype_plot_label(datatype) for datatype in datatypes])
ax.grid(True, axis="y", linestyle="--", alpha=0.5)
ax.legend()
ax.ticklabel_format(style="scientific", axis="y", scilimits=(0, 0))
def generate_plots(results, output_dir):
os.makedirs(output_dir, exist_ok=True)
plot_images = []
for datatype in DATATYPE_ORDER:
if datatype not in results:
continue
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for index, operation in enumerate(OPERATIONS):
plot_datatype(axes[index], results, datatype, operation)
fig.suptitle(f"{datatype_title(datatype)} Throughput", fontsize=14)
fig.tight_layout(rect=[0, 0, 1, 0.95])
plot_path = os.path.join(output_dir, f"{datatype}.png")
plt.savefig(plot_path, dpi=150)
plt.close(fig)
plot_images.append((datatype, plot_path))
non_list = [
datatype
for datatype in DATATYPE_ORDER
if datatype in results and not datatype.endswith("list")
]
list_only = [
datatype
for datatype in DATATYPE_ORDER
if datatype in results and datatype.endswith("list")
]
fig, axes = plt.subplots(1, 4, figsize=(28, 6))
fig.supylabel("Throughput (ops/sec)")
plot_combined_subplot(
axes[0],
results,
non_list,
"serialize",
"Serialize Throughput (higher is better)",
)
plot_combined_subplot(
axes[1],
results,
non_list,
"deserialize",
"Deserialize Throughput (higher is better)",
)
plot_combined_subplot(
axes[2], results, list_only, "serialize", "Serialize Throughput (*List)"
)
plot_combined_subplot(
axes[3], results, list_only, "deserialize", "Deserialize Throughput (*List)"
)
fig.tight_layout()
throughput_path = os.path.join(output_dir, "throughput.png")
plt.savefig(throughput_path, dpi=150)
plt.close(fig)
plot_images.append(("throughput", throughput_path))
return plot_images
def write_report(system_info, results, sizes, plot_images, output_dir, plot_prefix):
report = [
"# Rust Benchmark Performance Report\n\n",
f"_Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}_\n\n",
"## How to Generate This Report\n\n",
"```bash\n",
"cd benchmarks/rust\n",
"cargo bench --bench serialization_bench 2>&1 | tee results/cargo_bench.log\n",
"cargo run --release --bin fory_profiler -- --print-all-serialized-sizes | tee results/serialized_sizes.txt\n",
"python benchmark_report.py --log-file results/cargo_bench.log --size-file results/serialized_sizes.txt --output-dir results\n",
"```\n\n",
"## Hardware & OS Info\n\n",
"| Key | Value |\n",
"|-----|-------|\n",
]
for key, value in system_info.items():
report.append(f"| {key} | {value} |\n")
report.append("\n## Benchmark Plots\n")
report.append("\nAll class-level plots below show throughput (ops/sec).\n")
sorted_plots = sorted(
plot_images, key=lambda item: (0 if item[0] == "throughput" else 1, item[0])
)
for datatype, image_path in sorted_plots:
plot_title = datatype_title(datatype)
report.append(f"\n### {plot_title}\n\n")
report.append(f"![{plot_title}]({plot_prefix}{os.path.basename(image_path)})\n")
report.append("\n## Benchmark Results\n\n")
report.append("### Timing Results (nanoseconds)\n\n")
report.append("| Datatype | Operation | fory (ns) | protobuf (ns) | Fastest |\n")
report.append("|----------|-----------|-----------|---------------|---------|\n")
for datatype in DATATYPE_ORDER:
if datatype not in results:
continue
for operation in OPERATIONS:
times = {
serializer: results[datatype][operation].get(serializer, 0)
for serializer in SERIALIZER_ORDER
}
positive = {name: value for name, value in times.items() if value > 0}
fastest = min(positive, key=positive.get) if positive else "N/A"
report.append(
"| "
+ f"{datatype_title(datatype)} | {operation.capitalize()} | "
+ " | ".join(
f"{times[serializer]:.1f}" if times[serializer] > 0 else "N/A"
for serializer in SERIALIZER_ORDER
)
+ f" | {fastest} |\n"
)
report.append("\n### Throughput Results (ops/sec)\n\n")
report.append("| Datatype | Operation | fory TPS | protobuf TPS | Fastest |\n")
report.append("|----------|-----------|----------|--------------|---------|\n")
for datatype in DATATYPE_ORDER:
if datatype not in results:
continue
for operation in OPERATIONS:
throughput = {}
for serializer in SERIALIZER_ORDER:
time_ns = results[datatype][operation].get(serializer, 0)
throughput[serializer] = 1e9 / time_ns if time_ns > 0 else 0
positive = {name: value for name, value in throughput.items() if value > 0}
fastest = max(positive, key=positive.get) if positive else "N/A"
report.append(
"| "
+ f"{datatype_title(datatype)} | {operation.capitalize()} | "
+ " | ".join(
f"{throughput[serializer]:,.0f}"
if throughput[serializer] > 0
else "N/A"
for serializer in SERIALIZER_ORDER
)
+ f" | {fastest} |\n"
)
if sizes:
report.append("\n### Serialized Data Sizes (bytes)\n\n")
report.append("| Datatype | fory | protobuf |\n")
report.append("|----------|------|----------|\n")
for datatype in DATATYPE_ORDER:
title = datatype_title(datatype)
if title not in sizes:
continue
entry = sizes[title]
report.append(f"| {title} | {entry['fory']} | {entry['protobuf']} |\n")
report_path = os.path.join(output_dir, "README.md")
with open(report_path, "w", encoding="utf-8") as file:
file.writelines(report)
return report_path
def main():
args = parse_args()
results = load_benchmark_results(args.log_file)
sizes = load_serialized_sizes(args.size_file)
system_info = get_system_info(args.log_file)
plot_images = generate_plots(results, args.output_dir)
report_path = write_report(
system_info, results, sizes, plot_images, args.output_dir, args.plot_prefix
)
print(f"✅ Plots saved in: {args.output_dir}")
print(f"📄 Markdown report generated at: {report_path}")
if __name__ == "__main__":
main()