blob: a048af352ed48308790cf1ae0540437686757db6 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import json
import os
import platform
import argparse
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from datetime import datetime
try:
import psutil
HAS_PSUTIL = True
except ImportError:
HAS_PSUTIL = False
# === Colors and serializer order ===
COLORS = {
"fory": "#FF6f01", # Orange
"protobuf": "#55BCC2", # Teal
"msgpack": (0.55, 0.40, 0.45),
}
SERIALIZER_ORDER = ["fory", "protobuf", "msgpack"]
SERIALIZER_LABELS = {
"fory": "fory",
"protobuf": "protobuf",
"msgpack": "msgpack",
}
# === Parse arguments ===
parser = argparse.ArgumentParser(
description="Plot Google Benchmark stats and generate Markdown report for C++ benchmarks"
)
parser.add_argument(
"--json-file", default="benchmark_results.json", help="Benchmark JSON output file"
)
parser.add_argument(
"--output-dir", default="", help="Output directory for plots and report"
)
parser.add_argument(
"--plot-prefix", default="", help="Image path prefix in Markdown report"
)
args = parser.parse_args()
# === Determine output directory ===
if args.output_dir.strip():
output_dir = args.output_dir
else:
output_dir = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
os.makedirs(output_dir, exist_ok=True)
# === Get system info ===
def get_system_info():
try:
info = {
"OS": f"{platform.system()} {platform.release()}",
"Machine": platform.machine(),
"Processor": platform.processor() or "Unknown",
}
if HAS_PSUTIL:
info["CPU Cores (Physical)"] = psutil.cpu_count(logical=False)
info["CPU Cores (Logical)"] = psutil.cpu_count(logical=True)
info["Total RAM (GB)"] = round(psutil.virtual_memory().total / (1024**3), 2)
except Exception as e:
info = {"Error gathering system info": str(e)}
return info
# === Parse benchmark name ===
def parse_benchmark_name(name):
"""
Parse benchmark names like:
- BM_Fory_Struct_Serialize
- BM_Protobuf_Sample_Deserialize
- BM_Msgpack_MediaContent_Deserialize
Returns: (library, datatype, operation)
"""
# Remove BM_ prefix
if name.startswith("BM_"):
name = name[3:]
parts = name.split("_")
if len(parts) >= 3:
library = parts[0].lower()
datatype = parts[1].lower()
operation = parts[2].lower()
return library, datatype, operation
return None, None, None
def format_datatype_label(datatype):
if not datatype:
return ""
if datatype.endswith("list"):
base = datatype[: -len("list")]
if base == "mediacontent":
return "MediaContent\nList"
return f"{base.capitalize()}\nList"
if datatype == "mediacontent":
return "MediaContent"
return datatype.capitalize()
def format_datatype_table_label(datatype):
if not datatype:
return ""
if datatype.endswith("list"):
base = datatype[: -len("list")]
if base == "mediacontent":
return "MediaContentList"
return f"{base.capitalize()}List"
if datatype == "mediacontent":
return "MediaContent"
return datatype.capitalize()
# === Read and parse benchmark JSON ===
def load_benchmark_data(json_file):
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
return data
# === Data storage ===
# Structure: data[datatype][operation][library] = time_ns
data = defaultdict(lambda: defaultdict(dict))
sizes = {} # Store serialized sizes
# === Load and process data ===
benchmark_data = load_benchmark_data(args.json_file)
# Extract context info
context = benchmark_data.get("context", {})
# Process benchmarks
for bench in benchmark_data.get("benchmarks", []):
name = bench.get("name", "")
# Skip aggregate results and size benchmarks
if "/iterations:" in name or "PrintSerializedSizes" in name:
# Extract sizes from PrintSerializedSizes
if "PrintSerializedSizes" in name:
for key, value in bench.items():
if key.endswith("_size"):
sizes[key] = int(value)
continue
library, datatype, operation = parse_benchmark_name(name)
if library and datatype and operation:
# Get time in nanoseconds
time_ns = bench.get("real_time", bench.get("cpu_time", 0))
time_unit = bench.get("time_unit", "ns")
# Convert to nanoseconds if needed
if time_unit == "us":
time_ns *= 1000
elif time_unit == "ms":
time_ns *= 1000000
elif time_unit == "s":
time_ns *= 1000000000
data[datatype][operation][library] = time_ns
# === System info ===
system_info = get_system_info()
# Add context info from benchmark
if context:
if "date" in context:
system_info["Benchmark Date"] = context["date"]
if "num_cpus" in context:
system_info["CPU Cores (from benchmark)"] = context["num_cpus"]
# === Plotting ===
def format_tps_label(tps):
if tps >= 1e9:
return f"{tps / 1e9:.2f}G"
if tps >= 1e6:
return f"{tps / 1e6:.2f}M"
if tps >= 1e3:
return f"{tps / 1e3:.2f}K"
return f"{tps:.0f}"
def plot_datatype(ax, datatype, operation):
"""Plot a single datatype/operation throughput comparison."""
if datatype not in data or operation not in data[datatype]:
ax.set_title(f"{datatype} {operation} - No Data")
ax.axis("off")
return
libs = set(data[datatype][operation].keys())
lib_order = [lib for lib in SERIALIZER_ORDER if lib in libs]
times = [data[datatype][operation].get(lib, 0) for lib in lib_order]
throughput = [1e9 / t if t > 0 else 0 for t in times]
colors = [COLORS.get(lib, "#888888") for lib in lib_order]
x = np.arange(len(lib_order))
bars = ax.bar(x, throughput, color=colors, width=0.6)
ax.set_title(f"{operation.capitalize()} Throughput (higher is better)")
ax.set_xticks(x)
ax.set_xticklabels([SERIALIZER_LABELS.get(lib, lib) for lib in lib_order])
ax.set_ylabel("Throughput (ops/sec)")
ax.grid(True, axis="y", linestyle="--", alpha=0.5)
ax.ticklabel_format(style="scientific", axis="y", scilimits=(0, 0))
# Add value labels on bars
for bar, tps_val in zip(bars, throughput):
height = bar.get_height()
ax.annotate(
format_tps_label(tps_val),
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3),
textcoords="offset points",
ha="center",
va="bottom",
fontsize=9,
)
# === Create plots ===
plot_images = []
datatypes = sorted(data.keys())
operations = ["serialize", "deserialize"]
for datatype in datatypes:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
for i, op in enumerate(operations):
plot_datatype(axes[i], datatype, op)
fig.suptitle(f"{datatype.capitalize()} Throughput", fontsize=14)
fig.tight_layout(rect=[0, 0, 1, 0.95])
plot_path = os.path.join(output_dir, f"{datatype}.png")
plt.savefig(plot_path, dpi=150)
plot_images.append((datatype, plot_path))
plt.close()
# === Create combined TPS comparison plot ===
non_list_datatypes = [dt for dt in datatypes if not dt.endswith("list")]
list_datatypes = [dt for dt in datatypes if dt.endswith("list")]
def plot_combined_tps_subplot(ax, grouped_datatypes, operation, title):
if not grouped_datatypes:
ax.set_title(f"{title}\nNo Data")
ax.axis("off")
return
x = np.arange(len(grouped_datatypes))
available_libs = [
lib
for lib in SERIALIZER_ORDER
if any(data[dt][operation].get(lib, 0) > 0 for dt in grouped_datatypes)
]
if not available_libs:
ax.set_title(f"{title}\nNo Data")
ax.axis("off")
return
width = 0.8 / len(available_libs)
for idx, lib in enumerate(available_libs):
times = [data[dt][operation].get(lib, 0) for dt in grouped_datatypes]
tps = [1e9 / t if t > 0 else 0 for t in times]
offset = (idx - (len(available_libs) - 1) / 2) * width
ax.bar(
x + offset,
tps,
width,
label=SERIALIZER_LABELS.get(lib, lib),
color=COLORS.get(lib, "#888888"),
)
ax.set_title(title)
ax.set_xticks(x)
ax.set_xticklabels([format_datatype_label(dt) for dt in grouped_datatypes])
ax.legend()
ax.grid(True, axis="y", linestyle="--", alpha=0.5)
# Use a dedicated y-scale per subplot so list benchmarks are not compressed.
ax.ticklabel_format(style="scientific", axis="y", scilimits=(0, 0))
fig, axes = plt.subplots(1, 4, figsize=(28, 6))
fig.supylabel("Throughput (ops/sec)")
combined_subplots = [
(axes[0], non_list_datatypes, "serialize", "Serialize Throughput"),
(axes[1], non_list_datatypes, "deserialize", "Deserialize Throughput"),
(axes[2], list_datatypes, "serialize", "Serialize Throughput (*List)"),
(axes[3], list_datatypes, "deserialize", "Deserialize Throughput (*List)"),
]
for ax, grouped_datatypes, op, title in combined_subplots:
plot_combined_tps_subplot(ax, grouped_datatypes, op, f"{title} (higher is better)")
fig.tight_layout()
combined_plot_path = os.path.join(output_dir, "throughput.png")
plt.savefig(combined_plot_path, dpi=150)
plot_images.append(("throughput", combined_plot_path))
plt.close()
# === Markdown report ===
md_report = [
"# C++ Benchmark Performance Report\n\n",
f"_Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}_\n\n",
"## How to Generate This Report\n\n",
"```bash\n",
"cd benchmarks/cpp/build\n",
"./fory_benchmark --benchmark_format=json --benchmark_out=benchmark_results.json\n",
"cd ..\n",
"python benchmark_report.py --json-file build/benchmark_results.json --output-dir report\n",
"```\n\n",
"## Hardware & OS Info\n\n",
"| Key | Value |\n",
"|-----|-------|\n",
]
for k, v in system_info.items():
md_report.append(f"| {k} | {v} |\n")
# Plots section
md_report.append("\n## Benchmark Plots\n")
md_report.append("\nAll class-level plots below show throughput (ops/sec).\n")
plot_images_sorted = sorted(
plot_images, key=lambda item: (0 if item[0] == "throughput" else 1, item[0])
)
for datatype, img in plot_images_sorted:
img_filename = os.path.basename(img)
img_path_report = args.plot_prefix + img_filename
md_report.append(f"\n### {datatype.replace('_', ' ').title()}\n\n")
md_report.append(
f'<p align="center">\n<img src="{img_path_report}" width="90%" />\n</p>\n'
)
# Results table
md_report.append("\n## Benchmark Results\n\n")
md_report.append("### Timing Results (nanoseconds)\n\n")
md_report.append(
"| Datatype | Operation | fory (ns) | protobuf (ns) | msgpack (ns) | Fastest |\n"
)
md_report.append(
"|----------|-----------|-----------|---------------|--------------|---------|\n"
)
for datatype in datatypes:
for op in operations:
times = {lib: data[datatype][op].get(lib, 0) for lib in SERIALIZER_ORDER}
positive_times = {lib: t for lib, t in times.items() if t > 0}
fastest_str = "N/A"
if positive_times:
fastest_lib = min(positive_times, key=positive_times.get)
fastest_str = SERIALIZER_LABELS.get(fastest_lib, fastest_lib)
md_report.append(
"| "
+ f"{format_datatype_table_label(datatype)} | {op.capitalize()} | "
+ " | ".join(
f"{times[lib]:.1f}" if times[lib] > 0 else "N/A"
for lib in SERIALIZER_ORDER
)
+ f" | {fastest_str} |\n"
)
# Throughput table
md_report.append("\n### Throughput Results (ops/sec)\n\n")
md_report.append(
"| Datatype | Operation | fory TPS | protobuf TPS | msgpack TPS | Fastest |\n"
)
md_report.append(
"|----------|-----------|----------|--------------|-------------|---------|\n"
)
for datatype in datatypes:
for op in operations:
times = {lib: data[datatype][op].get(lib, 0) for lib in SERIALIZER_ORDER}
tps = {lib: (1e9 / t if t > 0 else 0) for lib, t in times.items()}
positive_tps = {lib: v for lib, v in tps.items() if v > 0}
fastest_str = "N/A"
if positive_tps:
fastest_lib = max(positive_tps, key=positive_tps.get)
fastest_str = SERIALIZER_LABELS.get(fastest_lib, fastest_lib)
md_report.append(
"| "
+ f"{format_datatype_table_label(datatype)} | {op.capitalize()} | "
+ " | ".join(
f"{tps[lib]:,.0f}" if tps[lib] > 0 else "N/A"
for lib in SERIALIZER_ORDER
)
+ f" | {fastest_str} |\n"
)
# Serialized sizes
if sizes:
md_report.append("\n### Serialized Data Sizes (bytes)\n\n")
md_report.append("| Datatype | fory | protobuf | msgpack |\n")
md_report.append("|----------|------|----------|---------|\n")
size_prefix = {
"fory": "fory",
"protobuf": "protobuf",
"msgpack": "msgpack",
}
size_datatypes = [
("struct", "Struct"),
("sample", "Sample"),
("media", "MediaContent"),
("struct_list", "StructList"),
("sample_list", "SampleList"),
("media_list", "MediaContentList"),
]
for datatype_key, datatype_label in size_datatypes:
row_values = []
has_value = False
for lib in SERIALIZER_ORDER:
key = f"{size_prefix[lib]}_{datatype_key}_size"
value = sizes.get(key)
if value is None and lib == "protobuf":
value = sizes.get(f"proto_{datatype_key}_size")
if value is None:
row_values.append("N/A")
else:
row_values.append(str(value))
has_value = True
if has_value:
md_report.append(f"| {datatype_label} | " + " | ".join(row_values) + " |\n")
# Save Markdown
report_path = os.path.join(output_dir, "README.md")
with open(report_path, "w", encoding="utf-8") as f:
f.writelines(md_report)
print(f"✅ Plots saved in: {output_dir}")
print(f"📄 Markdown report generated at: {report_path}")