| #!/usr/bin/env python3 |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import argparse |
| import os |
| import platform |
| import re |
| from collections import defaultdict |
| from datetime import datetime |
| |
| import matplotlib.pyplot as plt |
| import numpy as np |
| |
| try: |
| import psutil |
| |
| HAS_PSUTIL = True |
| except ImportError: |
| HAS_PSUTIL = False |
| |
| COLORS = { |
| "fory": "#FF6f01", |
| "protobuf": "#55BCC2", |
| } |
| SERIALIZER_ORDER = ["fory", "protobuf"] |
| SERIALIZER_LABELS = { |
| "fory": "fory", |
| "protobuf": "protobuf", |
| } |
| DATATYPE_ORDER = [ |
| "struct", |
| "sample", |
| "mediacontent", |
| "structlist", |
| "samplelist", |
| "mediacontentlist", |
| ] |
| OPERATIONS = ["serialize", "deserialize"] |
| UNIT_TO_NS = { |
| "ps": 1e-3, |
| "ns": 1.0, |
| "us": 1e3, |
| "µs": 1e3, |
| "ms": 1e6, |
| "s": 1e9, |
| } |
| |
| |
| def parse_args(): |
| parser = argparse.ArgumentParser( |
| description="Generate Rust benchmark report from Criterion output" |
| ) |
| parser.add_argument( |
| "--log-file", |
| default="results/cargo_bench.log", |
| help="Criterion cargo bench log file", |
| ) |
| parser.add_argument( |
| "--size-file", |
| default="results/serialized_sizes.txt", |
| help="Serialized size table generated by fory_profiler", |
| ) |
| parser.add_argument( |
| "--output-dir", |
| default="results", |
| help="Output directory for report artifacts", |
| ) |
| parser.add_argument( |
| "--plot-prefix", |
| default="", |
| help="Image path prefix inside the markdown report", |
| ) |
| return parser.parse_args() |
| |
| |
| def datatype_title(datatype): |
| if datatype == "mediacontent": |
| return "MediaContent" |
| if datatype == "mediacontentlist": |
| return "MediaContentList" |
| if datatype.endswith("list"): |
| return f"{datatype[:-4].capitalize()}List" |
| return datatype.capitalize() |
| |
| |
| def datatype_plot_label(datatype): |
| if datatype == "mediacontent": |
| return "MediaContent" |
| if datatype == "mediacontentlist": |
| return "MediaContent\nList" |
| if datatype.endswith("list"): |
| return f"{datatype[:-4].capitalize()}\nList" |
| return datatype.capitalize() |
| |
| |
| def get_system_info(log_file): |
| info = { |
| "OS": f"{platform.system()} {platform.release()}", |
| "Machine": platform.machine(), |
| "Processor": platform.processor() or "Unknown", |
| } |
| if HAS_PSUTIL: |
| info["CPU Cores (Physical)"] = psutil.cpu_count(logical=False) |
| info["CPU Cores (Logical)"] = psutil.cpu_count(logical=True) |
| info["Total RAM (GB)"] = round(psutil.virtual_memory().total / (1024**3), 2) |
| if os.path.exists(log_file): |
| info["Benchmark Date"] = datetime.fromtimestamp( |
| os.path.getmtime(log_file) |
| ).isoformat(timespec="seconds") |
| return info |
| |
| |
| def parse_time_ns(measurement): |
| tokens = measurement.replace("μ", "µ").split() |
| if len(tokens) < 4: |
| raise ValueError(f"unexpected criterion timing format: {measurement}") |
| median_value = float(tokens[2]) |
| median_unit = tokens[3] |
| return median_value * UNIT_TO_NS[median_unit] |
| |
| |
| def load_benchmark_results(log_file): |
| pattern = re.compile( |
| r"Benchmarking\s+([A-Za-z0-9_]+)/([A-Za-z0-9_]+).*?time:\s+\[([^\]]+)\]", |
| re.DOTALL, |
| ) |
| results = defaultdict(lambda: defaultdict(dict)) |
| |
| with open(log_file, "r", encoding="utf-8") as file: |
| content = file.read() |
| |
| for datatype, benchmark_name, measurement in pattern.findall(content): |
| if datatype not in DATATYPE_ORDER: |
| continue |
| if "_" not in benchmark_name: |
| continue |
| serializer, operation = benchmark_name.split("_", 1) |
| if serializer not in SERIALIZER_ORDER or operation not in OPERATIONS: |
| continue |
| time_ns = parse_time_ns(measurement) |
| results[datatype][operation][serializer] = time_ns |
| return results |
| |
| |
| def load_serialized_sizes(size_file): |
| if not os.path.exists(size_file): |
| return {} |
| |
| pattern = re.compile(r"^\|\s*([^|]+?)\s*\|\s*(\d+)\s*\|\s*(\d+)\s*\|$") |
| sizes = {} |
| with open(size_file, "r", encoding="utf-8") as file: |
| for line in file: |
| match = pattern.match(line.strip()) |
| if not match: |
| continue |
| datatype, fory_size, protobuf_size = match.groups() |
| if datatype == "Datatype": |
| continue |
| sizes[datatype] = { |
| "fory": int(fory_size), |
| "protobuf": int(protobuf_size), |
| } |
| return sizes |
| |
| |
| def format_tps_label(tps): |
| if tps >= 1e9: |
| return f"{tps / 1e9:.2f}G" |
| if tps >= 1e6: |
| return f"{tps / 1e6:.2f}M" |
| if tps >= 1e3: |
| return f"{tps / 1e3:.2f}K" |
| return f"{tps:.0f}" |
| |
| |
| def plot_datatype(ax, results, datatype, operation): |
| if datatype not in results or operation not in results[datatype]: |
| ax.set_title(f"{datatype} {operation} - No Data") |
| ax.axis("off") |
| return |
| |
| libs = [ |
| serializer |
| for serializer in SERIALIZER_ORDER |
| if results[datatype][operation].get(serializer, 0) > 0 |
| ] |
| throughput = [1e9 / results[datatype][operation][serializer] for serializer in libs] |
| x = np.arange(len(libs)) |
| bars = ax.bar( |
| x, |
| throughput, |
| color=[COLORS.get(serializer, "#888888") for serializer in libs], |
| width=0.6, |
| ) |
| |
| ax.set_title(f"{operation.capitalize()} Throughput (higher is better)") |
| ax.set_xticks(x) |
| ax.set_xticklabels([SERIALIZER_LABELS[serializer] for serializer in libs]) |
| ax.set_ylabel("Throughput (ops/sec)") |
| ax.grid(True, axis="y", linestyle="--", alpha=0.5) |
| ax.ticklabel_format(style="scientific", axis="y", scilimits=(0, 0)) |
| |
| for bar, value in zip(bars, throughput): |
| ax.annotate( |
| format_tps_label(value), |
| xy=(bar.get_x() + bar.get_width() / 2, value), |
| xytext=(0, 3), |
| textcoords="offset points", |
| ha="center", |
| va="bottom", |
| fontsize=9, |
| ) |
| |
| |
| def plot_combined_subplot(ax, results, datatypes, operation, title): |
| if not datatypes: |
| ax.set_title(f"{title}\nNo Data") |
| ax.axis("off") |
| return |
| |
| available = [ |
| serializer |
| for serializer in SERIALIZER_ORDER |
| if any( |
| results[datatype][operation].get(serializer, 0) > 0 |
| for datatype in datatypes |
| ) |
| ] |
| if not available: |
| ax.set_title(f"{title}\nNo Data") |
| ax.axis("off") |
| return |
| |
| x = np.arange(len(datatypes)) |
| width = 0.8 / len(available) |
| for index, serializer in enumerate(available): |
| throughput = [] |
| for datatype in datatypes: |
| time_ns = results[datatype][operation].get(serializer, 0) |
| throughput.append(1e9 / time_ns if time_ns > 0 else 0) |
| offset = (index - (len(available) - 1) / 2) * width |
| ax.bar( |
| x + offset, |
| throughput, |
| width, |
| label=SERIALIZER_LABELS[serializer], |
| color=COLORS.get(serializer, "#888888"), |
| ) |
| |
| ax.set_title(title) |
| ax.set_xticks(x) |
| ax.set_xticklabels([datatype_plot_label(datatype) for datatype in datatypes]) |
| ax.grid(True, axis="y", linestyle="--", alpha=0.5) |
| ax.legend() |
| ax.ticklabel_format(style="scientific", axis="y", scilimits=(0, 0)) |
| |
| |
| def generate_plots(results, output_dir): |
| os.makedirs(output_dir, exist_ok=True) |
| plot_images = [] |
| |
| for datatype in DATATYPE_ORDER: |
| if datatype not in results: |
| continue |
| fig, axes = plt.subplots(1, 2, figsize=(12, 5)) |
| for index, operation in enumerate(OPERATIONS): |
| plot_datatype(axes[index], results, datatype, operation) |
| fig.suptitle(f"{datatype_title(datatype)} Throughput", fontsize=14) |
| fig.tight_layout(rect=[0, 0, 1, 0.95]) |
| plot_path = os.path.join(output_dir, f"{datatype}.png") |
| plt.savefig(plot_path, dpi=150) |
| plt.close(fig) |
| plot_images.append((datatype, plot_path)) |
| |
| non_list = [ |
| datatype |
| for datatype in DATATYPE_ORDER |
| if datatype in results and not datatype.endswith("list") |
| ] |
| list_only = [ |
| datatype |
| for datatype in DATATYPE_ORDER |
| if datatype in results and datatype.endswith("list") |
| ] |
| |
| fig, axes = plt.subplots(1, 4, figsize=(28, 6)) |
| fig.supylabel("Throughput (ops/sec)") |
| plot_combined_subplot( |
| axes[0], |
| results, |
| non_list, |
| "serialize", |
| "Serialize Throughput (higher is better)", |
| ) |
| plot_combined_subplot( |
| axes[1], |
| results, |
| non_list, |
| "deserialize", |
| "Deserialize Throughput (higher is better)", |
| ) |
| plot_combined_subplot( |
| axes[2], results, list_only, "serialize", "Serialize Throughput (*List)" |
| ) |
| plot_combined_subplot( |
| axes[3], results, list_only, "deserialize", "Deserialize Throughput (*List)" |
| ) |
| fig.tight_layout() |
| throughput_path = os.path.join(output_dir, "throughput.png") |
| plt.savefig(throughput_path, dpi=150) |
| plt.close(fig) |
| plot_images.append(("throughput", throughput_path)) |
| |
| return plot_images |
| |
| |
| def write_report(system_info, results, sizes, plot_images, output_dir, plot_prefix): |
| report = [ |
| "# Rust Benchmark Performance Report\n\n", |
| f"_Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}_\n\n", |
| "## How to Generate This Report\n\n", |
| "```bash\n", |
| "cd benchmarks/rust\n", |
| "cargo bench --bench serialization_bench 2>&1 | tee results/cargo_bench.log\n", |
| "cargo run --release --bin fory_profiler -- --print-all-serialized-sizes | tee results/serialized_sizes.txt\n", |
| "python benchmark_report.py --log-file results/cargo_bench.log --size-file results/serialized_sizes.txt --output-dir results\n", |
| "```\n\n", |
| "## Hardware & OS Info\n\n", |
| "| Key | Value |\n", |
| "|-----|-------|\n", |
| ] |
| |
| for key, value in system_info.items(): |
| report.append(f"| {key} | {value} |\n") |
| |
| report.append("\n## Benchmark Plots\n") |
| report.append("\nAll class-level plots below show throughput (ops/sec).\n") |
| |
| sorted_plots = sorted( |
| plot_images, key=lambda item: (0 if item[0] == "throughput" else 1, item[0]) |
| ) |
| for datatype, image_path in sorted_plots: |
| plot_title = datatype_title(datatype) |
| report.append(f"\n### {plot_title}\n\n") |
| report.append(f"})\n") |
| |
| report.append("\n## Benchmark Results\n\n") |
| report.append("### Timing Results (nanoseconds)\n\n") |
| report.append("| Datatype | Operation | fory (ns) | protobuf (ns) | Fastest |\n") |
| report.append("|----------|-----------|-----------|---------------|---------|\n") |
| |
| for datatype in DATATYPE_ORDER: |
| if datatype not in results: |
| continue |
| for operation in OPERATIONS: |
| times = { |
| serializer: results[datatype][operation].get(serializer, 0) |
| for serializer in SERIALIZER_ORDER |
| } |
| positive = {name: value for name, value in times.items() if value > 0} |
| fastest = min(positive, key=positive.get) if positive else "N/A" |
| report.append( |
| "| " |
| + f"{datatype_title(datatype)} | {operation.capitalize()} | " |
| + " | ".join( |
| f"{times[serializer]:.1f}" if times[serializer] > 0 else "N/A" |
| for serializer in SERIALIZER_ORDER |
| ) |
| + f" | {fastest} |\n" |
| ) |
| |
| report.append("\n### Throughput Results (ops/sec)\n\n") |
| report.append("| Datatype | Operation | fory TPS | protobuf TPS | Fastest |\n") |
| report.append("|----------|-----------|----------|--------------|---------|\n") |
| |
| for datatype in DATATYPE_ORDER: |
| if datatype not in results: |
| continue |
| for operation in OPERATIONS: |
| throughput = {} |
| for serializer in SERIALIZER_ORDER: |
| time_ns = results[datatype][operation].get(serializer, 0) |
| throughput[serializer] = 1e9 / time_ns if time_ns > 0 else 0 |
| positive = {name: value for name, value in throughput.items() if value > 0} |
| fastest = max(positive, key=positive.get) if positive else "N/A" |
| report.append( |
| "| " |
| + f"{datatype_title(datatype)} | {operation.capitalize()} | " |
| + " | ".join( |
| f"{throughput[serializer]:,.0f}" |
| if throughput[serializer] > 0 |
| else "N/A" |
| for serializer in SERIALIZER_ORDER |
| ) |
| + f" | {fastest} |\n" |
| ) |
| |
| if sizes: |
| report.append("\n### Serialized Data Sizes (bytes)\n\n") |
| report.append("| Datatype | fory | protobuf |\n") |
| report.append("|----------|------|----------|\n") |
| for datatype in DATATYPE_ORDER: |
| title = datatype_title(datatype) |
| if title not in sizes: |
| continue |
| entry = sizes[title] |
| report.append(f"| {title} | {entry['fory']} | {entry['protobuf']} |\n") |
| |
| report_path = os.path.join(output_dir, "README.md") |
| with open(report_path, "w", encoding="utf-8") as file: |
| file.writelines(report) |
| return report_path |
| |
| |
| def main(): |
| args = parse_args() |
| results = load_benchmark_results(args.log_file) |
| sizes = load_serialized_sizes(args.size_file) |
| system_info = get_system_info(args.log_file) |
| plot_images = generate_plots(results, args.output_dir) |
| report_path = write_report( |
| system_info, results, sizes, plot_images, args.output_dir, args.plot_prefix |
| ) |
| print(f"✅ Plots saved in: {args.output_dir}") |
| print(f"📄 Markdown report generated at: {report_path}") |
| |
| |
| if __name__ == "__main__": |
| main() |