| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| """ |
| process fory/kryo/fst/hession performance data |
| """ |
| |
| import matplotlib.pyplot as plt |
| import os |
| import pandas as pd |
| from pathlib import Path |
| import re |
| |
| dir_path = os.path.dirname(os.path.realpath(__file__)) |
| repo_root = Path(dir_path).parent.parent |
| java_benchmark_dir = repo_root / "docs/benchmarks/java" |
| java_benchmark_data_dir = java_benchmark_dir / "data" |
| java_benchmark_readme = java_benchmark_dir / "README.md" |
| |
| lib_order = [ |
| "Fory", |
| "ForyMetaShared", |
| "Kryo", |
| "Fst", |
| "Hession", |
| "Jdk", |
| "Protostuff", |
| ] |
| |
| java_serialization_files = [ |
| "jmh-jdk-11-serialization.csv", |
| "jmh-jdk-11-deserialization.csv", |
| ] |
| java_zero_copy_file = "jmh-jdk-11-zerocopy.csv" |
| |
| java_plot_combined_groups = [ |
| { |
| "alt": "Java Heap Schema Consistent Serialization", |
| "combined": "java_heap_serialize_consistent.png", |
| "sources": [ |
| "serialization/bench_serialize_STRUCT_to_array_tps.png", |
| "serialization/bench_serialize_STRUCT2_to_array_tps.png", |
| "serialization/bench_serialize_MEDIA_CONTENT_to_array_tps.png", |
| "serialization/bench_serialize_SAMPLE_to_array_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Heap Schema Compatible Serialization", |
| "combined": "java_heap_serialize_compatible.png", |
| "sources": [ |
| "serialization/bench_serialize_compatible_STRUCT_to_array_tps.png", |
| "serialization/bench_serialize_compatible_STRUCT2_to_array_tps.png", |
| "compatible/bench_serialize_compatible_MEDIA_CONTENT_to_array_tps.png", |
| "serialization/bench_serialize_compatible_SAMPLE_to_array_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Heap Schema Consistent Deserialization", |
| "combined": "java_heap_deserialize_consistent.png", |
| "sources": [ |
| "deserialization/bench_deserialize_STRUCT_from_array_tps.png", |
| "deserialization/bench_deserialize_STRUCT2_from_array_tps.png", |
| "deserialization/bench_deserialize_MEDIA_CONTENT_from_array_tps.png", |
| "deserialization/bench_deserialize_SAMPLE_from_array_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Heap Schema Compatible Deserialization", |
| "combined": "java_heap_deserialize_compatible.png", |
| "sources": [ |
| "deserialization/bench_deserialize_compatible_STRUCT_from_array_tps.png", |
| "deserialization/bench_deserialize_compatible_STRUCT2_from_array_tps.png", |
| "compatible/bench_deserialize_compatible_MEDIA_CONTENT_from_array_tps.png", |
| "deserialization/bench_deserialize_compatible_SAMPLE_from_array_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Off Heap Schema Consistent Serialization", |
| "combined": "java_offheap_serialize_consistent.png", |
| "sources": [ |
| "serialization/bench_serialize_STRUCT_to_directBuffer_tps.png", |
| "serialization/bench_serialize_STRUCT2_to_directBuffer_tps.png", |
| "serialization/bench_serialize_MEDIA_CONTENT_to_directBuffer_tps.png", |
| "serialization/bench_serialize_compatible_SAMPLE_to_directBuffer_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Off Heap Schema Compatible Serialization", |
| "combined": "java_offheap_serialize_compatible.png", |
| "sources": [ |
| "compatible/bench_serialize_compatible_STRUCT_to_directBuffer_tps.png", |
| "serialization/bench_serialize_compatible_STRUCT2_to_directBuffer_tps.png", |
| "serialization/bench_serialize_compatible_MEDIA_CONTENT_to_directBuffer_tps.png", |
| "serialization/bench_serialize_SAMPLE_to_directBuffer_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Off Heap Schema Consistent Deserialization", |
| "combined": "java_offheap_deserialize_consistent.png", |
| "sources": [ |
| "deserialization/bench_deserialize_STRUCT_from_directBuffer_tps.png", |
| "deserialization/bench_deserialize_STRUCT2_from_directBuffer_tps.png", |
| "deserialization/bench_deserialize_MEDIA_CONTENT_from_directBuffer_tps.png", |
| "deserialization/bench_deserialize_SAMPLE_from_directBuffer_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Off Heap Schema Compatible Deserialization", |
| "combined": "java_offheap_deserialize_compatible.png", |
| "sources": [ |
| "compatible/bench_deserialize_compatible_STRUCT_from_directBuffer_tps.png", |
| "deserialization/bench_deserialize_compatible_STRUCT2_from_directBuffer_tps.png", |
| "deserialization/bench_deserialize_compatible_MEDIA_CONTENT_from_directBuffer_tps.png", |
| "deserialization/bench_deserialize_compatible_SAMPLE_from_directBuffer_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Zero Copy Serialization", |
| "combined": "java_zero_copy_serialize.png", |
| "sources": [ |
| "zerocopy/zero_copy_bench_serialize_BUFFER_to_array_tps.png", |
| "zerocopy/zero_copy_bench_serialize_BUFFER_to_directBuffer_tps.png", |
| "zerocopy/zero_copy_bench_serialize_PRIMITIVE_ARRAY_to_array_tps.png", |
| "zerocopy/zero_copy_bench_serialize_PRIMITIVE_ARRAY_to_directBuffer_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Zero Copy Deserialization", |
| "combined": "java_zero_copy_deserialize.png", |
| "sources": [ |
| "zerocopy/zero_copy_bench_deserialize_BUFFER_from_array_tps.png", |
| "zerocopy/zero_copy_bench_deserialize_BUFFER_from_directBuffer_tps.png", |
| "zerocopy/zero_copy_bench_deserialize_PRIMITIVE_ARRAY_from_array_tps.png", |
| "zerocopy/zero_copy_bench_deserialize_PRIMITIVE_ARRAY_from_directBuffer_tps.png", |
| ], |
| }, |
| ] |
| |
| repo_plot_combined_groups = [ |
| { |
| "alt": "Java Serialization Throughput", |
| "combined": "docs/benchmarks/java/java_repo_serialization_throughput.png", |
| "sources": [ |
| "docs/benchmarks/java/compatible/bench_serialize_compatible_STRUCT_to_directBuffer_tps.png", |
| "docs/benchmarks/java/compatible/bench_serialize_compatible_MEDIA_CONTENT_to_array_tps.png", |
| "docs/benchmarks/java/serialization/bench_serialize_MEDIA_CONTENT_to_array_tps.png", |
| "docs/benchmarks/java/serialization/bench_serialize_SAMPLE_to_array_tps.png", |
| ], |
| }, |
| { |
| "alt": "Java Deserialization Throughput", |
| "combined": "docs/benchmarks/java/java_repo_deserialization_throughput.png", |
| "sources": [ |
| "docs/benchmarks/java/compatible/bench_deserialize_compatible_STRUCT_from_directBuffer_tps.png", |
| "docs/benchmarks/java/compatible/bench_deserialize_compatible_MEDIA_CONTENT_from_array_tps.png", |
| "docs/benchmarks/java/deserialization/bench_deserialize_MEDIA_CONTENT_from_array_tps.png", |
| "docs/benchmarks/java/deserialization/bench_deserialize_SAMPLE_from_array_tps.png", |
| ], |
| }, |
| ] |
| |
| |
| def _to_markdown(df: pd.DataFrame): |
| lines = list(df.values.tolist()) |
| width = len(df.columns) |
| lines.insert(0, df.columns.values.tolist()) |
| lines.insert(1, ["-------"] * width) |
| md_table = "\n".join( |
| ["| " + " | ".join([str(item) for item in line]) + " |" for line in lines] |
| ) |
| return md_table |
| |
| |
| def _format_tps(value): |
| if pd.isna(value): |
| return "" |
| return f"{float(value):.6f}" |
| |
| |
| def _pivot_lib_columns(df: pd.DataFrame, index_columns): |
| table_df = ( |
| df.pivot_table( |
| index=index_columns, |
| columns="Lib", |
| values="Tps", |
| aggfunc="first", |
| sort=False, |
| ) |
| .reset_index() |
| .copy() |
| ) |
| available_libs = table_df.columns.tolist() |
| sorted_lib_columns = [name for name in lib_order if name in available_libs] |
| extra_lib_columns = sorted( |
| [ |
| name |
| for name in available_libs |
| if name not in index_columns + sorted_lib_columns |
| ] |
| ) |
| table_df = table_df[index_columns + sorted_lib_columns + extra_lib_columns] |
| if "references" in table_df.columns: |
| table_df["references"] = table_df["references"].astype(str).str.capitalize() |
| for column in sorted_lib_columns + extra_lib_columns: |
| table_df[column] = table_df[column].map(_format_tps) |
| return table_df |
| |
| |
| def _replace_table_section(content: str, heading: str, table_markdown: str): |
| lines = content.splitlines() |
| start_index = None |
| for index, line in enumerate(lines): |
| if line.strip() == heading: |
| start_index = index |
| break |
| if start_index is None: |
| raise ValueError(f"Failed to find section {heading}") |
| end_index = len(lines) |
| for index in range(start_index + 1, len(lines)): |
| if lines[index].startswith("### "): |
| end_index = index |
| break |
| updated_lines = lines[: start_index + 1] + ["", table_markdown, ""] |
| if end_index < len(lines): |
| updated_lines.extend(lines[end_index:]) |
| return "\n".join(updated_lines).rstrip() + "\n" |
| |
| |
| def _parse_chart_spec(source_path: str): |
| name = Path(source_path).name |
| benchmark_match = re.match( |
| r"bench_(serialize(?:_compatible)?|deserialize(?:_compatible)?)_([A-Z0-9_]+)_(to|from)_(array|directBuffer)_tps\.png", |
| name, |
| ) |
| if benchmark_match is not None: |
| return { |
| "kind": "benchmark", |
| "benchmark": benchmark_match.group(1), |
| "objectType": benchmark_match.group(2), |
| "bufferType": benchmark_match.group(4), |
| } |
| zero_copy_match = re.match( |
| r"zero_copy_bench_(serialize|deserialize)_([A-Z_]+)_(to|from)_(array|directBuffer)_tps\.png", |
| name, |
| ) |
| if zero_copy_match is not None: |
| return { |
| "kind": "zero_copy", |
| "benchmark": zero_copy_match.group(1), |
| "dataType": zero_copy_match.group(2), |
| "bufferType": zero_copy_match.group(4), |
| } |
| raise ValueError(f"Unsupported chart source path: {source_path}") |
| |
| |
| def _prepare_benchmark_plot_data(bench_df: pd.DataFrame): |
| data = bench_df.fillna("").copy() |
| compatible = data[data["Benchmark"].str.contains("compatible")] |
| if len(compatible) > 0: |
| jdk = data[data["Lib"].str.contains("Jdk")].copy() |
| jdk["Benchmark"] = jdk["Benchmark"] + "_compatible" |
| data = pd.concat([data, jdk], ignore_index=True) |
| data["Tps"] = (data["Tps"] / scaler).apply(format_scaler) |
| return data |
| |
| |
| def _prepare_zero_copy_plot_data(zero_copy_df: pd.DataFrame): |
| data = zero_copy_df.fillna("").copy() |
| data["Tps"] = (data["Tps"] / scaler).apply(format_scaler) |
| return data |
| |
| |
| def _build_single_plot_frame(spec, benchmark_data, zero_copy_data): |
| if spec["kind"] == "benchmark": |
| sub_df = benchmark_data[ |
| (benchmark_data["Benchmark"] == spec["benchmark"]) |
| & (benchmark_data["objectType"] == spec["objectType"]) |
| & (benchmark_data["bufferType"] == spec["bufferType"]) |
| ][["Lib", "references", "Tps"]] |
| final_df = ( |
| sub_df.reset_index(drop=True) |
| .set_index(["Lib", "references"]) |
| .unstack("Lib") |
| ) |
| if spec["benchmark"].startswith("serialize"): |
| title = f"{spec['benchmark']} {spec['objectType']} to {spec['bufferType']}" |
| else: |
| title = ( |
| f"{spec['benchmark']} {spec['objectType']} from {spec['bufferType']}" |
| ) |
| xlabel = "enable_references" |
| width = 0.7 * bar_width_scale |
| else: |
| sub_df = zero_copy_data[ |
| (zero_copy_data["Benchmark"] == spec["benchmark"]) |
| & (zero_copy_data["dataType"] == spec["dataType"]) |
| & (zero_copy_data["bufferType"] == spec["bufferType"]) |
| ][["Lib", "array_size", "Tps"]] |
| final_df = ( |
| sub_df.reset_index(drop=True) |
| .set_index(["Lib", "array_size"]) |
| .unstack("Lib") |
| ) |
| if spec["benchmark"].startswith("serialize"): |
| title = f"{spec['benchmark']} {spec['dataType']} to {spec['bufferType']}" |
| else: |
| title = f"{spec['benchmark']} {spec['dataType']} from {spec['bufferType']}" |
| xlabel = "array_size" |
| width = 0.8 * bar_width_scale |
| return final_df, title, xlabel, width |
| |
| |
| def _plot_combined_group(group, benchmark_data, zero_copy_data, output_path: Path): |
| fig, axes = plt.subplots(1, 4, figsize=(22, 6), gridspec_kw={"wspace": 0.15}) |
| for axis_index, source_path in enumerate(group["sources"]): |
| axis = axes[axis_index] |
| spec = _parse_chart_spec(source_path) |
| final_df, title, xlabel, width = _build_single_plot_frame( |
| spec, benchmark_data, zero_copy_data |
| ) |
| libs = final_df.columns.to_frame()["Lib"] |
| color = [color_map[lib] for lib in libs] |
| final_df.plot.bar(title=title, color=color, ax=axis, width=width) |
| for container in axis.containers: |
| axis.bar_label(container, fontsize=8) |
| axis.set_xlabel(xlabel) |
| if axis_index == 0: |
| axis.set_ylabel(f"Tps/{scaler}") |
| else: |
| axis.set_ylabel("") |
| add_upper_right_legend(axis, libs) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| fig.savefig( |
| output_path, dpi=170, bbox_inches="tight", pad_inches=0.03, facecolor="white" |
| ) |
| plt.close(fig) |
| |
| |
| def _generate_direct_combined_plots(benchmark_df, zero_copy_df, base_dir: Path, groups): |
| benchmark_data = _prepare_benchmark_plot_data(benchmark_df) |
| zero_copy_data = _prepare_zero_copy_plot_data(zero_copy_df) |
| for group in groups: |
| output_path = base_dir / group["combined"] |
| _plot_combined_group(group, benchmark_data, zero_copy_data, output_path) |
| |
| |
| def _update_java_benchmark_readme(data_dir: Path, readme_path: Path): |
| benchmark_dfs = [] |
| for file_name in java_serialization_files: |
| _, bench_df = process_data(str(data_dir / file_name)) |
| benchmark_dfs.append(bench_df) |
| benchmark_df = pd.concat(benchmark_dfs, ignore_index=True) |
| benchmark_df = ( |
| benchmark_df.assign( |
| _benchmark_order=benchmark_df["Benchmark"].map( |
| { |
| "serialize": 0, |
| "serialize_compatible": 1, |
| "deserialize": 2, |
| "deserialize_compatible": 3, |
| } |
| ), |
| _buffer_order=benchmark_df["bufferType"].map( |
| {"array": 0, "directBuffer": 1} |
| ), |
| _object_order=benchmark_df["objectType"].map( |
| {"STRUCT": 0, "STRUCT2": 1, "MEDIA_CONTENT": 2, "SAMPLE": 3} |
| ), |
| ) |
| .sort_values( |
| ["_benchmark_order", "_object_order", "_buffer_order", "references"] |
| ) |
| .drop(columns=["_benchmark_order", "_buffer_order", "_object_order"]) |
| .reset_index(drop=True) |
| ) |
| benchmark_table = _pivot_lib_columns( |
| benchmark_df, ["Benchmark", "objectType", "bufferType", "references"] |
| ) |
| |
| zero_copy_df, _ = process_data(str(data_dir / java_zero_copy_file)) |
| zero_copy_df = ( |
| zero_copy_df.assign( |
| _benchmark_order=zero_copy_df["Benchmark"].map( |
| {"serialize": 0, "deserialize": 1} |
| ), |
| _buffer_order=zero_copy_df["bufferType"].map( |
| {"array": 0, "directBuffer": 1} |
| ), |
| _data_type_order=zero_copy_df["dataType"].map( |
| {"BUFFER": 0, "PRIMITIVE_ARRAY": 1} |
| ), |
| ) |
| .sort_values( |
| ["_benchmark_order", "array_size", "_buffer_order", "_data_type_order"] |
| ) |
| .drop(columns=["_benchmark_order", "_buffer_order", "_data_type_order"]) |
| .reset_index(drop=True) |
| ) |
| zero_copy_table = _pivot_lib_columns( |
| zero_copy_df, ["Benchmark", "array_size", "bufferType", "dataType"] |
| ) |
| |
| readme_content = readme_path.read_text() |
| readme_content = _replace_table_section( |
| readme_content, "### Java Serialization", _to_markdown(benchmark_table) |
| ) |
| readme_content = _replace_table_section( |
| readme_content, "### Java Zero-copy", _to_markdown(zero_copy_table) |
| ) |
| readme_path.write_text(readme_content) |
| |
| |
| def process_data(filepath: str): |
| df = pd.read_csv(filepath) |
| columns = list(df.columns.values) |
| for column in columns: |
| if "Score Error" in column: |
| df.drop([column], axis=1, inplace=True) |
| if column == "Score": |
| df.rename({"Score": "Tps"}, axis=1, inplace=True) |
| if "Param: " in column: |
| df.rename({column: column.replace("Param: ", "")}, axis=1, inplace=True) |
| |
| def process_df(bench_df): |
| if bench_df.shape[0] > 0: |
| benchmark_name = bench_df["Benchmark"].str.rsplit( |
| pat=".", n=1, expand=True |
| )[1] |
| bench_df[["Lib", "Benchmark"]] = benchmark_name.str.split( |
| pat="_", n=1, expand=True |
| ) |
| bench_df["Lib"] = bench_df["Lib"].str.capitalize() |
| bench_df["Lib"] = bench_df["Lib"].replace( |
| {"Forymetashared": "ForyMetaShared"} |
| ) |
| bench_df.drop(["Threads"], axis=1, inplace=True) |
| return bench_df |
| |
| zero_copy_bench = df[df["Benchmark"].str.contains("ZeroCopy")].copy() |
| zero_copy_bench = process_df(zero_copy_bench) |
| |
| bench = df[~df["Benchmark"].str.contains("ZeroCopy")].copy() |
| bench = process_df(bench) |
| |
| return zero_copy_bench, bench |
| |
| |
| color_map = { |
| "Fory": "#FF6f01", # Orange |
| "ForyMetaShared": "#FFB266", # Shallow orange |
| # "Kryo": (1, 0.5, 1), |
| # "Kryo": (1, 0.84, 0.25), |
| "Kryo": "#55BCC2", |
| "Kryo_deserialize": "#55BCC2", |
| "Fst": (0.90, 0.43, 0.5), |
| "Hession": (0.80, 0.5, 0.6), |
| "Hession_deserialize": (0.80, 0.5, 0.6), |
| "Protostuff": (1, 0.84, 0.66), |
| "Jdk": (0.55, 0.40, 0.45), |
| "Jsonb": (0.45, 0.40, 0.55), |
| } |
| |
| |
| scaler = 10000 |
| bar_width_scale = 1.2 |
| |
| |
| def format_scaler(x): |
| if x > 100: |
| return round(x) |
| else: |
| return round(x, 1) |
| |
| |
| def add_upper_right_legend(ax, labels): |
| legend_labels = [ |
| str(label).replace("ForyMetaShared", "ForyMeta\nShared") for label in labels |
| ] |
| ax.legend( |
| legend_labels, |
| loc="upper right", |
| bbox_to_anchor=(0.98, 0.98), |
| borderaxespad=0.2, |
| prop={"size": 10}, |
| frameon=True, |
| framealpha=0.9, |
| ) |
| |
| |
| if __name__ == "__main__": |
| benchmark_dfs = [] |
| for file_name in java_serialization_files: |
| _, bench_df = process_data(str(java_benchmark_data_dir / file_name)) |
| benchmark_dfs.append(bench_df) |
| benchmark_df = pd.concat(benchmark_dfs, ignore_index=True) |
| zero_copy_df, _ = process_data(str(java_benchmark_data_dir / java_zero_copy_file)) |
| |
| _update_java_benchmark_readme(java_benchmark_data_dir, java_benchmark_readme) |
| _generate_direct_combined_plots( |
| benchmark_df, zero_copy_df, java_benchmark_dir, java_plot_combined_groups |
| ) |
| _generate_direct_combined_plots( |
| benchmark_df, zero_copy_df, repo_root, repo_plot_combined_groups |
| ) |