blob: 5e3cedb90e5b8e308698cbab5280df00b75b36ab [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
process fory/kryo/fst/hession performance data
"""
import matplotlib.pyplot as plt
import os
import pandas as pd
from pathlib import Path
import re
dir_path = os.path.dirname(os.path.realpath(__file__))
repo_root = Path(dir_path).parent.parent
java_benchmark_dir = repo_root / "docs/benchmarks/java"
java_benchmark_data_dir = java_benchmark_dir / "data"
java_benchmark_readme = java_benchmark_dir / "README.md"
lib_order = [
"Fory",
"ForyMetaShared",
"Kryo",
"Fst",
"Hession",
"Jdk",
"Protostuff",
]
java_serialization_files = [
"jmh-jdk-11-serialization.csv",
"jmh-jdk-11-deserialization.csv",
]
java_zero_copy_file = "jmh-jdk-11-zerocopy.csv"
java_plot_combined_groups = [
{
"alt": "Java Heap Schema Consistent Serialization",
"combined": "java_heap_serialize_consistent.png",
"sources": [
"serialization/bench_serialize_STRUCT_to_array_tps.png",
"serialization/bench_serialize_STRUCT2_to_array_tps.png",
"serialization/bench_serialize_MEDIA_CONTENT_to_array_tps.png",
"serialization/bench_serialize_SAMPLE_to_array_tps.png",
],
},
{
"alt": "Java Heap Schema Compatible Serialization",
"combined": "java_heap_serialize_compatible.png",
"sources": [
"serialization/bench_serialize_compatible_STRUCT_to_array_tps.png",
"serialization/bench_serialize_compatible_STRUCT2_to_array_tps.png",
"compatible/bench_serialize_compatible_MEDIA_CONTENT_to_array_tps.png",
"serialization/bench_serialize_compatible_SAMPLE_to_array_tps.png",
],
},
{
"alt": "Java Heap Schema Consistent Deserialization",
"combined": "java_heap_deserialize_consistent.png",
"sources": [
"deserialization/bench_deserialize_STRUCT_from_array_tps.png",
"deserialization/bench_deserialize_STRUCT2_from_array_tps.png",
"deserialization/bench_deserialize_MEDIA_CONTENT_from_array_tps.png",
"deserialization/bench_deserialize_SAMPLE_from_array_tps.png",
],
},
{
"alt": "Java Heap Schema Compatible Deserialization",
"combined": "java_heap_deserialize_compatible.png",
"sources": [
"deserialization/bench_deserialize_compatible_STRUCT_from_array_tps.png",
"deserialization/bench_deserialize_compatible_STRUCT2_from_array_tps.png",
"compatible/bench_deserialize_compatible_MEDIA_CONTENT_from_array_tps.png",
"deserialization/bench_deserialize_compatible_SAMPLE_from_array_tps.png",
],
},
{
"alt": "Java Off Heap Schema Consistent Serialization",
"combined": "java_offheap_serialize_consistent.png",
"sources": [
"serialization/bench_serialize_STRUCT_to_directBuffer_tps.png",
"serialization/bench_serialize_STRUCT2_to_directBuffer_tps.png",
"serialization/bench_serialize_MEDIA_CONTENT_to_directBuffer_tps.png",
"serialization/bench_serialize_compatible_SAMPLE_to_directBuffer_tps.png",
],
},
{
"alt": "Java Off Heap Schema Compatible Serialization",
"combined": "java_offheap_serialize_compatible.png",
"sources": [
"compatible/bench_serialize_compatible_STRUCT_to_directBuffer_tps.png",
"serialization/bench_serialize_compatible_STRUCT2_to_directBuffer_tps.png",
"serialization/bench_serialize_compatible_MEDIA_CONTENT_to_directBuffer_tps.png",
"serialization/bench_serialize_SAMPLE_to_directBuffer_tps.png",
],
},
{
"alt": "Java Off Heap Schema Consistent Deserialization",
"combined": "java_offheap_deserialize_consistent.png",
"sources": [
"deserialization/bench_deserialize_STRUCT_from_directBuffer_tps.png",
"deserialization/bench_deserialize_STRUCT2_from_directBuffer_tps.png",
"deserialization/bench_deserialize_MEDIA_CONTENT_from_directBuffer_tps.png",
"deserialization/bench_deserialize_SAMPLE_from_directBuffer_tps.png",
],
},
{
"alt": "Java Off Heap Schema Compatible Deserialization",
"combined": "java_offheap_deserialize_compatible.png",
"sources": [
"compatible/bench_deserialize_compatible_STRUCT_from_directBuffer_tps.png",
"deserialization/bench_deserialize_compatible_STRUCT2_from_directBuffer_tps.png",
"deserialization/bench_deserialize_compatible_MEDIA_CONTENT_from_directBuffer_tps.png",
"deserialization/bench_deserialize_compatible_SAMPLE_from_directBuffer_tps.png",
],
},
{
"alt": "Java Zero Copy Serialization",
"combined": "java_zero_copy_serialize.png",
"sources": [
"zerocopy/zero_copy_bench_serialize_BUFFER_to_array_tps.png",
"zerocopy/zero_copy_bench_serialize_BUFFER_to_directBuffer_tps.png",
"zerocopy/zero_copy_bench_serialize_PRIMITIVE_ARRAY_to_array_tps.png",
"zerocopy/zero_copy_bench_serialize_PRIMITIVE_ARRAY_to_directBuffer_tps.png",
],
},
{
"alt": "Java Zero Copy Deserialization",
"combined": "java_zero_copy_deserialize.png",
"sources": [
"zerocopy/zero_copy_bench_deserialize_BUFFER_from_array_tps.png",
"zerocopy/zero_copy_bench_deserialize_BUFFER_from_directBuffer_tps.png",
"zerocopy/zero_copy_bench_deserialize_PRIMITIVE_ARRAY_from_array_tps.png",
"zerocopy/zero_copy_bench_deserialize_PRIMITIVE_ARRAY_from_directBuffer_tps.png",
],
},
]
repo_plot_combined_groups = [
{
"alt": "Java Serialization Throughput",
"combined": "docs/benchmarks/java/java_repo_serialization_throughput.png",
"sources": [
"docs/benchmarks/java/compatible/bench_serialize_compatible_STRUCT_to_directBuffer_tps.png",
"docs/benchmarks/java/compatible/bench_serialize_compatible_MEDIA_CONTENT_to_array_tps.png",
"docs/benchmarks/java/serialization/bench_serialize_MEDIA_CONTENT_to_array_tps.png",
"docs/benchmarks/java/serialization/bench_serialize_SAMPLE_to_array_tps.png",
],
},
{
"alt": "Java Deserialization Throughput",
"combined": "docs/benchmarks/java/java_repo_deserialization_throughput.png",
"sources": [
"docs/benchmarks/java/compatible/bench_deserialize_compatible_STRUCT_from_directBuffer_tps.png",
"docs/benchmarks/java/compatible/bench_deserialize_compatible_MEDIA_CONTENT_from_array_tps.png",
"docs/benchmarks/java/deserialization/bench_deserialize_MEDIA_CONTENT_from_array_tps.png",
"docs/benchmarks/java/deserialization/bench_deserialize_SAMPLE_from_array_tps.png",
],
},
]
def _to_markdown(df: pd.DataFrame):
lines = list(df.values.tolist())
width = len(df.columns)
lines.insert(0, df.columns.values.tolist())
lines.insert(1, ["-------"] * width)
md_table = "\n".join(
["| " + " | ".join([str(item) for item in line]) + " |" for line in lines]
)
return md_table
def _format_tps(value):
if pd.isna(value):
return ""
return f"{float(value):.6f}"
def _pivot_lib_columns(df: pd.DataFrame, index_columns):
table_df = (
df.pivot_table(
index=index_columns,
columns="Lib",
values="Tps",
aggfunc="first",
sort=False,
)
.reset_index()
.copy()
)
available_libs = table_df.columns.tolist()
sorted_lib_columns = [name for name in lib_order if name in available_libs]
extra_lib_columns = sorted(
[
name
for name in available_libs
if name not in index_columns + sorted_lib_columns
]
)
table_df = table_df[index_columns + sorted_lib_columns + extra_lib_columns]
if "references" in table_df.columns:
table_df["references"] = table_df["references"].astype(str).str.capitalize()
for column in sorted_lib_columns + extra_lib_columns:
table_df[column] = table_df[column].map(_format_tps)
return table_df
def _replace_table_section(content: str, heading: str, table_markdown: str):
lines = content.splitlines()
start_index = None
for index, line in enumerate(lines):
if line.strip() == heading:
start_index = index
break
if start_index is None:
raise ValueError(f"Failed to find section {heading}")
end_index = len(lines)
for index in range(start_index + 1, len(lines)):
if lines[index].startswith("### "):
end_index = index
break
updated_lines = lines[: start_index + 1] + ["", table_markdown, ""]
if end_index < len(lines):
updated_lines.extend(lines[end_index:])
return "\n".join(updated_lines).rstrip() + "\n"
def _parse_chart_spec(source_path: str):
name = Path(source_path).name
benchmark_match = re.match(
r"bench_(serialize(?:_compatible)?|deserialize(?:_compatible)?)_([A-Z0-9_]+)_(to|from)_(array|directBuffer)_tps\.png",
name,
)
if benchmark_match is not None:
return {
"kind": "benchmark",
"benchmark": benchmark_match.group(1),
"objectType": benchmark_match.group(2),
"bufferType": benchmark_match.group(4),
}
zero_copy_match = re.match(
r"zero_copy_bench_(serialize|deserialize)_([A-Z_]+)_(to|from)_(array|directBuffer)_tps\.png",
name,
)
if zero_copy_match is not None:
return {
"kind": "zero_copy",
"benchmark": zero_copy_match.group(1),
"dataType": zero_copy_match.group(2),
"bufferType": zero_copy_match.group(4),
}
raise ValueError(f"Unsupported chart source path: {source_path}")
def _prepare_benchmark_plot_data(bench_df: pd.DataFrame):
data = bench_df.fillna("").copy()
compatible = data[data["Benchmark"].str.contains("compatible")]
if len(compatible) > 0:
jdk = data[data["Lib"].str.contains("Jdk")].copy()
jdk["Benchmark"] = jdk["Benchmark"] + "_compatible"
data = pd.concat([data, jdk], ignore_index=True)
data["Tps"] = (data["Tps"] / scaler).apply(format_scaler)
return data
def _prepare_zero_copy_plot_data(zero_copy_df: pd.DataFrame):
data = zero_copy_df.fillna("").copy()
data["Tps"] = (data["Tps"] / scaler).apply(format_scaler)
return data
def _build_single_plot_frame(spec, benchmark_data, zero_copy_data):
if spec["kind"] == "benchmark":
sub_df = benchmark_data[
(benchmark_data["Benchmark"] == spec["benchmark"])
& (benchmark_data["objectType"] == spec["objectType"])
& (benchmark_data["bufferType"] == spec["bufferType"])
][["Lib", "references", "Tps"]]
final_df = (
sub_df.reset_index(drop=True)
.set_index(["Lib", "references"])
.unstack("Lib")
)
if spec["benchmark"].startswith("serialize"):
title = f"{spec['benchmark']} {spec['objectType']} to {spec['bufferType']}"
else:
title = (
f"{spec['benchmark']} {spec['objectType']} from {spec['bufferType']}"
)
xlabel = "enable_references"
width = 0.7 * bar_width_scale
else:
sub_df = zero_copy_data[
(zero_copy_data["Benchmark"] == spec["benchmark"])
& (zero_copy_data["dataType"] == spec["dataType"])
& (zero_copy_data["bufferType"] == spec["bufferType"])
][["Lib", "array_size", "Tps"]]
final_df = (
sub_df.reset_index(drop=True)
.set_index(["Lib", "array_size"])
.unstack("Lib")
)
if spec["benchmark"].startswith("serialize"):
title = f"{spec['benchmark']} {spec['dataType']} to {spec['bufferType']}"
else:
title = f"{spec['benchmark']} {spec['dataType']} from {spec['bufferType']}"
xlabel = "array_size"
width = 0.8 * bar_width_scale
return final_df, title, xlabel, width
def _plot_combined_group(group, benchmark_data, zero_copy_data, output_path: Path):
fig, axes = plt.subplots(1, 4, figsize=(22, 6), gridspec_kw={"wspace": 0.15})
for axis_index, source_path in enumerate(group["sources"]):
axis = axes[axis_index]
spec = _parse_chart_spec(source_path)
final_df, title, xlabel, width = _build_single_plot_frame(
spec, benchmark_data, zero_copy_data
)
libs = final_df.columns.to_frame()["Lib"]
color = [color_map[lib] for lib in libs]
final_df.plot.bar(title=title, color=color, ax=axis, width=width)
for container in axis.containers:
axis.bar_label(container, fontsize=8)
axis.set_xlabel(xlabel)
if axis_index == 0:
axis.set_ylabel(f"Tps/{scaler}")
else:
axis.set_ylabel("")
add_upper_right_legend(axis, libs)
output_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(
output_path, dpi=170, bbox_inches="tight", pad_inches=0.03, facecolor="white"
)
plt.close(fig)
def _generate_direct_combined_plots(benchmark_df, zero_copy_df, base_dir: Path, groups):
benchmark_data = _prepare_benchmark_plot_data(benchmark_df)
zero_copy_data = _prepare_zero_copy_plot_data(zero_copy_df)
for group in groups:
output_path = base_dir / group["combined"]
_plot_combined_group(group, benchmark_data, zero_copy_data, output_path)
def _update_java_benchmark_readme(data_dir: Path, readme_path: Path):
benchmark_dfs = []
for file_name in java_serialization_files:
_, bench_df = process_data(str(data_dir / file_name))
benchmark_dfs.append(bench_df)
benchmark_df = pd.concat(benchmark_dfs, ignore_index=True)
benchmark_df = (
benchmark_df.assign(
_benchmark_order=benchmark_df["Benchmark"].map(
{
"serialize": 0,
"serialize_compatible": 1,
"deserialize": 2,
"deserialize_compatible": 3,
}
),
_buffer_order=benchmark_df["bufferType"].map(
{"array": 0, "directBuffer": 1}
),
_object_order=benchmark_df["objectType"].map(
{"STRUCT": 0, "STRUCT2": 1, "MEDIA_CONTENT": 2, "SAMPLE": 3}
),
)
.sort_values(
["_benchmark_order", "_object_order", "_buffer_order", "references"]
)
.drop(columns=["_benchmark_order", "_buffer_order", "_object_order"])
.reset_index(drop=True)
)
benchmark_table = _pivot_lib_columns(
benchmark_df, ["Benchmark", "objectType", "bufferType", "references"]
)
zero_copy_df, _ = process_data(str(data_dir / java_zero_copy_file))
zero_copy_df = (
zero_copy_df.assign(
_benchmark_order=zero_copy_df["Benchmark"].map(
{"serialize": 0, "deserialize": 1}
),
_buffer_order=zero_copy_df["bufferType"].map(
{"array": 0, "directBuffer": 1}
),
_data_type_order=zero_copy_df["dataType"].map(
{"BUFFER": 0, "PRIMITIVE_ARRAY": 1}
),
)
.sort_values(
["_benchmark_order", "array_size", "_buffer_order", "_data_type_order"]
)
.drop(columns=["_benchmark_order", "_buffer_order", "_data_type_order"])
.reset_index(drop=True)
)
zero_copy_table = _pivot_lib_columns(
zero_copy_df, ["Benchmark", "array_size", "bufferType", "dataType"]
)
readme_content = readme_path.read_text()
readme_content = _replace_table_section(
readme_content, "### Java Serialization", _to_markdown(benchmark_table)
)
readme_content = _replace_table_section(
readme_content, "### Java Zero-copy", _to_markdown(zero_copy_table)
)
readme_path.write_text(readme_content)
def process_data(filepath: str):
df = pd.read_csv(filepath)
columns = list(df.columns.values)
for column in columns:
if "Score Error" in column:
df.drop([column], axis=1, inplace=True)
if column == "Score":
df.rename({"Score": "Tps"}, axis=1, inplace=True)
if "Param: " in column:
df.rename({column: column.replace("Param: ", "")}, axis=1, inplace=True)
def process_df(bench_df):
if bench_df.shape[0] > 0:
benchmark_name = bench_df["Benchmark"].str.rsplit(
pat=".", n=1, expand=True
)[1]
bench_df[["Lib", "Benchmark"]] = benchmark_name.str.split(
pat="_", n=1, expand=True
)
bench_df["Lib"] = bench_df["Lib"].str.capitalize()
bench_df["Lib"] = bench_df["Lib"].replace(
{"Forymetashared": "ForyMetaShared"}
)
bench_df.drop(["Threads"], axis=1, inplace=True)
return bench_df
zero_copy_bench = df[df["Benchmark"].str.contains("ZeroCopy")].copy()
zero_copy_bench = process_df(zero_copy_bench)
bench = df[~df["Benchmark"].str.contains("ZeroCopy")].copy()
bench = process_df(bench)
return zero_copy_bench, bench
color_map = {
"Fory": "#FF6f01", # Orange
"ForyMetaShared": "#FFB266", # Shallow orange
# "Kryo": (1, 0.5, 1),
# "Kryo": (1, 0.84, 0.25),
"Kryo": "#55BCC2",
"Kryo_deserialize": "#55BCC2",
"Fst": (0.90, 0.43, 0.5),
"Hession": (0.80, 0.5, 0.6),
"Hession_deserialize": (0.80, 0.5, 0.6),
"Protostuff": (1, 0.84, 0.66),
"Jdk": (0.55, 0.40, 0.45),
"Jsonb": (0.45, 0.40, 0.55),
}
scaler = 10000
bar_width_scale = 1.2
def format_scaler(x):
if x > 100:
return round(x)
else:
return round(x, 1)
def add_upper_right_legend(ax, labels):
legend_labels = [
str(label).replace("ForyMetaShared", "ForyMeta\nShared") for label in labels
]
ax.legend(
legend_labels,
loc="upper right",
bbox_to_anchor=(0.98, 0.98),
borderaxespad=0.2,
prop={"size": 10},
frameon=True,
framealpha=0.9,
)
if __name__ == "__main__":
benchmark_dfs = []
for file_name in java_serialization_files:
_, bench_df = process_data(str(java_benchmark_data_dir / file_name))
benchmark_dfs.append(bench_df)
benchmark_df = pd.concat(benchmark_dfs, ignore_index=True)
zero_copy_df, _ = process_data(str(java_benchmark_data_dir / java_zero_copy_file))
_update_java_benchmark_readme(java_benchmark_data_dir, java_benchmark_readme)
_generate_direct_combined_plots(
benchmark_df, zero_copy_df, java_benchmark_dir, java_plot_combined_groups
)
_generate_direct_combined_plots(
benchmark_df, zero_copy_df, repo_root, repo_plot_combined_groups
)