blob: 4301fc5650f9a508cdf92c833948775ba2b77db6 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Apache Fory™ vs Pickle vs Msgpack CPython Benchmark Suite
Microbenchmark comparing Apache Fory™, Pickle, and Msgpack serialization
performance in CPython.
Usage:
python fory_benchmark.py [OPTIONS]
Benchmark Options:
--operation MODE
Benchmark operation mode. Default: roundtrip
Available: roundtrip, serialize, deserialize
--benchmarks BENCHMARK_LIST
Comma-separated list of benchmarks to run. Default: all
Available: dict, large_dict, dict_group, tuple, large_tuple,
large_float_tuple, large_boolean_tuple, list, large_list, struct, slots_struct
--serializers SERIALIZER_LIST
Comma-separated list of serializers to benchmark. Default: all
Available: fory, pickle, msgpack
Example: --serializers fory,pickle
--no-ref
Disable reference tracking for Fory (enabled by default)
--warmup N
Number of warmup iterations (default: 3)
--iterations N
Number of benchmark iterations (default: 20)
--repeat N
Number of times to repeat each iteration (default: 5)
--number N
Number of times to call function per measurement (inner loop, default: 1000)
--help
Show help message and exit
Examples:
# Run all benchmarks with all serializers
python fory_benchmark.py
# Benchmark serialize only
python fory_benchmark.py --operation serialize
# Benchmark deserialize only
python fory_benchmark.py --operation deserialize
# Run specific benchmarks with both serializers
python fory_benchmark.py --benchmarks dict,large_dict,struct,slots_struct
# Compare only Fory performance
python fory_benchmark.py --serializers fory
# Compare only Pickle performance
python fory_benchmark.py --serializers pickle
# Compare only Msgpack performance
python fory_benchmark.py --serializers msgpack
# Run without reference tracking for Fory
python fory_benchmark.py --no-ref
# Run with more iterations for better accuracy
python fory_benchmark.py --iterations 50 --repeat 10
# Debug with pure Python mode
python fory_benchmark.py --disable-cython --benchmarks dict
"""
import argparse
import array
from dataclasses import dataclass, fields, is_dataclass
import datetime
import pickle
import random
import statistics
import sys
import timeit
from typing import Any, Dict, List
import pyfory
try:
import msgpack
except ImportError:
msgpack = None
# The benchmark case is rewritten from pyperformance bm_pickle
# https://github.com/python/pyperformance/blob/main/pyperformance/data-files/benchmarks/bm_pickle/run_benchmark.py
BENCHMARK_RANDOM_SEED = 5
DICT = {
"ads_flags": 0,
"age": 18,
"birthday": datetime.date(1980, 5, 7),
"bulletin_count": 0,
"comment_count": 0,
"country": "BR",
"encrypted_id": "G9urXXAJwjE",
"favorite_count": 9,
"first_name": "",
"flags": 412317970704,
"friend_count": 0,
"gender": "m",
"gender_for_display": "Male",
"id": 302935349,
"is_custom_profile_icon": 0,
"last_name": "",
"locale_preference": "pt_BR",
"member": 0,
"tags": ["a", "b", "c", "d", "e", "f", "g"],
"profile_foo_id": 827119638,
"secure_encrypted_id": "Z_xxx2dYx3t4YAdnmfgyKw",
"session_number": 2,
"signup_id": "201-19225-223",
"status": "A",
"theme": 1,
"time_created": 1225237014,
"time_updated": 1233134493,
"unread_message_count": 0,
"user_group": "0",
"username": "collinwinter",
"play_count": 9,
"view_count": 7,
"zip": "",
}
LARGE_DICT = {str(i): i for i in range(2**10 + 1)}
TUPLE = (
[
265867233,
265868503,
265252341,
265243910,
265879514,
266219766,
266021701,
265843726,
265592821,
265246784,
265853180,
45526486,
265463699,
265848143,
265863062,
265392591,
265877490,
265823665,
265828884,
265753032,
],
60,
)
LARGE_TUPLE = tuple(range(2**8 + 1))
benchmark_random = random.Random(BENCHMARK_RANDOM_SEED)
LARGE_FLOAT_TUPLE = tuple(benchmark_random.random() * 10000 for _ in range(2**8 + 1))
LARGE_BOOLEAN_TUPLE = tuple(benchmark_random.random() > 0.5 for _ in range(2**8 + 1))
LIST = [[list(range(10)), list(range(10))] for _ in range(10)]
LARGE_LIST = [i for i in range(2**8 + 1)]
def mutate_dict(orig_dict, random_source):
new_dict = dict(orig_dict)
for key, value in new_dict.items():
rand_val = random_source.random() * sys.maxsize
if isinstance(key, (int, bytes, str)):
new_dict[key] = type(key)(rand_val)
return new_dict
random_source = random.Random(BENCHMARK_RANDOM_SEED)
DICT_GROUP = [mutate_dict(DICT, random_source) for _ in range(3)]
@dataclass
class Struct1:
f1: Any = None
f2: str = None
f3: List[str] = None
f4: Dict[pyfory.int8, pyfory.int32] = None
f5: pyfory.int8 = None
f6: pyfory.int16 = None
f7: pyfory.int32 = None
f8: pyfory.int64 = None
f9: pyfory.float32 = None
f10: pyfory.float64 = None
f11: pyfory.int16_array = None
f12: List[pyfory.int16] = None
@dataclass
class Struct2:
f1: Any
f2: Dict[pyfory.int8, pyfory.int32]
@pyfory.dataslots
@dataclass
class SlotsStruct:
f1: Any = None
f2: str = None
f3: List[str] = None
f4: Dict[pyfory.int8, pyfory.int32] = None
f5: pyfory.int8 = None
f6: pyfory.int16 = None
f7: pyfory.int32 = None
f8: pyfory.int64 = None
f9: pyfory.float32 = None
f10: pyfory.float64 = None
f11: pyfory.int16_array = None
f12: List[pyfory.int16] = None
STRUCT_OBJECT = Struct1(
f1=Struct2(f1=True, f2={-1: 2}),
f2="abc",
f3=["abc", "abc"],
f4={1: 2},
f5=2**7 - 1,
f6=2**15 - 1,
f7=2**31 - 1,
f8=2**63 - 1,
f9=1.0 / 2,
f10=1 / 3.0,
f11=array.array("h", [-1, 4]),
f12=[-1, 4],
)
SLOTS_STRUCT_OBJECT = SlotsStruct(
f1=Struct2(f1=True, f2={-1: 2}),
f2="abc",
f3=["abc", "abc"],
f4={1: 2},
f5=2**7 - 1,
f6=2**15 - 1,
f7=2**31 - 1,
f8=2**63 - 1,
f9=1.0 / 2,
f10=1 / 3.0,
f11=array.array("h", [-1, 4]),
f12=[-1, 4],
)
# Global fory instances
fory_with_ref = pyfory.Fory(ref=True)
fory_without_ref = pyfory.Fory(ref=False)
# Register all custom types on both instances
for fory_instance in (fory_with_ref, fory_without_ref):
fory_instance.register_type(Struct1)
fory_instance.register_type(Struct2)
fory_instance.register_type(SlotsStruct)
def fory_roundtrip(ref, obj):
fory = fory_with_ref if ref else fory_without_ref
binary = fory.serialize(obj)
fory.deserialize(binary)
def fory_serialize(ref, obj):
fory = fory_with_ref if ref else fory_without_ref
fory.serialize(obj)
def fory_deserialize(ref, binary):
fory = fory_with_ref if ref else fory_without_ref
fory.deserialize(binary)
def pickle_roundtrip(obj):
binary = pickle.dumps(obj)
pickle.loads(binary)
def pickle_serialize(obj):
pickle.dumps(obj)
def pickle_deserialize(binary):
pickle.loads(binary)
def msgpack_roundtrip(obj):
binary = msgpack.dumps(obj, use_bin_type=True)
msgpack.loads(binary, raw=False, strict_map_key=False)
def msgpack_roundtrip_dataclass(obj):
payload = make_msgpack_compatible(obj)
binary = msgpack.dumps(payload, use_bin_type=True)
restored = msgpack.loads(binary, raw=False, strict_map_key=False)
_restore_dataclass_from_template(restored, obj)
def msgpack_serialize(obj):
msgpack.dumps(obj, use_bin_type=True)
def msgpack_serialize_dataclass(obj):
payload = make_msgpack_compatible(obj)
msgpack.dumps(payload, use_bin_type=True)
def msgpack_deserialize(binary):
msgpack.loads(binary, raw=False, strict_map_key=False)
def msgpack_deserialize_dataclass(binary, dataclass_template):
restored = msgpack.loads(binary, raw=False, strict_map_key=False)
_restore_dataclass_from_template(restored, dataclass_template)
def make_msgpack_compatible(obj):
if isinstance(obj, datetime.date):
return obj.isoformat()
if isinstance(obj, array.array):
return obj.tolist()
if is_dataclass(obj):
return {
f.name: make_msgpack_compatible(getattr(obj, f.name)) for f in fields(obj)
}
if isinstance(obj, dict):
return {
make_msgpack_compatible(k): make_msgpack_compatible(v)
for k, v in obj.items()
}
if isinstance(obj, list):
return [make_msgpack_compatible(v) for v in obj]
if isinstance(obj, tuple):
return tuple(make_msgpack_compatible(v) for v in obj)
return obj
def _restore_dataclass_from_template(value, template):
if not is_dataclass(template):
return value
if not isinstance(value, dict):
return value
kwargs = {}
for field in template.__dataclass_fields__.values():
field_value = value.get(field.name)
template_value = getattr(template, field.name, None)
if is_dataclass(template_value):
kwargs[field.name] = _restore_dataclass_from_template(
field_value, template_value
)
else:
kwargs[field.name] = field_value
return type(template)(**kwargs)
def build_fory_benchmark_case(operation: str, ref: bool, obj):
if operation == "serialize":
return fory_serialize, (ref, obj)
if operation == "deserialize":
fory = fory_with_ref if ref else fory_without_ref
return fory_deserialize, (ref, fory.serialize(obj))
return fory_roundtrip, (ref, obj)
def build_pickle_benchmark_case(operation: str, obj):
if operation == "serialize":
return pickle_serialize, (obj,)
if operation == "deserialize":
return pickle_deserialize, (pickle.dumps(obj),)
return pickle_roundtrip, (obj,)
def build_msgpack_benchmark_case(operation: str, obj):
if operation == "serialize":
if is_dataclass(obj):
return msgpack_serialize_dataclass, (obj,)
return msgpack_serialize, (obj,)
if operation == "deserialize":
if is_dataclass(obj):
return msgpack_deserialize_dataclass, (
msgpack.dumps(make_msgpack_compatible(obj), use_bin_type=True),
obj,
)
return msgpack_deserialize, (msgpack.dumps(obj, use_bin_type=True),)
if is_dataclass(obj):
return msgpack_roundtrip_dataclass, (obj,)
return msgpack_roundtrip, (obj,)
def benchmark_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description="Fory vs Pickle vs Msgpack Benchmark")
parser.add_argument(
"--operation",
type=str,
default="roundtrip",
choices=["roundtrip", "serialize", "deserialize"],
help="Benchmark operation mode: roundtrip, serialize, deserialize (default: roundtrip)",
)
parser.add_argument(
"--no-ref",
action="store_true",
default=False,
help="Disable reference tracking for Fory",
)
parser.add_argument(
"--disable-cython",
action="store_true",
default=False,
help="Use pure Python mode for Fory",
)
parser.add_argument(
"--benchmarks",
type=str,
default="all",
help="Comma-separated list of benchmarks to run. Available: dict, large_dict, "
"dict_group, tuple, large_tuple, large_float_tuple, large_boolean_tuple, "
"list, large_list, struct, slots_struct. Default: all",
)
parser.add_argument(
"--serializers",
type=str,
default="all",
help="Comma-separated list of serializers to benchmark. Available: fory, pickle, msgpack. Default: all",
)
parser.add_argument(
"--warmup",
type=int,
default=3,
help="Number of warmup iterations (default: 3)",
)
parser.add_argument(
"--iterations",
type=int,
default=20,
help="Number of benchmark iterations (default: 20)",
)
parser.add_argument(
"--repeat",
type=int,
default=5,
help="Number of times to repeat each iteration (default: 5)",
)
parser.add_argument(
"--number",
type=int,
default=1000,
help="Number of times to call function per measurement (inner loop, default: 1000)",
)
return parser.parse_args()
def run_benchmark(func, *args, warmup=3, iterations=20, repeat=5, number=10000):
"""Run a benchmark and return timing statistics
Args:
func: Function to benchmark
*args: Arguments to pass to func
warmup: Number of warmup iterations
iterations: Number of measurement iterations
repeat: Number of times to repeat each measurement
number: Number of times to call func per measurement (inner loop)
Returns:
(mean_time_per_call, stdev_time_per_call)
"""
# Warmup
for _ in range(warmup):
for _ in range(number):
func(*args)
# Benchmark - run func 'number' times per measurement
times = []
for _ in range(iterations):
timer = timeit.Timer(lambda: func(*args))
iteration_times = timer.repeat(repeat=repeat, number=number)
# Convert total time to time per call
times.extend([t / number for t in iteration_times])
mean = statistics.mean(times)
stdev = statistics.stdev(times) if len(times) > 1 else 0
return mean, stdev
def format_time(seconds):
"""Format time in human-readable units"""
if seconds < 1e-6:
return f"{seconds * 1e9:.2f} ns"
elif seconds < 1e-3:
return f"{seconds * 1e6:.2f} us"
elif seconds < 1:
return f"{seconds * 1e3:.2f} ms"
else:
return f"{seconds:.2f} s"
def micro_benchmark():
args = benchmark_args()
ref = not args.no_ref
# Define benchmark data
benchmark_data = {
"dict": DICT,
"large_dict": LARGE_DICT,
"dict_group": DICT_GROUP,
"tuple": TUPLE,
"large_tuple": LARGE_TUPLE,
"large_float_tuple": LARGE_FLOAT_TUPLE,
"large_boolean_tuple": LARGE_BOOLEAN_TUPLE,
"list": LIST,
"large_list": LARGE_LIST,
"struct": STRUCT_OBJECT,
"slots_struct": SLOTS_STRUCT_OBJECT,
}
# Determine which benchmarks to run
if args.benchmarks == "all":
selected_benchmarks = list(benchmark_data.keys())
else:
selected_benchmarks = [b.strip() for b in args.benchmarks.split(",")]
# Validate benchmark names
invalid = [b for b in selected_benchmarks if b not in benchmark_data]
if invalid:
print(f"Error: Invalid benchmark names: {', '.join(invalid)}")
print(f"Available benchmarks: {', '.join(benchmark_data.keys())}")
sys.exit(1)
# Determine which serializers to run
available_serializers = {"fory", "pickle", "msgpack"}
if args.serializers == "all":
selected_serializers = ["fory", "pickle"]
if msgpack is not None:
selected_serializers.append("msgpack")
else:
selected_serializers = [s.strip() for s in args.serializers.split(",")]
# Validate serializer names
invalid = [s for s in selected_serializers if s not in available_serializers]
if invalid:
print(f"Error: Invalid serializer names: {', '.join(invalid)}")
print(f"Available serializers: {', '.join(available_serializers)}")
sys.exit(1)
if "msgpack" in selected_serializers and msgpack is None:
print("Error: msgpack is not installed.")
print("Install it with: pip install msgpack")
sys.exit(1)
msgpack_data = {}
if "msgpack" in selected_serializers:
msgpack_data = {
benchmark_name: (
data if is_dataclass(data) else make_msgpack_compatible(data)
)
for benchmark_name, data in benchmark_data.items()
}
print(
f"\nBenchmarking {len(selected_benchmarks)} benchmark(s) with {len(selected_serializers)} serializer(s)"
)
print(f"Operation: {args.operation}")
print(
f"Warmup: {args.warmup}, Iterations: {args.iterations}, Repeat: {args.repeat}, Inner loop: {args.number}"
)
print(f"Fory reference tracking: {'enabled' if ref else 'disabled'}")
print("=" * 80)
# Run selected benchmarks with selected serializers
results = []
for benchmark_name in selected_benchmarks:
data = benchmark_data[benchmark_name]
benchmark_number = (
max(1, args.number // 10) if benchmark_name == "large_dict" else args.number
)
if "fory" in selected_serializers:
print(
f"\nRunning fory_{benchmark_name}_{args.operation}...",
end=" ",
flush=True,
)
fory_func, fory_args = build_fory_benchmark_case(args.operation, ref, data)
mean, stdev = run_benchmark(
fory_func,
*fory_args,
warmup=args.warmup,
iterations=args.iterations,
repeat=args.repeat,
number=benchmark_number,
)
results.append(("fory", benchmark_name, mean, stdev))
print(f"{format_time(mean)} ± {format_time(stdev)}")
if "pickle" in selected_serializers:
print(
f"Running pickle_{benchmark_name}_{args.operation}...",
end=" ",
flush=True,
)
pickle_func, pickle_args = build_pickle_benchmark_case(args.operation, data)
mean, stdev = run_benchmark(
pickle_func,
*pickle_args,
warmup=args.warmup,
iterations=args.iterations,
repeat=args.repeat,
number=benchmark_number,
)
results.append(("pickle", benchmark_name, mean, stdev))
print(f"{format_time(mean)} ± {format_time(stdev)}")
if "msgpack" in selected_serializers:
print(
f"Running msgpack_{benchmark_name}_{args.operation}...",
end=" ",
flush=True,
)
msgpack_func, msgpack_args = build_msgpack_benchmark_case(
args.operation, msgpack_data[benchmark_name]
)
mean, stdev = run_benchmark(
msgpack_func,
*msgpack_args,
warmup=args.warmup,
iterations=args.iterations,
repeat=args.repeat,
number=benchmark_number,
)
results.append(("msgpack", benchmark_name, mean, stdev))
print(f"{format_time(mean)} ± {format_time(stdev)}")
# Print summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"{'Serializer':<15} {'Benchmark':<25} {'Mean':<20} {'Std Dev':<20}")
print("-" * 80)
for serializer, benchmark, mean, stdev in results:
print(
f"{serializer:<15} {benchmark:<25} {format_time(mean):<20} {format_time(stdev):<20}"
)
# Calculate speedup if fory and at least one baseline serializer were tested.
if "fory" in selected_serializers:
for baseline in selected_serializers:
if baseline == "fory":
continue
print("\n" + "=" * 80)
print(f"SPEEDUP (Fory vs {baseline.capitalize()})")
print("=" * 80)
print(
f"{'Benchmark':<25} {'Fory':<20} {baseline.capitalize():<20} {'Speedup':<20}"
)
print("-" * 80)
for benchmark_name in selected_benchmarks:
fory_result = next(
(r for r in results if r[0] == "fory" and r[1] == benchmark_name),
None,
)
baseline_result = next(
(r for r in results if r[0] == baseline and r[1] == benchmark_name),
None,
)
if fory_result and baseline_result:
fory_mean = fory_result[2]
baseline_mean = baseline_result[2]
speedup = baseline_mean / fory_mean
speedup_str = (
f"{speedup:.2f}x"
if speedup >= 1
else f"{1 / speedup:.2f}x slower"
)
print(
f"{benchmark_name:<25} {format_time(fory_mean):<20} {format_time(baseline_mean):<20} {speedup_str:<20}"
)
if __name__ == "__main__":
micro_benchmark()