benchmarks/lineprotocol.py - datafusion - Git at Google

 #!/usr/bin/env python
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.


 """
 Converts a given json to LineProtocol format that can be
 visualised by grafana/other systems that support LineProtocol.

 Usage example:
 $ python3 lineprotocol.py sort.json
 benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=0,row_count=10838832,elapsed_ms=85626006 1691105678000000000
 benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=1,row_count=10838832,elapsed_ms=68694468 1691105678000000000
 benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000
 benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000
 """

 # sort.json
 """
 {
   "queries": [
     {
       "iterations": [
         {
           "elapsed": 85626.006132,
           "row_count": 10838832
         },
         {
           "elapsed": 68694.467851,
           "row_count": 10838832
         },
         {
           "elapsed": 63392.883406,
           "row_count": 10838832
         },
         {
           "elapsed": 66388.367387,
           "row_count": 10838832
         },
       ],
       "query": "sort utf8",
       "start_time": 1691105678
     },
   ],
   "context": {
     "arguments": [
       "sort",
       "--path",
       "benchmarks/data",
       "--scale-factor",
       "1.0",
       "--iterations",
       "4",
       "-o",
       "sort.json"
     ],
     "benchmark_version": "28.0.0",
     "datafusion_version": "28.0.0",
     "num_cpus": 8,
     "start_time": 1691105678
   }
 }
 """

 from __future__ import annotations

 import json
 from dataclasses import dataclass
 from typing import Dict, List, Any
 from pathlib import Path
 from argparse import ArgumentParser
 import sys
 print = sys.stdout.write


 @dataclass
 class QueryResult:
     elapsed: float
     row_count: int

     @classmethod
     def load_from(cls, data: Dict[str, Any]) -> QueryResult:
         return cls(elapsed=data["elapsed"], row_count=data["row_count"])


 @dataclass
 class QueryRun:
     query: int
     iterations: List[QueryResult]
     start_time: int

     @classmethod
     def load_from(cls, data: Dict[str, Any]) -> QueryRun:
         return cls(
             query=data["query"],
             iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
             start_time=data["start_time"],
         )

     @property
     def execution_time(self) -> float:
         assert len(self.iterations) >= 1

         # Use minimum execution time to account for variations / other
         # things the system was doing
         return min(iteration.elapsed for iteration in self.iterations)


 @dataclass
 class Context:
     benchmark_version: str
     datafusion_version: str
     num_cpus: int
     start_time: int
     arguments: List[str]
     name: str

     @classmethod
     def load_from(cls, data: Dict[str, Any]) -> Context:
         return cls(
             benchmark_version=data["benchmark_version"],
             datafusion_version=data["datafusion_version"],
             num_cpus=data["num_cpus"],
             start_time=data["start_time"],
             arguments=data["arguments"],
             name=data["arguments"][0]
         )


 @dataclass
 class BenchmarkRun:
     context: Context
     queries: List[QueryRun]

     @classmethod
     def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
         return cls(
             context=Context.load_from(data["context"]),
             queries=[QueryRun.load_from(result) for result in data["queries"]],
         )

     @classmethod
     def load_from_file(cls, path: Path) -> BenchmarkRun:
         with open(path, "r") as f:
             return cls.load_from(json.load(f))


 def lineformat(
     baseline: Path,
 ) -> None:
     baseline = BenchmarkRun.load_from_file(baseline)
     context = baseline.context
     benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
     for query in baseline.queries:
         query_str = f"query=\"{query.query}\""
         timestamp = f"{query.start_time*10**9}"
         for iter_num, result in enumerate(query.iterations):
             print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n")

 def main() -> None:
     parser = ArgumentParser()
     parser.add_argument(
         "path",
         type=Path,
         help="Path to the benchmark file.",
     )
     options = parser.parse_args()

     lineformat(options.baseline_path)


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.


	"""
	Converts a given json to LineProtocol format that can be
	visualised by grafana/other systems that support LineProtocol.

	Usage example:
	$ python3 lineprotocol.py sort.json
	benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=0,row_count=10838832,elapsed_ms=85626006 1691105678000000000
	benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=1,row_count=10838832,elapsed_ms=68694468 1691105678000000000
	benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=2,row_count=10838832,elapsed_ms=63392883 1691105678000000000
	benchmark,name=sort,version=28.0.0,datafusion_version=28.0.0,num_cpus=8 query="sort utf8",iteration=3,row_count=10838832,elapsed_ms=66388367 1691105678000000000
	"""

	# sort.json
	"""
	{
	"queries": [
	{
	"iterations": [
	{
	"elapsed": 85626.006132,
	"row_count": 10838832
	},
	{
	"elapsed": 68694.467851,
	"row_count": 10838832
	},
	{
	"elapsed": 63392.883406,
	"row_count": 10838832
	},
	{
	"elapsed": 66388.367387,
	"row_count": 10838832
	},
	],
	"query": "sort utf8",
	"start_time": 1691105678
	},
	],
	"context": {
	"arguments": [
	"sort",
	"--path",
	"benchmarks/data",
	"--scale-factor",
	"1.0",
	"--iterations",
	"4",
	"-o",
	"sort.json"
	],
	"benchmark_version": "28.0.0",
	"datafusion_version": "28.0.0",
	"num_cpus": 8,
	"start_time": 1691105678
	}
	}
	"""

	from __future__ import annotations

	import json
	from dataclasses import dataclass
	from typing import Dict, List, Any
	from pathlib import Path
	from argparse import ArgumentParser
	import sys
	print = sys.stdout.write


	@dataclass
	class QueryResult:
	elapsed: float
	row_count: int

	@classmethod
	def load_from(cls, data: Dict[str, Any]) -> QueryResult:
	return cls(elapsed=data["elapsed"], row_count=data["row_count"])


	@dataclass
	class QueryRun:
	query: int
	iterations: List[QueryResult]
	start_time: int

	@classmethod
	def load_from(cls, data: Dict[str, Any]) -> QueryRun:
	return cls(
	query=data["query"],
	iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
	start_time=data["start_time"],
	)

	@property
	def execution_time(self) -> float:
	assert len(self.iterations) >= 1

	# Use minimum execution time to account for variations / other
	# things the system was doing
	return min(iteration.elapsed for iteration in self.iterations)


	@dataclass
	class Context:
	benchmark_version: str
	datafusion_version: str
	num_cpus: int
	start_time: int
	arguments: List[str]
	name: str

	@classmethod
	def load_from(cls, data: Dict[str, Any]) -> Context:
	return cls(
	benchmark_version=data["benchmark_version"],
	datafusion_version=data["datafusion_version"],
	num_cpus=data["num_cpus"],
	start_time=data["start_time"],
	arguments=data["arguments"],
	name=data["arguments"][0]
	)


	@dataclass
	class BenchmarkRun:
	context: Context
	queries: List[QueryRun]

	@classmethod
	def load_from(cls, data: Dict[str, Any]) -> BenchmarkRun:
	return cls(
	context=Context.load_from(data["context"]),
	queries=[QueryRun.load_from(result) for result in data["queries"]],
	)

	@classmethod
	def load_from_file(cls, path: Path) -> BenchmarkRun:
	with open(path, "r") as f:
	return cls.load_from(json.load(f))


	def lineformat(
	baseline: Path,
	) -> None:
	baseline = BenchmarkRun.load_from_file(baseline)
	context = baseline.context
	benchamrk_str = f"benchmark,name={context.name},version={context.benchmark_version},datafusion_version={context.datafusion_version},num_cpus={context.num_cpus}"
	for query in baseline.queries:
	query_str = f"query=\"{query.query}\""
	timestamp = f"{query.start_time10*9}"
	for iter_num, result in enumerate(query.iterations):
	print(f"{benchamrk_str} {query_str},iteration={iter_num},row_count={result.row_count},elapsed_ms={result.elapsed*1000:.0f} {timestamp}\n")

	def main() -> None:
	parser = ArgumentParser()
	parser.add_argument(
	"path",
	type=Path,
	help="Path to the benchmark file.",
	)
	options = parser.parse_args()

	lineformat(options.baseline_path)



	if __name__ == "__main__":
	main()