dev/perf/sql_queries.py - airflow - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 from __future__ import annotations

 import os
 import statistics
 import textwrap
 from time import monotonic, sleep
 from typing import NamedTuple

 import pandas as pd

 from airflow.jobs.job import Job, run_job

 # Setup environment before any Airflow import
 DAG_FOLDER = os.path.join(os.path.dirname(__file__), "dags")
 os.environ["AIRFLOW__CORE__DAGS_FOLDER"] = DAG_FOLDER
 os.environ["AIRFLOW__DEBUG__SQLALCHEMY_STATS"] = "True"
 os.environ["AIRFLOW__CORE__LOAD_EXAMPLES"] = "False"

 # Here we setup simpler logger to avoid any code changes in
 # Airflow core code base
 LOG_LEVEL = "INFO"
 LOG_FILE = "/files/sql_stats.log"  # Default to run in Breeze

 os.environ["AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS"] = "scripts.perf.sql_queries.DEBUG_LOGGING_CONFIG"

 DEBUG_LOGGING_CONFIG = {
     "version": 1,
     "disable_existing_loggers": False,
     "formatters": {"airflow": {"format": "%(message)s"}},
     "handlers": {
         "console": {"class": "logging.StreamHandler"},
         "task": {
             "class": "logging.FileHandler",
             "formatter": "airflow",
             "filename": LOG_FILE,
         },
         "processor": {
             "class": "logging.FileHandler",
             "formatter": "airflow",
             "filename": LOG_FILE,
         },
     },
     "loggers": {
         "airflow.processor": {
             "handlers": ["processor"],
             "level": LOG_LEVEL,
             "propagate": False,
         },
         "airflow.task": {"handlers": ["task"], "level": LOG_LEVEL, "propagate": False},
         "flask_appbuilder": {
             "handler": ["console"],
             "level": LOG_LEVEL,
             "propagate": True,
         },
     },
     "root": {"handlers": ["console", "task"], "level": LOG_LEVEL},
 }


 class Query(NamedTuple):
     """
     Define attributes of the queries that will be picked up by the performance tests.
     """

     function: str
     file: str
     location: int
     sql: str
     stack: str
     time: float

     def __str__(self):
         return f"{self.function} in {self.file}:{self.location}: {textwrap.shorten(self.sql, 110)}"

     def __eq__(self, other):
         """
         Override the __eq__ method to compare specific Query attributes
         """
         return (
             self.function == other.function
             and self.sql == other.sql
             and self.location == other.location
             and self.file == other.file
         )

     def to_dict(self):
         """
         Convert selected attributes of the instance into a dictionary.
         """
         return dict(zip(("function", "file", "location", "sql", "stack", "time"), self))


 def reset_db():
     """
     Wrapper function that calls the airflow resetdb function.
     """
     from airflow.utils.db import resetdb

     resetdb()


 def run_scheduler_job(with_db_reset=False) -> None:
     """
     Run the scheduler job, selectively resetting the db before creating a ScheduleJob instance
     """
     from airflow.jobs.scheduler_job_runner import SchedulerJobRunner

     if with_db_reset:
         reset_db()
     job_runner = SchedulerJobRunner(job=Job(), subdir=DAG_FOLDER, do_pickle=False, num_runs=3)
     run_job(job=job_runner.job, execute_callable=job_runner._execute)


 def is_query(line: str) -> bool:
     """
     Return True, if provided line embeds a query, else False
     """
     return "@SQLALCHEMY" in line and "|$" in line


 def make_report() -> list[Query]:
     """
     Returns a list of Query objects that are expected to be run during the performance run.
     """
     queries = []
     with open(LOG_FILE, "r+") as f:
         raw_queries = [line for line in f.readlines() if is_query(line)]

     for query in raw_queries:
         time, info, stack, sql = query.replace("@SQLALCHEMY ", "").split("|$")
         func, file, loc = info.split(":")
         file_name = file.rpartition("/")[-1]
         queries.append(
             Query(
                 function=func.strip(),
                 file=file_name.strip(),
                 location=int(loc.strip()),
                 sql=sql.strip(),
                 stack=stack.strip(),
                 time=float(time.strip()),
             )
         )

     return queries


 def run_test() -> tuple[list[Query], float]:
     """
     Run the tests inside a scheduler and then return the elapsed time along with
     the queries that will be run.
     """
     if os.path.exists(LOG_FILE):
         os.remove(LOG_FILE)

     tic = monotonic()
     run_scheduler_job(with_db_reset=False)
     toc = monotonic()
     queries = make_report()
     return queries, toc - tic


 def rows_to_csv(rows: list[dict], name: str | None = None) -> pd.DataFrame:
     """
     Write results stats to a file.
     """
     df = pd.DataFrame(rows)
     name = name or f"/files/sql_stats_{int(monotonic())}.csv"
     df.to_csv(name, index=False)
     print(f"Saved result to {name}")
     return df


 def main() -> None:
     """
     Run the tests and write stats to a csv file.
     """
     reset_db()
     rows = []
     times = []

     for test_no in range(4):
         sleep(5)
         queries, exec_time = run_test()
         if test_no:
             times.append(exec_time)
             for qry in queries:
                 info = qry.to_dict()
                 info["test_no"] = test_no
                 rows.append(info)

     rows_to_csv(rows, name="/files/sql_after_remote.csv")
     print(times)
     msg = "Time for %d dag runs: %.4fs"

     if len(times) > 1:
         print((msg + " (±%.3fs)") % (len(times), statistics.mean(times), statistics.stdev(times)))
     else:
         print(msg % (len(times), times[0]))


 if __name__ == "__main__":
     main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	from __future__ import annotations

	import os
	import statistics
	import textwrap
	from time import monotonic, sleep
	from typing import NamedTuple

	import pandas as pd

	from airflow.jobs.job import Job, run_job

	# Setup environment before any Airflow import
	DAG_FOLDER = os.path.join(os.path.dirname(__file__), "dags")
	os.environ["AIRFLOW__CORE__DAGS_FOLDER"] = DAG_FOLDER
	os.environ["AIRFLOW__DEBUG__SQLALCHEMY_STATS"] = "True"
	os.environ["AIRFLOW__CORE__LOAD_EXAMPLES"] = "False"

	# Here we setup simpler logger to avoid any code changes in
	# Airflow core code base
	LOG_LEVEL = "INFO"
	LOG_FILE = "/files/sql_stats.log" # Default to run in Breeze

	os.environ["AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS"] = "scripts.perf.sql_queries.DEBUG_LOGGING_CONFIG"

	DEBUG_LOGGING_CONFIG = {
	"version": 1,
	"disable_existing_loggers": False,
	"formatters": {"airflow": {"format": "%(message)s"}},
	"handlers": {
	"console": {"class": "logging.StreamHandler"},
	"task": {
	"class": "logging.FileHandler",
	"formatter": "airflow",
	"filename": LOG_FILE,
	},
	"processor": {
	"class": "logging.FileHandler",
	"formatter": "airflow",
	"filename": LOG_FILE,
	},
	},
	"loggers": {
	"airflow.processor": {
	"handlers": ["processor"],
	"level": LOG_LEVEL,
	"propagate": False,
	},
	"airflow.task": {"handlers": ["task"], "level": LOG_LEVEL, "propagate": False},
	"flask_appbuilder": {
	"handler": ["console"],
	"level": LOG_LEVEL,
	"propagate": True,
	},
	},
	"root": {"handlers": ["console", "task"], "level": LOG_LEVEL},
	}


	class Query(NamedTuple):
	"""
	Define attributes of the queries that will be picked up by the performance tests.
	"""

	function: str
	file: str
	location: int
	sql: str
	stack: str
	time: float

	def __str__(self):
	return f"{self.function} in {self.file}:{self.location}: {textwrap.shorten(self.sql, 110)}"

	def __eq__(self, other):
	"""
	Override the __eq__ method to compare specific Query attributes
	"""
	return (
	self.function == other.function
	and self.sql == other.sql
	and self.location == other.location
	and self.file == other.file
	)

	def to_dict(self):
	"""
	Convert selected attributes of the instance into a dictionary.
	"""
	return dict(zip(("function", "file", "location", "sql", "stack", "time"), self))


	def reset_db():
	"""
	Wrapper function that calls the airflow resetdb function.
	"""
	from airflow.utils.db import resetdb

	resetdb()


	def run_scheduler_job(with_db_reset=False) -> None:
	"""
	Run the scheduler job, selectively resetting the db before creating a ScheduleJob instance
	"""
	from airflow.jobs.scheduler_job_runner import SchedulerJobRunner

	if with_db_reset:
	reset_db()
	job_runner = SchedulerJobRunner(job=Job(), subdir=DAG_FOLDER, do_pickle=False, num_runs=3)
	run_job(job=job_runner.job, execute_callable=job_runner._execute)


	def is_query(line: str) -> bool:
	"""
	Return True, if provided line embeds a query, else False
	"""
	return "@SQLALCHEMY" in line and "\|$" in line


	def make_report() -> list[Query]:
	"""
	Returns a list of Query objects that are expected to be run during the performance run.
	"""
	queries = []
	with open(LOG_FILE, "r+") as f:
	raw_queries = [line for line in f.readlines() if is_query(line)]

	for query in raw_queries:
	time, info, stack, sql = query.replace("@SQLALCHEMY ", "").split("\|$")
	func, file, loc = info.split(":")
	file_name = file.rpartition("/")[-1]
	queries.append(
	Query(
	function=func.strip(),
	file=file_name.strip(),
	location=int(loc.strip()),
	sql=sql.strip(),
	stack=stack.strip(),
	time=float(time.strip()),
	)
	)

	return queries


	def run_test() -> tuple[list[Query], float]:
	"""
	Run the tests inside a scheduler and then return the elapsed time along with
	the queries that will be run.
	"""
	if os.path.exists(LOG_FILE):
	os.remove(LOG_FILE)

	tic = monotonic()
	run_scheduler_job(with_db_reset=False)
	toc = monotonic()
	queries = make_report()
	return queries, toc - tic


	def rows_to_csv(rows: list[dict], name: str \| None = None) -> pd.DataFrame:
	"""
	Write results stats to a file.
	"""
	df = pd.DataFrame(rows)
	name = name or f"/files/sql_stats_{int(monotonic())}.csv"
	df.to_csv(name, index=False)
	print(f"Saved result to {name}")
	return df


	def main() -> None:
	"""
	Run the tests and write stats to a csv file.
	"""
	reset_db()
	rows = []
	times = []

	for test_no in range(4):
	sleep(5)
	queries, exec_time = run_test()
	if test_no:
	times.append(exec_time)
	for qry in queries:
	info = qry.to_dict()
	info["test_no"] = test_no
	rows.append(info)

	rows_to_csv(rows, name="/files/sql_after_remote.csv")
	print(times)
	msg = "Time for %d dag runs: %.4fs"

	if len(times) > 1:
	print((msg + " (±%.3fs)") % (len(times), statistics.mean(times), statistics.stdev(times)))
	else:
	print(msg % (len(times), times[0]))


	if __name__ == "__main__":
	main()