dev/structured_logging_style.py - spark - Git at Google

 #!/usr/bin/env python3

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import os
 import sys
 import re
 import glob


 def main():
     log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
     inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\"\+(?!s?")).*|[^"]+\+\s*".*?"'
     compiled_inner_log_pattern = re.compile(inner_log_pattern)

     # Regex patterns for file paths to exclude from the Structured Logging style check
     excluded_file_patterns = [
         "[Tt]est",
         "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen"
         "/CodeGenerator.scala",
         "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
         "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver"
         "/SparkSQLCLIService.scala",
         "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
     ]

     nonmigrated_files = {}

     scala_files = glob.glob(os.path.join(__file__, "../", "**", "*.scala"), recursive=True)

     for file in scala_files:
         skip_file = False
         for exclude_pattern in excluded_file_patterns:
             if re.search(exclude_pattern, file):
                 skip_file = True
                 break

         if not skip_file and not os.path.isdir(file):
             with open(file, "r") as f:
                 content = f.read()

                 log_statements = re.finditer(log_pattern, content, re.DOTALL)

                 if log_statements:
                     nonmigrated_files[file] = []
                     for log_statement in log_statements:
                         log_statement_str = log_statement.group(0).strip()
                         # trim first ( and last )
                         first_paren_index = log_statement_str.find("(")
                         inner_log_statement = re.sub(
                             r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
                         )

                         if compiled_inner_log_pattern.fullmatch(inner_log_statement):
                             start_pos = log_statement.start()
                             preceding_content = content[:start_pos]
                             line_number = preceding_content.count("\n") + 1
                             start_char = start_pos - preceding_content.rfind("\n") - 1
                             nonmigrated_files[file].append((line_number, start_char))

     if all(len(issues) == 0 for issues in nonmigrated_files.values()):
         print("Structured logging style check passed.", file=sys.stderr)
         sys.exit(0)
     else:
         for file_path, issues in nonmigrated_files.items():
             for line_number, start_char in issues:
                 print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
                 print(
                     "[error]\tPlease use the Structured Logging Framework for logging messages "
                     'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
                     "\n\tRefer to the guidelines in the file `internal/Logging.scala`.",
                     file=sys.stderr,
                 )

         sys.exit(-1)


 if __name__ == "__main__":
     main()
	#!/usr/bin/env python3

	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import os
	import sys
	import re
	import glob


	def main():
	log_pattern = r"log(?:Info\|Warning\|Error)\(.*?\)\n"
	inner_log_pattern = r'".?"\.format\(.\)\|s?".?(?:\$\|\"\+(?!s?")).\|[^"]+\+\s".?"'
	compiled_inner_log_pattern = re.compile(inner_log_pattern)

	# Regex patterns for file paths to exclude from the Structured Logging style check
	excluded_file_patterns = [
	"[Tt]est",
	"sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen"
	"/CodeGenerator.scala",
	"streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
	"sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver"
	"/SparkSQLCLIService.scala",
	"core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
	]

	nonmigrated_files = {}

	scala_files = glob.glob(os.path.join(__file__, "../", "*", ".scala"), recursive=True)

	for file in scala_files:
	skip_file = False
	for exclude_pattern in excluded_file_patterns:
	if re.search(exclude_pattern, file):
	skip_file = True
	break

	if not skip_file and not os.path.isdir(file):
	with open(file, "r") as f:
	content = f.read()

	log_statements = re.finditer(log_pattern, content, re.DOTALL)

	if log_statements:
	nonmigrated_files[file] = []
	for log_statement in log_statements:
	log_statement_str = log_statement.group(0).strip()
	# trim first ( and last )
	first_paren_index = log_statement_str.find("(")
	inner_log_statement = re.sub(
	r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
	)

	if compiled_inner_log_pattern.fullmatch(inner_log_statement):
	start_pos = log_statement.start()
	preceding_content = content[:start_pos]
	line_number = preceding_content.count("\n") + 1
	start_char = start_pos - preceding_content.rfind("\n") - 1
	nonmigrated_files[file].append((line_number, start_char))

	if all(len(issues) == 0 for issues in nonmigrated_files.values()):
	print("Structured logging style check passed.", file=sys.stderr)
	sys.exit(0)
	else:
	for file_path, issues in nonmigrated_files.items():
	for line_number, start_char in issues:
	print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
	print(
	"[error]\tPlease use the Structured Logging Framework for logging messages "
	'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
	"\n\tRefer to the guidelines in the file `internal/Logging.scala`.",
	file=sys.stderr,
	)

	sys.exit(-1)


	if __name__ == "__main__":
	main()