.github/check_chinese_character.py - bigtop-manager - Git at Google

 #!/usr/bin/env python
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #    https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #

 import os
 import re
 from pathlib import Path
 from typing import List, Set

 class chinese_character_check_test:
     CHINESE_CHAR_PATTERN = re.compile(r'[\u4e00-\u9fa5]')
     # Exclude directories or files. If it is a file, just write the file name. The same is true for directories, just write the directory name.
     EXCLUDED_DIRS_AND_FILES = {
         "target",
         "node_modules",
         "dist",
     }
     # Supported file extensions
     SUPPORTED_EXTENSIONS = {
         ".java",
         ".kt",
         ".scala",
         ".js",
         ".jsx",
         ".ts",
         ".tsx",
         ".vue"
     }

     def should_not_contain_chinese_in_comments(self):
         violations = self.scan_for_chinese_characters(scan_target.COMMENTS)
         self.assert_no_chinese_characters(violations)

     def scan_for_chinese_characters(self, target: 'scan_target') -> List[str]:
         violations = []
         for ext in self.SUPPORTED_EXTENSIONS:
             for path in Path("..").rglob(f"*{ext}"):
                 if self.is_valid_file(path) and not self.is_excluded(path):
                     self.process_file(path, target, violations)
         return violations

     def is_excluded(self, path: Path) -> bool:
         path_str = str(path)
         return any(excluded in path_str for excluded in self.EXCLUDED_DIRS_AND_FILES)

     def is_valid_file(self, path: Path) -> bool:
         path_str = str(path)
         return any(path_str.endswith(ext) for ext in self.SUPPORTED_EXTENSIONS)

     def process_file(self, path: Path, target: 'scan_target', violations: List[str]):
         try:
             with open(path, 'r', encoding='utf-8') as file:
                 content = file.read()
                 if target.include_comments():
                     self.check_comments(content, path, violations)
                 if target.include_code():
                     self.check_code(content, path, violations)
         except Exception as e:
             print(f"Error processing file: {path}")
             print(e)

     def check_comments(self, content: str, path: Path, violations: List[str]):
         # Matching multiple types of comments
         comment_patterns = [
             r'//.*?$',  # Single line comments
             r'/\*.*?\*/',  # Multi line comments
             r'<!--.*?-->'  # Vue/HTML,/javascript/typescript comments
         ]
         for pattern in comment_patterns:
             for comment in re.findall(pattern, content, re.DOTALL | re.MULTILINE):
                 if self.CHINESE_CHAR_PATTERN.search(comment):
                     violations.append(self.format_violation(path, "comment", comment.strip()))

     def check_code(self, content: str, path: Path, violations: List[str]):
         # Matching string literals in multiple languages
         string_patterns = [
             r'"[^"]*"',  # Double quoted strings
             r"'[^']*'"   # Single quoted strings
         ]
         for pattern in string_patterns:
             for string_literal in re.findall(pattern, content):
                 if self.CHINESE_CHAR_PATTERN.search(string_literal):
                     violations.append(self.format_violation(path, "code", string_literal))

     def format_violation(self, path: Path, location: str, content: str) -> str:
         return f"Chinese characters found in {location} at {path.absolute()}: {content}"

     def assert_no_chinese_characters(self, violations: List[str]):
         assert len(violations) == 0, f"Found Chinese characters in files:\n{os.linesep.join(violations)}"

 class scan_target:
     def __init__(self, check_comments: bool, check_code: bool):
         self.check_comments = check_comments
         self.check_code = check_code

     def include_comments(self) -> bool:
         return self.check_comments

     def include_code(self) -> bool:
         return self.check_code

 scan_target.COMMENTS = scan_target(True, False)
 scan_target.CODE = scan_target(False, True)
 scan_target.ALL = scan_target(True, True)

 if __name__ == "__main__":
     test = chinese_character_check_test()
     test.should_not_contain_chinese_in_comments()
	#!/usr/bin/env python
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#

	import os
	import re
	from pathlib import Path
	from typing import List, Set

	class chinese_character_check_test:
	CHINESE_CHAR_PATTERN = re.compile(r'[\u4e00-\u9fa5]')
	# Exclude directories or files. If it is a file, just write the file name. The same is true for directories, just write the directory name.
	EXCLUDED_DIRS_AND_FILES = {
	"target",
	"node_modules",
	"dist",
	}
	# Supported file extensions
	SUPPORTED_EXTENSIONS = {
	".java",
	".kt",
	".scala",
	".js",
	".jsx",
	".ts",
	".tsx",
	".vue"
	}

	def should_not_contain_chinese_in_comments(self):
	violations = self.scan_for_chinese_characters(scan_target.COMMENTS)
	self.assert_no_chinese_characters(violations)

	def scan_for_chinese_characters(self, target: 'scan_target') -> List[str]:
	violations = []
	for ext in self.SUPPORTED_EXTENSIONS:
	for path in Path("..").rglob(f"*{ext}"):
	if self.is_valid_file(path) and not self.is_excluded(path):
	self.process_file(path, target, violations)
	return violations

	def is_excluded(self, path: Path) -> bool:
	path_str = str(path)
	return any(excluded in path_str for excluded in self.EXCLUDED_DIRS_AND_FILES)

	def is_valid_file(self, path: Path) -> bool:
	path_str = str(path)
	return any(path_str.endswith(ext) for ext in self.SUPPORTED_EXTENSIONS)

	def process_file(self, path: Path, target: 'scan_target', violations: List[str]):
	try:
	with open(path, 'r', encoding='utf-8') as file:
	content = file.read()
	if target.include_comments():
	self.check_comments(content, path, violations)
	if target.include_code():
	self.check_code(content, path, violations)
	except Exception as e:
	print(f"Error processing file: {path}")
	print(e)

	def check_comments(self, content: str, path: Path, violations: List[str]):
	# Matching multiple types of comments
	comment_patterns = [
	r'//.*?$', # Single line comments
	r'/\.?\*/', # Multi line comments
	r'<!--.*?-->' # Vue/HTML,/javascript/typescript comments
	]
	for pattern in comment_patterns:
	for comment in re.findall(pattern, content, re.DOTALL \| re.MULTILINE):
	if self.CHINESE_CHAR_PATTERN.search(comment):
	violations.append(self.format_violation(path, "comment", comment.strip()))

	def check_code(self, content: str, path: Path, violations: List[str]):
	# Matching string literals in multiple languages
	string_patterns = [
	r'"[^"]*"', # Double quoted strings
	r"'[^']*'" # Single quoted strings
	]
	for pattern in string_patterns:
	for string_literal in re.findall(pattern, content):
	if self.CHINESE_CHAR_PATTERN.search(string_literal):
	violations.append(self.format_violation(path, "code", string_literal))

	def format_violation(self, path: Path, location: str, content: str) -> str:
	return f"Chinese characters found in {location} at {path.absolute()}: {content}"

	def assert_no_chinese_characters(self, violations: List[str]):
	assert len(violations) == 0, f"Found Chinese characters in files:\n{os.linesep.join(violations)}"

	class scan_target:
	def __init__(self, check_comments: bool, check_code: bool):
	self.check_comments = check_comments
	self.check_code = check_code

	def include_comments(self) -> bool:
	return self.check_comments

	def include_code(self) -> bool:
	return self.check_code

	scan_target.COMMENTS = scan_target(True, False)
	scan_target.CODE = scan_target(False, True)
	scan_target.ALL = scan_target(True, True)

	if __name__ == "__main__":
	test = chinese_character_check_test()
	test.should_not_contain_chinese_in_comments()