blob: 3479066f7e30e656d377c1941d129d76ae6faca1 [file] [log] [blame]
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
import os
import re
from pathlib import Path
from typing import List, Set
class chinese_character_check_test:
CHINESE_CHAR_PATTERN = re.compile(r'[\u4e00-\u9fa5]')
# Exclude directories or files. If it is a file, just write the file name. The same is true for directories, just write the directory name.
EXCLUDED_DIRS_AND_FILES = {
"target",
"node_modules",
"dist",
}
# Supported file extensions
SUPPORTED_EXTENSIONS = {
".java",
".kt",
".scala",
".js",
".jsx",
".ts",
".tsx",
".vue"
}
def should_not_contain_chinese_in_comments(self):
violations = self.scan_for_chinese_characters(scan_target.COMMENTS)
self.assert_no_chinese_characters(violations)
def scan_for_chinese_characters(self, target: 'scan_target') -> List[str]:
violations = []
for ext in self.SUPPORTED_EXTENSIONS:
for path in Path("..").rglob(f"*{ext}"):
if self.is_valid_file(path) and not self.is_excluded(path):
self.process_file(path, target, violations)
return violations
def is_excluded(self, path: Path) -> bool:
path_str = str(path)
return any(excluded in path_str for excluded in self.EXCLUDED_DIRS_AND_FILES)
def is_valid_file(self, path: Path) -> bool:
path_str = str(path)
return any(path_str.endswith(ext) for ext in self.SUPPORTED_EXTENSIONS)
def process_file(self, path: Path, target: 'scan_target', violations: List[str]):
try:
with open(path, 'r', encoding='utf-8') as file:
content = file.read()
if target.include_comments():
self.check_comments(content, path, violations)
if target.include_code():
self.check_code(content, path, violations)
except Exception as e:
print(f"Error processing file: {path}")
print(e)
def check_comments(self, content: str, path: Path, violations: List[str]):
# Matching multiple types of comments
comment_patterns = [
r'//.*?$', # Single line comments
r'/\*.*?\*/', # Multi line comments
r'<!--.*?-->' # Vue/HTML,/javascript/typescript comments
]
for pattern in comment_patterns:
for comment in re.findall(pattern, content, re.DOTALL | re.MULTILINE):
if self.CHINESE_CHAR_PATTERN.search(comment):
violations.append(self.format_violation(path, "comment", comment.strip()))
def check_code(self, content: str, path: Path, violations: List[str]):
# Matching string literals in multiple languages
string_patterns = [
r'"[^"]*"', # Double quoted strings
r"'[^']*'" # Single quoted strings
]
for pattern in string_patterns:
for string_literal in re.findall(pattern, content):
if self.CHINESE_CHAR_PATTERN.search(string_literal):
violations.append(self.format_violation(path, "code", string_literal))
def format_violation(self, path: Path, location: str, content: str) -> str:
return f"Chinese characters found in {location} at {path.absolute()}: {content}"
def assert_no_chinese_characters(self, violations: List[str]):
assert len(violations) == 0, f"Found Chinese characters in files:\n{os.linesep.join(violations)}"
class scan_target:
def __init__(self, check_comments: bool, check_code: bool):
self.check_comments = check_comments
self.check_code = check_code
def include_comments(self) -> bool:
return self.check_comments
def include_code(self) -> bool:
return self.check_code
scan_target.COMMENTS = scan_target(True, False)
scan_target.CODE = scan_target(False, True)
scan_target.ALL = scan_target(True, True)
if __name__ == "__main__":
test = chinese_character_check_test()
test.should_not_contain_chinese_in_comments()