| from functools import lru_cache |
| from logging import getLogger |
| from typing import List, Optional |
| |
| from .constant import ( |
| COMMON_SAFE_ASCII_CHARACTERS, |
| TRACE, |
| UNICODE_SECONDARY_RANGE_KEYWORD, |
| ) |
| from .utils import ( |
| is_accentuated, |
| is_ascii, |
| is_case_variable, |
| is_cjk, |
| is_emoticon, |
| is_hangul, |
| is_hiragana, |
| is_katakana, |
| is_latin, |
| is_punctuation, |
| is_separator, |
| is_symbol, |
| is_thai, |
| is_unprintable, |
| remove_accent, |
| unicode_range, |
| ) |
| |
| |
| class MessDetectorPlugin: |
| """ |
| Base abstract class used for mess detection plugins. |
| All detectors MUST extend and implement given methods. |
| """ |
| |
| def eligible(self, character: str) -> bool: |
| """ |
| Determine if given character should be fed in. |
| """ |
| raise NotImplementedError # pragma: nocover |
| |
| def feed(self, character: str) -> None: |
| """ |
| The main routine to be executed upon character. |
| Insert the logic in witch the text would be considered chaotic. |
| """ |
| raise NotImplementedError # pragma: nocover |
| |
| def reset(self) -> None: # pragma: no cover |
| """ |
| Permit to reset the plugin to the initial state. |
| """ |
| raise NotImplementedError |
| |
| @property |
| def ratio(self) -> float: |
| """ |
| Compute the chaos ratio based on what your feed() has seen. |
| Must NOT be lower than 0.; No restriction gt 0. |
| """ |
| raise NotImplementedError # pragma: nocover |
| |
| |
| class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin): |
| def __init__(self) -> None: |
| self._punctuation_count: int = 0 |
| self._symbol_count: int = 0 |
| self._character_count: int = 0 |
| |
| self._last_printable_char: Optional[str] = None |
| self._frenzy_symbol_in_word: bool = False |
| |
| def eligible(self, character: str) -> bool: |
| return character.isprintable() |
| |
| def feed(self, character: str) -> None: |
| self._character_count += 1 |
| |
| if ( |
| character != self._last_printable_char |
| and character not in COMMON_SAFE_ASCII_CHARACTERS |
| ): |
| if is_punctuation(character): |
| self._punctuation_count += 1 |
| elif ( |
| character.isdigit() is False |
| and is_symbol(character) |
| and is_emoticon(character) is False |
| ): |
| self._symbol_count += 2 |
| |
| self._last_printable_char = character |
| |
| def reset(self) -> None: # pragma: no cover |
| self._punctuation_count = 0 |
| self._character_count = 0 |
| self._symbol_count = 0 |
| |
| @property |
| def ratio(self) -> float: |
| if self._character_count == 0: |
| return 0.0 |
| |
| ratio_of_punctuation: float = ( |
| self._punctuation_count + self._symbol_count |
| ) / self._character_count |
| |
| return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0 |
| |
| |
| class TooManyAccentuatedPlugin(MessDetectorPlugin): |
| def __init__(self) -> None: |
| self._character_count: int = 0 |
| self._accentuated_count: int = 0 |
| |
| def eligible(self, character: str) -> bool: |
| return character.isalpha() |
| |
| def feed(self, character: str) -> None: |
| self._character_count += 1 |
| |
| if is_accentuated(character): |
| self._accentuated_count += 1 |
| |
| def reset(self) -> None: # pragma: no cover |
| self._character_count = 0 |
| self._accentuated_count = 0 |
| |
| @property |
| def ratio(self) -> float: |
| if self._character_count == 0 or self._character_count < 8: |
| return 0.0 |
| ratio_of_accentuation: float = self._accentuated_count / self._character_count |
| return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 |
| |
| |
| class UnprintablePlugin(MessDetectorPlugin): |
| def __init__(self) -> None: |
| self._unprintable_count: int = 0 |
| self._character_count: int = 0 |
| |
| def eligible(self, character: str) -> bool: |
| return True |
| |
| def feed(self, character: str) -> None: |
| if is_unprintable(character): |
| self._unprintable_count += 1 |
| self._character_count += 1 |
| |
| def reset(self) -> None: # pragma: no cover |
| self._unprintable_count = 0 |
| |
| @property |
| def ratio(self) -> float: |
| if self._character_count == 0: |
| return 0.0 |
| |
| return (self._unprintable_count * 8) / self._character_count |
| |
| |
| class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin): |
| def __init__(self) -> None: |
| self._successive_count: int = 0 |
| self._character_count: int = 0 |
| |
| self._last_latin_character: Optional[str] = None |
| |
| def eligible(self, character: str) -> bool: |
| return character.isalpha() and is_latin(character) |
| |
| def feed(self, character: str) -> None: |
| self._character_count += 1 |
| if ( |
| self._last_latin_character is not None |
| and is_accentuated(character) |
| and is_accentuated(self._last_latin_character) |
| ): |
| if character.isupper() and self._last_latin_character.isupper(): |
| self._successive_count += 1 |
| # Worse if its the same char duplicated with different accent. |
| if remove_accent(character) == remove_accent(self._last_latin_character): |
| self._successive_count += 1 |
| self._last_latin_character = character |
| |
| def reset(self) -> None: # pragma: no cover |
| self._successive_count = 0 |
| self._character_count = 0 |
| self._last_latin_character = None |
| |
| @property |
| def ratio(self) -> float: |
| if self._character_count == 0: |
| return 0.0 |
| |
| return (self._successive_count * 2) / self._character_count |
| |
| |
| class SuspiciousRange(MessDetectorPlugin): |
| def __init__(self) -> None: |
| self._suspicious_successive_range_count: int = 0 |
| self._character_count: int = 0 |
| self._last_printable_seen: Optional[str] = None |
| |
| def eligible(self, character: str) -> bool: |
| return character.isprintable() |
| |
| def feed(self, character: str) -> None: |
| self._character_count += 1 |
| |
| if ( |
| character.isspace() |
| or is_punctuation(character) |
| or character in COMMON_SAFE_ASCII_CHARACTERS |
| ): |
| self._last_printable_seen = None |
| return |
| |
| if self._last_printable_seen is None: |
| self._last_printable_seen = character |
| return |
| |
| unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen) |
| unicode_range_b: Optional[str] = unicode_range(character) |
| |
| if is_suspiciously_successive_range(unicode_range_a, unicode_range_b): |
| self._suspicious_successive_range_count += 1 |
| |
| self._last_printable_seen = character |
| |
| def reset(self) -> None: # pragma: no cover |
| self._character_count = 0 |
| self._suspicious_successive_range_count = 0 |
| self._last_printable_seen = None |
| |
| @property |
| def ratio(self) -> float: |
| if self._character_count == 0: |
| return 0.0 |
| |
| ratio_of_suspicious_range_usage: float = ( |
| self._suspicious_successive_range_count * 2 |
| ) / self._character_count |
| |
| if ratio_of_suspicious_range_usage < 0.1: |
| return 0.0 |
| |
| return ratio_of_suspicious_range_usage |
| |
| |
| class SuperWeirdWordPlugin(MessDetectorPlugin): |
| def __init__(self) -> None: |
| self._word_count: int = 0 |
| self._bad_word_count: int = 0 |
| self._foreign_long_count: int = 0 |
| |
| self._is_current_word_bad: bool = False |
| self._foreign_long_watch: bool = False |
| |
| self._character_count: int = 0 |
| self._bad_character_count: int = 0 |
| |
| self._buffer: str = "" |
| self._buffer_accent_count: int = 0 |
| |
| def eligible(self, character: str) -> bool: |
| return True |
| |
| def feed(self, character: str) -> None: |
| if character.isalpha(): |
| self._buffer += character |
| if is_accentuated(character): |
| self._buffer_accent_count += 1 |
| if ( |
| self._foreign_long_watch is False |
| and (is_latin(character) is False or is_accentuated(character)) |
| and is_cjk(character) is False |
| and is_hangul(character) is False |
| and is_katakana(character) is False |
| and is_hiragana(character) is False |
| and is_thai(character) is False |
| ): |
| self._foreign_long_watch = True |
| return |
| if not self._buffer: |
| return |
| if ( |
| character.isspace() or is_punctuation(character) or is_separator(character) |
| ) and self._buffer: |
| self._word_count += 1 |
| buffer_length: int = len(self._buffer) |
| |
| self._character_count += buffer_length |
| |
| if buffer_length >= 4: |
| if self._buffer_accent_count / buffer_length > 0.34: |
| self._is_current_word_bad = True |
| # Word/Buffer ending with an upper case accentuated letter are so rare, |
| # that we will consider them all as suspicious. Same weight as foreign_long suspicious. |
| if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): |
| self._foreign_long_count += 1 |
| self._is_current_word_bad = True |
| if buffer_length >= 24 and self._foreign_long_watch: |
| camel_case_dst = [ |
| i |
| for c, i in zip(self._buffer, range(0, buffer_length)) |
| if c.isupper() |
| ] |
| probable_camel_cased: bool = False |
| |
| if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3): |
| probable_camel_cased = True |
| |
| if not probable_camel_cased: |
| self._foreign_long_count += 1 |
| self._is_current_word_bad = True |
| |
| if self._is_current_word_bad: |
| self._bad_word_count += 1 |
| self._bad_character_count += len(self._buffer) |
| self._is_current_word_bad = False |
| |
| self._foreign_long_watch = False |
| self._buffer = "" |
| self._buffer_accent_count = 0 |
| elif ( |
| character not in {"<", ">", "-", "=", "~", "|", "_"} |
| and character.isdigit() is False |
| and is_symbol(character) |
| ): |
| self._is_current_word_bad = True |
| self._buffer += character |
| |
| def reset(self) -> None: # pragma: no cover |
| self._buffer = "" |
| self._is_current_word_bad = False |
| self._foreign_long_watch = False |
| self._bad_word_count = 0 |
| self._word_count = 0 |
| self._character_count = 0 |
| self._bad_character_count = 0 |
| self._foreign_long_count = 0 |
| |
| @property |
| def ratio(self) -> float: |
| if self._word_count <= 10 and self._foreign_long_count == 0: |
| return 0.0 |
| |
| return self._bad_character_count / self._character_count |
| |
| |
| class CjkInvalidStopPlugin(MessDetectorPlugin): |
| """ |
| GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and |
| can be easily detected. Searching for the overuse of '丅' and '丄'. |
| """ |
| |
| def __init__(self) -> None: |
| self._wrong_stop_count: int = 0 |
| self._cjk_character_count: int = 0 |
| |
| def eligible(self, character: str) -> bool: |
| return True |
| |
| def feed(self, character: str) -> None: |
| if character in {"丅", "丄"}: |
| self._wrong_stop_count += 1 |
| return |
| if is_cjk(character): |
| self._cjk_character_count += 1 |
| |
| def reset(self) -> None: # pragma: no cover |
| self._wrong_stop_count = 0 |
| self._cjk_character_count = 0 |
| |
| @property |
| def ratio(self) -> float: |
| if self._cjk_character_count < 16: |
| return 0.0 |
| return self._wrong_stop_count / self._cjk_character_count |
| |
| |
| class ArchaicUpperLowerPlugin(MessDetectorPlugin): |
| def __init__(self) -> None: |
| self._buf: bool = False |
| |
| self._character_count_since_last_sep: int = 0 |
| |
| self._successive_upper_lower_count: int = 0 |
| self._successive_upper_lower_count_final: int = 0 |
| |
| self._character_count: int = 0 |
| |
| self._last_alpha_seen: Optional[str] = None |
| self._current_ascii_only: bool = True |
| |
| def eligible(self, character: str) -> bool: |
| return True |
| |
| def feed(self, character: str) -> None: |
| is_concerned = character.isalpha() and is_case_variable(character) |
| chunk_sep = is_concerned is False |
| |
| if chunk_sep and self._character_count_since_last_sep > 0: |
| if ( |
| self._character_count_since_last_sep <= 64 |
| and character.isdigit() is False |
| and self._current_ascii_only is False |
| ): |
| self._successive_upper_lower_count_final += ( |
| self._successive_upper_lower_count |
| ) |
| |
| self._successive_upper_lower_count = 0 |
| self._character_count_since_last_sep = 0 |
| self._last_alpha_seen = None |
| self._buf = False |
| self._character_count += 1 |
| self._current_ascii_only = True |
| |
| return |
| |
| if self._current_ascii_only is True and is_ascii(character) is False: |
| self._current_ascii_only = False |
| |
| if self._last_alpha_seen is not None: |
| if (character.isupper() and self._last_alpha_seen.islower()) or ( |
| character.islower() and self._last_alpha_seen.isupper() |
| ): |
| if self._buf is True: |
| self._successive_upper_lower_count += 2 |
| self._buf = False |
| else: |
| self._buf = True |
| else: |
| self._buf = False |
| |
| self._character_count += 1 |
| self._character_count_since_last_sep += 1 |
| self._last_alpha_seen = character |
| |
| def reset(self) -> None: # pragma: no cover |
| self._character_count = 0 |
| self._character_count_since_last_sep = 0 |
| self._successive_upper_lower_count = 0 |
| self._successive_upper_lower_count_final = 0 |
| self._last_alpha_seen = None |
| self._buf = False |
| self._current_ascii_only = True |
| |
| @property |
| def ratio(self) -> float: |
| if self._character_count == 0: |
| return 0.0 |
| |
| return self._successive_upper_lower_count_final / self._character_count |
| |
| |
| @lru_cache(maxsize=1024) |
| def is_suspiciously_successive_range( |
| unicode_range_a: Optional[str], unicode_range_b: Optional[str] |
| ) -> bool: |
| """ |
| Determine if two Unicode range seen next to each other can be considered as suspicious. |
| """ |
| if unicode_range_a is None or unicode_range_b is None: |
| return True |
| |
| if unicode_range_a == unicode_range_b: |
| return False |
| |
| if "Latin" in unicode_range_a and "Latin" in unicode_range_b: |
| return False |
| |
| if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b: |
| return False |
| |
| # Latin characters can be accompanied with a combining diacritical mark |
| # eg. Vietnamese. |
| if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and ( |
| "Combining" in unicode_range_a or "Combining" in unicode_range_b |
| ): |
| return False |
| |
| keywords_range_a, keywords_range_b = unicode_range_a.split( |
| " " |
| ), unicode_range_b.split(" ") |
| |
| for el in keywords_range_a: |
| if el in UNICODE_SECONDARY_RANGE_KEYWORD: |
| continue |
| if el in keywords_range_b: |
| return False |
| |
| # Japanese Exception |
| range_a_jp_chars, range_b_jp_chars = ( |
| unicode_range_a |
| in ( |
| "Hiragana", |
| "Katakana", |
| ), |
| unicode_range_b in ("Hiragana", "Katakana"), |
| ) |
| if (range_a_jp_chars or range_b_jp_chars) and ( |
| "CJK" in unicode_range_a or "CJK" in unicode_range_b |
| ): |
| return False |
| if range_a_jp_chars and range_b_jp_chars: |
| return False |
| |
| if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b: |
| if "CJK" in unicode_range_a or "CJK" in unicode_range_b: |
| return False |
| if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": |
| return False |
| |
| # Chinese/Japanese use dedicated range for punctuation and/or separators. |
| if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or ( |
| unicode_range_a in ["Katakana", "Hiragana"] |
| and unicode_range_b in ["Katakana", "Hiragana"] |
| ): |
| if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b: |
| return False |
| if "Forms" in unicode_range_a or "Forms" in unicode_range_b: |
| return False |
| |
| return True |
| |
| |
| @lru_cache(maxsize=2048) |
| def mess_ratio( |
| decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False |
| ) -> float: |
| """ |
| Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. |
| """ |
| |
| detectors: List[MessDetectorPlugin] = [ |
| md_class() for md_class in MessDetectorPlugin.__subclasses__() |
| ] |
| |
| length: int = len(decoded_sequence) + 1 |
| |
| mean_mess_ratio: float = 0.0 |
| |
| if length < 512: |
| intermediary_mean_mess_ratio_calc: int = 32 |
| elif length <= 1024: |
| intermediary_mean_mess_ratio_calc = 64 |
| else: |
| intermediary_mean_mess_ratio_calc = 128 |
| |
| for character, index in zip(decoded_sequence + "\n", range(length)): |
| for detector in detectors: |
| if detector.eligible(character): |
| detector.feed(character) |
| |
| if ( |
| index > 0 and index % intermediary_mean_mess_ratio_calc == 0 |
| ) or index == length - 1: |
| mean_mess_ratio = sum(dt.ratio for dt in detectors) |
| |
| if mean_mess_ratio >= maximum_threshold: |
| break |
| |
| if debug: |
| logger = getLogger("charset_normalizer") |
| |
| logger.log( |
| TRACE, |
| "Mess-detector extended-analysis start. " |
| f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " |
| f"maximum_threshold={maximum_threshold}", |
| ) |
| |
| if len(decoded_sequence) > 16: |
| logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") |
| logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") |
| |
| for dt in detectors: # pragma: nocover |
| logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") |
| |
| return round(mean_mess_ratio, 3) |