python-docs-venv/lib/python3.11/site-packages/charset_normalizer/md.py - datasketches-python - Git at Google

 from functools import lru_cache
 from logging import getLogger
 from typing import List, Optional

 from .constant import (
     COMMON_SAFE_ASCII_CHARACTERS,
     TRACE,
     UNICODE_SECONDARY_RANGE_KEYWORD,
 )
 from .utils import (
     is_accentuated,
     is_ascii,
     is_case_variable,
     is_cjk,
     is_emoticon,
     is_hangul,
     is_hiragana,
     is_katakana,
     is_latin,
     is_punctuation,
     is_separator,
     is_symbol,
     is_thai,
     is_unprintable,
     remove_accent,
     unicode_range,
 )


 class MessDetectorPlugin:
     """
     Base abstract class used for mess detection plugins.
     All detectors MUST extend and implement given methods.
     """

     def eligible(self, character: str) -> bool:
         """
         Determine if given character should be fed in.
         """
         raise NotImplementedError  # pragma: nocover

     def feed(self, character: str) -> None:
         """
         The main routine to be executed upon character.
         Insert the logic in witch the text would be considered chaotic.
         """
         raise NotImplementedError  # pragma: nocover

     def reset(self) -> None:  # pragma: no cover
         """
         Permit to reset the plugin to the initial state.
         """
         raise NotImplementedError

     @property
     def ratio(self) -> float:
         """
         Compute the chaos ratio based on what your feed() has seen.
         Must NOT be lower than 0.; No restriction gt 0.
         """
         raise NotImplementedError  # pragma: nocover


 class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
         self._punctuation_count: int = 0
         self._symbol_count: int = 0
         self._character_count: int = 0

         self._last_printable_char: Optional[str] = None
         self._frenzy_symbol_in_word: bool = False

     def eligible(self, character: str) -> bool:
         return character.isprintable()

     def feed(self, character: str) -> None:
         self._character_count += 1

         if (
             character != self._last_printable_char
             and character not in COMMON_SAFE_ASCII_CHARACTERS
         ):
             if is_punctuation(character):
                 self._punctuation_count += 1
             elif (
                 character.isdigit() is False
                 and is_symbol(character)
                 and is_emoticon(character) is False
             ):
                 self._symbol_count += 2

         self._last_printable_char = character

     def reset(self) -> None:  # pragma: no cover
         self._punctuation_count = 0
         self._character_count = 0
         self._symbol_count = 0

     @property
     def ratio(self) -> float:
         if self._character_count == 0:
             return 0.0

         ratio_of_punctuation: float = (
             self._punctuation_count + self._symbol_count
         ) / self._character_count

         return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0


 class TooManyAccentuatedPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
         self._character_count: int = 0
         self._accentuated_count: int = 0

     def eligible(self, character: str) -> bool:
         return character.isalpha()

     def feed(self, character: str) -> None:
         self._character_count += 1

         if is_accentuated(character):
             self._accentuated_count += 1

     def reset(self) -> None:  # pragma: no cover
         self._character_count = 0
         self._accentuated_count = 0

     @property
     def ratio(self) -> float:
         if self._character_count == 0 or self._character_count < 8:
             return 0.0
         ratio_of_accentuation: float = self._accentuated_count / self._character_count
         return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0


 class UnprintablePlugin(MessDetectorPlugin):
     def __init__(self) -> None:
         self._unprintable_count: int = 0
         self._character_count: int = 0

     def eligible(self, character: str) -> bool:
         return True

     def feed(self, character: str) -> None:
         if is_unprintable(character):
             self._unprintable_count += 1
         self._character_count += 1

     def reset(self) -> None:  # pragma: no cover
         self._unprintable_count = 0

     @property
     def ratio(self) -> float:
         if self._character_count == 0:
             return 0.0

         return (self._unprintable_count * 8) / self._character_count


 class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
         self._successive_count: int = 0
         self._character_count: int = 0

         self._last_latin_character: Optional[str] = None

     def eligible(self, character: str) -> bool:
         return character.isalpha() and is_latin(character)

     def feed(self, character: str) -> None:
         self._character_count += 1
         if (
             self._last_latin_character is not None
             and is_accentuated(character)
             and is_accentuated(self._last_latin_character)
         ):
             if character.isupper() and self._last_latin_character.isupper():
                 self._successive_count += 1
             # Worse if its the same char duplicated with different accent.
             if remove_accent(character) == remove_accent(self._last_latin_character):
                 self._successive_count += 1
         self._last_latin_character = character

     def reset(self) -> None:  # pragma: no cover
         self._successive_count = 0
         self._character_count = 0
         self._last_latin_character = None

     @property
     def ratio(self) -> float:
         if self._character_count == 0:
             return 0.0

         return (self._successive_count * 2) / self._character_count


 class SuspiciousRange(MessDetectorPlugin):
     def __init__(self) -> None:
         self._suspicious_successive_range_count: int = 0
         self._character_count: int = 0
         self._last_printable_seen: Optional[str] = None

     def eligible(self, character: str) -> bool:
         return character.isprintable()

     def feed(self, character: str) -> None:
         self._character_count += 1

         if (
             character.isspace()
             or is_punctuation(character)
             or character in COMMON_SAFE_ASCII_CHARACTERS
         ):
             self._last_printable_seen = None
             return

         if self._last_printable_seen is None:
             self._last_printable_seen = character
             return

         unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
         unicode_range_b: Optional[str] = unicode_range(character)

         if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
             self._suspicious_successive_range_count += 1

         self._last_printable_seen = character

     def reset(self) -> None:  # pragma: no cover
         self._character_count = 0
         self._suspicious_successive_range_count = 0
         self._last_printable_seen = None

     @property
     def ratio(self) -> float:
         if self._character_count == 0:
             return 0.0

         ratio_of_suspicious_range_usage: float = (
             self._suspicious_successive_range_count * 2
         ) / self._character_count

         if ratio_of_suspicious_range_usage < 0.1:
             return 0.0

         return ratio_of_suspicious_range_usage


 class SuperWeirdWordPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
         self._word_count: int = 0
         self._bad_word_count: int = 0
         self._foreign_long_count: int = 0

         self._is_current_word_bad: bool = False
         self._foreign_long_watch: bool = False

         self._character_count: int = 0
         self._bad_character_count: int = 0

         self._buffer: str = ""
         self._buffer_accent_count: int = 0

     def eligible(self, character: str) -> bool:
         return True

     def feed(self, character: str) -> None:
         if character.isalpha():
             self._buffer += character
             if is_accentuated(character):
                 self._buffer_accent_count += 1
             if (
                 self._foreign_long_watch is False
                 and (is_latin(character) is False or is_accentuated(character))
                 and is_cjk(character) is False
                 and is_hangul(character) is False
                 and is_katakana(character) is False
                 and is_hiragana(character) is False
                 and is_thai(character) is False
             ):
                 self._foreign_long_watch = True
             return
         if not self._buffer:
             return
         if (
             character.isspace() or is_punctuation(character) or is_separator(character)
         ) and self._buffer:
             self._word_count += 1
             buffer_length: int = len(self._buffer)

             self._character_count += buffer_length

             if buffer_length >= 4:
                 if self._buffer_accent_count / buffer_length > 0.34:
                     self._is_current_word_bad = True
                 # Word/Buffer ending with an upper case accentuated letter are so rare,
                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
                 if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
                     self._foreign_long_count += 1
                     self._is_current_word_bad = True
             if buffer_length >= 24 and self._foreign_long_watch:
                 camel_case_dst = [
                     i
                     for c, i in zip(self._buffer, range(0, buffer_length))
                     if c.isupper()
                 ]
                 probable_camel_cased: bool = False

                 if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
                     probable_camel_cased = True

                 if not probable_camel_cased:
                     self._foreign_long_count += 1
                     self._is_current_word_bad = True

             if self._is_current_word_bad:
                 self._bad_word_count += 1
                 self._bad_character_count += len(self._buffer)
                 self._is_current_word_bad = False

             self._foreign_long_watch = False
             self._buffer = ""
             self._buffer_accent_count = 0
         elif (
             character not in {"<", ">", "-", "=", "~", "|", "_"}
             and character.isdigit() is False
             and is_symbol(character)
         ):
             self._is_current_word_bad = True
             self._buffer += character

     def reset(self) -> None:  # pragma: no cover
         self._buffer = ""
         self._is_current_word_bad = False
         self._foreign_long_watch = False
         self._bad_word_count = 0
         self._word_count = 0
         self._character_count = 0
         self._bad_character_count = 0
         self._foreign_long_count = 0

     @property
     def ratio(self) -> float:
         if self._word_count <= 10 and self._foreign_long_count == 0:
             return 0.0

         return self._bad_character_count / self._character_count


 class CjkInvalidStopPlugin(MessDetectorPlugin):
     """
     GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
     can be easily detected. Searching for the overuse of '丅' and '丄'.
     """

     def __init__(self) -> None:
         self._wrong_stop_count: int = 0
         self._cjk_character_count: int = 0

     def eligible(self, character: str) -> bool:
         return True

     def feed(self, character: str) -> None:
         if character in {"丅", "丄"}:
             self._wrong_stop_count += 1
             return
         if is_cjk(character):
             self._cjk_character_count += 1

     def reset(self) -> None:  # pragma: no cover
         self._wrong_stop_count = 0
         self._cjk_character_count = 0

     @property
     def ratio(self) -> float:
         if self._cjk_character_count < 16:
             return 0.0
         return self._wrong_stop_count / self._cjk_character_count


 class ArchaicUpperLowerPlugin(MessDetectorPlugin):
     def __init__(self) -> None:
         self._buf: bool = False

         self._character_count_since_last_sep: int = 0

         self._successive_upper_lower_count: int = 0
         self._successive_upper_lower_count_final: int = 0

         self._character_count: int = 0

         self._last_alpha_seen: Optional[str] = None
         self._current_ascii_only: bool = True

     def eligible(self, character: str) -> bool:
         return True

     def feed(self, character: str) -> None:
         is_concerned = character.isalpha() and is_case_variable(character)
         chunk_sep = is_concerned is False

         if chunk_sep and self._character_count_since_last_sep > 0:
             if (
                 self._character_count_since_last_sep <= 64
                 and character.isdigit() is False
                 and self._current_ascii_only is False
             ):
                 self._successive_upper_lower_count_final += (
                     self._successive_upper_lower_count
                 )

             self._successive_upper_lower_count = 0
             self._character_count_since_last_sep = 0
             self._last_alpha_seen = None
             self._buf = False
             self._character_count += 1
             self._current_ascii_only = True

             return

         if self._current_ascii_only is True and is_ascii(character) is False:
             self._current_ascii_only = False

         if self._last_alpha_seen is not None:
             if (character.isupper() and self._last_alpha_seen.islower()) or (
                 character.islower() and self._last_alpha_seen.isupper()
             ):
                 if self._buf is True:
                     self._successive_upper_lower_count += 2
                     self._buf = False
                 else:
                     self._buf = True
             else:
                 self._buf = False

         self._character_count += 1
         self._character_count_since_last_sep += 1
         self._last_alpha_seen = character

     def reset(self) -> None:  # pragma: no cover
         self._character_count = 0
         self._character_count_since_last_sep = 0
         self._successive_upper_lower_count = 0
         self._successive_upper_lower_count_final = 0
         self._last_alpha_seen = None
         self._buf = False
         self._current_ascii_only = True

     @property
     def ratio(self) -> float:
         if self._character_count == 0:
             return 0.0

         return self._successive_upper_lower_count_final / self._character_count


 @lru_cache(maxsize=1024)
 def is_suspiciously_successive_range(
     unicode_range_a: Optional[str], unicode_range_b: Optional[str]
 ) -> bool:
     """
     Determine if two Unicode range seen next to each other can be considered as suspicious.
     """
     if unicode_range_a is None or unicode_range_b is None:
         return True

     if unicode_range_a == unicode_range_b:
         return False

     if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
         return False

     if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
         return False

     # Latin characters can be accompanied with a combining diacritical mark
     # eg. Vietnamese.
     if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
         "Combining" in unicode_range_a or "Combining" in unicode_range_b
     ):
         return False

     keywords_range_a, keywords_range_b = unicode_range_a.split(
         " "
     ), unicode_range_b.split(" ")

     for el in keywords_range_a:
         if el in UNICODE_SECONDARY_RANGE_KEYWORD:
             continue
         if el in keywords_range_b:
             return False

     # Japanese Exception
     range_a_jp_chars, range_b_jp_chars = (
         unicode_range_a
         in (
             "Hiragana",
             "Katakana",
         ),
         unicode_range_b in ("Hiragana", "Katakana"),
     )
     if (range_a_jp_chars or range_b_jp_chars) and (
         "CJK" in unicode_range_a or "CJK" in unicode_range_b
     ):
         return False
     if range_a_jp_chars and range_b_jp_chars:
         return False

     if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
         if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
             return False
         if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
             return False

     # Chinese/Japanese use dedicated range for punctuation and/or separators.
     if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
         unicode_range_a in ["Katakana", "Hiragana"]
         and unicode_range_b in ["Katakana", "Hiragana"]
     ):
         if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
             return False
         if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
             return False

     return True


 @lru_cache(maxsize=2048)
 def mess_ratio(
     decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
 ) -> float:
     """
     Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
     """

     detectors: List[MessDetectorPlugin] = [
         md_class() for md_class in MessDetectorPlugin.__subclasses__()
     ]

     length: int = len(decoded_sequence) + 1

     mean_mess_ratio: float = 0.0

     if length < 512:
         intermediary_mean_mess_ratio_calc: int = 32
     elif length <= 1024:
         intermediary_mean_mess_ratio_calc = 64
     else:
         intermediary_mean_mess_ratio_calc = 128

     for character, index in zip(decoded_sequence + "\n", range(length)):
         for detector in detectors:
             if detector.eligible(character):
                 detector.feed(character)

         if (
             index > 0 and index % intermediary_mean_mess_ratio_calc == 0
         ) or index == length - 1:
             mean_mess_ratio = sum(dt.ratio for dt in detectors)

             if mean_mess_ratio >= maximum_threshold:
                 break

     if debug:
         logger = getLogger("charset_normalizer")

         logger.log(
             TRACE,
             "Mess-detector extended-analysis start. "
             f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
             f"maximum_threshold={maximum_threshold}",
         )

         if len(decoded_sequence) > 16:
             logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
             logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")

         for dt in detectors:  # pragma: nocover
             logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")

     return round(mean_mess_ratio, 3)
	from functools import lru_cache
	from logging import getLogger
	from typing import List, Optional

	from .constant import (
	COMMON_SAFE_ASCII_CHARACTERS,
	TRACE,
	UNICODE_SECONDARY_RANGE_KEYWORD,
	)
	from .utils import (
	is_accentuated,
	is_ascii,
	is_case_variable,
	is_cjk,
	is_emoticon,
	is_hangul,
	is_hiragana,
	is_katakana,
	is_latin,
	is_punctuation,
	is_separator,
	is_symbol,
	is_thai,
	is_unprintable,
	remove_accent,
	unicode_range,
	)


	class MessDetectorPlugin:
	"""
	Base abstract class used for mess detection plugins.
	All detectors MUST extend and implement given methods.
	"""

	def eligible(self, character: str) -> bool:
	"""
	Determine if given character should be fed in.
	"""
	raise NotImplementedError # pragma: nocover

	def feed(self, character: str) -> None:
	"""
	The main routine to be executed upon character.
	Insert the logic in witch the text would be considered chaotic.
	"""
	raise NotImplementedError # pragma: nocover

	def reset(self) -> None: # pragma: no cover
	"""
	Permit to reset the plugin to the initial state.
	"""
	raise NotImplementedError

	@property
	def ratio(self) -> float:
	"""
	Compute the chaos ratio based on what your feed() has seen.
	Must NOT be lower than 0.; No restriction gt 0.
	"""
	raise NotImplementedError # pragma: nocover


	class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
	def __init__(self) -> None:
	self._punctuation_count: int = 0
	self._symbol_count: int = 0
	self._character_count: int = 0

	self._last_printable_char: Optional[str] = None
	self._frenzy_symbol_in_word: bool = False

	def eligible(self, character: str) -> bool:
	return character.isprintable()

	def feed(self, character: str) -> None:
	self._character_count += 1

	if (
	character != self._last_printable_char
	and character not in COMMON_SAFE_ASCII_CHARACTERS
	):
	if is_punctuation(character):
	self._punctuation_count += 1
	elif (
	character.isdigit() is False
	and is_symbol(character)
	and is_emoticon(character) is False
	):
	self._symbol_count += 2

	self._last_printable_char = character

	def reset(self) -> None: # pragma: no cover
	self._punctuation_count = 0
	self._character_count = 0
	self._symbol_count = 0

	@property
	def ratio(self) -> float:
	if self._character_count == 0:
	return 0.0

	ratio_of_punctuation: float = (
	self._punctuation_count + self._symbol_count
	) / self._character_count

	return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0


	class TooManyAccentuatedPlugin(MessDetectorPlugin):
	def __init__(self) -> None:
	self._character_count: int = 0
	self._accentuated_count: int = 0

	def eligible(self, character: str) -> bool:
	return character.isalpha()

	def feed(self, character: str) -> None:
	self._character_count += 1

	if is_accentuated(character):
	self._accentuated_count += 1

	def reset(self) -> None: # pragma: no cover
	self._character_count = 0
	self._accentuated_count = 0

	@property
	def ratio(self) -> float:
	if self._character_count == 0 or self._character_count < 8:
	return 0.0
	ratio_of_accentuation: float = self._accentuated_count / self._character_count
	return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0


	class UnprintablePlugin(MessDetectorPlugin):
	def __init__(self) -> None:
	self._unprintable_count: int = 0
	self._character_count: int = 0

	def eligible(self, character: str) -> bool:
	return True

	def feed(self, character: str) -> None:
	if is_unprintable(character):
	self._unprintable_count += 1
	self._character_count += 1

	def reset(self) -> None: # pragma: no cover
	self._unprintable_count = 0

	@property
	def ratio(self) -> float:
	if self._character_count == 0:
	return 0.0

	return (self._unprintable_count * 8) / self._character_count


	class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
	def __init__(self) -> None:
	self._successive_count: int = 0
	self._character_count: int = 0

	self._last_latin_character: Optional[str] = None

	def eligible(self, character: str) -> bool:
	return character.isalpha() and is_latin(character)

	def feed(self, character: str) -> None:
	self._character_count += 1
	if (
	self._last_latin_character is not None
	and is_accentuated(character)
	and is_accentuated(self._last_latin_character)
	):
	if character.isupper() and self._last_latin_character.isupper():
	self._successive_count += 1
	# Worse if its the same char duplicated with different accent.
	if remove_accent(character) == remove_accent(self._last_latin_character):
	self._successive_count += 1
	self._last_latin_character = character

	def reset(self) -> None: # pragma: no cover
	self._successive_count = 0
	self._character_count = 0
	self._last_latin_character = None

	@property
	def ratio(self) -> float:
	if self._character_count == 0:
	return 0.0

	return (self._successive_count * 2) / self._character_count


	class SuspiciousRange(MessDetectorPlugin):
	def __init__(self) -> None:
	self._suspicious_successive_range_count: int = 0
	self._character_count: int = 0
	self._last_printable_seen: Optional[str] = None

	def eligible(self, character: str) -> bool:
	return character.isprintable()

	def feed(self, character: str) -> None:
	self._character_count += 1

	if (
	character.isspace()
	or is_punctuation(character)
	or character in COMMON_SAFE_ASCII_CHARACTERS
	):
	self._last_printable_seen = None
	return

	if self._last_printable_seen is None:
	self._last_printable_seen = character
	return

	unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
	unicode_range_b: Optional[str] = unicode_range(character)

	if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
	self._suspicious_successive_range_count += 1

	self._last_printable_seen = character

	def reset(self) -> None: # pragma: no cover
	self._character_count = 0
	self._suspicious_successive_range_count = 0
	self._last_printable_seen = None

	@property
	def ratio(self) -> float:
	if self._character_count == 0:
	return 0.0

	ratio_of_suspicious_range_usage: float = (
	self._suspicious_successive_range_count * 2
	) / self._character_count

	if ratio_of_suspicious_range_usage < 0.1:
	return 0.0

	return ratio_of_suspicious_range_usage


	class SuperWeirdWordPlugin(MessDetectorPlugin):
	def __init__(self) -> None:
	self._word_count: int = 0
	self._bad_word_count: int = 0
	self._foreign_long_count: int = 0

	self._is_current_word_bad: bool = False
	self._foreign_long_watch: bool = False

	self._character_count: int = 0
	self._bad_character_count: int = 0

	self._buffer: str = ""
	self._buffer_accent_count: int = 0

	def eligible(self, character: str) -> bool:
	return True

	def feed(self, character: str) -> None:
	if character.isalpha():
	self._buffer += character
	if is_accentuated(character):
	self._buffer_accent_count += 1
	if (
	self._foreign_long_watch is False
	and (is_latin(character) is False or is_accentuated(character))
	and is_cjk(character) is False
	and is_hangul(character) is False
	and is_katakana(character) is False
	and is_hiragana(character) is False
	and is_thai(character) is False
	):
	self._foreign_long_watch = True
	return
	if not self._buffer:
	return
	if (
	character.isspace() or is_punctuation(character) or is_separator(character)
	) and self._buffer:
	self._word_count += 1
	buffer_length: int = len(self._buffer)

	self._character_count += buffer_length

	if buffer_length >= 4:
	if self._buffer_accent_count / buffer_length > 0.34:
	self._is_current_word_bad = True
	# Word/Buffer ending with an upper case accentuated letter are so rare,
	# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
	if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
	self._foreign_long_count += 1
	self._is_current_word_bad = True
	if buffer_length >= 24 and self._foreign_long_watch:
	camel_case_dst = [
	i
	for c, i in zip(self._buffer, range(0, buffer_length))
	if c.isupper()
	]
	probable_camel_cased: bool = False

	if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
	probable_camel_cased = True

	if not probable_camel_cased:
	self._foreign_long_count += 1
	self._is_current_word_bad = True

	if self._is_current_word_bad:
	self._bad_word_count += 1
	self._bad_character_count += len(self._buffer)
	self._is_current_word_bad = False

	self._foreign_long_watch = False
	self._buffer = ""
	self._buffer_accent_count = 0
	elif (
	character not in {"<", ">", "-", "=", "~", "\|", "_"}
	and character.isdigit() is False
	and is_symbol(character)
	):
	self._is_current_word_bad = True
	self._buffer += character

	def reset(self) -> None: # pragma: no cover
	self._buffer = ""
	self._is_current_word_bad = False
	self._foreign_long_watch = False
	self._bad_word_count = 0
	self._word_count = 0
	self._character_count = 0
	self._bad_character_count = 0
	self._foreign_long_count = 0

	@property
	def ratio(self) -> float:
	if self._word_count <= 10 and self._foreign_long_count == 0:
	return 0.0

	return self._bad_character_count / self._character_count


	class CjkInvalidStopPlugin(MessDetectorPlugin):
	"""
	GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
	can be easily detected. Searching for the overuse of '丅' and '丄'.
	"""

	def __init__(self) -> None:
	self._wrong_stop_count: int = 0
	self._cjk_character_count: int = 0

	def eligible(self, character: str) -> bool:
	return True

	def feed(self, character: str) -> None:
	if character in {"丅", "丄"}:
	self._wrong_stop_count += 1
	return
	if is_cjk(character):
	self._cjk_character_count += 1

	def reset(self) -> None: # pragma: no cover
	self._wrong_stop_count = 0
	self._cjk_character_count = 0

	@property
	def ratio(self) -> float:
	if self._cjk_character_count < 16:
	return 0.0
	return self._wrong_stop_count / self._cjk_character_count


	class ArchaicUpperLowerPlugin(MessDetectorPlugin):
	def __init__(self) -> None:
	self._buf: bool = False

	self._character_count_since_last_sep: int = 0

	self._successive_upper_lower_count: int = 0
	self._successive_upper_lower_count_final: int = 0

	self._character_count: int = 0

	self._last_alpha_seen: Optional[str] = None
	self._current_ascii_only: bool = True

	def eligible(self, character: str) -> bool:
	return True

	def feed(self, character: str) -> None:
	is_concerned = character.isalpha() and is_case_variable(character)
	chunk_sep = is_concerned is False

	if chunk_sep and self._character_count_since_last_sep > 0:
	if (
	self._character_count_since_last_sep <= 64
	and character.isdigit() is False
	and self._current_ascii_only is False
	):
	self._successive_upper_lower_count_final += (
	self._successive_upper_lower_count
	)

	self._successive_upper_lower_count = 0
	self._character_count_since_last_sep = 0
	self._last_alpha_seen = None
	self._buf = False
	self._character_count += 1
	self._current_ascii_only = True

	return

	if self._current_ascii_only is True and is_ascii(character) is False:
	self._current_ascii_only = False

	if self._last_alpha_seen is not None:
	if (character.isupper() and self._last_alpha_seen.islower()) or (
	character.islower() and self._last_alpha_seen.isupper()
	):
	if self._buf is True:
	self._successive_upper_lower_count += 2
	self._buf = False
	else:
	self._buf = True
	else:
	self._buf = False

	self._character_count += 1
	self._character_count_since_last_sep += 1
	self._last_alpha_seen = character

	def reset(self) -> None: # pragma: no cover
	self._character_count = 0
	self._character_count_since_last_sep = 0
	self._successive_upper_lower_count = 0
	self._successive_upper_lower_count_final = 0
	self._last_alpha_seen = None
	self._buf = False
	self._current_ascii_only = True

	@property
	def ratio(self) -> float:
	if self._character_count == 0:
	return 0.0

	return self._successive_upper_lower_count_final / self._character_count


	@lru_cache(maxsize=1024)
	def is_suspiciously_successive_range(
	unicode_range_a: Optional[str], unicode_range_b: Optional[str]
	) -> bool:
	"""
	Determine if two Unicode range seen next to each other can be considered as suspicious.
	"""
	if unicode_range_a is None or unicode_range_b is None:
	return True

	if unicode_range_a == unicode_range_b:
	return False

	if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
	return False

	if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
	return False

	# Latin characters can be accompanied with a combining diacritical mark
	# eg. Vietnamese.
	if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
	"Combining" in unicode_range_a or "Combining" in unicode_range_b
	):
	return False

	keywords_range_a, keywords_range_b = unicode_range_a.split(
	" "
	), unicode_range_b.split(" ")

	for el in keywords_range_a:
	if el in UNICODE_SECONDARY_RANGE_KEYWORD:
	continue
	if el in keywords_range_b:
	return False

	# Japanese Exception
	range_a_jp_chars, range_b_jp_chars = (
	unicode_range_a
	in (
	"Hiragana",
	"Katakana",
	),
	unicode_range_b in ("Hiragana", "Katakana"),
	)
	if (range_a_jp_chars or range_b_jp_chars) and (
	"CJK" in unicode_range_a or "CJK" in unicode_range_b
	):
	return False
	if range_a_jp_chars and range_b_jp_chars:
	return False

	if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
	if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
	return False
	if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
	return False

	# Chinese/Japanese use dedicated range for punctuation and/or separators.
	if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
	unicode_range_a in ["Katakana", "Hiragana"]
	and unicode_range_b in ["Katakana", "Hiragana"]
	):
	if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
	return False
	if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
	return False

	return True


	@lru_cache(maxsize=2048)
	def mess_ratio(
	decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
	) -> float:
	"""
	Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
	"""

	detectors: List[MessDetectorPlugin] = [
	md_class() for md_class in MessDetectorPlugin.__subclasses__()
	]

	length: int = len(decoded_sequence) + 1

	mean_mess_ratio: float = 0.0

	if length < 512:
	intermediary_mean_mess_ratio_calc: int = 32
	elif length <= 1024:
	intermediary_mean_mess_ratio_calc = 64
	else:
	intermediary_mean_mess_ratio_calc = 128

	for character, index in zip(decoded_sequence + "\n", range(length)):
	for detector in detectors:
	if detector.eligible(character):
	detector.feed(character)

	if (
	index > 0 and index % intermediary_mean_mess_ratio_calc == 0
	) or index == length - 1:
	mean_mess_ratio = sum(dt.ratio for dt in detectors)

	if mean_mess_ratio >= maximum_threshold:
	break

	if debug:
	logger = getLogger("charset_normalizer")

	logger.log(
	TRACE,
	"Mess-detector extended-analysis start. "
	f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
	f"maximum_threshold={maximum_threshold}",
	)

	if len(decoded_sequence) > 16:
	logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
	logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")

	for dt in detectors: # pragma: nocover
	logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")

	return round(mean_mess_ratio, 3)