Skip to content

Commit

Permalink
❇️ Improve the detection around some cases
Browse files Browse the repository at this point in the history
Close #365 #357 #356
  • Loading branch information
Ousret committed Oct 19, 2023
1 parent 165211a commit 490a1bb
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 9 deletions.
4 changes: 3 additions & 1 deletion bin/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List
import argparse

from charset_normalizer import from_path
from charset_normalizer import from_path, __version__
from charset_normalizer.utils import iana_name

from os import sep
Expand Down Expand Up @@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
exit(1)

print(f"> using charset-normalizer {__version__}")

success_count = 0
total_count = 0

Expand Down
13 changes: 8 additions & 5 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count == 0:
if self._character_count <= 24:
return 0.0

ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count

if ratio_of_suspicious_range_usage < 0.1:
return 0.0

return ratio_of_suspicious_range_usage


Expand Down Expand Up @@ -295,7 +292,11 @@ def feed(self, character: str) -> None:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
if (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
Expand Down Expand Up @@ -521,6 +522,8 @@ def is_suspiciously_successive_range(
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False

return True

Expand Down
4 changes: 2 additions & 2 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
if character_range is None:
return False

return "Forms" in character_range
return "Forms" in character_range and character_category != "Lo"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand All @@ -106,7 +106,7 @@ def is_emoticon(character: str) -> bool:
if character_range is None:
return False

return "Emoticons" in character_range
return "Emoticons" in character_range or "Pictographs" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "3.3.0"
__version__ = "3.3.1"
VERSION = __version__.split(".")

0 comments on commit 490a1bb

Please sign in to comment.