From 66966f1d7911a6ed29bc5d6dd927aa1fd5fac8a1 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 19 Oct 2023 08:42:34 +0200 Subject: [PATCH] :sparkle: Improve the detection around some cases (#366) Close #365 #357 #356 --- CHANGELOG.md | 8 +++++++- bin/coverage.py | 4 +++- charset_normalizer/md.py | 13 ++++++++----- charset_normalizer/utils.py | 4 ++-- charset_normalizer/version.py | 2 +- tests/test_edge_case.py | 4 +++- 6 files changed, 24 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d2898af0..eec7d1cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-??) + +### Changed +- Optional mypyc compilation upgraded to version 1.6.0 for Python >= 3.8 +- Improved the general detection reliability based on reports from the community + ## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30) ### Added @@ -14,7 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection -- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.7 +- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8 ### Fixed - Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350) diff --git a/bin/coverage.py b/bin/coverage.py index 94e058cf..e5f07bd5 100644 --- a/bin/coverage.py +++ b/bin/coverage.py @@ -5,7 +5,7 @@ from typing import List import argparse -from charset_normalizer import from_path +from charset_normalizer import from_path, __version__ from charset_normalizer.utils import iana_name from os import sep @@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]): print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory") exit(1) + print(f"> using charset-normalizer {__version__}") + success_count = 0 total_count = 0 diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index a6d9350c..103dfdd6 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover @property def ratio(self) -> float: - if self._character_count == 0: + if self._character_count <= 24: return 0.0 ratio_of_suspicious_range_usage: float = ( self._suspicious_successive_range_count * 2 ) / self._character_count - if ratio_of_suspicious_range_usage < 0.1: - return 0.0 - return ratio_of_suspicious_range_usage @@ -295,7 +292,11 @@ def feed(self, character: str) -> None: self._is_current_word_bad = True # Word/Buffer ending with an upper case accentuated letter are so rare, # that we will consider them all as suspicious. Same weight as foreign_long suspicious. - if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper(): + if ( + is_accentuated(self._buffer[-1]) + and self._buffer[-1].isupper() + and all(_.isupper() for _ in self._buffer) is False + ): self._foreign_long_count += 1 self._is_current_word_bad = True if buffer_length >= 24 and self._foreign_long_watch: @@ -521,6 +522,8 @@ def is_suspiciously_successive_range( return False if "Forms" in unicode_range_a or "Forms" in unicode_range_b: return False + if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin": + return False return True diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 45a402e4..b5ee8459 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool: if character_range is None: return False - return "Forms" in character_range + return "Forms" in character_range and character_category != "Lo" @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) @@ -106,7 +106,7 @@ def is_emoticon(character: str) -> bool: if character_range is None: return False - return "Emoticons" in character_range + return "Emoticons" in character_range or "Pictographs" in character_range @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index db1ff57a..83683f4c 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.3.0" +__version__ = "3.3.1" VERSION = __version__.split(".") diff --git a/tests/test_edge_case.py b/tests/test_edge_case.py index 4f8e0015..f324664d 100644 --- a/tests/test_edge_case.py +++ b/tests/test_edge_case.py @@ -1,6 +1,8 @@ from charset_normalizer import from_bytes +import pytest +import platform - +@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)") def test_unicode_edge_case(): payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'