Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

❇️ Improve the detection around some cases #366

Merged
merged 1 commit into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-??)

### Changed
- Optional mypyc compilation upgraded to version 1.6.0 for Python >= 3.8
- Improved the general detection reliability based on reports from the community

## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)

### Added
Expand All @@ -14,7 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.7
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8

### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
Expand Down
4 changes: 3 additions & 1 deletion bin/coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List
import argparse

from charset_normalizer import from_path
from charset_normalizer import from_path, __version__
from charset_normalizer.utils import iana_name

from os import sep
Expand Down Expand Up @@ -40,6 +40,8 @@ def cli_coverage(arguments: List[str]):
print("This script require https://github.com/Ousret/char-dataset to be cloned on package root directory")
exit(1)

print(f"> using charset-normalizer {__version__}")

success_count = 0
total_count = 0

Expand Down
13 changes: 8 additions & 5 deletions charset_normalizer/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,16 +233,13 @@ def reset(self) -> None: # pragma: no cover

@property
def ratio(self) -> float:
if self._character_count == 0:
if self._character_count <= 24:
return 0.0

ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count

if ratio_of_suspicious_range_usage < 0.1:
return 0.0

return ratio_of_suspicious_range_usage


Expand Down Expand Up @@ -295,7 +292,11 @@ def feed(self, character: str) -> None:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
if (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
Expand Down Expand Up @@ -521,6 +522,8 @@ def is_suspiciously_successive_range(
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False

return True

Expand Down
4 changes: 2 additions & 2 deletions charset_normalizer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def is_symbol(character: str) -> bool:
if character_range is None:
return False

return "Forms" in character_range
return "Forms" in character_range and character_category != "Lo"


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand All @@ -106,7 +106,7 @@ def is_emoticon(character: str) -> bool:
if character_range is None:
return False

return "Emoticons" in character_range
return "Emoticons" in character_range or "Pictographs" in character_range


@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
Expand Down
2 changes: 1 addition & 1 deletion charset_normalizer/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
Expose version
"""

__version__ = "3.3.0"
__version__ = "3.3.1"
VERSION = __version__.split(".")
4 changes: 3 additions & 1 deletion tests/test_edge_case.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from charset_normalizer import from_bytes
import pytest
import platform


@pytest.mark.xfail(platform.python_version_tuple()[0] == "3" and platform.python_version_tuple()[1] == "7", reason="Unicode database is too old for this case (Python 3.7)")
def test_unicode_edge_case():
payload = b'\xef\xbb\xbf\xf0\x9f\xa9\xb3'

Expand Down
Loading