Skip to content

Commit

Permalink
Improved charset tag recognition accuracy.
Browse files Browse the repository at this point in the history
  • Loading branch information
osapon committed Nov 29, 2024
1 parent a1da3d4 commit 751ea8b
Showing 1 changed file with 20 additions and 3 deletions.
23 changes: 20 additions & 3 deletions src/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,31 @@ public function __construct(Extractor $extractor)

$encoding = null;
$contentType = $extractor->getResponse()->getHeaderLine('content-type');
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $contentType, $match);
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
} elseif (!empty($html)) {
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $html, $match);
try {
$ret = mb_encoding_aliases($encoding);
if ($ret === false) {
$encoding = null;
}
} catch (\ValueError $exception) {
$encoding = null;
}
}
if (is_null($encoding) && !empty($html)) {
preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match);
if (!empty($match[1])) {
$encoding = trim($match[1], ',');
}
try {
$ret = mb_encoding_aliases($encoding);
if ($ret === false) {
$encoding = null;
}
} catch (\ValueError $exception) {
$encoding = null;
}
}
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
$this->initXPath();
Expand Down

0 comments on commit 751ea8b

Please sign in to comment.