From 751ea8be747a9ccc15b24ed3724ff1374961938f Mon Sep 17 00:00:00 2001 From: osapon Date: Mon, 25 Nov 2024 11:58:17 +0900 Subject: [PATCH] Improved charset tag recognition accuracy. --- src/Document.php | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/Document.php b/src/Document.php index 6db07663..c61e5c62 100644 --- a/src/Document.php +++ b/src/Document.php @@ -28,14 +28,31 @@ public function __construct(Extractor $extractor) $encoding = null; $contentType = $extractor->getResponse()->getHeaderLine('content-type'); - preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $contentType, $match); + preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $contentType, $match); if (!empty($match[1])) { $encoding = trim($match[1], ','); - } elseif (!empty($html)) { - preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $html, $match); + try { + $ret = mb_encoding_aliases($encoding); + if ($ret === false) { + $encoding = null; + } + } catch (\ValueError $exception) { + $encoding = null; + } + } + if (is_null($encoding) && !empty($html)) { + preg_match('/charset=(?:"|\')?(.*?)(?=$|\s|;|"|\'|>)/i', $html, $match); if (!empty($match[1])) { $encoding = trim($match[1], ','); } + try { + $ret = mb_encoding_aliases($encoding); + if ($ret === false) { + $encoding = null; + } + } catch (\ValueError $exception) { + $encoding = null; + } } $this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument(); $this->initXPath();