From c8451b7c9b068731fd891579d28089c5984607ea Mon Sep 17 00:00:00 2001 From: ignace nyamagana butera Date: Fri, 3 Feb 2023 17:08:13 +0100 Subject: [PATCH] Allow CharsetConverter to handle BOM skipping #483 --- CHANGELOG.md | 3 +- composer.json | 10 ++-- docs/9.0/connections/bom.md | 10 +++- docs/9.0/connections/filters.md | 5 +- docs/9.0/interoperability/encoding.md | 30 ++++++++++ src/AbstractCsv.php | 5 +- src/AbstractCsvTest.php | 2 + src/CannotInsertRecordTest.php | 2 + src/CharsetConverter.php | 44 ++++++++++++--- src/CharsetConverterTest.php | 81 +++++++++++++++++++++++++++ src/ColumnConsistencyTest.php | 2 + src/EncloseFieldTest.php | 2 + src/EscapeFormulaTest.php | 2 + src/HTMLConverterTest.php | 2 + src/InfoTest.php | 2 + src/RFC4180FieldTest.php | 2 + src/Reader.php | 7 ++- src/ReaderTest.php | 2 + src/ResultSetTest.php | 2 + src/StatementTest.php | 2 + src/Stream.php | 5 +- src/StreamTest.php | 2 + src/WriterTest.php | 2 + src/XMLConverterTest.php | 2 + 24 files changed, 204 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8635c2a8..8354616b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,10 @@ All Notable changes to `Csv` will be documented in this file ### Added -- `League\Csv\TabularDataWriter` interface to represent how to write to a tabular data document. +- `TabularDataWriter` interface to represent how to write to a tabular data document. - `TabularDataReader::first` to replace `TabularDataReader::fetchOne` - `TabularDataReader::nth` to replace `TabularDataReader::fetchOne` +- `CharsetConverter::skipBOM` to improve BOM skipping see [bug #483](https://github.com/thephpleague/csv/issues/483) ### Deprecated diff --git a/composer.json b/composer.json index c6a2793d..0c7f5020 100644 --- a/composer.json +++ b/composer.json @@ -33,13 +33,13 @@ "require-dev": { "ext-dom": "*", "doctrine/collections": "^2.1.2", - "friendsofphp/php-cs-fixer": "^v3.13.2", - "phpbench/phpbench": "^1.2.7", - "phpstan/phpstan": "^1.9.6", + "friendsofphp/php-cs-fixer": "^v3.14.3", + "phpbench/phpbench": "^1.2.8", + "phpstan/phpstan": "^1.9.16", "phpstan/phpstan-deprecation-rules": "^1.1.1", "phpstan/phpstan-phpunit": "^1.3.3", - "phpstan/phpstan-strict-rules": "^1.4.4", - "phpunit/phpunit": "^9.5.27" + "phpstan/phpstan-strict-rules": "^1.4.5", + "phpunit/phpunit": "^9.6.3" }, "autoload": { "psr-4": { diff --git a/docs/9.0/connections/bom.md b/docs/9.0/connections/bom.md index b3e2b96f..e40ab5ae 100644 --- a/docs/9.0/connections/bom.md +++ b/docs/9.0/connections/bom.md @@ -62,12 +62,12 @@ bom_match('hello world!'.ByteSequence::BOM_UTF16_BE); //returns '' public AbstractCsv::getInputBOM(void): string ``` -The CSV document current BOM character is detected using the `getInputBOM` method. This method returns the currently used BOM character or an empty string if none is found or recognized. The detection is done using the `bom_match` function. +The CSV document current BOM character is detected using the `getInputBOM` method. This method returns the currently used BOM character or an empty string if none is found or recognized. The detection is done using the `Info::fetchBOMSequence` static method. ```php -use League\Csv\Writer; +use League\Csv\Reader; -$csv = Writer::createFromPath('/path/to/file.csv'); +$csv = Reader::createFromPath('/path/to/file.csv'); $bom = $csv->getInputBOM(); ``` @@ -128,3 +128,7 @@ The returned `$document` will contain **2** BOM markers instead of one.

If you are using a stream that can not be seekable you should disable BOM skipping, otherwise an Exception will be triggered.

The BOM sequence is never removed from the CSV document, it is only skipped from the result set.

+ +### Skipping the BOM Sequence + +

Since version 9.9.0 you can skip the BOM sequence using the CharsetConverter filter

diff --git a/docs/9.0/connections/filters.md b/docs/9.0/connections/filters.md index 92d80513..12556087 100644 --- a/docs/9.0/connections/filters.md +++ b/docs/9.0/connections/filters.md @@ -5,7 +5,7 @@ title: Controlling PHP Stream Filters # Stream Filters -To ease performing operations on the CSV document as it is being read from or written to, you can add PHP stream filters to the `Reader` and `Writer` objects. +To ease performing operations on the CSV document as it is being read from or written to, the `Reader` and `Writer` objects allow attaching PHP stream filters to them. ## Detecting stream filter support @@ -143,7 +143,8 @@ $reader = null; ## Bundled stream filters -The library comes bundled with two (2) stream filters: +The library comes bundled with the following stream filters: - [RFC4180Field](/9.0/interoperability/rfc4180-field/) stream filter to read or write RFC4180 compliant CSV field; - [CharsetConverter](/9.0/converter/charset/) stream filter to convert your CSV document content using the `mbstring` extension; +- [SkipBOMSequence](/9.0/connections/bom/) stream filter to skip your CSV document BOM sequence if present; diff --git a/docs/9.0/interoperability/encoding.md b/docs/9.0/interoperability/encoding.md index 71f37528..8a252f6c 100644 --- a/docs/9.0/interoperability/encoding.md +++ b/docs/9.0/interoperability/encoding.md @@ -55,3 +55,33 @@ $writer->output('mycsvfile.csv'); ```

The conversion is done with the mbstring extension using the League\Csv\CharsetConverter.

+ +## Skipping The BOM sequence with the Reader class + +

Since version 9.9.0.

+ +In order to ensure the correct removal of the sequence and avoid bugs while parsing the CSV, the filter can skip the +BOM sequence completely when using the `Reader` class and convert the CSV content from the BOM sequence encoding charset +to UTF-8. To work as intended call the `Reader::includeInputBOM` method to ensure the default BOM removal behaviour is disabled +and add the stream filter to you reader instance using the static method `CharsetConverter::skipBOM` method; + +```php +includeInputBOM(); // de-activate the default skipping mechanism +CharsetConverter::allowBOMSkipping($document); +var_dump(iterator_to_array($document)); +// returns the document content without the skipped BOM sequence +// [ +// ['john', 'doe', 'john.doe@example.com'], +// ['jane', 'doe', 'jane.doe@example.com'], +// ] +``` + +

Once the filter is applied, the Reader instance looses all information regarding its +own BOM sequence. The sequence is still be present but the instance is no longer able to detect it.

diff --git a/src/AbstractCsv.php b/src/AbstractCsv.php index e4e3ff4b..1f54ae9a 100644 --- a/src/AbstractCsv.php +++ b/src/AbstractCsv.php @@ -15,6 +15,7 @@ use Generator; use SplFileObject; +use Stringable; use function filter_var; use function get_class; use function mb_strlen; @@ -94,9 +95,9 @@ public static function createFromStream($stream): static /** * Returns a new instance from a string. */ - public static function createFromString(string $content = ''): static + public static function createFromString(Stringable|string $content = ''): static { - return new static(Stream::createFromString($content)); + return new static(Stream::createFromString((string) $content)); } /** diff --git a/src/AbstractCsvTest.php b/src/AbstractCsvTest.php index b4a9fd88..94bd646f 100644 --- a/src/AbstractCsvTest.php +++ b/src/AbstractCsvTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use PHPUnit\Framework\TestCase; diff --git a/src/CannotInsertRecordTest.php b/src/CannotInsertRecordTest.php index 6a54557e..1a8b718d 100644 --- a/src/CannotInsertRecordTest.php +++ b/src/CannotInsertRecordTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use PHPUnit\Framework\TestCase; diff --git a/src/CharsetConverter.php b/src/CharsetConverter.php index e68804c5..dfb7994e 100644 --- a/src/CharsetConverter.php +++ b/src/CharsetConverter.php @@ -36,18 +36,42 @@ class CharsetConverter extends php_user_filter { public const FILTERNAME = 'convert.league.csv'; + public const BOM_SEQUENCE = 'bom_sequence'; + public const SKIP_BOM_SEQUENCE = 'skip_bom_sequence'; protected string $input_encoding = 'UTF-8'; protected string $output_encoding = 'UTF-8'; + protected bool $skipBomSequence = false; + + /** + * Static method to add the stream filter to a {@link Reader} object to handle BOM skipping. + */ + public static function allowBOMSkipping(Reader $document): Reader + { + self::register(); + + $document->addStreamFilter( + self::getFiltername(match ($document->getInputBOM()) { + ByteSequence::BOM_UTF16_LE => 'UTF-16LE', + ByteSequence::BOM_UTF16_BE => 'UTF-16BE', + ByteSequence::BOM_UTF32_LE => 'UTF-32LE', + ByteSequence::BOM_UTF32_BE => 'UTF-32BE', + default => 'UTF-8', + }, 'UTF-8'), + [self::BOM_SEQUENCE => self::SKIP_BOM_SEQUENCE] + ); + + return $document; + } /** * Static method to add the stream filter to a {@link AbstractCsv} object. */ - public static function addTo(AbstractCsv $csv, string $input_encoding, string $output_encoding): AbstractCsv + public static function addTo(AbstractCsv $csv, string $input_encoding, string $output_encoding, array $params = null): AbstractCsv { self::register(); - return $csv->addStreamFilter(self::getFiltername($input_encoding, $output_encoding)); + return $csv->addStreamFilter(self::getFiltername($input_encoding, $output_encoding), $params); } /** @@ -110,6 +134,9 @@ public function onCreate(): bool try { $this->input_encoding = self::filterEncoding($matches['input']); $this->output_encoding = self::filterEncoding($matches['output']); + $this->skipBomSequence = is_array($this->params) + && isset($this->params[self::BOM_SEQUENCE]) + && self::SKIP_BOM_SEQUENCE === $this->params[self::BOM_SEQUENCE]; } catch (OutOfRangeException) { return false; } @@ -117,16 +144,17 @@ public function onCreate(): bool return true; } - /** - * @param resource $in - * @param resource $out - * @param int $consumed - */ public function filter($in, $out, &$consumed, bool $closing): int { set_error_handler(fn (int $errno, string $errstr, string $errfile, int $errline) => true); + $alreadyRun = false; while (null !== ($bucket = stream_bucket_make_writeable($in))) { - $bucket->data = mb_convert_encoding($bucket->data, $this->output_encoding, $this->input_encoding); + $content = $bucket->data; + if (!$alreadyRun && $this->skipBomSequence && null !== ($bom = Info::fetchBOMSequence($content))) { + $content = substr($content, strlen($bom)); + } + $alreadyRun = true; + $bucket->data = mb_convert_encoding($content, $this->output_encoding, $this->input_encoding); $consumed += $bucket->datalen; stream_bucket_append($out, $bucket); } diff --git a/src/CharsetConverterTest.php b/src/CharsetConverterTest.php index bd3de828..67bbbb2e 100644 --- a/src/CharsetConverterTest.php +++ b/src/CharsetConverterTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use ArrayIterator; @@ -196,4 +198,83 @@ public function converterProvider(): array ], ]; } + + public function testItDoesNotChangeTheCSVContentIfNoBOMSequenceIsFound(): void + { + $data = <<includeInputBOM(); + + self::assertSame( + [['start +end']], + iterator_to_array($reader) + ); + } + + /** + * @dataProvider providesBOMSequences + */ + public function testItSkipBOMSequenceBeforeConsumingTheCSVStream(string $sequence): void + { + $data = <<includeInputBOM(); + CharsetConverter::allowBOMSkipping($reader); + + self::assertSame( + [['start +end']], + iterator_to_array($reader) + ); + } + + /** + * @dataProvider providesBOMSequences + */ + public function testItOnlySkipOnceTheBOMSequenceBeforeConsumingTheCSVStreamOnMultipleLine(string $sequence): void + { + $data = <<includeInputBOM(); + CharsetConverter::allowBOMSkipping($reader); + + self::assertSame( + [[$sequence.'start +end']], + iterator_to_array($reader) + ); + } + + /** + * @dataProvider providesBOMSequences + */ + public function testItOnlySkipOnceTheBOMSequenceBeforeConsumingTheCSVStreamOnSingleLine(string $sequence): void + { + $reader = Reader::createFromString($sequence.$sequence.'start,'.$sequence.'end'); + CharsetConverter::allowBOMSkipping($reader); + $reader->includeInputBOM(); + + self::assertSame( + [[$sequence.'start', $sequence.'end']], + iterator_to_array($reader) + ); + } + + public function providesBOMSequences(): iterable + { + yield 'BOM UTF-8' => [ + 'sequence' => ByteSequence::BOM_UTF8, + ]; + } } diff --git a/src/ColumnConsistencyTest.php b/src/ColumnConsistencyTest.php index 3af224d2..3c022a38 100644 --- a/src/ColumnConsistencyTest.php +++ b/src/ColumnConsistencyTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use PHPUnit\Framework\TestCase; diff --git a/src/EncloseFieldTest.php b/src/EncloseFieldTest.php index a1e9b332..f374162e 100644 --- a/src/EncloseFieldTest.php +++ b/src/EncloseFieldTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use InvalidArgumentException; diff --git a/src/EscapeFormulaTest.php b/src/EscapeFormulaTest.php index dc3a3b8b..01a1b9c8 100644 --- a/src/EscapeFormulaTest.php +++ b/src/EscapeFormulaTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use InvalidArgumentException; diff --git a/src/HTMLConverterTest.php b/src/HTMLConverterTest.php index 69ccfd95..b3ec646c 100644 --- a/src/HTMLConverterTest.php +++ b/src/HTMLConverterTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use DOMException; diff --git a/src/InfoTest.php b/src/InfoTest.php index 41380cd6..721cfe93 100644 --- a/src/InfoTest.php +++ b/src/InfoTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use PHPUnit\Framework\TestCase; diff --git a/src/RFC4180FieldTest.php b/src/RFC4180FieldTest.php index 988d019c..f54d61c4 100644 --- a/src/RFC4180FieldTest.php +++ b/src/RFC4180FieldTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use InvalidArgumentException; diff --git a/src/Reader.php b/src/Reader.php index 673658c2..4f8d3bda 100644 --- a/src/Reader.php +++ b/src/Reader.php @@ -98,7 +98,12 @@ protected function setHeader(int $offset): array return $header; } - $header = $this->removeBOM($header, mb_strlen($this->getInputBOM()), $this->enclosure); + $header = $this->removeBOM( + $header, + !$this->is_input_bom_included ? mb_strlen($this->getInputBOM()) : 0, + $this->enclosure + ); + if ([''] === $header) { throw SyntaxError::dueToHeaderNotFound($offset); } diff --git a/src/ReaderTest.php b/src/ReaderTest.php index c992ed8a..4159cab7 100644 --- a/src/ReaderTest.php +++ b/src/ReaderTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use PHPUnit\Framework\TestCase; diff --git a/src/ResultSetTest.php b/src/ResultSetTest.php index 86f73f08..04087201 100644 --- a/src/ResultSetTest.php +++ b/src/ResultSetTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use Generator; diff --git a/src/StatementTest.php b/src/StatementTest.php index 0c6a8821..c8598cc3 100644 --- a/src/StatementTest.php +++ b/src/StatementTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use OutOfBoundsException; diff --git a/src/Stream.php b/src/Stream.php index 308ed856..aa0a3b28 100644 --- a/src/Stream.php +++ b/src/Stream.php @@ -15,6 +15,7 @@ use SeekableIterator; use SplFileObject; +use Stringable; use TypeError; use function array_keys; use function array_walk_recursive; @@ -131,11 +132,11 @@ public static function createFromPath(string $path, string $open_mode = 'r', $co /** * Returns a new instance from a string. */ - public static function createFromString(string $content = ''): self + public static function createFromString(Stringable|string $content = ''): self { /** @var resource $resource */ $resource = fopen('php://temp', 'r+'); - fwrite($resource, $content); + fwrite($resource, (string) $content); $instance = new self($resource); $instance->should_close_stream = true; diff --git a/src/StreamTest.php b/src/StreamTest.php index 9a3f56ac..9c4cc097 100644 --- a/src/StreamTest.php +++ b/src/StreamTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use PHPUnit\Framework\TestCase; diff --git a/src/WriterTest.php b/src/WriterTest.php index 72693ee3..3567b22c 100644 --- a/src/WriterTest.php +++ b/src/WriterTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use ArrayIterator; diff --git a/src/XMLConverterTest.php b/src/XMLConverterTest.php index 418f7da2..8872091f 100644 --- a/src/XMLConverterTest.php +++ b/src/XMLConverterTest.php @@ -9,6 +9,8 @@ * file that was distributed with this source code. */ +declare(strict_types=1); + namespace League\Csv; use DOMDocument;