From 913e1c96a02b0d95d4193a9eb088e6935131d167 Mon Sep 17 00:00:00 2001 From: demeritcowboy Date: Fri, 18 Dec 2020 16:06:18 -0500 Subject: [PATCH] trim non-breaking spaces in both ascii and utf8 without breaking other utf8 characters that share the same byte --- CRM/Import/DataSource/CSV.php | 29 +++++- .../phpunit/CRM/Import/DataSource/CsvTest.php | 89 ++++++++++++++++--- .../CRM/Import/DataSource/specialchar.csv | 2 + .../DataSource/specialchar_with_BOM.csv | 2 + 4 files changed, 106 insertions(+), 16 deletions(-) create mode 100644 tests/phpunit/CRM/Import/DataSource/specialchar.csv create mode 100644 tests/phpunit/CRM/Import/DataSource/specialchar_with_BOM.csv diff --git a/CRM/Import/DataSource/CSV.php b/CRM/Import/DataSource/CSV.php index 48687237cb9e..45b2ee63b320 100644 --- a/CRM/Import/DataSource/CSV.php +++ b/CRM/Import/DataSource/CSV.php @@ -222,10 +222,7 @@ private static function _CsvToTable( $first = FALSE; // CRM-17859 Trim non-breaking spaces from columns. - $row = array_map( - function($string) { - return trim($string, chr(0xC2) . chr(0xA0)); - }, $row); + $row = array_map(['CRM_Import_DataSource_CSV', 'trimNonBreakingSpaces'], $row); $row = array_map(['CRM_Core_DAO', 'escapeString'], $row); $sql .= "('" . implode("', '", $row) . "')"; $count++; @@ -251,4 +248,28 @@ function($string) { return $result; } + /** + * Trim non-breaking spaces in a multibyte-safe way. + * See also dev/core#2127 - avoid breaking strings ending in à or any other + * unicode character sharing the same 0xA0 byte as a non-breaking space. + * + * @param string $string + * @return string The trimmed string + */ + public static function trimNonBreakingSpaces(string $string): string { + $encoding = mb_detect_encoding($string, NULL, TRUE); + if ($encoding === FALSE) { + // This could mean a couple things. One is that the string is + // ASCII-encoded but contains a non-breaking space, which causes + // php to fail to detect the encoding. So let's just do what we + // did before which works in that situation and is at least no + // worse in other situations. + return trim($string, chr(0xC2) . chr(0xA0)); + } + elseif ($encoding !== 'UTF-8') { + $string = mb_convert_encoding($string, 'UTF-8', [$encoding]); + } + return preg_replace("/^(\u{a0})+|(\u{a0})+$/", '', $string); + } + } diff --git a/tests/phpunit/CRM/Import/DataSource/CsvTest.php b/tests/phpunit/CRM/Import/DataSource/CsvTest.php index a93f31aeb42f..a51eb1b0b943 100644 --- a/tests/phpunit/CRM/Import/DataSource/CsvTest.php +++ b/tests/phpunit/CRM/Import/DataSource/CsvTest.php @@ -17,16 +17,16 @@ class CRM_Import_DataSource_CsvTest extends CiviUnitTestCase { /** * Test the to csv function. * - * @param string $fileName + * @param array $fileData * * @dataProvider getCsvFiles * @throws \CRM_Core_Exception */ - public function testToCsv($fileName) { + public function testToCsv(array $fileData) { $dataSource = new CRM_Import_DataSource_CSV(); $params = [ 'uploadFile' => [ - 'name' => __DIR__ . '/' . $fileName, + 'name' => __DIR__ . '/' . $fileData['filename'], ], 'skipColumnHeader' => TRUE, ]; @@ -40,14 +40,10 @@ public function testToCsv($fileName) { $dataSource->postProcess($params, $db, $form); $tableName = $form->get('importTableName'); - $this->assertEquals(4, - CRM_Core_DAO::singleValueQuery("SELECT LENGTH(last_name) FROM $tableName"), - $fileName . ' failed on last_name' - ); - $this->assertEquals(21, - CRM_Core_DAO::singleValueQuery("SELECT LENGTH(email) FROM $tableName"), - $fileName . ' failed on email' - ); + foreach (['first_name', 'last_name', 'email'] as $field) { + $json = json_encode(CRM_Core_DAO::singleValueQuery("SELECT $field FROM $tableName")); + $this->assertEquals($fileData["{$field}_json"], $json, "{$fileData['filename']} failed on $field"); + } CRM_Core_DAO::executeQuery("DROP TABLE $tableName"); } @@ -57,7 +53,76 @@ public function testToCsv($fileName) { * @return array */ public function getCsvFiles() { - return [['import.csv'], ['yogi.csv']]; + return [ + // import.csv is utf8-encoded, with no BOM + [ + [ + 'filename' => 'import.csv', + 'first_name_json' => '"Yogi"', + 'last_name_json' => '"Bear"', + 'email_json' => '"yogi@yellowstone.park"', + ], + ], + // yogi.csv is latin1-encoded + [ + [ + 'filename' => 'yogi.csv', + 'first_name_json' => '"Yogi"', + 'last_name_json' => '"Bear"', + 'email_json' => '"yogi@yellowstone.park"', + ], + ], + // specialchar.csv is utf8-encoded, with no BOM + [ + [ + 'filename' => 'specialchar.csv', + // note that json uses unicode representation not utf8 byte sequences + 'first_name_json' => '"Yog\u00e0"', + 'last_name_json' => '"Ber\u00e0"', + 'email_json' => '"yogi@yellowstone.park"', + ], + ], + // specialchar_with_BOM.csv is utf8-encoded with BOM + [ + [ + 'filename' => 'specialchar_with_BOM.csv', + 'first_name_json' => '"Yog\u00e0"', + 'last_name_json' => '"Ber\u00e0"', + 'email_json' => '"yogi@yellowstone.park"', + ], + ], + ]; + } + + /** + * Test the trim function + * @dataProvider trimDataProvider + * @param string $input + * @param string $expected + */ + public function testTrim(string $input, string $expected) { + $this->assertSame($expected, CRM_Import_DataSource_CSV::trimNonBreakingSpaces($input)); + } + + /** + * Dataprovider for testTrim + * @return array + */ + public function trimDataProvider(): array { + return [ + 'plain' => ['plain', 'plain'], + 'non-breaking-space-at-end-latin1' => ['foo' . chr(0xA0), 'foo'], + 'non-breaking-space-at-end-utf8' => ["foo\u{a0}", 'foo'], + 'non-breaking-space-at-start-latin1' => [chr(0xA0) . 'foo', 'foo'], + 'non-breaking-space-at-start-utf8' => ["\u{a0}foo", 'foo'], + 'non-breaking-space-at-both-latin1' => [chr(0xA0) . 'foo' . chr(0xA0), 'foo'], + 'non-breaking-space-at-both-utf8' => ["\u{a0}foo\u{a0}", 'foo'], + 'sharing-same-byte' => ['fooà', 'fooà'], + 'sharing-same-byte-plus-space-end' => ["fooà\u{a0}", 'fooà'], + 'sharing-same-byte-plus-space-start' => ["\u{a0}àfoo", 'àfoo'], + 'sharing-same-byte-plus-space-both' => ["\u{a0}àfooà\u{a0}", 'àfooà'], + 'multiple-spaces' => ["\u{a0}\u{a0}foo\u{a0}\u{a0}", 'foo'], + ]; } } diff --git a/tests/phpunit/CRM/Import/DataSource/specialchar.csv b/tests/phpunit/CRM/Import/DataSource/specialchar.csv new file mode 100644 index 000000000000..408bea2bc6cb --- /dev/null +++ b/tests/phpunit/CRM/Import/DataSource/specialchar.csv @@ -0,0 +1,2 @@ +First Name,Last Name,email +Yogà,Berà ,yogi@yellowstone.park  diff --git a/tests/phpunit/CRM/Import/DataSource/specialchar_with_BOM.csv b/tests/phpunit/CRM/Import/DataSource/specialchar_with_BOM.csv new file mode 100644 index 000000000000..4604b2e3a2f4 --- /dev/null +++ b/tests/phpunit/CRM/Import/DataSource/specialchar_with_BOM.csv @@ -0,0 +1,2 @@ +First Name,Last Name,email +Yogà,Berà ,yogi@yellowstone.park