From f91754b01cb9f32b83aeaa80b74ed10b5dfccb6a Mon Sep 17 00:00:00 2001 From: Daniel Roschka Date: Wed, 4 Sep 2024 17:58:12 +0200 Subject: [PATCH] Enclose white spaces in references (#1105) Since version 0.22 gettext encloses file names in references which contain white spaces or tabs within First Strong Isolate (U+2068) and Pop Directional Isolate (U+2069). This commit adds the same behavior for Babel. --- babel/messages/pofile.py | 69 ++++++++++++++++++++- tests/messages/test_pofile.py | 112 ++++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 2 deletions(-) diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 89a924255..5cd65d867 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -80,6 +80,50 @@ def denormalize(string: str) -> str: return unescape(string) +def _extract_locations(line: str) -> list[str]: + """Extract locations from location comments. + + Locations are extracted while properly handling First Strong + Isolate (U+2068) and Pop Directional Isolate (U+2069), used by + gettext to enclose filenames with spaces and tabs in their names. + """ + if "\u2068" not in line and "\u2069" not in line: + return line.lstrip().split() + + locations = [] + location = "" + in_filename = False + for c in line: + if c == "\u2068": + if in_filename: + raise ValueError("location comment contains more First Strong Isolate " + "characters, than Pop Directional Isolate characters") + in_filename = True + continue + elif c == "\u2069": + if not in_filename: + raise ValueError("location comment contains more Pop Directional Isolate " + "characters, than First Strong Isolate characters") + in_filename = False + continue + elif c == " ": + if in_filename: + location += c + elif location: + locations.append(location) + location = "" + else: + location += c + else: + if location: + if in_filename: + raise ValueError("location comment contains more First Strong Isolate " + "characters, than Pop Directional Isolate characters") + locations.append(location) + + return locations + + class PoFileError(Exception): """Exception thrown by PoParser when an invalid po file is encountered.""" @@ -269,7 +313,7 @@ def _process_comment(self, line) -> None: self._finish_current_message() if line[1:].startswith(':'): - for location in line[2:].lstrip().split(): + for location in _extract_locations(line[2:]): pos = location.rfind(':') if pos >= 0: try: @@ -307,7 +351,10 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None: if line[1:].startswith('~'): self._process_message_line(lineno, line[2:].lstrip(), obsolete=True) else: - self._process_comment(line) + try: + self._process_comment(line) + except ValueError as exc: + self._invalid_pofile(line, lineno, str(exc)) else: self._process_message_line(lineno, line) @@ -474,6 +521,23 @@ def normalize(string: str, prefix: str = '', width: int = 76) -> str: return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines]) +def _enclose_filename_if_necessary(filename: str) -> str: + """Enclose filenames which include white spaces or tabs. + + Do the same as gettext and enclose filenames which contain white + spaces or tabs with First Strong Isolate (U+2068) and Pop + Directional Isolate (U+2069). + """ + if " " not in filename and "\t" not in filename: + return filename + + if not filename.startswith("\u2068"): + filename = "\u2068" + filename + if not filename.endswith("\u2069"): + filename += "\u2069" + return filename + + def write_po( fileobj: SupportsWrite[bytes], catalog: Catalog, @@ -626,6 +690,7 @@ def _format_message(message, prefix=''): for filename, lineno in locations: location = filename.replace(os.sep, '/') + location = _enclose_filename_if_necessary(location) if lineno and include_lineno: location = f"{location}:{lineno:d}" if location not in locs: diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index d1a3e2d11..c0ded1296 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -19,6 +19,7 @@ from babel.core import Locale from babel.messages import pofile from babel.messages.catalog import Catalog, Message +from babel.messages.pofile import _enclose_filename_if_necessary, _extract_locations from babel.util import FixedOffsetTimezone @@ -438,6 +439,19 @@ def test_missing_plural_in_the_middle(self): assert message.string[1] == '' assert message.string[2] == 'Vohs [text]' + def test_with_location(self): + buf = StringIO('''\ +#: main.py:1 \u2068filename with whitespace.py\u2069:123 +msgid "foo" +msgstr "bar" +''') + catalog = pofile.read_po(buf, locale='de_DE') + assert len(catalog) == 1 + message = catalog['foo'] + assert message.string == 'bar' + assert message.locations == [("main.py", 1), ("filename with whitespace.py", 123)] + + def test_abort_invalid_po_file(self): invalid_po = ''' msgctxt "" @@ -841,6 +855,59 @@ def test_no_include_lineno(self): msgid "foo" msgstr ""''' + def test_white_space_in_location(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('utils b.py', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + + def test_white_space_in_location_already_enclosed(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('\u2068utils b.py\u2069', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + + def test_tab_in_location(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('utils\tb.py', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + + def test_tab_in_location_already_enclosed(self): + catalog = Catalog() + catalog.add('foo', locations=[('main.py', 1)]) + catalog.add('foo', locations=[('\u2068utils\tb.py\u2069', 3)]) + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + assert buf.getvalue().strip() == b'''#: main.py:1 \xe2\x81\xa8utils b.py\xe2\x81\xa9:3 +msgid "foo" +msgstr ""''' + + +class RoundtripPoTestCase(unittest.TestCase): + + def test_enclosed_filenames_in_location_comment(self): + catalog = Catalog() + catalog.add("foo", lineno=2, locations=[("main 1.py", 1)], string="") + catalog.add("bar", lineno=6, locations=[("other.py", 2)], string="") + catalog.add("baz", lineno=10, locations=[("main 1.py", 3), ("other.py", 4)], string="") + buf = BytesIO() + pofile.write_po(buf, catalog, omit_header=True, include_lineno=True) + buf.seek(0) + catalog2 = pofile.read_po(buf) + assert True is catalog.is_identical(catalog2) class PofileFunctionsTestCase(unittest.TestCase): @@ -864,6 +931,51 @@ def test_denormalize_on_msgstr_without_empty_first_line(self): assert expected_denormalized == pofile.denormalize(f'""\n{msgstr}') +@pytest.mark.parametrize(("line", "locations"), [ + ("\u2068file1.po\u2069", ["file1.po"]), + ("file1.po \u2068file 2.po\u2069 file3.po", ["file1.po", "file 2.po", "file3.po"]), + ("file1.po:1 \u2068file 2.po\u2069:2 file3.po:3", ["file1.po:1", "file 2.po:2", "file3.po:3"]), + ("\u2068file1.po\u2069:1 \u2068file\t2.po\u2069:2 file3.po:3", + ["file1.po:1", "file\t2.po:2", "file3.po:3"]), + ("file1.po file2.po", ["file1.po", "file2.po"]), + ("file1.po \u2068\u2069 file2.po", ["file1.po", "file2.po"]), +]) +def test_extract_locations_valid_location_comment(line, locations): + assert locations == _extract_locations(line) + + +@pytest.mark.parametrize(("line",), [ + ("\u2068file 1.po",), + ("file 1.po\u2069",), + ("\u2069file 1.po\u2068",), + ("\u2068file 1.po:1 \u2068file 2.po\u2069:2",), + ("\u2068file 1.po\u2069:1 file 2.po\u2069:2",), +]) +def test_extract_locations_invalid_location_comment(line): + with pytest.raises(ValueError): + _extract_locations(line) + + +@pytest.mark.parametrize(("filename",), [ + ("file.po",), + ("file_a.po",), + ("file-a.po",), + ("file\n.po",), + ("\u2068file.po\u2069",), + ("\u2068file a.po\u2069",), +]) +def test_enclose_filename_if_necessary_no_change(filename): + assert filename == _enclose_filename_if_necessary(filename) + + +@pytest.mark.parametrize(("filename",), [ + ("file a.po",), + ("file\ta.po",), +]) +def test_enclose_filename_if_necessary_enclosed(filename): + assert "\u2068" + filename + "\u2069" == _enclose_filename_if_necessary(filename) + + def test_unknown_language_roundtrip(): buf = StringIO(r''' msgid ""