diff --git a/distutils/tests/test_dist.py b/distutils/tests/test_dist.py index 30a6f9ff2e..694bf02a60 100644 --- a/distutils/tests/test_dist.py +++ b/distutils/tests/test_dist.py @@ -1,6 +1,9 @@ """Tests for distutils.dist.""" import os import io +import email +import email.policy +import email.generator import sys import warnings import textwrap @@ -510,3 +513,41 @@ def test_read_metadata(self): assert metadata.platforms is None assert metadata.obsoletes is None assert metadata.requires == ['foo'] + + def test_round_trip_through_email_generator(self): + """ + In pypa/setuptools#4033, it was shown that once PKG-INFO is + re-generated using ``email.generator.Generator``, some control + characters might cause problems. + """ + # Given a PKG-INFO file ... + attrs = { + "name": "package", + "version": "1.0", + "long_description": "hello\x0b\nworld\n", + } + dist = Distribution(attrs) + metadata = dist.metadata + + with io.StringIO() as buffer: + metadata.write_pkg_file(buffer) + msg = buffer.getvalue() + + # ... when it is read and re-written using stdlib's email library, + orig = email.message_from_string(msg) + policy = email.policy.EmailPolicy( + utf8=True, + mangle_from_=False, + max_line_length=0, + ) + with io.StringIO() as buffer: + email.generator.Generator(buffer, policy=policy).flatten(orig) + + buffer.seek(0) + regen = email.message_from_file(buffer) + + # ... then it should be the same as the original + # (except for the specific line break characters) + orig_desc = set(orig["Description"].splitlines()) + regen_desc = set(regen["Description"].splitlines()) + assert regen_desc == orig_desc diff --git a/distutils/tests/test_util.py b/distutils/tests/test_util.py index 070a277069..22a003d8ca 100644 --- a/distutils/tests/test_util.py +++ b/distutils/tests/test_util.py @@ -1,4 +1,8 @@ """Tests for distutils.util.""" +import email +import email.policy +import email.generator +import io import os import sys import sysconfig as stdlib_sysconfig @@ -184,12 +188,55 @@ def test_strtobool(self): for n in no: assert not strtobool(n) - def test_rfc822_escape(self): - header = 'I am a\npoor\nlonesome\nheader\n' - res = rfc822_escape(header) - wanted = ('I am a%(8s)spoor%(8s)slonesome%(8s)s' 'header%(8s)s') % { - '8s': '\n' + 8 * ' ' - } + indent = 8 * ' ' + + @pytest.mark.parametrize( + "given,wanted", + [ + # 0x0b, 0x0c, ..., etc are also considered a line break by Python + ("hello\x0b\nworld\n", f"hello\x0b{indent}\n{indent}world\n{indent}"), + ("hello\x1eworld", f"hello\x1e{indent}world"), + ("", ""), + ( + "I am a\npoor\nlonesome\nheader\n", + f"I am a\n{indent}poor\n{indent}lonesome\n{indent}header\n{indent}", + ), + ], + ) + def test_rfc822_escape(self, given, wanted): + """ + We want to ensure a multi-line header parses correctly. + + For interoperability, the escaped value should also "round-trip" over + `email.generator.Generator.flatten` and `email.message_from_*` + (see pypa/setuptools#4033). + + The main issue is that internally `email.policy.EmailPolicy` uses + `splitlines` which will split on some control chars. If all the new lines + are not prefixed with spaces, the parser will interrupt reading + the current header and produce an incomplete value, while + incorrectly interpreting the rest of the headers as part of the payload. + """ + res = rfc822_escape(given) + + policy = email.policy.EmailPolicy( + utf8=True, + mangle_from_=False, + max_line_length=0, + ) + with io.StringIO() as buffer: + raw = f"header: {res}\nother-header: 42\n\npayload\n" + orig = email.message_from_string(raw) + email.generator.Generator(buffer, policy=policy).flatten(orig) + buffer.seek(0) + regen = email.message_from_file(buffer) + + for msg in (orig, regen): + assert msg.get_payload() == "payload\n" + assert msg["other-header"] == "42" + # Generator may replace control chars with `\n` + assert set(msg["header"].splitlines()) == set(res.splitlines()) + assert res == wanted def test_dont_write_bytecode(self): diff --git a/distutils/util.py b/distutils/util.py index 7ae914f7ee..5408b16032 100644 --- a/distutils/util.py +++ b/distutils/util.py @@ -508,6 +508,12 @@ def rfc822_escape(header): """Return a version of the string escaped for inclusion in an RFC-822 header, by ensuring there are 8 spaces space after each newline. """ - lines = header.split('\n') - sep = '\n' + 8 * ' ' - return sep.join(lines) + indent = 8 * " " + lines = header.splitlines(keepends=True) + + # Emulate the behaviour of `str.split` + # (the terminal line break in `splitlines` does not result in an extra line): + ends_in_newline = lines and lines[-1].splitlines()[0] != lines[-1] + suffix = indent if ends_in_newline else "" + + return indent.join(lines) + suffix