From 80f5cfa673a496e5a236fd0617691646555cc9ba Mon Sep 17 00:00:00 2001 From: Toshio Kuratomi Date: Tue, 3 May 2022 17:38:55 -0700 Subject: [PATCH] Fix for email.generator.Generator with whitespace between encoded words. email.generator.Generator currently does not handle whitespace between encoded words correctly when the encoded words span multiple lines. The current generator will create an encoded word for each line. If the end of the line happens to correspond with the end real word in the plaintext, the generator will place an unencoded space at the start of the subsequent lines to represent the whitespace between the plaintext words. A compliant decoder will strip all the whitespace from between two encoded words which leads to missing spaces in the round-tripped output. The fix for this is to make sure that whitespace between two encoded words ends up inside of one or the other of the encoded words. This fix places the space inside of the second encoded word. Test case from #92081 --- Lib/email/_header_value_parser.py | 34 ++++++++++++++++----- Lib/test/test_email/test_generator.py | 35 ++++++++++++++++++++++ Lib/test/test_email/test_headerregistry.py | 3 +- 3 files changed, 64 insertions(+), 8 deletions(-) diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index 8a8fb8bc42a9540..24c43336f60a18f 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2766,10 +2766,13 @@ def _refold_parse_tree(parse_tree, *, policy): # max_line_length 0/None means no limit, ie: infinitely long. maxlen = policy.max_line_length or sys.maxsize encoding = 'utf-8' if policy.utf8 else 'us-ascii' - lines = [''] - last_ew = None + lines = [''] # Folded lines to be output + prepend_whitespace = '' # When we have whitespace between two encoded + # words, we may need to encode the whitespace + last_ew = None # Points to the last encoded character if there's an ew on + # the line wrap_as_ew_blocked = 0 - want_encoding = False + want_encoding = False # True if we need to encode this part end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') parts = list(parse_tree) while parts: @@ -2793,10 +2796,12 @@ def _refold_parse_tree(parse_tree, *, policy): # 'charset' property on the policy. charset = 'utf-8' want_encoding = True + if part.token_type == 'mime-parameters': # Mime parameter folding (using RFC2231) is extra special. _fold_mime_parameters(part, lines, maxlen, encoding) continue + if want_encoding and not wrap_as_ew_blocked: if not part.as_ew_allowed: want_encoding = False @@ -2823,20 +2828,24 @@ def _refold_parse_tree(parse_tree, *, policy): # It's a terminal, wrap it as an encoded word, possibly # combining it with previously encoded words if allowed. last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, - part.ew_combine_allowed, charset) + part.ew_combine_allowed, charset, prepend_whitespace) + prepend_whitespace = '' want_encoding = False continue + if len(tstr) <= maxlen - len(lines[-1]): lines[-1] += tstr continue # This part is too long to fit. The RFC wants us to break at # "major syntactic breaks", so unless we don't consider this # to be one, check if it will fit on the next line by itself. + prepend_whitespace = '' if (part.syntactic_break and len(tstr) + 1 <= maxlen): newline = _steal_trailing_WSP_if_exists(lines) if newline or part.startswith_fws(): lines.append(newline + tstr) + prepend_whitespace = ' ' # part.value last_ew = None continue if not hasattr(part, 'encode'): @@ -2860,9 +2869,10 @@ def _refold_parse_tree(parse_tree, *, policy): else: # We can't fold it onto the next line either... lines[-1] += tstr + return policy.linesep.join(lines) + policy.linesep -def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): +def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, prepend_whitespace): """Fold string to_encode into lines as encoded word, combining if allowed. Return the new value for last_ew, or None if ew_combine_allowed is False. @@ -2877,7 +2887,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): to_encode = str( get_unstructured(lines[-1][last_ew:] + to_encode)) lines[-1] = lines[-1][:last_ew] - if to_encode[0] in WSP: + elif to_encode[0] in WSP: # We're joining this to non-encoded text, so don't encode # the leading blank. leading_wsp = to_encode[0] @@ -2885,6 +2895,7 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): if (len(lines[-1]) == maxlen): lines.append(_steal_trailing_WSP_if_exists(lines)) lines[-1] += leading_wsp + trailing_wsp = '' if to_encode[-1] in WSP: # Likewise for the trailing space. @@ -2904,11 +2915,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset): while to_encode: remaining_space = maxlen - len(lines[-1]) - text_space = remaining_space - chrome_len + text_space = remaining_space - chrome_len - len(prepend_whitespace) if text_space <= 0: lines.append(' ') continue + # If we are at the start of a continuation line, prepend whitespace + # (we only want to do this when the line starts with an encoded word + # but if we're folding in this helper function, then we know that we + # are going to be writing out an encoded word.) + if len(lines) > 1 and len(lines[-1]) == 1 and prepend_whitespace: + encoded_word = _ew.encode(prepend_whitespace, charset=encode_as) + lines[-1] += encoded_word + prepend_whitespace = '' + to_encode_word = to_encode[:text_space] encoded_word = _ew.encode(to_encode_word, charset=encode_as) excess = len(encoded_word) - remaining_space diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py index 89e7edeb63a8926..8e75ca5b7a17ac5 100644 --- a/Lib/test/test_email/test_generator.py +++ b/Lib/test/test_email/test_generator.py @@ -232,6 +232,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase): ioclass = io.BytesIO typ = lambda self, x: x.encode('ascii') + def test_defaults_handle_spaces_between_encoded_words_when_folded(self): + source = ("Уведомление о принятии в работу обращения для" + " подключения услуги") + expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n' + ' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n' + ' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii') + msg = EmailMessage() + msg['Subject'] = source + s = io.BytesIO() + g = BytesGenerator(s) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_defaults_handle_spaces_at_start_of_subject(self): + source = " Уведомление" + expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n" + msg = EmailMessage() + msg['Subject'] = source + s = io.BytesIO() + g = BytesGenerator(s) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + + def test_defaults_handle_spaces_at_start_of_continuation_line(self): + source = " ф ффффффффффффффффффф ф ф" + expected = (b"Subject: " + b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n" + b" =?utf-8?b?INGEINGE?=\n\n") + msg = EmailMessage() + msg['Subject'] = source + s = io.BytesIO() + g = BytesGenerator(s) + g.flatten(msg) + self.assertEqual(s.getvalue(), expected) + def test_cte_type_7bit_handles_unknown_8bit(self): source = ("Subject: Maintenant je vous présente mon " "collègue\n\n").encode('utf-8') diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py index 25347ef13c21475..18df18baaf84619 100644 --- a/Lib/test/test_email/test_headerregistry.py +++ b/Lib/test/test_email/test_headerregistry.py @@ -7,6 +7,7 @@ from test.test_email import TestEmailBase, parameterize from email import headerregistry from email.headerregistry import Address, Group +from email.header import decode_header from test.support import ALWAYS_EQ @@ -1628,7 +1629,7 @@ def test_address_display_names(self): 'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. ' 'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.', '=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c' - '=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse' + '=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse' '_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8' '?q?p=C3=B4tenti=2E?=', ),