Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-92081: Fix for email.generator.Generator with whitespace between encoded words. #92281

Merged
merged 2 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 41 additions & 7 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2784,11 +2784,15 @@ def _refold_parse_tree(parse_tree, *, policy):
# max_line_length 0/None means no limit, ie: infinitely long.
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = ['']
last_ew = None
lines = [''] # Folded lines to be output
leading_whitespace = '' # When we have whitespace between two encoded
# words, we may need to encode the whitespace
# at the beginning of the second word.
last_ew = None # Points to the last encoded character if there's an ew on
# the line
last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False
want_encoding = False # This is set to True if we need to encode this part
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
parts = list(parse_tree)
while parts:
Expand All @@ -2812,10 +2816,12 @@ def _refold_parse_tree(parse_tree, *, policy):
# 'charset' property on the policy.
charset = 'utf-8'
want_encoding = True

if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
continue

if want_encoding and not wrap_as_ew_blocked:
if not part.as_ew_allowed:
want_encoding = False
Expand Down Expand Up @@ -2847,21 +2853,38 @@ def _refold_parse_tree(parse_tree, *, policy):
last_charset == 'utf-8' and charset != 'us-ascii')):
last_ew = None
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset)
part.ew_combine_allowed, charset, leading_whitespace)
# This whitespace has been added to the lines in _fold_as_ew()
# so clear it now.
leading_whitespace = ''
last_charset = charset
want_encoding = False
continue

if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
continue

# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
leading_whitespace = ''
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
# We're going to fold the data onto a new line here. Due to
# the way encoded strings handle continuation lines, we need to
# be prepared to encode any whitespace if the next line turns
# out to start with an encoded word.
lines.append(newline + tstr)

whitespace_accumulator = []
for char in lines[-1]:
if char not in WSP:
break
whitespace_accumulator.append(char)
leading_whitespace = ''.join(whitespace_accumulator)
last_ew = None
continue
if not hasattr(part, 'encode'):
Expand All @@ -2885,9 +2908,10 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# We can't fold it onto the next line either...
lines[-1] += tstr

return policy.linesep.join(lines) + policy.linesep

def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Return the new value for last_ew, or None if ew_combine_allowed is False.

Expand All @@ -2902,14 +2926,15 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
to_encode = str(
get_unstructured(lines[-1][last_ew:] + to_encode))
lines[-1] = lines[-1][:last_ew]
if to_encode[0] in WSP:
elif to_encode[0] in WSP:
# We're joining this to non-encoded text, so don't encode
# the leading blank.
leading_wsp = to_encode[0]
to_encode = to_encode[1:]
if (len(lines[-1]) == maxlen):
lines.append(_steal_trailing_WSP_if_exists(lines))
lines[-1] += leading_wsp

trailing_wsp = ''
if to_encode[-1] in WSP:
# Likewise for the trailing space.
Expand All @@ -2929,11 +2954,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):

while to_encode:
remaining_space = maxlen - len(lines[-1])
text_space = remaining_space - chrome_len
text_space = remaining_space - chrome_len - len(leading_whitespace)
if text_space <= 0:
lines.append(' ')
continue

# If we are at the start of a continuation line, prepend whitespace
# (we only want to do this when the line starts with an encoded word
# but if we're folding in this helper function, then we know that we
# are going to be writing out an encoded word.)
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
lines[-1] += encoded_word
leading_whitespace = ''

to_encode_word = to_encode[:text_space]
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
excess = len(encoded_word) - remaining_space
Expand Down
35 changes: 35 additions & 0 deletions Lib/test/test_email/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
ioclass = io.BytesIO
typ = lambda self, x: x.encode('ascii')

def test_defaults_handle_spaces_between_encoded_words_when_folded(self):
source = ("Уведомление о принятии в работу обращения для"
" подключения услуги")
expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n'
' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n'
' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii')
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_defaults_handle_spaces_at_start_of_subject(self):
source = " Уведомление"
expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n"
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_defaults_handle_spaces_at_start_of_continuation_line(self):
source = " ф ффффффффффффффффффф ф ф"
expected = (b"Subject: "
b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n"
b" =?utf-8?b?INGEINGE?=\n\n")
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_cte_type_7bit_handles_unknown_8bit(self):
source = ("Subject: Maintenant je vous présente mon "
"collègue\n\n").encode('utf-8')
Expand Down
3 changes: 2 additions & 1 deletion Lib/test/test_email/test_headerregistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from test.test_email import TestEmailBase, parameterize
from email import headerregistry
from email.headerregistry import Address, Group
from email.header import decode_header
from test.support import ALWAYS_EQ


Expand Down Expand Up @@ -1648,7 +1649,7 @@ def test_address_display_names(self):
'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. '
'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.',
'=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c'
'=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse'
'=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse'
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note, the data in the unittest was buggy. If you run the original output through decode_header() you'll find that it is missing the space between cônsectetuer and adipiscing

'_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8'
'?q?p=C3=B4tenti=2E?=',
),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix missing spaces in email headers when the spaces are mixed with encoded 8-bit characters.
Loading