Skip to content

Commit

Permalink
Fix for email.generator.Generator with whitespace between encoded words.
Browse files Browse the repository at this point in the history
email.generator.Generator currently does not handle whitespace between
encoded words correctly when the encoded words span multiple lines.  The
current generator will create an encoded word for each line.  If the end
of the line happens to correspond with the end real word in the
plaintext, the generator will place an unencoded space at the start of
the subsequent lines to represent the whitespace between the plaintext
words.

A compliant decoder will strip all the whitespace from between two
encoded words which leads to missing spaces in the round-tripped
output.

The fix for this is to make sure that whitespace between two encoded
words ends up inside of one or the other of the encoded words.  This
fix places the space inside of the second encoded word.

Test case from #92081
  • Loading branch information
abadger committed Apr 26, 2023
1 parent feca9bb commit 80f5cfa
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 8 deletions.
34 changes: 27 additions & 7 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2766,10 +2766,13 @@ def _refold_parse_tree(parse_tree, *, policy):
# max_line_length 0/None means no limit, ie: infinitely long.
maxlen = policy.max_line_length or sys.maxsize
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = ['']
last_ew = None
lines = [''] # Folded lines to be output
prepend_whitespace = '' # When we have whitespace between two encoded
# words, we may need to encode the whitespace
last_ew = None # Points to the last encoded character if there's an ew on
# the line
wrap_as_ew_blocked = 0
want_encoding = False
want_encoding = False # True if we need to encode this part
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
parts = list(parse_tree)
while parts:
Expand All @@ -2793,10 +2796,12 @@ def _refold_parse_tree(parse_tree, *, policy):
# 'charset' property on the policy.
charset = 'utf-8'
want_encoding = True

if part.token_type == 'mime-parameters':
# Mime parameter folding (using RFC2231) is extra special.
_fold_mime_parameters(part, lines, maxlen, encoding)
continue

if want_encoding and not wrap_as_ew_blocked:
if not part.as_ew_allowed:
want_encoding = False
Expand All @@ -2823,20 +2828,24 @@ def _refold_parse_tree(parse_tree, *, policy):
# It's a terminal, wrap it as an encoded word, possibly
# combining it with previously encoded words if allowed.
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset)
part.ew_combine_allowed, charset, prepend_whitespace)
prepend_whitespace = ''
want_encoding = False
continue

if len(tstr) <= maxlen - len(lines[-1]):
lines[-1] += tstr
continue
# This part is too long to fit. The RFC wants us to break at
# "major syntactic breaks", so unless we don't consider this
# to be one, check if it will fit on the next line by itself.
prepend_whitespace = ''
if (part.syntactic_break and
len(tstr) + 1 <= maxlen):
newline = _steal_trailing_WSP_if_exists(lines)
if newline or part.startswith_fws():
lines.append(newline + tstr)
prepend_whitespace = ' ' # part.value
last_ew = None
continue
if not hasattr(part, 'encode'):
Expand All @@ -2860,9 +2869,10 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# We can't fold it onto the next line either...
lines[-1] += tstr

return policy.linesep.join(lines) + policy.linesep

def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, prepend_whitespace):
"""Fold string to_encode into lines as encoded word, combining if allowed.
Return the new value for last_ew, or None if ew_combine_allowed is False.
Expand All @@ -2877,14 +2887,15 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
to_encode = str(
get_unstructured(lines[-1][last_ew:] + to_encode))
lines[-1] = lines[-1][:last_ew]
if to_encode[0] in WSP:
elif to_encode[0] in WSP:
# We're joining this to non-encoded text, so don't encode
# the leading blank.
leading_wsp = to_encode[0]
to_encode = to_encode[1:]
if (len(lines[-1]) == maxlen):
lines.append(_steal_trailing_WSP_if_exists(lines))
lines[-1] += leading_wsp

trailing_wsp = ''
if to_encode[-1] in WSP:
# Likewise for the trailing space.
Expand All @@ -2904,11 +2915,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):

while to_encode:
remaining_space = maxlen - len(lines[-1])
text_space = remaining_space - chrome_len
text_space = remaining_space - chrome_len - len(prepend_whitespace)
if text_space <= 0:
lines.append(' ')
continue

# If we are at the start of a continuation line, prepend whitespace
# (we only want to do this when the line starts with an encoded word
# but if we're folding in this helper function, then we know that we
# are going to be writing out an encoded word.)
if len(lines) > 1 and len(lines[-1]) == 1 and prepend_whitespace:
encoded_word = _ew.encode(prepend_whitespace, charset=encode_as)
lines[-1] += encoded_word
prepend_whitespace = ''

to_encode_word = to_encode[:text_space]
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
excess = len(encoded_word) - remaining_space
Expand Down
35 changes: 35 additions & 0 deletions Lib/test/test_email/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
ioclass = io.BytesIO
typ = lambda self, x: x.encode('ascii')

def test_defaults_handle_spaces_between_encoded_words_when_folded(self):
source = ("Уведомление о принятии в работу обращения для"
" подключения услуги")
expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n'
' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n'
' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii')
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_defaults_handle_spaces_at_start_of_subject(self):
source = " Уведомление"
expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n"
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_defaults_handle_spaces_at_start_of_continuation_line(self):
source = " ф ффффффффффффффффффф ф ф"
expected = (b"Subject: "
b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n"
b" =?utf-8?b?INGEINGE?=\n\n")
msg = EmailMessage()
msg['Subject'] = source
s = io.BytesIO()
g = BytesGenerator(s)
g.flatten(msg)
self.assertEqual(s.getvalue(), expected)

def test_cte_type_7bit_handles_unknown_8bit(self):
source = ("Subject: Maintenant je vous présente mon "
"collègue\n\n").encode('utf-8')
Expand Down
3 changes: 2 additions & 1 deletion Lib/test/test_email/test_headerregistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from test.test_email import TestEmailBase, parameterize
from email import headerregistry
from email.headerregistry import Address, Group
from email.header import decode_header
from test.support import ALWAYS_EQ


Expand Down Expand Up @@ -1628,7 +1629,7 @@ def test_address_display_names(self):
'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. '
'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.',
'=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c'
'=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse'
'=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse'
'_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8'
'?q?p=C3=B4tenti=2E?=',
),
Expand Down

0 comments on commit 80f5cfa

Please sign in to comment.