Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix infinite loop in Terminal.wrap #275

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion blessed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
'support due to http://bugs.python.org/issue10570.')

__all__ = ('Terminal',)
__version__ = "1.20.0"
__version__ = "1.21.0"
31 changes: 23 additions & 8 deletions blessed/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,11 +186,11 @@ def _wrap_chunks(self, chunks):
while chunks:
chunk_len = Sequence(chunks[-1], term).length()
if cur_len + chunk_len > width:
if chunk_len > width:
self._handle_long_word(chunks, cur_line, cur_len, width)
break
cur_line.append(chunks.pop())
cur_len += chunk_len
if chunks and Sequence(chunks[-1], term).length() > width:
self._handle_long_word(chunks, cur_line, cur_len, width)
if drop_whitespace and (
cur_line and Sequence(cur_line[-1], term).strip() == ''):
del cur_line[-1]
Expand All @@ -202,10 +202,18 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
"""
Sequence-aware :meth:`textwrap.TextWrapper._handle_long_word`.

This simply ensures that word boundaries are not broken mid-sequence, as standard python
textwrap would incorrectly determine the length of a string containing sequences, and may
also break consider sequences part of a "word" that may be broken by hyphen (``-``), where
this implementation corrects both.
This method ensures that word boundaries are not broken mid-sequence, as
standard python textwrap would incorrectly determine the length of a
string containing sequences and wide characters it would also break
these "words" that would be broken by hyphen (``-``), this
implementation corrects both.

This is done by mutating the passed arguments, removing items from
'reversed_chunks' and appending them to 'cur_line'.

However, some characters (east-asian, emoji, etc.) cannot be split any
less than 2 cells, so in the case of a width of 1, we have no choice
but to allow those characters to flow outside of the given cell.
"""
# Figure out when indent is larger than the specified width, and make
# sure at least one character is stripped off on every pass
Expand All @@ -219,8 +227,15 @@ def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
idx = nxt = 0
for text, _ in iter_parse(term, chunk):
nxt += len(text)
if Sequence(chunk[:nxt], term).length() > space_left:
break
seq_length = Sequence(chunk[:nxt], term).length()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when seq_length is equal to space_left, the for loop does the next chunk even though there is no space remaining

is there another small optimization possible to call length fewer times? something like:

if seq_length == space_left:
    idx = nxt
    break
elif seq_length > space_left:
    ...

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are many unicode characters that consume 0 cells, https://wcwidth.readthedocs.io/en/latest/specs.html#width-of-0, there are examples of this in the ZWJ test added, though it gets it wrong in that specific case, we would prefer to cojoin with 1,800+ possible Combining Marks that might follow, like in cafe\u0301 we would not wish to break immediately after the letter 'e', because the next character \u0301 is desired

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added test in 6beb3c6 hope that makes it more clear

if seq_length > space_left:
if cur_len == 0 and width == 1 and nxt == 1 and seq_length == 2:
# Emoji etc. cannot be split under 2 cells, so in the
# case of a width of 1, we have no choice but to allow
# those characters to flow outside of the given cell.
pass
else:
break
idx = nxt
cur_line.append(chunk[:idx])
reversed_chunks[-1] = chunk[idx:]
Expand Down
6 changes: 6 additions & 0 deletions docs/history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

Version History
===============
1.21
* bugfix infinite loop in :meth:`~Terminal.wrap` when "Wide" characters of
width 2 (East-Asian or Emoji) are used with a wrap width of 1, and a small
performance enhancement, :ghissue:`273` and :ghpull:`274` by :ghuser:`grayjk`
merged as :ghpull:`275`.

1.20
* introduced :meth:`~Terminal.get_fgcolor` and :meth:`~Terminal.get_bgcolor` to query
the terminal for the currently set colors. :ghissue:`237` by :ghuser:`stefanholek`
Expand Down
17 changes: 16 additions & 1 deletion tests/test_length_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,21 @@ def child():
child()


def test_length_with_zwj_is_wrong():
"""Because of the way Zero-Width Joiner (ZWJ) is measured, blessed gets this wrong"""
# But for the time being, so do many terminals (~85%), so its not a huge deal..
# https://ucs-detect.readthedocs.io/results.html
@as_subprocess
def child():
term = TestTerminal()
# RGI_Emoji_ZWJ_Sequence ; family: woman, woman, girl, boy
given = term.bold_red(u'\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466')
expected = sum((2, 0, 2, 0, 2, 0, 2))

# exercise,
assert term.length(given) == expected


def test_length_ansiart():
"""Test length of ANSI art"""
@as_subprocess
Expand All @@ -60,7 +75,7 @@ def child(kind):


def test_sequence_length(all_terms):
"""Ensure T.length(string containing sequence) is correcterm."""
"""Ensure T.length(string containing sequence) is correct."""
# pylint: disable=too-complex,too-many-statements
@as_subprocess
def child(kind):
Expand Down
53 changes: 53 additions & 0 deletions tests/test_wrap.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Tests for Terminal.wrap()"""
# coding: utf-8

# std imports
import textwrap
Expand Down Expand Up @@ -113,3 +114,55 @@ def child():
assert expected == result

child()


def test_east_asian_emojis_width_1():
"""Tests edge-case of east-asian and emoji characters split into single columns."""
@as_subprocess
def child():
term = TestTerminal()
# by @grayjk from https://github.com/jquast/blessed/issues/273
result = term.wrap(u'\u5973', 1)
assert result == [u'\u5973']

# much like test_length_with_zwj_is_wrong(), blessed gets ZWJ wrong when wrapping, also.
# In this case, each character gets its own line--even though '\u200D' is considered
# a width of 0, the next emoji is "too large to fit".
# RGI_Emoji_ZWJ_Sequence ; family: woman, woman, girl, boy
given = u'\U0001F469\u200D\U0001F469\u200D\U0001F467\u200D\U0001F466'
result = term.wrap(given, 1)
assert result == list(given)

# in another example, two *narrow* characters, \u1100, "ᄀ" HANGUL
# CHOSEONG KIYEOK (consonant) is joined with \u1161, "ᅡ" HANGUL
# JUNGSEONG A (vowel), to form a single *wide* character "가" HANGUL
# SYLLABLE GA. Ideally, a native speaker would rather have the cojoined
# wide character, and word-wrapping to a column width of '1' for any
# language that includes wide characters or emoji is a bit foolish!
given = u'\u1100\u1161'
result = term.wrap(given, 1)
assert result == list(given)

child()


def test_emojis_width_2_and_greater():
"""Tests emoji characters split into multiple columns."""
@as_subprocess
def child():
term = TestTerminal()
given = u'\U0001F469\U0001F467\U0001F466' # woman, girl, boy
result = term.wrap(given, 2)
assert result == list(given)
result = term.wrap(given, 3)
assert result == list(given)
result = term.wrap(given, 4)
assert result == [u'\U0001F469\U0001F467', u'\U0001F466']
result = term.wrap(given, 5)
assert result == [u'\U0001F469\U0001F467', u'\U0001F466']
result = term.wrap(given, 6)
assert result == [u'\U0001F469\U0001F467\U0001F466']
result = term.wrap(given, 7)
assert result == [u'\U0001F469\U0001F467\U0001F466']

child()
2 changes: 2 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,8 @@ addopts =
--ignore=setup.py
--ignore=.tox
--junit-xml=.tox/results.{envname}.xml
# if any test takes over 30 seconds, dump traceback
faulthandler_timeout = 30
filterwarnings = error
junit_family = xunit1
log_format=%(levelname)s %(relativeCreated)2.2f %(filename)s:%(lineno)d %(message)s
Expand Down
2 changes: 1 addition & 1 deletion version.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"version": "1.20.0"}
{"version": "1.21.0"}
Loading