From 1f76d025e41841f7ef05ac29fa41d13f10405784 Mon Sep 17 00:00:00 2001
From: Guillaume Ayoub <guillaume@courtbouillon.org>
Date: Fri, 24 Jan 2025 23:23:44 +0100
Subject: [PATCH] Improve text layout heuristics and use layout log attributes

This change gives amazing speeding results with long texts.
---
 weasyprint/text/ffi.py        |  2 +
 weasyprint/text/line_break.py | 90 +++++++++++++++++++++--------------
 2 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/weasyprint/text/ffi.py b/weasyprint/text/ffi.py
index b6577541c..c6cc16fb3 100644
--- a/weasyprint/text/ffi.py
+++ b/weasyprint/text/ffi.py
@@ -272,6 +272,8 @@
     void pango_layout_line_get_extents (
         PangoLayoutLine *line, PangoRectangle *ink_rect, PangoRectangle *logical_rect);
     PangoLayoutLine * pango_layout_get_line_readonly (PangoLayout *layout, int line);
+    const PangoLogAttr* pango_layout_get_log_attrs_readonly (
+        PangoLayout* layout, gint* n_attrs);
 
     hb_font_t * pango_font_get_hb_font (PangoFont *font);
 
diff --git a/weasyprint/text/line_break.py b/weasyprint/text/line_break.py
index 96f34e3ce..27bf471d9 100644
--- a/weasyprint/text/line_break.py
+++ b/weasyprint/text/line_break.py
@@ -270,81 +270,101 @@ def split_first_line(text, style, context, max_width, justification_spacing,
     if not text_wrap:
         max_width = None
 
-    # Step #1: Get a draft layout with the first line
+    # Step #1: Get a draft layout with the first line.
+    ratio = 4  # number that almost always respects char_height / char_width > ratio
+    short_text = text
     if max_width is not None and max_width != inf and style['font_size']:
-        short_text = text
-        if max_width == 0:
-            # Trying to find minimum size, let's naively split on spaces and
-            # keep one word + one letter
+        # Try to use a small amount of text to avoid the whole layout. We need
+        # at least one line, and one possible line break point on the second line.
+        if style['font_size'] * ratio > max_width:
+            # Trying to find minimum or very small size, let's naively split on
+            # spaces and keep one word + one letter.
             space_index = text.find(' ')
             if space_index != -1:
                 short_text = text[:space_index+2]  # index + space + one letter
         else:
-            short_text = text[:int(max_width / style['font_size'] * 2.5)]
-        # Try to use a small amount of text instead of the whole text
+            # Use the magic ration and hope that we’ll get the right amount of text.
+            short_text = text[:int(max_width / style['font_size'] * ratio)]
         layout = create_layout(
             short_text, style, context, max_width, justification_spacing)
         first_line, resume_index = layout.get_first_line()
         if resume_index is None and short_text != text:
-            # The small amount of text fits in one line, give up and use
-            # the whole text
+            # The small amount of text fits in one line, give up and use the
+            # whole text.
+            short_text = text
             layout.set_text(text)
             first_line, resume_index = layout.get_first_line()
+        else:
+            # If the second line of the short text can break, we have the next
+            # line break point required for step #3 in it, drop the end of the text.
+            first_line_text = short_text.encode()[:resume_index].decode()
+            if first_line_text != short_text:
+                start, end = len(first_line_text) + 1, len(short_text)
+                text_end_log_attrs = pango.pango_layout_get_log_attrs_readonly(
+                    layout.layout, ffi.NULL)[start:end]
+                if get_next_break_point(text_end_log_attrs) is not None:
+                    text = short_text
     else:
         layout = create_layout(
             text, style, context, original_max_width, justification_spacing)
         first_line, resume_index = layout.get_first_line()
 
-    # Step #2: Don't split lines when it's not needed
+    # Step #2: Don't split lines when it's not needed.
     if max_width is None:
-        # The first line can take all the place needed
+        # The first line can take all the place needed.
         return first_line_metrics(
             first_line, text, layout, resume_index, space_collapse, style)
     first_line_width, _ = line_size(first_line, style)
     if resume_index is None and first_line_width <= max_width:
-        # The first line fits in the available width
+        # The first line fits in the available width.
         return first_line_metrics(
             first_line, text, layout, resume_index, space_collapse, style)
 
     # Step #3: Try to put the first word of the second line on the first line
     # https://mail.gnome.org/archives/gtk-i18n-list/2013-September/msg00006
     # is a good thread related to this problem.
-    first_line_text = text.encode()[:resume_index].decode()
-    first_line_fits = (
-        first_line_width <= max_width or
-        can_break_text(first_line_text.strip(), style['lang']))
-    if first_line_fits:
-        # The first line fits but may have been cut too early by Pango
-        second_line_text = text.encode()[resume_index:].decode()
+    if first_line_width <= max_width:
+        # The first line fits but may have been cut too early by Pango.
+        encoded_text = text.encode()
+        first_line_text = encoded_text[:resume_index].decode()
+        second_line_text = encoded_text[resume_index:].decode()
     else:
         # The line can't be split earlier, try to hyphenate the first word.
         first_line_text = ''
         second_line_text = text
-
-    break_point = next_break_point(second_line_text, style['lang'])
+    if first_line_text == short_text:
+        # There’s no second line, don’t try to find a next word.
+        break_point = None
+    else:
+        # Find then second line’s first break point.
+        log_attrs = pango.pango_layout_get_log_attrs_readonly(layout.layout, ffi.NULL)
+        start, end = len(first_line_text) + 1, len(short_text)
+        second_line_log_attrs = log_attrs[start:end]
+        break_point = get_next_break_point(second_line_log_attrs)
+        if break_point is not None:
+            break_point -= len(first_line_text) + 1
     next_word = second_line_text[:break_point].rstrip(' ')
     if next_word:
         if space_collapse and second_line_text[break_point or -1] == ' ':
-            # next_word might fit without a space afterwards
-            # only try when space collapsing is allowed
+            # Next word might fit without a space afterwards only try when
+            # space collapsing is allowed.
             new_first_line_text = first_line_text + next_word
             layout.set_text(new_first_line_text)
             first_line, resume_index = layout.get_first_line()
             if resume_index is None:
                 if first_line_text:
-                    # The next word fits in the first line, keep the layout
+                    # The next word fits in the first line, keep the layout.
                     resume_index = len(new_first_line_text.encode()) + 1
                     return first_line_metrics(
-                        first_line, text, layout, resume_index, space_collapse,
-                        style)
+                        first_line, text, layout, resume_index, space_collapse, style)
                 else:
-                    # Second line is None
+                    # Second line is None.
                     resume_index = first_line.length + 1
                     if resume_index >= len(text.encode()):
                         resume_index = None
     elif first_line_text:
         # We found something on the first line but we did not find a word on
-        # the next line, no need to hyphenate, we can keep the current layout
+        # the next line, no need to hyphenate, we can keep the current layout.
         return first_line_metrics(
             first_line, text, layout, resume_index, space_collapse, style)
 
@@ -499,18 +519,18 @@ def get_log_attrs(text, lang):
     return log_attrs
 
 
-def next_break_point(text, lang):
-    if not text or len(text) < 2:
-        return None
-    log_attrs = get_log_attrs(text, lang)
-    length = len(text) + 1
-    for i, attr in enumerate(log_attrs[1:length - 1]):
+def get_next_break_point(log_attrs):
+    for i, attr in enumerate(log_attrs):
         if attr.is_line_break:
             return i
 
 
 def can_break_text(text, lang):
-    return next_break_point(text, lang) is not None
+    if not text or len(text) < 2:
+        return None
+    log_attrs = get_log_attrs(text, lang)
+    length = len(text) + 1
+    return get_next_break_point(log_attrs[1:length-1]) is not None
 
 
 def get_next_word_boundaries(text, lang):