pola-rs · alexander-beedie · Nov 15, 2024 · Nov 14, 2024 · Nov 15, 2024
@@ -119,7 +119,9 @@ def write_body(self) -> None:
                             else:
                                 series = self.df[:, c]
                                 self.elements.append(
-                                    html.escape(series._s.get_fmt(r, str_len_limit))
+                                    html.escape(
+                                        series._s.get_fmt(r, str_len_limit)
+                                    ).replace(" ", "&nbsp;")
                                 )
 
     def write(self, inner: str) -> None:

@@ -1,3 +1,5 @@
+import re
+
 import polars as pl
 
 
@@ -77,3 +79,30 @@ def test_series_repr_html_max_rows_default() -> None:
 
     expected_rows = 10
     assert html.count("<td>") - 2 == expected_rows
+
+
+def test_html_representation_multiple_spaces() -> None:
+    df = pl.DataFrame(
+        {"string_col": ["multiple   spaces", "  trailing and leading   "]}
+    )
+    html_repr = df._repr_html_()
+
+    # Regex explanation:
+    # Matches cell content inside <td>...</td> tags, but only within the <tbody> section
+    # 1. <tbody>: Ensures matching starts within the <tbody> section.
+    # 2. .*?: Lazily matches any content until the first <td> tag.
+    # 3. <td>(.*?)</td>: Captures the content inside each <td> tag (non-greedy).
+    # 4. .*?: Lazily matches any content between <td>...</td> and </tbody>.
+    # 5. </tbody>: Ensures matching ends at the closing </tbody> tag.
+    # The re.S flag allows the regex to work across multiple lines.
+    cell_pattern = re.compile(r"<tbody>.*?<td>(.*?)</td>.*?</tbody>", re.S)
+
+    cells = cell_pattern.findall(html_repr)
+
+    for cell_content in cells:
+        # Check that there are no regular spaces in the content
+        assert " " not in cell_content, f"Unexpected space in cell: {cell_content}"
+        # Check that the content contains &nbsp; as required
+        assert (
+            "&nbsp;" in cell_content
+        ), f"Expected &nbsp; in cell but found: {cell_content}"