Merge branch 'master' into no-p-in-table

Alir3z4 · May 5, 2021 · 7fcb2cf · 7fcb2cf
2 parents 855c1a0 + 6cdb234
commit 7fcb2cf
Show file tree

Hide file tree

Showing 15 changed files with 173 additions and 13 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -31,7 +31,9 @@ The AUTHORS/Contributors are (and/or have been):
 * Jacek Kołodziej <kolodziejj@gmail.com>
 * Jonathan Vanasco <jonathan@findmeon.com>
 * Jon Dufresne <jon.dufresne@gmail.com>
+* Edward Ross <edward@skeptric.com>
 * Mike Borsetti
+* Gregory Anders <greg@gpanders.com>
 
 Maintainer:
 

diff --git a/ChangeLog.rst b/ChangeLog.rst
@@ -2,11 +2,14 @@ UNRELEASED
 ==========
 ----
 
+* Fix #332: Insert at most one space for multiple emphasis
 * Feature #318: Make padded tables more similar to pandoc's pipe_tables.
 * Add support for Python 3.9.
 * Fix extra line breaks inside html link text (between '[' and ']')
 * Fix #344: indent ``<ul>`` inside ``<ol>`` three spaces instead of two to comply with CommonMark, GFM, etc.
-* Feature #198: Ignore ``<p>`` tags inside table rows
+* Fix #324: unnecessary spaces around ``<b>``, ``<em>``, and ``strike`` tags.
+* Don't wrap tables by default and add a ``--wrap-tables`` config option.
+* Feature #198: Ignore ``<p>`` tags inside table rows.
 
 2020.1.16
 =========

diff --git a/docs/usage.md b/docs/usage.md
@@ -95,6 +95,7 @@ simple indications of their function.
     - MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags
     - WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False)
     - WRAP_LIST_ITEMS to decide if list items have to be wrapped during text wrapping
+    - WRAP_TABLES to decide if tables have to be wrapped during text wrapping
     - DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values.
     - DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage
     - OPEN_QUOTE is the character used to open a quote when replacing the `<q>` tag. It defaults to `"`.
@@ -143,6 +144,7 @@ Command line options
 | `--mark-code`                                          | Mark code with [code]...[/code] blocks
 | `--no-wrap-links`                                      | Do not wrap links during text wrapping. Implies `--reference-links`
 | `--wrap-list-items`                                    | Wrap list items during text wrapping.
+| `--wrap-tables`                                        | Wrap tables during text wrapping.
 | `--decode-errors`=`HANDLER`                            | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc.
 | `--pad-tables`                                         | Use padding to make tables look good.
 | `--default-image-alt`=`Image_Here`                     | Inserts the given `alt` text whenever images are missing `alt` values.

diff --git a/html2text/__init__.py b/html2text/__init__.py
@@ -3,6 +3,7 @@
 import html.entities
 import html.parser
 import re
+import string
 import urllib.parse as urlparse
 from textwrap import wrap
 from typing import Dict, List, Optional, Tuple, Union
@@ -78,6 +79,7 @@ def __init__(
         self.mark_code = config.MARK_CODE
         self.wrap_list_items = config.WRAP_LIST_ITEMS  # covered in cli
         self.wrap_links = config.WRAP_LINKS  # covered in cli
+        self.wrap_tables = config.WRAP_TABLES
         self.pad_tables = config.PAD_TABLES  # covered in cli
         self.default_image_alt = config.DEFAULT_IMAGE_ALT  # covered in cli
         self.tag_callback = None
@@ -406,14 +408,20 @@ def handle_tag(
                 self.blockquote -= 1
                 self.p()
 
-        def no_preceding_space(self: HTML2Text) -> bool:
-            return bool(
-                self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1])
-            )
-
         if tag in ["em", "i", "u"] and not self.ignore_emphasis:
-            if start and no_preceding_space(self):
+            # Separate with a space if we immediately follow an alphanumeric
+            # character, since otherwise Markdown won't render the emphasis
+            # marks, and we'll be left with eg 'foo_bar_' visible.
+            # (Don't add a space otherwise, though, since there isn't one in the
+            # original HTML.)
+            if (
+                start
+                and self.preceding_data
+                and self.preceding_data[-1] not in string.whitespace
+                and self.preceding_data[-1] not in string.punctuation
+            ):
                 emphasis = " " + self.emphasis_mark
+                self.preceding_data += " "
             else:
                 emphasis = self.emphasis_mark
 
@@ -422,8 +430,17 @@ def no_preceding_space(self: HTML2Text) -> bool:
                 self.stressed = True
 
         if tag in ["strong", "b"] and not self.ignore_emphasis:
-            if start and no_preceding_space(self):
+            # Separate with space if we immediately follow an * character, since
+            # without it, Markdown won't render the resulting *** correctly.
+            # (Don't add a space otherwise, though, since there isn't one in the
+            # original HTML.)
+            if (
+                start
+                and self.preceding_data
+                and self.preceding_data[-1] == self.strong_mark[0]
+            ):
                 strong = " " + self.strong_mark
+                self.preceding_data += " "
             else:
                 strong = self.strong_mark
 
@@ -432,8 +449,9 @@ def no_preceding_space(self: HTML2Text) -> bool:
                 self.stressed = True
 
         if tag in ["del", "strike", "s"]:
-            if start and no_preceding_space(self):
+            if start and self.preceding_data and self.preceding_data[-1] == "~":
                 strike = " ~~"
+                self.preceding_data += " "
             else:
                 strike = "~~"
 
@@ -836,7 +854,7 @@ def handle_data(self, data: str, entity_char: bool = False) -> None:
             self.preceding_stressed = True
         elif self.preceding_stressed:
             if (
-                re.match(r"[^\s.!?]", data[0])
+                re.match(r"[^][(){}\s.!?]", data[0])
                 and not hn(self.current_tag)
                 and self.current_tag not in ["a", "code", "pre"]
             ):
@@ -924,7 +942,9 @@ def optwrap(self, text: str) -> str:
             self.inline_links = False
         for para in text.split("\n"):
             if len(para) > 0:
-                if not skipwrap(para, self.wrap_links, self.wrap_list_items):
+                if not skipwrap(
+                    para, self.wrap_links, self.wrap_list_items, self.wrap_tables
+                ):
                     indent = ""
                     if para.startswith("  " + self.ul_item_mark):
                         # list item continuation: add a double indent to the

diff --git a/html2text/cli.py b/html2text/cli.py
@@ -45,6 +45,13 @@ class bcolors:
         default=config.WRAP_LIST_ITEMS,
         help="wrap list items during conversion",
     )
+    p.add_argument(
+        "--wrap-tables",
+        dest="wrap_tables",
+        action="store_true",
+        default=config.WRAP_TABLES,
+        help="wrap tables",
+    )
     p.add_argument(
         "--ignore-emphasis",
         dest="ignore_emphasis",
@@ -298,6 +305,7 @@ class bcolors:
     h.mark_code = args.mark_code
     h.wrap_links = args.wrap_links
     h.wrap_list_items = args.wrap_list_items
+    h.wrap_tables = args.wrap_tables
     h.pad_tables = args.pad_tables
     h.default_image_alt = args.default_image_alt
     h.open_quote = args.open_quote

diff --git a/html2text/config.py b/html2text/config.py
@@ -31,6 +31,9 @@
 # Wrap list items.
 WRAP_LIST_ITEMS = False
 
+# Wrap tables
+WRAP_TABLES = False
+
 # Number of pixels Google indents nested lists
 GOOGLE_LIST_INDENT = 36
 
@@ -63,6 +66,9 @@
 # to find links in the text
 RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
 
+# to find table separators
+RE_TABLE = re.compile(r" \| ")
+
 RE_MD_DOT_MATCHER = re.compile(
     r"""
     ^             # start of line

diff --git a/html2text/utils.py b/html2text/utils.py
@@ -159,7 +159,9 @@ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
     return 0
 
 
-def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
+def skipwrap(
+    para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
+) -> bool:
     # If it appears to contain a link
     # don't wrap
     if not wrap_links and config.RE_LINK.search(para):
@@ -181,6 +183,10 @@ def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
     if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
         return not wrap_list_items
 
+    # If text contains a pipe character it is likely a table
+    if not wrap_tables and config.RE_TABLE.search(para):
+        return True
+
     # If the text begins with a single -, *, or +, followed by a space,
     # or an integer, followed by a ., followed by a space (in either
     # case optionally proceeded by whitespace), it's a list; don't wrap.

diff --git a/test/emphasis_preserved_whitespace.html b/test/emphasis_preserved_whitespace.html
@@ -16,5 +16,12 @@
 <p><em>emphasis</em>.</p>
 <p><em>emphasis</em>?</p>
 <p><em>emphasis</em>!</p>
+<p>(<em>emphasis</em>)</p>
+<p>[<b>bold</b>}</p>
+<p>(<strike>strike</strike>]</p>
+
+<!-- Strong and strike characters have space  -->
+<p>*<b>bold</b></p>
+<p>~<strike>strike</strike></p>
 
 <p><em>em1</em><em>em2</em></p>
diff --git a/test/emphasis_preserved_whitespace.md b/test/emphasis_preserved_whitespace.md
@@ -24,5 +24,15 @@ _emphasis_?
 
 _emphasis_!
 
+(_emphasis_)
+
+[**bold**}
+
+(~~strike~~]
+
+* **bold**
+
+~ ~~strike~~
+
 _em1_ _em2_
 
diff --git a/test/emphasis_whitespace.html b/test/emphasis_whitespace.html
@@ -0,0 +1,31 @@
+<p> <em><strong>ib</strong></em></p>
+
+<p>.<em><strong>ib</strong></em></p>
+
+<p> <strong><em>bi</em></strong></p>
+
+<p>.<strong><em>bi</em></strong></p>
+
+<p> <em><strike>is</strike></em></p>
+
+<p>.<em><strike>is</strike></em></p>
+
+<p> <em><strike>si</strike></em></p>
+
+<p>.<em><strike>si</strike></em></p>
+
+<p> <strong><strike>bs</strike></strong></p>
+
+<p>.<strong><strike>bs</strike></strong></p>
+
+<p> <strike><strong>sb</strong></strike></p>
+
+<p>.<strike><strong>sb</strong></strike></p>
+
+<p> <strike><strong><em>sbi</em></strong></strike></p>
+
+<p>.<strike><strong><em>sbi</em></strong></strike></p>
+
+<p> <strong><em><strike>bis</strike></em></strong></p>
+
+<p>.<strong><em><strike>bis</strike></em></strong></p>
diff --git a/test/emphasis_whitespace.md b/test/emphasis_whitespace.md
@@ -0,0 +1,32 @@
+_**ib**_
+
+. _**ib**_
+
+**_bi_**
+
+. **_bi_**
+
+_~~is~~_
+
+. _~~is~~_
+
+_~~si~~_
+
+. _~~si~~_
+
+**~~bs~~**
+
+. **~~bs~~**
+
+~~**sb**~~
+
+. ~~**sb**~~
+
+~~**_sbi_**~~
+
+. ~~**_sbi_**~~
+
+**_~~bis~~_**
+
+. **_~~bis~~_**
+
diff --git a/test/test_html2text.py b/test/test_html2text.py
@@ -122,6 +122,11 @@ def generate_testdata():
             cmdline_args.append("--wrap-list-items")
             func_args = skip
 
+        if base_fn.startswith("wrap_tables"):
+            module_args["wrap_tables"] = True
+            cmdline_args.append("--wrap-tables")
+            func_args = skip
+
         if base_fn == "inplace_baseurl_substitution.html":
             module_args["baseurl"] = "http://brettterpstra.com"
             module_args["body_width"] = 0

diff --git a/test/wrap_tables.html b/test/wrap_tables.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html> <html>
+    <head lang="en"> <meta charset="UTF-8"> <title></title> </head>
+    <body> <h1>This is a test document</h1> With some text, <code>code</code>, <b>bolds</b> and <i>italics</i>.  <h2>This is second header</h2> <p style="display: none">Displaynone text</p> 
+    <table>
+        <tr> <th>Header 1</th> <th>Header 2</th> <th>Header 3</th> </tr>
+        <tr> <td>Content 1</td> <td>2</td> <td><img src="http://lorempixel.com/200/200" alt="200"/> Image!</td> </tr>
+        <tr> <td>Content 1 longer</td> <td>Content 2</td> <td>Here is some really long text that will wrap to the next line. Because it's so long.</td> </tr>
+        <tr> <td>Content </td> <td>Content 2</td> <td>blah</td> </tr>
+        <tr> <td>t </td> <td>Content 2</td> <td>blah blah blah</td> </tr>
+    </table>
+
+</body> </html>
diff --git a/test/wrap_tables.md b/test/wrap_tables.md
@@ -0,0 +1,16 @@
+# This is a test document
+
+With some text, `code`, **bolds** and _italics_.
+
+## This is second header
+
+Displaynone text
+
+Header 1 | Header 2 | Header 3  
+---|---|---  
+Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image!  
+Content 1 longer | Content 2 | Here is some really long text that will wrap to
+the next line. Because it's so long.  
+Content  | Content 2 | blah  
+t  | Content 2 | blah blah blah
+
diff --git a/tox.ini b/tox.ini
@@ -33,7 +33,7 @@ skip_install = true
 [testenv:isort]
 basepython = python3
 commands =
-    isort --check --diff .
+    isort --check-only --diff .
 deps =
     isort >= 5.0.1
 skip_install = true