Skip to content

Commit

Permalink
Merge branch 'master' into no-p-in-table
Browse files Browse the repository at this point in the history
  • Loading branch information
Alir3z4 authored May 5, 2021
2 parents 855c1a0 + 6cdb234 commit 7fcb2cf
Show file tree
Hide file tree
Showing 15 changed files with 173 additions and 13 deletions.
2 changes: 2 additions & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ The AUTHORS/Contributors are (and/or have been):
* Jacek Kołodziej <kolodziejj@gmail.com>
* Jonathan Vanasco <jonathan@findmeon.com>
* Jon Dufresne <jon.dufresne@gmail.com>
* Edward Ross <edward@skeptric.com>
* Mike Borsetti
* Gregory Anders <greg@gpanders.com>

Maintainer:

Expand Down
5 changes: 4 additions & 1 deletion ChangeLog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@ UNRELEASED
==========
----

* Fix #332: Insert at most one space for multiple emphasis
* Feature #318: Make padded tables more similar to pandoc's pipe_tables.
* Add support for Python 3.9.
* Fix extra line breaks inside html link text (between '[' and ']')
* Fix #344: indent ``<ul>`` inside ``<ol>`` three spaces instead of two to comply with CommonMark, GFM, etc.
* Feature #198: Ignore ``<p>`` tags inside table rows
* Fix #324: unnecessary spaces around ``<b>``, ``<em>``, and ``strike`` tags.
* Don't wrap tables by default and add a ``--wrap-tables`` config option.
* Feature #198: Ignore ``<p>`` tags inside table rows.

2020.1.16
=========
Expand Down
2 changes: 2 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ simple indications of their function.
- MARK_CODE to wrap 'pre' blocks with [code]...[/code] tags
- WRAP_LINKS to decide if links have to be wrapped during text wrapping (implies INLINE_LINKS = False)
- WRAP_LIST_ITEMS to decide if list items have to be wrapped during text wrapping
- WRAP_TABLES to decide if tables have to be wrapped during text wrapping
- DECODE_ERRORS to handle decoding errors. 'strict', 'ignore', 'replace' are the acceptable values.
- DEFAULT_IMAGE_ALT takes a string as value and is used whenever an image tag is missing an `alt` value. The default for this is an empty string '' to avoid backward breakage
- OPEN_QUOTE is the character used to open a quote when replacing the `<q>` tag. It defaults to `"`.
Expand Down Expand Up @@ -143,6 +144,7 @@ Command line options
| `--mark-code` | Mark code with [code]...[/code] blocks
| `--no-wrap-links` | Do not wrap links during text wrapping. Implies `--reference-links`
| `--wrap-list-items` | Wrap list items during text wrapping.
| `--wrap-tables` | Wrap tables during text wrapping.
| `--decode-errors`=`HANDLER` | What to do in case an error is encountered. `ignore`, `strict`, `replace` etc.
| `--pad-tables` | Use padding to make tables look good.
| `--default-image-alt`=`Image_Here` | Inserts the given `alt` text whenever images are missing `alt` values.
Expand Down
40 changes: 30 additions & 10 deletions html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import html.entities
import html.parser
import re
import string
import urllib.parse as urlparse
from textwrap import wrap
from typing import Dict, List, Optional, Tuple, Union
Expand Down Expand Up @@ -78,6 +79,7 @@ def __init__(
self.mark_code = config.MARK_CODE
self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
self.wrap_links = config.WRAP_LINKS # covered in cli
self.wrap_tables = config.WRAP_TABLES
self.pad_tables = config.PAD_TABLES # covered in cli
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
self.tag_callback = None
Expand Down Expand Up @@ -406,14 +408,20 @@ def handle_tag(
self.blockquote -= 1
self.p()

def no_preceding_space(self: HTML2Text) -> bool:
return bool(
self.preceding_data and re.match(r"[^\s]", self.preceding_data[-1])
)

if tag in ["em", "i", "u"] and not self.ignore_emphasis:
if start and no_preceding_space(self):
# Separate with a space if we immediately follow an alphanumeric
# character, since otherwise Markdown won't render the emphasis
# marks, and we'll be left with eg 'foo_bar_' visible.
# (Don't add a space otherwise, though, since there isn't one in the
# original HTML.)
if (
start
and self.preceding_data
and self.preceding_data[-1] not in string.whitespace
and self.preceding_data[-1] not in string.punctuation
):
emphasis = " " + self.emphasis_mark
self.preceding_data += " "
else:
emphasis = self.emphasis_mark

Expand All @@ -422,8 +430,17 @@ def no_preceding_space(self: HTML2Text) -> bool:
self.stressed = True

if tag in ["strong", "b"] and not self.ignore_emphasis:
if start and no_preceding_space(self):
# Separate with space if we immediately follow an * character, since
# without it, Markdown won't render the resulting *** correctly.
# (Don't add a space otherwise, though, since there isn't one in the
# original HTML.)
if (
start
and self.preceding_data
and self.preceding_data[-1] == self.strong_mark[0]
):
strong = " " + self.strong_mark
self.preceding_data += " "
else:
strong = self.strong_mark

Expand All @@ -432,8 +449,9 @@ def no_preceding_space(self: HTML2Text) -> bool:
self.stressed = True

if tag in ["del", "strike", "s"]:
if start and no_preceding_space(self):
if start and self.preceding_data and self.preceding_data[-1] == "~":
strike = " ~~"
self.preceding_data += " "
else:
strike = "~~"

Expand Down Expand Up @@ -836,7 +854,7 @@ def handle_data(self, data: str, entity_char: bool = False) -> None:
self.preceding_stressed = True
elif self.preceding_stressed:
if (
re.match(r"[^\s.!?]", data[0])
re.match(r"[^][(){}\s.!?]", data[0])
and not hn(self.current_tag)
and self.current_tag not in ["a", "code", "pre"]
):
Expand Down Expand Up @@ -924,7 +942,9 @@ def optwrap(self, text: str) -> str:
self.inline_links = False
for para in text.split("\n"):
if len(para) > 0:
if not skipwrap(para, self.wrap_links, self.wrap_list_items):
if not skipwrap(
para, self.wrap_links, self.wrap_list_items, self.wrap_tables
):
indent = ""
if para.startswith(" " + self.ul_item_mark):
# list item continuation: add a double indent to the
Expand Down
8 changes: 8 additions & 0 deletions html2text/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ class bcolors:
default=config.WRAP_LIST_ITEMS,
help="wrap list items during conversion",
)
p.add_argument(
"--wrap-tables",
dest="wrap_tables",
action="store_true",
default=config.WRAP_TABLES,
help="wrap tables",
)
p.add_argument(
"--ignore-emphasis",
dest="ignore_emphasis",
Expand Down Expand Up @@ -298,6 +305,7 @@ class bcolors:
h.mark_code = args.mark_code
h.wrap_links = args.wrap_links
h.wrap_list_items = args.wrap_list_items
h.wrap_tables = args.wrap_tables
h.pad_tables = args.pad_tables
h.default_image_alt = args.default_image_alt
h.open_quote = args.open_quote
Expand Down
6 changes: 6 additions & 0 deletions html2text/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
# Wrap list items.
WRAP_LIST_ITEMS = False

# Wrap tables
WRAP_TABLES = False

# Number of pixels Google indents nested lists
GOOGLE_LIST_INDENT = 36

Expand Down Expand Up @@ -63,6 +66,9 @@
# to find links in the text
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")

# to find table separators
RE_TABLE = re.compile(r" \| ")

RE_MD_DOT_MATCHER = re.compile(
r"""
^ # start of line
Expand Down
8 changes: 7 additions & 1 deletion html2text/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
return 0


def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
def skipwrap(
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
) -> bool:
# If it appears to contain a link
# don't wrap
if not wrap_links and config.RE_LINK.search(para):
Expand All @@ -181,6 +183,10 @@ def skipwrap(para: str, wrap_links: bool, wrap_list_items: bool) -> bool:
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
return not wrap_list_items

# If text contains a pipe character it is likely a table
if not wrap_tables and config.RE_TABLE.search(para):
return True

# If the text begins with a single -, *, or +, followed by a space,
# or an integer, followed by a ., followed by a space (in either
# case optionally proceeded by whitespace), it's a list; don't wrap.
Expand Down
7 changes: 7 additions & 0 deletions test/emphasis_preserved_whitespace.html
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,12 @@
<p><em>emphasis</em>.</p>
<p><em>emphasis</em>?</p>
<p><em>emphasis</em>!</p>
<p>(<em>emphasis</em>)</p>
<p>[<b>bold</b>}</p>
<p>(<strike>strike</strike>]</p>

<!-- Strong and strike characters have space -->
<p>*<b>bold</b></p>
<p>~<strike>strike</strike></p>

<p><em>em1</em><em>em2</em></p>
10 changes: 10 additions & 0 deletions test/emphasis_preserved_whitespace.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,15 @@ _emphasis_?

_emphasis_!

(_emphasis_)

[**bold**}

(~~strike~~]

* **bold**

~ ~~strike~~

_em1_ _em2_

31 changes: 31 additions & 0 deletions test/emphasis_whitespace.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<p> <em><strong>ib</strong></em></p>

<p>.<em><strong>ib</strong></em></p>

<p> <strong><em>bi</em></strong></p>

<p>.<strong><em>bi</em></strong></p>

<p> <em><strike>is</strike></em></p>

<p>.<em><strike>is</strike></em></p>

<p> <em><strike>si</strike></em></p>

<p>.<em><strike>si</strike></em></p>

<p> <strong><strike>bs</strike></strong></p>

<p>.<strong><strike>bs</strike></strong></p>

<p> <strike><strong>sb</strong></strike></p>

<p>.<strike><strong>sb</strong></strike></p>

<p> <strike><strong><em>sbi</em></strong></strike></p>

<p>.<strike><strong><em>sbi</em></strong></strike></p>

<p> <strong><em><strike>bis</strike></em></strong></p>

<p>.<strong><em><strike>bis</strike></em></strong></p>
32 changes: 32 additions & 0 deletions test/emphasis_whitespace.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
_**ib**_

. _**ib**_

**_bi_**

. **_bi_**

_~~is~~_

. _~~is~~_

_~~si~~_

. _~~si~~_

**~~bs~~**

. **~~bs~~**

~~**sb**~~

. ~~**sb**~~

~~**_sbi_**~~

. ~~**_sbi_**~~

**_~~bis~~_**

. **_~~bis~~_**

5 changes: 5 additions & 0 deletions test/test_html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ def generate_testdata():
cmdline_args.append("--wrap-list-items")
func_args = skip

if base_fn.startswith("wrap_tables"):
module_args["wrap_tables"] = True
cmdline_args.append("--wrap-tables")
func_args = skip

if base_fn == "inplace_baseurl_substitution.html":
module_args["baseurl"] = "http://brettterpstra.com"
module_args["body_width"] = 0
Expand Down
12 changes: 12 additions & 0 deletions test/wrap_tables.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<!DOCTYPE html> <html>
<head lang="en"> <meta charset="UTF-8"> <title></title> </head>
<body> <h1>This is a test document</h1> With some text, <code>code</code>, <b>bolds</b> and <i>italics</i>. <h2>This is second header</h2> <p style="display: none">Displaynone text</p>
<table>
<tr> <th>Header 1</th> <th>Header 2</th> <th>Header 3</th> </tr>
<tr> <td>Content 1</td> <td>2</td> <td><img src="http://lorempixel.com/200/200" alt="200"/> Image!</td> </tr>
<tr> <td>Content 1 longer</td> <td>Content 2</td> <td>Here is some really long text that will wrap to the next line. Because it's so long.</td> </tr>
<tr> <td>Content </td> <td>Content 2</td> <td>blah</td> </tr>
<tr> <td>t </td> <td>Content 2</td> <td>blah blah blah</td> </tr>
</table>

</body> </html>
16 changes: 16 additions & 0 deletions test/wrap_tables.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This is a test document

With some text, `code`, **bolds** and _italics_.

## This is second header

Displaynone text

Header 1 | Header 2 | Header 3
---|---|---
Content 1 | 2 | ![200](http://lorempixel.com/200/200) Image!
Content 1 longer | Content 2 | Here is some really long text that will wrap to
the next line. Because it's so long.
Content | Content 2 | blah
t | Content 2 | blah blah blah

2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ skip_install = true
[testenv:isort]
basepython = python3
commands =
isort --check --diff .
isort --check-only --diff .
deps =
isort >= 5.0.1
skip_install = true
Expand Down

0 comments on commit 7fcb2cf

Please sign in to comment.