From 1748b27fcdba28b7a8b791e5e08220045d27fc38 Mon Sep 17 00:00:00 2001
From: Stephen Young
Date: Wed, 29 Mar 2023 16:46:55 -0400
Subject: [PATCH] Linting and fix paragraph prefixing (#11)
Fixes an issue with spans eating their siblings, and leading lines
before paragraphs
---
regexp.go | 6 +++---
textplain_test.go | 18 ++++++++++++++++++
tree.go | 8 +++++++-
3 files changed, 28 insertions(+), 4 deletions(-)
diff --git a/regexp.go b/regexp.go
index 9dc4703..bb0da3d 100644
--- a/regexp.go
+++ b/regexp.go
@@ -69,7 +69,7 @@ func NewRegexpConverter() Converter {
handler: func(t string, submatch []int) string {
href, value := strings.TrimSpace(t[submatch[4]:submatch[5]]), strings.TrimSpace(t[submatch[6]:submatch[7]])
var replace string
- if strings.ToLower(href) == strings.ToLower(value) {
+ if strings.EqualFold(href, value) {
replace = value
} else if value != "" {
replace = fmt.Sprintf("%s ( %s )", value, href)
@@ -184,7 +184,7 @@ func (s *submatchReplacer) Replace(text string) string {
finalText += text[start:submatch[0]] + s.handler(text, submatch)
start = submatch[1]
}
- return finalText + text[start:len(text)]
+ return finalText + text[start:]
}
// Convert returns a text-only version of supplied document in UTF-8 format with all HTML tags removed
@@ -251,7 +251,7 @@ func (t *RegexpConverter) Convert(document string, lineLength int) (string, erro
// strip text ignored html. Useful for removing
// headers and footers that aren't needed in the
// text version
- txt := t.ignoredHTML.ReplaceAllString(string(clean.Bytes()), "")
+ txt := t.ignoredHTML.ReplaceAllString(clean.String(), "")
// strip out html comments
txt = t.comments.ReplaceAllString(txt, "")
diff --git a/textplain_test.go b/textplain_test.go
index 863f70a..618ac19 100644
--- a/textplain_test.go
+++ b/textplain_test.go
@@ -102,6 +102,24 @@ func TestWrappingSpans(t *testing.T) {
`,
expect: "Test spans\n\ninbetween\n\nline 2\nagain",
},
+ {
+ name: "tables and spans",
+ body: `
+
+
+
+ ID
+ ABC-1234
+ |
+
+ Date
+ Mar 29, 2023
+ |
+
+
+
`,
+ expect: "ID\nABC-1234\n\nDate\nMar 29, 2023",
+ },
})
}
diff --git a/tree.go b/tree.go
index 146f9b2..9c18372 100644
--- a/tree.go
+++ b/tree.go
@@ -81,6 +81,12 @@ func (t *TreeConverter) doConvert(n *html.Node) ([]string, error) {
if err != nil {
return nil, err
}
+ if len(parts) > 0 {
+ if p := strings.Trim(parts[len(parts)-1], " \t"); len(p) == 0 || p[len(p)-1] != '\n' {
+ parts = append(parts, "\n")
+ }
+ }
+
parts = append(parts, more...)
parts = append(parts, "\n\n")
continue
@@ -258,7 +264,7 @@ func (t *TreeConverter) wrapSpans(n *html.Node) (*html.Node, []string, error) {
for c = n; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.DataAtom != atom.Span {
- return c, parts, nil
+ return c.PrevSibling, parts, nil
}
var span string