From 1748b27fcdba28b7a8b791e5e08220045d27fc38 Mon Sep 17 00:00:00 2001 From: Stephen Young Date: Wed, 29 Mar 2023 16:46:55 -0400 Subject: [PATCH] Linting and fix paragraph prefixing (#11) Fixes an issue with spans eating their siblings, and leading lines before paragraphs --- regexp.go | 6 +++--- textplain_test.go | 18 ++++++++++++++++++ tree.go | 8 +++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/regexp.go b/regexp.go index 9dc4703..bb0da3d 100644 --- a/regexp.go +++ b/regexp.go @@ -69,7 +69,7 @@ func NewRegexpConverter() Converter { handler: func(t string, submatch []int) string { href, value := strings.TrimSpace(t[submatch[4]:submatch[5]]), strings.TrimSpace(t[submatch[6]:submatch[7]]) var replace string - if strings.ToLower(href) == strings.ToLower(value) { + if strings.EqualFold(href, value) { replace = value } else if value != "" { replace = fmt.Sprintf("%s ( %s )", value, href) @@ -184,7 +184,7 @@ func (s *submatchReplacer) Replace(text string) string { finalText += text[start:submatch[0]] + s.handler(text, submatch) start = submatch[1] } - return finalText + text[start:len(text)] + return finalText + text[start:] } // Convert returns a text-only version of supplied document in UTF-8 format with all HTML tags removed @@ -251,7 +251,7 @@ func (t *RegexpConverter) Convert(document string, lineLength int) (string, erro // strip text ignored html. Useful for removing // headers and footers that aren't needed in the // text version - txt := t.ignoredHTML.ReplaceAllString(string(clean.Bytes()), "") + txt := t.ignoredHTML.ReplaceAllString(clean.String(), "") // strip out html comments txt = t.comments.ReplaceAllString(txt, "") diff --git a/textplain_test.go b/textplain_test.go index 863f70a..618ac19 100644 --- a/textplain_test.go +++ b/textplain_test.go @@ -102,6 +102,24 @@ func TestWrappingSpans(t *testing.T) {

`, expect: "Test spans\n\ninbetween\n\nline 2\nagain", }, + { + name: "tables and spans", + body: ` + + + + + + +
+ ID +

ABC-1234

+
+ Date +

Mar 29, 2023

+
`, + expect: "ID\nABC-1234\n\nDate\nMar 29, 2023", + }, }) } diff --git a/tree.go b/tree.go index 146f9b2..9c18372 100644 --- a/tree.go +++ b/tree.go @@ -81,6 +81,12 @@ func (t *TreeConverter) doConvert(n *html.Node) ([]string, error) { if err != nil { return nil, err } + if len(parts) > 0 { + if p := strings.Trim(parts[len(parts)-1], " \t"); len(p) == 0 || p[len(p)-1] != '\n' { + parts = append(parts, "\n") + } + } + parts = append(parts, more...) parts = append(parts, "\n\n") continue @@ -258,7 +264,7 @@ func (t *TreeConverter) wrapSpans(n *html.Node) (*html.Node, []string, error) { for c = n; c != nil; c = c.NextSibling { if c.Type == html.ElementNode && c.DataAtom != atom.Span { - return c, parts, nil + return c.PrevSibling, parts, nil } var span string