Skip to content

Commit

Permalink
Linting and fix paragraph prefixing (#11)
Browse files Browse the repository at this point in the history
Fixes an issue with spans eating their siblings, and leading lines
before paragraphs
  • Loading branch information
hownowstephen authored Mar 29, 2023
1 parent 8b49286 commit 1748b27
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 4 deletions.
6 changes: 3 additions & 3 deletions regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ func NewRegexpConverter() Converter {
handler: func(t string, submatch []int) string {
href, value := strings.TrimSpace(t[submatch[4]:submatch[5]]), strings.TrimSpace(t[submatch[6]:submatch[7]])
var replace string
if strings.ToLower(href) == strings.ToLower(value) {
if strings.EqualFold(href, value) {
replace = value
} else if value != "" {
replace = fmt.Sprintf("%s ( %s )", value, href)
Expand Down Expand Up @@ -184,7 +184,7 @@ func (s *submatchReplacer) Replace(text string) string {
finalText += text[start:submatch[0]] + s.handler(text, submatch)
start = submatch[1]
}
return finalText + text[start:len(text)]
return finalText + text[start:]
}

// Convert returns a text-only version of supplied document in UTF-8 format with all HTML tags removed
Expand Down Expand Up @@ -251,7 +251,7 @@ func (t *RegexpConverter) Convert(document string, lineLength int) (string, erro
// strip text ignored html. Useful for removing
// headers and footers that aren't needed in the
// text version
txt := t.ignoredHTML.ReplaceAllString(string(clean.Bytes()), "")
txt := t.ignoredHTML.ReplaceAllString(clean.String(), "")

// strip out html comments
txt = t.comments.ReplaceAllString(txt, "")
Expand Down
18 changes: 18 additions & 0 deletions textplain_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,24 @@ func TestWrappingSpans(t *testing.T) {
</p>`,
expect: "Test spans\n\ninbetween\n\nline 2\nagain",
},
{
name: "tables and spans",
body: `<table>
<tbody>
<tr>
<td>
<span>ID</span>
<p>ABC-1234</p>
</td>
<td>
<span>Date</span>
<p>Mar 29, 2023</p>
</td>
</tr>
</tbody>
</table>`,
expect: "ID\nABC-1234\n\nDate\nMar 29, 2023",
},
})
}

Expand Down
8 changes: 7 additions & 1 deletion tree.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ func (t *TreeConverter) doConvert(n *html.Node) ([]string, error) {
if err != nil {
return nil, err
}
if len(parts) > 0 {
if p := strings.Trim(parts[len(parts)-1], " \t"); len(p) == 0 || p[len(p)-1] != '\n' {
parts = append(parts, "\n")
}
}

parts = append(parts, more...)
parts = append(parts, "\n\n")
continue
Expand Down Expand Up @@ -258,7 +264,7 @@ func (t *TreeConverter) wrapSpans(n *html.Node) (*html.Node, []string, error) {
for c = n; c != nil; c = c.NextSibling {

if c.Type == html.ElementNode && c.DataAtom != atom.Span {
return c, parts, nil
return c.PrevSibling, parts, nil
}

var span string
Expand Down

0 comments on commit 1748b27

Please sign in to comment.