Use nokogiri to process html, instead of regex

This allows us to handle the more complex case where table headings contain other markup (e.g. the heading might have an `<em>` tag, as per the test). It feels less hacky than using a regex, but it's quite a bit more code so I'm not 100% sure whether it's an improvement.
alphagov · Mar 12, 2021 · c7965e7 · c7965e7
1 parent 3de782e
commit c7965e7
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 9 deletions.
diff --git a/lib/govuk_tech_docs/tech_docs_html_renderer.rb b/lib/govuk_tech_docs/tech_docs_html_renderer.rb
@@ -52,12 +52,27 @@ def table_row(body)
       # table cells and table headings until https://github.com/vmg/redcarpet/commit/27dfb2a738a23aadd286ac9e7ecd61c4545d29de
       # (which is not yet released). This means we can't use the table_cell callback
       # without breaking column headers, so we're having to hack it in table_row.
-      #
-      # Despite the hackiness, this substitution should work for all cases
-      # where the table cell just includes simple text (with no other markup).
 
-      body_with_row_headers = body.gsub(/<td># ([^<]*)<\/td>/, "<th scope=\"row\">\\1</th>")
-      "<tr>#{body_with_row_headers}</tr>"
+      fragment = Nokogiri::HTML::DocumentFragment.parse(body)
+      fragment.children.each do |cell|
+        next unless cell.name == "td"
+        next if cell.children.empty?
+
+        first_child = cell.children.first
+        next unless first_child.text?
+
+        leading_text = first_child.content
+        next unless leading_text.start_with?("#")
+
+        cell.name = "th"
+        cell["scope"] = "row"
+        first_child.content = leading_text.sub /# */, ""
+      end
+
+      tr = Nokogiri::XML::Node.new "tr", fragment
+      tr.children = fragment.children
+
+      tr.to_html
     end
   end
 end
diff --git a/spec/govuk_tech_docs/tech_docs_html_renderer_spec.rb b/spec/govuk_tech_docs/tech_docs_html_renderer_spec.rb
@@ -19,17 +19,17 @@
       expect(output).to include("<th>A</th>")
       expect(output).to include("<th>B</th>")
 
-      # Cells starting with `# ` should be treated as row headings
+      # Cells starting with `#` should be treated as row headings
       expect(output).to include('<th scope="row">C</th>')
 
+      # Cells starting with `#` with more complex markup should be treated as row headings
+      expect(output).to match(/<th scope="row"><em>G<\/em>\s*<\/th>/)
+
       # Other cells should be treated as ordinary cells
       expect(output).to include("<td>D</td>")
       expect(output).to include("<td>E</td>")
       expect(output).to include("<td>F</td>")
       expect(output).to include("<td>H</td>")
-
-      # Heading cells with more complex markup aren't handled yet
-      expect(output).to include("<td># <em>G</em></td>")
     end
   end
 end