Add website_root arguement to Govspeak::Document#extracted_links

alphagov · Jan 12, 2018 · 33e5f26 · 33e5f26
1 parent d3dae86
commit 33e5f26
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## Unreleased changes
+
+* Add an optional `website_root` argument to `Govspeak::Document#extracted_links` in order to get all links as fully qualified URLs [#122](https://github.com/alphagov/govspeak/pull/122)
+
 ## 5.3.0
 * Add a link extraction class for finding links in documents [#120](https://github.com/alphagov/govspeak/pull/120)
 

diff --git a/lib/govspeak.rb b/lib/govspeak.rb
@@ -95,8 +95,8 @@ def structured_headers
       Govspeak::StructuredHeaderExtractor.new(self).call
     end
 
-    def extracted_links
-      Govspeak::LinkExtractor.new(self).call
+    def extracted_links(website_root: nil)
+      Govspeak::LinkExtractor.new(self, website_root: website_root).call
     end
 
     def preprocess(source)

diff --git a/lib/govspeak/link_extractor.rb b/lib/govspeak/link_extractor.rb
@@ -1,7 +1,8 @@
 module Govspeak
   class LinkExtractor
-    def initialize(document)
+    def initialize(document, website_root: nil)
       @document = document
+      @website_root = website_root
     end
 
     def call
@@ -10,8 +11,16 @@ def call
 
   private
 
+    attr_reader :document, :website_root
+
     def extract_links
-      document_anchors.map { |link| link['href'] }
+      document_anchors.map do |link|
+        if website_root && link['href'].start_with?('/')
+          "#{website_root}#{link['href']}"
+        else
+          link['href']
+        end
+      end
     end
 
     def document_anchors
@@ -22,7 +31,7 @@ def processed_govspeak
       doc = Nokogiri::HTML::Document.new
       doc.encoding = "UTF-8"
 
-      doc.fragment(@document.to_html)
+      doc.fragment(document.to_html)
     end
   end
 end
diff --git a/test/govspeak_link_extractor_test.rb b/test/govspeak_link_extractor_test.rb
@@ -12,6 +12,8 @@ def document_body
 [not_a_link](#somepage)
 
 [mailto:](mailto:someone@www.example.com)
+
+[absolute_path](/cais-trwydded-yrru-dros-dro)
     }
   end
 
@@ -24,7 +26,7 @@ def links
   end
 
   test "Links are extracted from the body" do
-    expected_links = ["http://www.example.com", "http://www.gov.com"]
+    expected_links = %w{http://www.example.com http://www.gov.com /cais-trwydded-yrru-dros-dro}
     assert_equal expected_links, links
   end
 
@@ -39,4 +41,9 @@ def links
   test "Links are not extracted if they begin with mailto:" do
     refute_includes ["mailto:someone@www.example.com"], links
   end
+
+  test "Absolute links are transformed to a url when website_root passed in" do
+    urls = doc.extracted_links(website_root: "http://www.example.com")
+    assert urls.include?("http://www.example.com/cais-trwydded-yrru-dros-dro")
+  end
 end