Add website_url arguement to Govspeak::Document#extracted_links

alphagov · Jan 11, 2018 · ac0062c · ac0062c
1 parent d3dae86
commit ac0062c
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## Unreleased changes
+
+* Add an optional `website_root` argument to `Govspeak::Document#extracted_links` in order to get all links as fully qualified urls [#122](https://github.com/alphagov/govspeak/pull/122)
+
 ## 5.3.0
 * Add a link extraction class for finding links in documents [#120](https://github.com/alphagov/govspeak/pull/120)
 

diff --git a/lib/govspeak.rb b/lib/govspeak.rb
@@ -95,8 +95,8 @@ def structured_headers
       Govspeak::StructuredHeaderExtractor.new(self).call
     end
 
-    def extracted_links
-      Govspeak::LinkExtractor.new(self).call
+    def extracted_links(website_root: nil)
+      Govspeak::LinkExtractor.new(self, website_root: website_root).call
     end
 
     def preprocess(source)

diff --git a/lib/govspeak/link_extractor.rb b/lib/govspeak/link_extractor.rb
@@ -1,7 +1,8 @@
 module Govspeak
   class LinkExtractor
-    def initialize(document)
+    def initialize(document, website_root: nil)
       @document = document
+      @website_root = website_root
     end
 
     def call
@@ -10,8 +11,16 @@ def call
 
   private
 
+    attr_reader :document, :website_root
+
     def extract_links
-      document_anchors.map { |link| link['href'] }
+      document_anchors.map do |link|
+        if website_root && link['href'].start_with?('/')
+          "#{website_root}#{link['href']}"
+        else
+          link['href']
+        end
+      end
     end
 
     def document_anchors
@@ -22,7 +31,7 @@ def processed_govspeak
       doc = Nokogiri::HTML::Document.new
       doc.encoding = "UTF-8"
 
-      doc.fragment(@document.to_html)
+      doc.fragment(document.to_html)
     end
   end
 end
diff --git a/test/govspeak_link_extractor_with_website_root_test.rb b/test/govspeak_link_extractor_with_website_root_test.rb
@@ -0,0 +1,28 @@
+require "test_helper"
+
+class GovspeakLinkExtractorWithWebsiteRootTest < Minitest::Test
+  def document_body
+    %{
+## Heading
+
+[link](https://www.gov.uk)
+
+[link_two](/cais-trwydded-yrru-dros-dro)
+
+[not_a_link](#somepage)
+    }
+  end
+
+  def doc
+    @doc ||= Govspeak::Document.new(document_body)
+  end
+
+  def links
+    doc.extracted_links(website_root: "http://www.example.com")
+  end
+
+  test "Absolute path are converted to full urls" do
+    expected_links = %w{https://www.gov.uk http://www.example.com/cais-trwydded-yrru-dros-dro}
+    assert_equal expected_links, links
+  end
+end