diff --git a/CHANGELOG.md b/CHANGELOG.md index d107b3e1..8ba25458 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## Unreleased changes + +* Add an optional `website_root` argument to `Govspeak::Document#extracted_links` in order to get all links as fully qualified URLs [#122](https://github.com/alphagov/govspeak/pull/122) + ## 5.3.0 * Add a link extraction class for finding links in documents [#120](https://github.com/alphagov/govspeak/pull/120) diff --git a/lib/govspeak.rb b/lib/govspeak.rb index e1005b3e..7f34f4e0 100644 --- a/lib/govspeak.rb +++ b/lib/govspeak.rb @@ -95,8 +95,8 @@ def structured_headers Govspeak::StructuredHeaderExtractor.new(self).call end - def extracted_links - Govspeak::LinkExtractor.new(self).call + def extracted_links(website_root: nil) + Govspeak::LinkExtractor.new(self, website_root: website_root).call end def preprocess(source) diff --git a/lib/govspeak/link_extractor.rb b/lib/govspeak/link_extractor.rb index 90e803f2..102be6ae 100644 --- a/lib/govspeak/link_extractor.rb +++ b/lib/govspeak/link_extractor.rb @@ -1,7 +1,8 @@ module Govspeak class LinkExtractor - def initialize(document) + def initialize(document, website_root: nil) @document = document + @website_root = website_root end def call @@ -10,8 +11,16 @@ def call private + attr_reader :document, :website_root + def extract_links - document_anchors.map { |link| link['href'] } + document_anchors.map do |link| + if website_root && link['href'].start_with?('/') + "#{website_root}#{link['href']}" + else + link['href'] + end + end end def document_anchors @@ -22,7 +31,7 @@ def processed_govspeak doc = Nokogiri::HTML::Document.new doc.encoding = "UTF-8" - doc.fragment(@document.to_html) + doc.fragment(document.to_html) end end end diff --git a/test/govspeak_link_extractor_test.rb b/test/govspeak_link_extractor_test.rb index e755c392..bad78225 100644 --- a/test/govspeak_link_extractor_test.rb +++ b/test/govspeak_link_extractor_test.rb @@ -12,6 +12,8 @@ def document_body [not_a_link](#somepage) [mailto:](mailto:someone@www.example.com) + +[absolute_path](/cais-trwydded-yrru-dros-dro) } end @@ -24,7 +26,7 @@ def links end test "Links are extracted from the body" do - expected_links = ["http://www.example.com", "http://www.gov.com"] + expected_links = %w{http://www.example.com http://www.gov.com /cais-trwydded-yrru-dros-dro} assert_equal expected_links, links end @@ -39,4 +41,9 @@ def links test "Links are not extracted if they begin with mailto:" do refute_includes ["mailto:someone@www.example.com"], links end + + test "Absolute links are transformed to a url when website_root passed in" do + urls = doc.extracted_links(website_root: "http://www.example.com") + assert urls.include?("http://www.example.com/cais-trwydded-yrru-dros-dro") + end end