Skip to content

Commit

Permalink
Merge pull request #122 from alphagov/add-website-root-to-link-extractor
Browse files Browse the repository at this point in the history
Add website_url arguement to Govspeak::Document#extracted_links
  • Loading branch information
SebAshton authored Jan 12, 2018
2 parents d3dae86 + 33e5f26 commit 8a5227c
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## Unreleased changes

* Add an optional `website_root` argument to `Govspeak::Document#extracted_links` in order to get all links as fully qualified URLs [#122](https://github.com/alphagov/govspeak/pull/122)

## 5.3.0
* Add a link extraction class for finding links in documents [#120](https://github.com/alphagov/govspeak/pull/120)

Expand Down
4 changes: 2 additions & 2 deletions lib/govspeak.rb
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ def structured_headers
Govspeak::StructuredHeaderExtractor.new(self).call
end

def extracted_links
Govspeak::LinkExtractor.new(self).call
def extracted_links(website_root: nil)
Govspeak::LinkExtractor.new(self, website_root: website_root).call
end

def preprocess(source)
Expand Down
15 changes: 12 additions & 3 deletions lib/govspeak/link_extractor.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
module Govspeak
class LinkExtractor
def initialize(document)
def initialize(document, website_root: nil)
@document = document
@website_root = website_root
end

def call
Expand All @@ -10,8 +11,16 @@ def call

private

attr_reader :document, :website_root

def extract_links
document_anchors.map { |link| link['href'] }
document_anchors.map do |link|
if website_root && link['href'].start_with?('/')
"#{website_root}#{link['href']}"
else
link['href']
end
end
end

def document_anchors
Expand All @@ -22,7 +31,7 @@ def processed_govspeak
doc = Nokogiri::HTML::Document.new
doc.encoding = "UTF-8"

doc.fragment(@document.to_html)
doc.fragment(document.to_html)
end
end
end
9 changes: 8 additions & 1 deletion test/govspeak_link_extractor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ def document_body
[not_a_link](#somepage)
[mailto:](mailto:someone@www.example.com)
[absolute_path](/cais-trwydded-yrru-dros-dro)
}
end

Expand All @@ -24,7 +26,7 @@ def links
end

test "Links are extracted from the body" do
expected_links = ["http://www.example.com", "http://www.gov.com"]
expected_links = %w{http://www.example.com http://www.gov.com /cais-trwydded-yrru-dros-dro}
assert_equal expected_links, links
end

Expand All @@ -39,4 +41,9 @@ def links
test "Links are not extracted if they begin with mailto:" do
refute_includes ["mailto:someone@www.example.com"], links
end

test "Absolute links are transformed to a url when website_root passed in" do
urls = doc.extracted_links(website_root: "http://www.example.com")
assert urls.include?("http://www.example.com/cais-trwydded-yrru-dros-dro")
end
end

0 comments on commit 8a5227c

Please sign in to comment.