Skip to content

Commit

Permalink
Add improved handling of PDF hash links
Browse files Browse the repository at this point in the history
  • Loading branch information
gjtorikian committed Jan 20, 2023
1 parent 3aa7073 commit 1dbb0b8
Show file tree
Hide file tree
Showing 8 changed files with 201 additions and 11 deletions.
1 change: 1 addition & 0 deletions html-proofer.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
spec.add_dependency("addressable", "~> 2.3")
spec.add_dependency("async", "~> 2.1")
spec.add_dependency("nokogiri", "~> 1.13")
spec.add_dependency("pdf-reader", "~> 2.11")
spec.add_dependency("rainbow", "~> 3.0")
spec.add_dependency("typhoeus", "~> 1.3")
spec.add_dependency("yell", "~> 2.0")
Expand Down
12 changes: 8 additions & 4 deletions lib/html_proofer/attribute/url.rb
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,16 @@ def file_path
# either overwrite with root_dir; or, if source is directory, use that; or, just get the current file's dirname
@runner.options[:root_dir] || (File.directory?(@runner.current_source) ? @runner.current_source : File.dirname(@runner.current_source))
# relative links, path is a file
elsif File.exist?(File.expand_path(path,
@runner.current_source)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
elsif File.exist?(File.expand_path(
path,
@runner.current_source,
)) || File.exist?(File.expand_path(path_dot_ext, @runner.current_source))
File.dirname(@runner.current_filename)
# relative links in nested dir, path is a file
elsif File.exist?(File.join(File.dirname(@runner.current_filename),
path)) || File.exist?(File.join(File.dirname(@runner.current_filename), path_dot_ext))
elsif File.exist?(File.join(
File.dirname(@runner.current_filename),
path,
)) || File.exist?(File.join(File.dirname(@runner.current_filename), path_dot_ext))
File.dirname(@runner.current_filename)
# relative link, path is a directory
else
Expand Down
32 changes: 29 additions & 3 deletions lib/html_proofer/url_validator/external.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# frozen_string_literal: true

require "typhoeus"
require "uri"
require "open-uri"
# require "uri"
require "pdf-reader"

module HTMLProofer
class UrlValidator
Expand Down Expand Up @@ -88,8 +90,12 @@ def response_handler(response, url, filenames)
return if @runner.options[:ignore_status_codes].include?(response_code)

if response_code.between?(200, 299)
@cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response(href, url,
response, filenames)
@cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response(
href,
url,
response,
filenames,
)
elsif response.timed_out?
handle_timeout(href, filenames, response_code)
elsif response_code.zero?
Expand All @@ -116,6 +122,26 @@ def check_hash_in_2xx_response(href, url, response, filenames)

hash = url.hash

# attempt to verify PDF hash ref; see #787 for more details
# FIXME: this is re-reading the PDF response
if /pdf/.match?(response.options[:headers]["content-type"])
io = URI.parse(url.to_s).open
reader = PDF::Reader.new(io)

pages = reader.pages
if hash =~ /\Apage=(\d+)\z/
page = Regexp.last_match[1].to_i

unless pages[page - 1]
msg = "External link #{href} failed: #{url.sans_hash} exists, but the hash '#{hash}' does not"
add_failure(filenames, msg, response.code)
@cache.add_external(href, filenames, response.code, msg, false)
end

return true
end
end

body_doc = create_nokogiri(response.body)

unencoded_hash = Addressable::URI.unescape(hash)
Expand Down
2 changes: 2 additions & 0 deletions spec/html-proofer/fixtures/links/pdf_w_hash.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<!DOCTYPE html>
<a href='https://checkoway.net/teaching/cs210/2021-fall/slides/Lec19%20ClocksandTiming.pdf#page=25'>X</a>
2 changes: 2 additions & 0 deletions spec/html-proofer/fixtures/links/pdf_w_hash_broken.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<!DOCTYPE html>
<a href='https://checkoway.net/teaching/cs210/2021-fall/slides/Lec19%20ClocksandTiming.pdf#page=2111115'>X</a>

Large diffs are not rendered by default.

Large diffs are not rendered by default.

25 changes: 21 additions & 4 deletions spec/html-proofer/links_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -144,16 +144,22 @@

it "fails on redirects if not following" do
link_with_redirect_filepath = File.join(FIXTURES_DIR, "links", "link_with_redirect.html")
proofer = run_proofer(link_with_redirect_filepath, :file,
{ enforce_https: false, typhoeus: { followlocation: false } })
proofer = run_proofer(
link_with_redirect_filepath,
:file,
{ enforce_https: false, typhoeus: { followlocation: false } },
)
expect(proofer.failed_checks.first.description).to(match(/failed/))
end

it "does not fail on redirects we're not following" do
# this test should emit a 301--see above--but we're intentionally suppressing it
link_with_redirect_filepath = File.join(FIXTURES_DIR, "links", "link_with_redirect.html")
proofer = run_proofer(link_with_redirect_filepath, :file,
{ only_4xx: true, enforce_https: false, typhoeus: { followlocation: false } })
proofer = run_proofer(
link_with_redirect_filepath,
:file,
{ only_4xx: true, enforce_https: false, typhoeus: { followlocation: false } },
)
expect(proofer.failed_checks).to(eq([]))
end

Expand Down Expand Up @@ -750,4 +756,15 @@
expect(proofer.failed_checks.first.description).to(match(/internally linking to exists.pdf#page=2; the file exists, but the hash 'page=2' does not/))
expect(proofer.failed_checks.last.description).to(match(/internally linking to missing.pdf#page=2, which does not exist/))
end

it "tries reading PDFs with hashes" do
file = File.join(FIXTURES_DIR, "links", "pdf_w_hash.html")
proofer = run_proofer(file, :file)
expect(proofer.failed_checks.count).to(eq(0))

file = File.join(FIXTURES_DIR, "links", "pdf_w_hash_broken.html")
proofer = run_proofer(file, :file)
expect(proofer.failed_checks.count).to(eq(1))
expect(proofer.failed_checks.first.description).to(match(/ClocksandTiming.pdf exists, but the hash 'page=2111115' does not/))
end
end

0 comments on commit 1dbb0b8

Please sign in to comment.