Skip to content

Commit

Permalink
Merge branch 'master' into mp/1.7.0
Browse files Browse the repository at this point in the history
  • Loading branch information
mortenpi committed Sep 4, 2024
2 parents 6440b28 + 7d96559 commit 45e89ac
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 32 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

* The `User-Agent` header set in the linkcheck HTTP(S) requests can now be customized with the `linkcheck_useragent` option to `makedocs`. ([#2557], [#2562])
* The `User-Agent` header set in the linkcheck HTTP(S) requests can now be customized with the `linkcheck_useragent` option to `makedocs`. ([#2557], [#2562], [#2571])
* Admonitions with category `todo` are now colored purple. Previously they were default-colored like all other unknown admonitions categories. ([#2526])
* A `checkdocs_ignored_modules` keyword argument to `makedocs(...)`, which prevents `checkdocs` from warning about missing documentation in certain modules. ([#2233])

Expand Down Expand Up @@ -1898,6 +1898,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
[#2561]: https://github.com/JuliaDocs/Documenter.jl/issues/2561
[#2562]: https://github.com/JuliaDocs/Documenter.jl/issues/2562
[#2569]: https://github.com/JuliaDocs/Documenter.jl/issues/2569
[#2571]: https://github.com/JuliaDocs/Documenter.jl/issues/2571
[JuliaLang/julia#36953]: https://github.com/JuliaLang/julia/issues/36953
[JuliaLang/julia#38054]: https://github.com/JuliaLang/julia/issues/38054
[JuliaLang/julia#39841]: https://github.com/JuliaLang/julia/issues/39841
Expand Down
54 changes: 28 additions & 26 deletions src/docchecks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,6 @@ function linkcheck(node::MarkdownAST.Node, element::MarkdownAST.AbstractElement,
return nothing
end

const _LINKCHECK_DEFAULT_USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"

function linkcheck(node::MarkdownAST.Node, link::MarkdownAST.Link, doc::Document; method::Symbol=:HEAD)

# first, make sure we're not supposed to ignore this link
Expand All @@ -204,29 +202,7 @@ function linkcheck(node::MarkdownAST.Node, link::MarkdownAST.Link, doc::Document
end

if !haskey(doc.internal.locallinks, link)
timeout = doc.user.linkcheck_timeout
useragent = doc.user.linkcheck_useragent
null_file = @static Sys.iswindows() ? "nul" : "/dev/null"
# In some cases, web servers (e.g. docs.github.com as of 2022) will reject requests
# that declare a non-browser user agent (curl specifically passes 'curl/X.Y'). In
# case of docs.github.com, the server returns a 403 with a page saying "The request
# is blocked". However, spoofing a realistic browser User-Agent string is enough to
# get around this, and so here we simply pass the example Chrome UA string from the
# Mozilla developer docs, but only is it's a HTTP(S) request.
#
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent#chrome_ua_string
fakebrowser = if startswith(uppercase(link.destination), "HTTP")
headers = [
"-H",
"accept-encoding: gzip, deflate, br",
]
if !isempty(useragent)
push!(headers, "--user-agent", useragent)
end
else
""
end
cmd = `curl $(method === :HEAD ? "-sI" : "-s") --proto =http,https,ftp,ftps $(fakebrowser) $(link.destination) --max-time $timeout -o $null_file --write-out "%{http_code} %{url_effective} %{redirect_url}"`
cmd = _linkcheck_curl(method, link.destination; timeout=doc.user.linkcheck_timeout, useragent=doc.user.linkcheck_useragent)

local result
try
Expand Down Expand Up @@ -279,10 +255,36 @@ function linkcheck(node::MarkdownAST.Node, docs_node::Documenter.DocsNode, doc::
end
end


linkcheck_ismatch(r::String, url) = (url == r)
linkcheck_ismatch(r::Regex, url) = occursin(r, url)

const _LINKCHECK_DEFAULT_USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"

function _linkcheck_curl(method::Symbol, url::AbstractString; timeout::Real, useragent::Union{AbstractString, Nothing})
null_file = @static Sys.iswindows() ? "nul" : "/dev/null"
# In some cases, web servers (e.g. docs.github.com as of 2022) will reject requests
# that declare a non-browser user agent (curl specifically passes 'curl/X.Y'). In
# case of docs.github.com, the server returns a 403 with a page saying "The request
# is blocked". However, spoofing a realistic browser User-Agent string is enough to
# get around this, and so here we simply pass the example Chrome UA string from the
# Mozilla developer docs, but only is it's a HTTP(S) request.
#
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent#chrome_ua_string
fakebrowser = if startswith(uppercase(url), "HTTP")
headers = [
"-H",
"accept-encoding: gzip, deflate, br",
]
if !isnothing(useragent)
push!(headers, "--user-agent", useragent)
end
headers
else
String[]
end
return `curl $(method === :HEAD ? "-sI" : "-s") --proto =http,https,ftp,ftps $(fakebrowser) $(url) --max-time $timeout -o $null_file --write-out "%{http_code} %{url_effective} %{redirect_url}"`
end

# Automatic Pkg.add() GitHub remote check
# ---------------------------------------

Expand Down
4 changes: 2 additions & 2 deletions src/documents.jl
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ struct User
linkcheck::Bool # Check external links..
linkcheck_ignore::Vector{Union{String,Regex}} # ..and then ignore (some of) them.
linkcheck_timeout::Real # ..but only wait this many seconds for each one.
linkcheck_useragent::String # User agent to use for linkchecks.
linkcheck_useragent::Union{String, Nothing} # User agent to use for linkchecks.
checkdocs::Symbol # Check objects missing from `@docs` blocks. `:none`, `:exports`, or `:all`.
checkdocs_ignored_modules::Vector{Module} # ..and then ignore (some of) them.
doctestfilters::Vector{Regex} # Filtering for doctests
Expand Down Expand Up @@ -387,7 +387,7 @@ function Document(;
linkcheck:: Bool = false,
linkcheck_ignore :: Vector = [],
linkcheck_timeout :: Real = 10,
linkcheck_useragent :: String= _LINKCHECK_DEFAULT_USERAGENT,
linkcheck_useragent :: Union{AbstractString, Nothing} = _LINKCHECK_DEFAULT_USERAGENT,
checkdocs::Symbol = :all,
checkdocs_ignored_modules::Vector{Module} = Module[],
doctestfilters::Vector{Regex}= Regex[],
Expand Down
8 changes: 6 additions & 2 deletions src/makedocs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,13 +201,17 @@ ignored.
return a response before giving up. The default is 10 seconds.
**`linkcheck_useragent`** can be used to override the user agent string used by the HTTP and
HTTPS requests made when checking for broken links. Currently, the default user agent is
HTTPS requests made when checking for broken links. If set to `nothing`, it uses the default
user agent string of the library/tool used to actually perform the requests (currently, the
system's `curl` binary).
If unset, Documenter uses the following user agent string:
```
$(_LINKCHECK_DEFAULT_USERAGENT)
```
which is set to mimic a realistic web browser. However, the exact user agent string is subject
This is set to mimic a realistic web browser. However, the exact user agent string is subject
to change. As such, it is possible that breakages can occur when Documenter's version changes,
but the goal is to set the user agent such that it would be accepted by as many web servers as
possible.
Expand Down
22 changes: 21 additions & 1 deletion test/online_linkcheck.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ using Test
[FTP (no proto) success](ftp.iana.org/tz/data/etcetera)
[Redirect success](google.com)
[HEAD fail GET success](https://codecov.io/gh/invenia/LibPQ.jl)
[Linkcheck old Chrome UA fail](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html)
"""
)
doc = Documenter.Document(; linkcheck=true, linkcheck_timeout=20)
Expand All @@ -25,6 +24,27 @@ using Test
@test doc.internal.errors == Set{Symbol}()
end

@testset "Empty User-Agent" begin
src = convert(
MarkdownAST.Node,
md"""
[Linkcheck Empty UA](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html)
"""
)

# The default user-agent fails (intel servers block it)
doc = Documenter.Document(; linkcheck=true, linkcheck_timeout=20)
doc.blueprint.pages["testpage"] = Documenter.Page("", "", "", [], Documenter.Globals(), src)
@test_logs (:error,) @test linkcheck(doc) === nothing
@test doc.internal.errors == Set{Symbol}([:linkcheck])

# You can work around by setting linkcheck_useragent=nothing and defaulting to the Curl's user agent
doc = Documenter.Document(; linkcheck=true, linkcheck_timeout=20, linkcheck_useragent=nothing)
doc.blueprint.pages["testpage"] = Documenter.Page("", "", "", [], Documenter.Globals(), src)
@test linkcheck(doc) === nothing
@test doc.internal.errors == Set{Symbol}()
end

@testset "Failures" begin
src = convert(MarkdownAST.Node, Markdown.parse("[FILE failure](file://$(@__FILE__))"))
doc = Documenter.Document(; linkcheck=true)
Expand Down

0 comments on commit 45e89ac

Please sign in to comment.