Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure URL fragments to named anchors also validate #363

Merged
merged 2 commits into from
Sep 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ lazy val core = project
name := "paradox",
libraryDependencies ++= Library.pegdown,
libraryDependencies ++= Seq(
Library.st4
Library.st4,
Library.jsoup
),
parallelExecution in Test := false
)
Expand Down
90 changes: 55 additions & 35 deletions core/src/main/scala/com/lightbend/paradox/ParadoxProcessor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ import com.lightbend.paradox.template.PageTemplate
import com.lightbend.paradox.markdown._
import com.lightbend.paradox.tree.Tree.{ Forest, Location }
import java.io.{ File, FileOutputStream, OutputStreamWriter }
import java.net.{ HttpURLConnection, URI }
import java.nio.charset.StandardCharsets

import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.pegdown.ast._

import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.io.Source
import scala.util.control.NonFatal
import scala.util.matching.Regex

Expand Down Expand Up @@ -137,49 +137,65 @@ class ParadoxProcessor(reader: Reader = new Reader, writer: Writer = new Writer)
validate(Some(root.location))
}

def ignore(path: String) = ignorePaths.exists(_.pattern.matcher(path).matches())

linkCapturer.allLinks.foreach {
case CapturedLinkWithSources(CapturedAbsoluteLink(uri), sources) if validateAbsolute && !ignore(uri.toString) =>
validateExternalLink(uri, reportErrorOnSources(errorCollector, sources), logger)
case CapturedLinkWithSources(CapturedRelativeLink(path, fragment), sources) if !ignore(path) =>
fullSite.get(path) match {
case Some(file) =>
fragment.foreach { f =>
validateFragment(path, Source.fromFile(file, "UTF-8").mkString, f, reportErrorOnSources(errorCollector, sources))
}
case None =>
reportErrorOnSources(errorCollector, sources)(s"Could not find path [$path] in site")
}
case _ =>
// Ignore
}
linkCapturer.allLinks
.filterNot(l => ignorePaths.exists(_.pattern.matcher(l.link.toString).matches()))
.foreach {
case c @ CapturedLink(uri, fragments) if c.isInternal =>
fullSite.get(uri.getPath) match {
case Some(file) =>
if (c.hasFragments) {
validateFragments(uri.getPath, Jsoup.parse(file, "UTF-8"), fragments, errorCollector)
}
case None =>
reportErrorOnSources(errorCollector, c.allSources)(s"Could not find path [${uri.getPath}] in site")
}
case absolute if validateAbsolute =>
validateExternalLink(absolute, errorCollector, logger)
case _ =>
// Ignore
}

errorCollector.logErrors(logger)
errorCollector.errorCount
}

private def validateExternalLink(uri: URI, reportError: String => Unit, logger: ParadoxLogger) = {
logger.info(s"Validating external link: $uri")
val conn = uri.toURL.openConnection().asInstanceOf[HttpURLConnection]
conn.addRequestProperty("User-Agent", "Paradox Link Validator <https://github.com/lightbend/paradox>")
private def validateExternalLink(capturedLink: CapturedLink, errorContext: ErrorContext, logger: ParadoxLogger) = {
logger.info(s"Validating external link: ${capturedLink.link}")

def reportError = reportErrorOnSources(errorContext, capturedLink.allSources)(_)
val url = capturedLink.link.toString

try {
if (conn.getResponseCode / 100 == 3) {
reportError(s"Received a ${conn.getResponseCode} redirect on external link, location redirected to is [${conn.getHeaderField("Location")}]")
} else if (conn.getResponseCode != 200) {
reportError(s"Error validating external link, status code was ${conn.getResponseCode}")
val response = Jsoup.connect(url)
.userAgent("Paradox Link Validator <https://github.com/lightbend/paradox>")
.followRedirects(false)
.ignoreHttpErrors(true)
.ignoreContentType(true)
.execute()

// jsoup doesn't offer any simple way to clean up, the only way to close is to get the body stream and close it,
// but if you've already read the response body, that will throw an exception, and there's no way to check if
// you've already tried to read the response body, so we can't do that in a finally block, we have to do it
// explicitly every time we don't want to consume the stream.
def close() = response.bodyStream().close()

if (response.statusCode() / 100 == 3) {
close()
reportError(s"Received a ${response.statusCode()} ${response.statusMessage()} on external link, location redirected to is [${response.header("Location")}]")
} else if (response.statusCode() != 200) {
close()
reportError(s"Error validating external link, status was ${response.statusCode()} ${response.statusMessage()}")
} else {
if (uri.getFragment != null) {
val content = Source.fromInputStream(conn.getInputStream).mkString
validateFragment(uri.toString, content, uri.getFragment, reportError)
if (capturedLink.hasFragments) {
validateFragments(url, response.parse(), capturedLink.fragments, errorContext)
} else {
close()
}
}
} catch {
case NonFatal(e) =>
reportError(s"Exception occurred when validating external link: $e")
logger.debug(e)
} finally {
conn.disconnect()
}
}

Expand All @@ -189,9 +205,13 @@ class ParadoxProcessor(reader: Reader = new Reader, writer: Writer = new Writer)
}
}

private def validateFragment(path: String, content: String, fragment: String, reportError: String => Unit) = {
if (!content.contains("id=\"" + fragment + "\"")) {
reportError(s"Could not find anchor id [$fragment] in page [$path]")
private def validateFragments(path: String, content: Document, fragments: List[CapturedLinkFragment], errorContext: ErrorContext): Unit = {
fragments.foreach {
case CapturedLinkFragment(Some(fragment), sources) =>
if (content.getElementById(fragment) == null && content.select(s"a[name=$fragment]").isEmpty) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wasn't sure what the best way to lookup named a tags was. Could probably lookup all a tags, and then iterate through to see if any have a name of fragment. This is simpler, though with the possibility of escaping issues, for example, it definitely won't work for fragments with ] in them, there may also be other characters that will cause a problem. People don't usually put special characters that won't work in names since they're meant to appear in URLs, and URLs with percent encoding looks ugly, so it may just not be an issue. I think if we find any actual problems with this, we can switch to something that searches for it better.

reportErrorOnSources(errorContext, sources)(s"Could not find anchor [$fragment] in page [$path]")
}
case _ =>
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ class LinkCapturer {
}
}

def allLinks: List[CapturedLinkWithSources] = {
def allLinks: List[CapturedLink] = {
// First, resolve the links, discarding links that we can't resolve.
links.collect {

case (page, node, uri) if isPageInSite(page, uri) =>
case Link(page, node, uri, fragment) if isPageInSite(page, uri) =>
val path = node match {
// Javadoc links may use the frames style, and may reference index.html, if so, need to drop it.
case d: DirectiveNode if d.name == "javadoc" && uri.getQuery != null =>
Expand All @@ -107,19 +107,22 @@ class LinkCapturer {
val pathWithIndex = if (path.endsWith("/")) uri.getPath + "index.html"
else path

val relativePath = URI.create(page.path).resolve(pathWithIndex).getPath

(CapturedRelativeLink(relativePath, Option(uri.getFragment)), page, node)

case (page, node, uri) if uri.getAuthority != null =>
(CapturedAbsoluteLink(uri), page, node)

}.groupBy(_._1).map {
case (link, links) =>
CapturedLinkWithSources(link, links.map {
case (_, page, node) => (page.file, node)
})
}.toList
val resolvedUri = URI.create(page.path).resolve(pathWithIndex)
Link(page, node, resolvedUri, fragment)

case link @ Link(_, _, uri, _) if uri.getAuthority != null => link
}.groupBy(_.uri)
.toList
.map {
case (uri, links) =>
val fragments = links.groupBy(_.fragment)
.toList
.map {
case (fragment, links) =>
CapturedLinkFragment(fragment, links.map(l => (l.page.file, l.node)))
}
CapturedLink(uri, fragments)
}
}

private def isPageInSite(page: Page, uri: URI): Boolean = {
Expand All @@ -136,18 +139,29 @@ class LinkCapturer {
}

private var nodeOverride: Option[Node] = None
private var links: List[(Page, Node, URI)] = Nil

private case class Link(page: Page, node: Node, uri: URI, fragment: Option[String])

private var links: List[Link] = Nil

def capture(page: Page, node: Node, rendering: LinkRenderer.Rendering): LinkRenderer.Rendering = {
links = (page, nodeOverride.getOrElse(node), URI.create(rendering.href)) :: links
val fullUri = URI.create(rendering.href)
val (uri, fragment) = if (fullUri.getFragment == null) (fullUri, None)
else (new URI(fullUri.getScheme, fullUri.getAuthority, fullUri.getPath, fullUri.getQuery, null), Some(fullUri.getFragment))
links = Link(page, nodeOverride.getOrElse(node), uri, fragment) :: links
rendering
}
}

case class CapturedLinkWithSources(link: CapturedLink, sources: List[(File, Node)])
sealed trait CapturedLink
case class CapturedAbsoluteLink(link: URI) extends CapturedLink
case class CapturedRelativeLink(path: String, fragment: Option[String]) extends CapturedLink
case class CapturedLink(link: URI, fragments: List[CapturedLinkFragment]) {
def allSources = fragments.flatMap(_.sources)

def isInternal = link.getAuthority == null && link.getPath != null

def hasFragments = fragments.size > 1 || fragments.headOption.flatMap(_.fragment).nonEmpty
}

case class CapturedLinkFragment(fragment: Option[String], sources: List[(File, Node)])

private class LinkCapturerRenderer(capturer: LinkCapturer, renderer: LinkRenderer, page: Page) extends LinkRenderer {
private def capture(node: Node, rendering: LinkRenderer.Rendering) = capturer.capture(page, node, rendering)
Expand Down
8 changes: 8 additions & 0 deletions plugin/src/sbt-test/paradox/validation/build.sbt
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
lazy val root = (project in file("."))
.enablePlugins(ParadoxPlugin)

lazy val additionalInternalMappings = settingKey[List[(File, String)]]("Additional mappings to add to paradox mappings")
lazy val additionalMappings = settingKey[List[(File, String)]]("Additional mappings to add to paradox mappings")

additionalInternalMappings := Nil
additionalMappings := Nil
(Compile / paradox / mappings) ++= additionalInternalMappings.value
(Compile / paradoxValidateInternalLinks / mappings) ++= additionalMappings.value
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
[docs site path link](test.html#frag)
[docs site path link name frag](test.html#namefrag)

[outside docs path link](../outside.html#frag)

https://www.lightbend.com/thispagedoesnotexist
This URL should 404 (or otherwise fail)
https://github.com/lightbend/thisrepodoesnotexist
This URL should 301 (or otherwise fail)
https://lightbend.com

Including this in build means we depend on github.com being up. Probably ok.
https://github.com/
36 changes: 20 additions & 16 deletions plugin/src/sbt-test/paradox/validation/test
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
-> paradoxValidateInternalLinks

# Add the path, but without the fragment, so validation still fails
> set (Compile / paradox / mappings) += (file("withoutfragment.html") -> ("test.html"))
> set additionalInternalMappings := List(file("withoutfragment.html") -> ("test.html"))
-> paradoxValidateInternalLinks

# Now add the path with the fragment, validation should succeed
> reload
> set (Compile / paradox / mappings) += (file("withfragment.html") -> ("test.html"))
> set additionalInternalMappings := List(file("withfragment.html") -> ("test.html"))
> paradoxValidateInternalLinks

# Now set a base path of /, validation should fail due to /../outside.html not existing
Expand All @@ -22,27 +21,32 @@
-> paradoxValidateInternalLinks

# Now add an /outside.html, but without a fragment, validation should fail
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withoutfragment.html") -> ("/outside.html"))
> set additionalMappings := List(file("withoutfragment.html") -> ("/outside.html"))
-> paradoxValidateInternalLinks

# Now add an /outside.html, with a fragment, validation should succeed
> reload
> set paradoxValidationSiteBasePath := Some("/docs")
> set (Compile / paradox / mappings) += (file("withfragment.html") -> ("test.html"))
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/outside.html"))
> set additionalMappings := List(file("withfragment.html") -> ("/outside.html"))
> paradoxValidateInternalLinks

# Also try setting the internal link with an absolute path
> reload
> set paradoxValidationSiteBasePath := Some("/docs")
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/docs/test.html"))
> set (Compile / paradoxValidateInternalLinks / mappings) += (file("withfragment.html") -> ("/outside.html"))
> set additionalInternalMappings := Nil
> set additionalMappings += (file("withfragment.html") -> ("/docs/test.html"))
> paradoxValidateInternalLinks

# Finally, try validating external links, first check that it fails
-> paradoxValidateLinks
# Now ignore the link that fails
> set paradoxValidationIgnorePaths += ("https://www.lightbend.com.*").r
# The following test will fail if GitHub is down. That's fairly unlikely if we're running a CI build, since CI
# depends on GitHub being up to run in the first place, and also being up to report the status of the build, etc.

# Now make sure it fails when just the 404ing link is ignored
> set paradoxValidationIgnorePaths := List(".*thisrepodoesnotexist.*".r)
-> paradoxValidateLinks

# Now make sure it fails when just the 401ing link is ignored
> set paradoxValidationIgnorePaths := List("https://lightbend.com".r)
-> paradoxValidateLinks

# Now ignore both failing links, and it should pass. Note though that passing depends on GitHub being up,
# since there is a valid link that points to GitHub. If we're running in CI, GitHub being up is an alright
# assumption to make, given that GitHub triggered the build, and will have to be up to receive the status
# report
> set paradoxValidationIgnorePaths := List("https://lightbend.com".r, (".*thisrepodoesnotexist.*").r)
> paradoxValidateLinks
1 change: 1 addition & 0 deletions plugin/src/sbt-test/paradox/validation/withfragment.html
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<html>
<body>
<a id="frag"></a>
<a name="namefrag"></a>
</body>
</html>
2 changes: 2 additions & 0 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ object Version {
val sbtWeb = "1.4.4"
val scalatest = "3.0.8"
val st4 = "4.1"
val jsoup = "1.12.1"
}

object Library {
Expand All @@ -38,4 +39,5 @@ object Library {
val sbtWeb = "com.typesafe.sbt" % "sbt-web" % Version.sbtWeb
val scalatest = "org.scalatest" %% "scalatest" % Version.scalatest
val st4 = "org.antlr" % "ST4" % Version.st4
val jsoup = "org.jsoup" % "jsoup" % Version.jsoup
}
Loading