From a3c736dda10d9a70ed93f71d787a41301d09694b Mon Sep 17 00:00:00 2001 From: Romain Deltour Date: Thu, 17 Nov 2022 01:58:56 +0100 Subject: [PATCH] feat: extend proper URL checking to more places (like CSS) This commit: - extracts the URL checking logic from `BaseURLHandler` (SAX handler) to its own independent class `URLChecker` - applies the same URL-checking logic to CSS as was used in XML, using the new `URLChecker` - applies the URL-checking logic to the few places were it wasn't the case already (in XML handlers) --- .../com/adobe/epubcheck/css/CSSHandler.java | 39 ++---- .../ocf/OCFContainerFileHandler.java | 26 +--- .../com/adobe/epubcheck/opf/OPFHandler.java | 16 +-- .../com/adobe/epubcheck/opf/OPFHandler30.java | 27 ++-- .../xml/handlers/BaseURLHandler.java | 130 ++---------------- .../org/w3c/epubcheck/url/URLChecker.java | 130 ++++++++++++++++++ .../java/org/w3c/epubcheck/url/URLUtils.java | 1 - 7 files changed, 171 insertions(+), 198 deletions(-) create mode 100644 src/main/java/org/w3c/epubcheck/url/URLChecker.java diff --git a/src/main/java/com/adobe/epubcheck/css/CSSHandler.java b/src/main/java/com/adobe/epubcheck/css/CSSHandler.java index 4b4ad4594..80810a178 100644 --- a/src/main/java/com/adobe/epubcheck/css/CSSHandler.java +++ b/src/main/java/com/adobe/epubcheck/css/CSSHandler.java @@ -1,7 +1,9 @@ package com.adobe.epubcheck.css; import java.util.EnumSet; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -17,6 +19,7 @@ import org.idpf.epubcheck.util.css.CssGrammar.CssSelector; import org.idpf.epubcheck.util.css.CssGrammar.CssURI; import org.idpf.epubcheck.util.css.CssLocation; +import org.w3c.epubcheck.url.URLChecker; import com.adobe.epubcheck.api.EPUBLocation; import com.adobe.epubcheck.api.Report; @@ -34,7 +37,6 @@ import com.google.common.base.CharMatcher; import com.google.common.collect.Sets; -import io.mola.galimatias.GalimatiasParseException; import io.mola.galimatias.URL; public class CSSHandler implements CssContentHandler, CssErrorHandler @@ -47,6 +49,10 @@ public class CSSHandler implements CssContentHandler, CssErrorHandler int startingColumnNumber = 0; static final CharMatcher SPACE_AND_QUOTES = CharMatcher.anyOf(" \t\n\r\f\"'").precomputed(); + // map to store parsed URLs + Map parsedURLs = new HashMap<>(); + final URLChecker urlChecker; + // vars for font-face info String fontFamily; String fontStyle; @@ -66,6 +72,7 @@ public CSSHandler(ValidationContext context) this.xrefChecker = context.xrefChecker.orNull(); this.report = context.report; this.version = context.version; + this.urlChecker = new URLChecker(context); } private EPUBLocation getCorrectedEPUBLocation(int lineNumber, int columnNumber, String details) @@ -314,20 +321,7 @@ else if (propertyName.equals("src")) { if (construct.getType() == CssConstruct.Type.URI) { - fontURI = ((CssURI) construct).toUriString(); - - // TODO implement more URL checks (like in BaseURLHandler) - URL fontURL = null; - try - { - fontURL = context.url.resolve(fontURI); - } catch (GalimatiasParseException e) - { - report.message(MessageId.RSC_020, - getCorrectedEPUBLocation(declaration.getLocation().getLine(), - declaration.getLocation().getColumn(), declaration.toCssString()), - fontURI, e.getLocalizedMessage()); - } + URL fontURL = parsedURLs.get(((CssURI) construct).toUriString()); if (fontURL != null) { // check font mimetypes @@ -348,7 +342,7 @@ else if (version == EPUBVersion.VERSION_3) report.message(MessageId.CSS_007, getCorrectedEPUBLocation(declaration.getLocation().getLine(), declaration.getLocation().getColumn(), declaration.toCssString()), - fontURI, fontMimeType); + fontURL, fontMimeType); } } @@ -388,17 +382,10 @@ private void resolveAndRegister(String uriString, int line, int col, String cssC // we ignore this case if (!uriString.startsWith("#")) { + // Check the URL once and store the parsed URL for later reference + URL url = urlChecker.checkURL(uriString, getCorrectedEPUBLocation(line, col, cssContext)); + parsedURLs.put(uriString, url); - // TODO implement more URL checks (like in BaseURLHandler) - URL url = null; - try - { - url = context.url.resolve(uriString); - } catch (GalimatiasParseException e) - { - report.message(MessageId.RSC_020, getCorrectedEPUBLocation(line, col, cssContext), - uriString, e.getLocalizedMessage()); - } if (url != null) { xrefChecker.registerReference(url, type, getCorrectedEPUBLocation(line, col, cssContext)); diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFContainerFileHandler.java b/src/main/java/com/adobe/epubcheck/ocf/OCFContainerFileHandler.java index 9105b2ef5..170b29aa7 100755 --- a/src/main/java/com/adobe/epubcheck/ocf/OCFContainerFileHandler.java +++ b/src/main/java/com/adobe/epubcheck/ocf/OCFContainerFileHandler.java @@ -82,19 +82,12 @@ else if (fullPath.trim().isEmpty()) return; } - try + // Parse the rootfile URL + URL rootfileURL = checkURL(fullPath); + if (rootfileURL != null) { - // Parse the rootfile URL - URL rootfileURL = URL.parse(baseURL(), fullPath); - // Register the parsed rootfile entry to the data model state.addRootfile(mediaType, rootfileURL); - - } catch (GalimatiasParseException e) - { - // FIXME 2022 - test this is reported - report.message(MessageId.RSC_020, location(), fullPath); - return; } } @@ -107,19 +100,14 @@ private void processMappingDoc() && !Strings.nullToEmpty(href).trim().isEmpty()) { - try + // Parse the href attribute against the container root URL + URL mappingDocURL = checkURL(href); + if (mappingDocURL != null) { - // Parse the href attribute against the container root URL - URL mappingDocURL = URL.parse(baseURL(), href); - // Register the parsed mapping document entry to the data model state.addMappingDocument(mappingDocURL); - } catch (GalimatiasParseException e) - { - // FIXME 2022 - test this is reported - report.message(MessageId.RSC_020, location(), href); - return; } + } } diff --git a/src/main/java/com/adobe/epubcheck/opf/OPFHandler.java b/src/main/java/com/adobe/epubcheck/opf/OPFHandler.java index a44977df2..12e24b1ce 100755 --- a/src/main/java/com/adobe/epubcheck/opf/OPFHandler.java +++ b/src/main/java/com/adobe/epubcheck/opf/OPFHandler.java @@ -46,7 +46,6 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.Maps; -import io.mola.galimatias.GalimatiasParseException; import io.mola.galimatias.URL; public class OPFHandler extends XMLHandler @@ -304,22 +303,11 @@ else if (name.equals("reference")) String href = e.getAttribute("href"); if (href != null && context.xrefChecker.isPresent()) { - - // FIXME next test URL string is conforming, better test remote URLs - if (href.matches("^[^:/?#]+://.*")) - { + URL url = checkURL(href); + if (context.isRemote(url)) { report.info(path, FeatureEnum.REFERENCE, href); } - URL url; - try - { - url = baseURL().resolve(href); - } catch (GalimatiasParseException e1) - { - report.message(MessageId.RSC_020, location(), href); - return; - } try { context.xrefChecker.get().registerReference(url, XRefChecker.Type.GENERIC, location()); diff --git a/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java b/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java index c717a0541..86f004c59 100644 --- a/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java +++ b/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java @@ -47,8 +47,6 @@ import static com.adobe.epubcheck.vocab.PackageVocabs.META_VOCAB; import static com.adobe.epubcheck.vocab.PackageVocabs.META_VOCAB_URI; -import java.net.URI; -import java.net.URISyntaxException; import java.util.Deque; import java.util.IllformedLocaleException; import java.util.List; @@ -56,6 +54,8 @@ import java.util.Map; import java.util.Set; +import org.w3c.epubcheck.url.URLUtils; + import com.adobe.epubcheck.api.EPUBLocation; import com.adobe.epubcheck.api.QuietReport; import com.adobe.epubcheck.messages.MessageId; @@ -403,14 +403,14 @@ private List processCollectionRole(String roleAtt) ImmutableList.Builder rolesBuilder = ImmutableList.builder(); for (String role : TOKENIZER.split(Strings.nullToEmpty(roleAtt))) { - if (role.matches("^[^:/?#]+://.*")) + if (URLUtils.isAbsoluteURLString(role)) { // Role is an absolute IRI // check that the host component doesn't contain 'idpf.org' try { - URI uri = new URI(role); - if (uri.getHost() != null && uri.getHost().contains("idpf.org")) + URL url = URL.parse(role); + if (url.authority() != null && url.authority().contains("idpf.org")) { report.message(MessageId.OPF_069, location(), role); } @@ -418,7 +418,7 @@ private List processCollectionRole(String roleAtt) { rolesBuilder.add(role); } - } catch (URISyntaxException e) + } catch (GalimatiasParseException e) { report.message(MessageId.OPF_070, location(), role); } @@ -448,22 +448,11 @@ private void processLink() if (href != null) { // check by schema - // FIXME next test URL string is conforming, better test remote URLs - if (href.matches("^[^:/?#]+://.*")) - { + URL url = checkURL(href); + if (context.isRemote(url)) { report.info(path, FeatureEnum.REFERENCE, href); } - URL url; - try - { - url = baseURL().resolve(href); - } catch (GalimatiasParseException e1) - { - report.message(MessageId.RSC_020, location(), href); - return; - } - if (context.xrefChecker.isPresent()) { context.xrefChecker.get().registerReference(url, Type.LINK, location()); diff --git a/src/main/java/com/adobe/epubcheck/xml/handlers/BaseURLHandler.java b/src/main/java/com/adobe/epubcheck/xml/handlers/BaseURLHandler.java index 383b7d312..243702a56 100644 --- a/src/main/java/com/adobe/epubcheck/xml/handlers/BaseURLHandler.java +++ b/src/main/java/com/adobe/epubcheck/xml/handlers/BaseURLHandler.java @@ -1,36 +1,21 @@ package com.adobe.epubcheck.xml.handlers; -import java.net.URI; - import javax.xml.XMLConstants; +import org.w3c.epubcheck.url.URLChecker; import org.xml.sax.Attributes; -import com.adobe.epubcheck.api.Report; -import com.adobe.epubcheck.messages.MessageId; import com.adobe.epubcheck.opf.ValidationContext; import com.adobe.epubcheck.util.EpubConstants; import com.google.common.base.Preconditions; -import io.mola.galimatias.GalimatiasParseException; -import io.mola.galimatias.StrictErrorHandler; import io.mola.galimatias.URL; -import io.mola.galimatias.URLParsingSettings; public abstract class BaseURLHandler extends LocationHandler { - private static final String TEST_BASE_A_FULL = "https://a.example.org/A/"; - private static final String TEST_BASE_A_START = "https://a.example.org/"; - private static final URL TEST_BASE_A_URL = URL.fromJavaURI(URI.create(TEST_BASE_A_FULL)); - private static final String TEST_BASE_B_FULL = "https://b.example.org/B/"; - private static final String TEST_BASE_B_START = "https://b.example.org/"; - private static final URL TEST_BASE_B_URL = URL.fromJavaURI(URI.create(TEST_BASE_B_FULL)); private URL baseURL; - private URL baseURLTestA; - private URL baseURLTestB; - private boolean isRemoteBase; - private final Report report; + private final URLChecker urlChecker; public BaseURLHandler(ValidationContext context) { @@ -40,17 +25,8 @@ public BaseURLHandler(ValidationContext context) public BaseURLHandler(ValidationContext context, URL baseURL) { super(context); - this.report = context.report; - this.isRemoteBase = false; + this.urlChecker = new URLChecker(context, baseURL); this.baseURL = Preconditions.checkNotNull(baseURL); - try - { - this.baseURLTestA = TEST_BASE_A_URL.resolve(context.relativize(baseURL)); - this.baseURLTestB = TEST_BASE_B_URL.resolve(context.relativize(baseURL)); - } catch (GalimatiasParseException e) - { - throw new AssertionError(e); - } } protected final URL baseURL() @@ -61,111 +37,27 @@ protected final URL baseURL() @Override public void startElement(String uri, String localName, String qName, Attributes attributes) { + String newBase = null; // In HTML, `base` element sets a new base URL if (EpubConstants.HtmlNamespaceUri.equals(uri) && "base".equals(localName)) { - setBase(attributes.getValue("", "href")); + newBase = attributes.getValue("", "href"); } // In XML, `xml:base` attribute sets a new base URL else { - setBase(attributes.getValue(XMLConstants.XML_NS_URI, "base")); + newBase = attributes.getValue(XMLConstants.XML_NS_URI, "base"); } - - } - - private void setBase(String newBase) - { - URL newBaseURL = resolveURL(newBase, true); - if (newBaseURL != null) + // Update the base if a new one was found + if (newBase != null) { - baseURL = newBaseURL; + baseURL = urlChecker.setBase(newBase, location()); } - } - - protected final URL checkURL(String string) - { - URL url = resolveURL(string, false); - // FIXME 2022 can we check resource existence here instead of XRefChecker? - // if in a container, check that the resource exists, or return null - // try - // { - // if (url != null && context.container.isPresent() - // && !context.container.get().contains(url.withFragment(null))) - // { - // report.message(MessageId.RSC_007, location(), string); - // return null; - // } - // } catch (GalimatiasParseException e) - // { - // new AssertionError(); // setting null fragment shouldn't throw - // } - return url; } - private static final URLParsingSettings STRICT_PARSING_SETTINGS = URLParsingSettings.create() - .withErrorHandler(StrictErrorHandler.getInstance()); - - private URL resolveURL(String string, boolean isBase) + protected final URL checkURL(String string) { - // FIXME next report disallowed URL schemes (e.g. file URLs). - Preconditions.checkNotNull(baseURL); - if (string == null) return null; - try - { - // Collapse formatting whitespace in data URLs - if (string.startsWith("data:")) - { - string = string.replaceAll("\\s", ""); - } - URL url = URL.parse(baseURL, string); // non-strict parsing - - // Also try to parse the URL in strict mode, to report any invalid URL, - // but continue the processing if an exception is thrown - try - { - URL.parse(STRICT_PARSING_SETTINGS, baseURL, string); - } catch (GalimatiasParseException e) - { - report.message(MessageId.RSC_020, location(), string, e.getLocalizedMessage()); - } - - // if we are resolving a new base URL, also resolve the test URLs - // it will not throw any error if the earlier parsing has not already - URL testA = baseURLTestA.resolve(string); - URL testB = baseURLTestB.resolve(string); - if (isBase) - { - baseURLTestA = testA; - baseURLTestB = testB; - } - - // if the base URL is remote, no further checks are required - if (isRemoteBase) - { - return url; - } - // if URL is absolute, return it as-is - if (!testA.toString().startsWith(TEST_BASE_A_START) - || !testB.toString().startsWith(TEST_BASE_B_START)) - { - isRemoteBase = true; - return url; - } - // if relative URL "leaks" outside the container, report and continue - else if (!isBase && !testA.toString().startsWith(TEST_BASE_A_FULL) - || !testB.toString().startsWith(TEST_BASE_B_FULL)) - { - // FIXME !!! this is broken, base s/b taken into account - report.message(MessageId.RSC_026, location(), string); - } - return url; - } catch (GalimatiasParseException e) - { - // URL parsing error thrown during non-strict parsing - report.message(MessageId.RSC_020, location(), string, e.getLocalizedMessage()); - return null; - } + return urlChecker.checkURL(string, location()); } } diff --git a/src/main/java/org/w3c/epubcheck/url/URLChecker.java b/src/main/java/org/w3c/epubcheck/url/URLChecker.java new file mode 100644 index 000000000..fb98e984a --- /dev/null +++ b/src/main/java/org/w3c/epubcheck/url/URLChecker.java @@ -0,0 +1,130 @@ +package org.w3c.epubcheck.url; + +import java.net.URI; + +import com.adobe.epubcheck.api.EPUBLocation; +import com.adobe.epubcheck.api.Report; +import com.adobe.epubcheck.messages.MessageId; +import com.adobe.epubcheck.opf.ValidationContext; +import com.google.common.base.Preconditions; + +import io.mola.galimatias.GalimatiasParseException; +import io.mola.galimatias.StrictErrorHandler; +import io.mola.galimatias.URL; +import io.mola.galimatias.URLParsingSettings; + +public class URLChecker +{ + private static final String TEST_BASE_A_FULL = "https://a.example.org/A/"; + private static final String TEST_BASE_A_START = "https://a.example.org/"; + private static final URL TEST_BASE_A_URL = URL.fromJavaURI(URI.create(TEST_BASE_A_FULL)); + private static final String TEST_BASE_B_FULL = "https://b.example.org/B/"; + private static final String TEST_BASE_B_START = "https://b.example.org/"; + private static final URL TEST_BASE_B_URL = URL.fromJavaURI(URI.create(TEST_BASE_B_FULL)); + private static final URLParsingSettings STRICT_PARSING_SETTINGS = URLParsingSettings.create() + .withErrorHandler(StrictErrorHandler.getInstance()); + + private URL baseURL; + private URL baseURLTestA; + private URL baseURLTestB; + private boolean isRemoteBase; + private final Report report; + + public URLChecker(ValidationContext context) + { + this(Preconditions.checkNotNull(context), context.url); + } + + public URLChecker(ValidationContext context, URL baseURL) + { + this.report = Preconditions.checkNotNull(context).report; + this.baseURL = Preconditions.checkNotNull(baseURL); + this.isRemoteBase = false; + try + { + this.baseURLTestA = TEST_BASE_A_URL.resolve(context.relativize(baseURL)); + this.baseURLTestB = TEST_BASE_B_URL.resolve(context.relativize(baseURL)); + } catch (GalimatiasParseException e) + { + throw new AssertionError(e); + } + } + + public URL setBase(String newBase, EPUBLocation location) + { + URL newBaseURL = resolveURL(newBase, true, location); + if (newBaseURL != null) + { + baseURL = newBaseURL; + } + return baseURL; + } + + public URL checkURL(String string, EPUBLocation location) + { + URL url = resolveURL(string, false, location); + return url; + } + + private URL resolveURL(String string, boolean isBase, EPUBLocation location) + { + assert baseURL != null; + if (string == null) return null; + try + { + // Collapse formatting whitespace in data URLs + if (string.startsWith("data:")) + { + string = string.replaceAll("\\s", ""); + } + URL url = URL.parse(baseURL, string); // non-strict parsing + + // Also try to parse the URL in strict mode, to report any invalid URL, + // but continue the processing if an exception is thrown + try + { + URL.parse(STRICT_PARSING_SETTINGS, baseURL, string); + } catch (GalimatiasParseException e) + { + report.message(MessageId.RSC_020, location, string, e.getLocalizedMessage()); + } + + // if we are resolving a new base URL, also resolve the test URLs + // it will not throw any error if the earlier parsing has not already + URL testA = baseURLTestA.resolve(string); + URL testB = baseURLTestB.resolve(string); + if (isBase) + { + baseURLTestA = testA; + baseURLTestB = testB; + } + + // if the base URL is remote, no further checks are required + if (isRemoteBase) + { + return url; + } + // if URL is absolute, return it as-is + if (!testA.toString().startsWith(TEST_BASE_A_START) + || !testB.toString().startsWith(TEST_BASE_B_START)) + { + isRemoteBase = true; + return url; + } + // if relative URL "leaks" outside the container, report and continue + else if (!isBase && !testA.toString().startsWith(TEST_BASE_A_FULL) + || !testB.toString().startsWith(TEST_BASE_B_FULL)) + { + // FIXME !!! this is broken, base s/b taken into account + report.message(MessageId.RSC_026, location, string); + } + return url; + } catch (GalimatiasParseException e) + { + // URL parsing error thrown during non-strict parsing + report.message(MessageId.RSC_020, location, string, e.getLocalizedMessage()); + return null; + } + } + +} diff --git a/src/main/java/org/w3c/epubcheck/url/URLUtils.java b/src/main/java/org/w3c/epubcheck/url/URLUtils.java index 89a6cf033..9c11d8bb5 100644 --- a/src/main/java/org/w3c/epubcheck/url/URLUtils.java +++ b/src/main/java/org/w3c/epubcheck/url/URLUtils.java @@ -100,7 +100,6 @@ public static boolean isAbsoluteURLString(String string) { return false; } - throw new IllegalArgumentException("Could not parse URL", e); } return true; }