Skip to content

Commit

Permalink
fix: report undefined entity references in EPUB 2.0.1
Browse files Browse the repository at this point in the history
References to undeclared entities where skipped in EPUB 2.0.1 parsing
(due to using a non-validating parser).
The declaration handler is now registered as a content handler to check
for skipped entities and report unknown ones.

Fix #1546
  • Loading branch information
rdeltour committed Dec 23, 2024
1 parent 8af39fc commit 4ad738c
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 8 deletions.
13 changes: 7 additions & 6 deletions src/main/java/com/adobe/epubcheck/xml/XMLParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@
public class XMLParser
{

private static final String SAXPROP_LEXICAL_HANDLER = "http://xml.org/sax/properties/lexical-handler";
private static final String SAXPROP_DECL_HANDLER = "http://xml.org/sax/properties/declaration-handler";
private final ValidationContext context;
private final Report report;
private final URL url;
Expand All @@ -76,9 +74,9 @@ public XMLParser(ValidationContext context)
{
factory.setNamespaceAware(true);
factory.setValidating(false);
factory.setFeature("http://xml.org/sax/features/validation", false);
factory.setFeature("http://xml.org/sax/features/external-general-entities", false);
factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
factory.setFeature("http://xml.org/sax/features/validation", false);
if (context.version == EPUBVersion.VERSION_3)
{
factory.setXIncludeAware(false);
Expand All @@ -93,9 +91,12 @@ public XMLParser(ValidationContext context)
handler.setEntityResolver(new DefaultResolver(context.version));

XMLReader reader = parser.getXMLReader();
DeclarationHandler docTypeHandler = new DeclarationHandler(context);
reader.setProperty(SAXPROP_LEXICAL_HANDLER, docTypeHandler);
reader.setProperty(SAXPROP_DECL_HANDLER, docTypeHandler);
DeclarationHandler docTypeHandler = new DeclarationHandler(context, parser);
reader.setProperty("http://xml.org/sax/properties/lexical-handler", docTypeHandler);
reader.setProperty("http://xml.org/sax/properties/declaration-handler", docTypeHandler);
// add the declaration handler as a content handler
// to check for skipped entities
handler.addContentHandler(docTypeHandler);

} catch (Exception e)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import java.util.HashSet;
import java.util.Set;

import javax.xml.parsers.SAXParser;

import org.apache.xerces.impl.XMLErrorReporter;
import org.apache.xerces.impl.msg.XMLMessageFormatter;
import org.xml.sax.SAXException;

import com.adobe.epubcheck.api.Report;
Expand All @@ -18,13 +22,15 @@ public class DeclarationHandler extends LocationHandler
private final EPUBVersion version;
private boolean firstStartDTDInvocation = true;
private final Set<String> entities = new HashSet<String>();
private final SAXParser parser;

public DeclarationHandler(ValidationContext context)
public DeclarationHandler(ValidationContext context, SAXParser parser)
{
super(context);
this.report = context.report;
this.mimeType = context.mimeType;
this.version = context.version;
this.parser = parser;

// XML predefined
entities.add("gt");
Expand Down Expand Up @@ -191,4 +197,32 @@ public void internalEntityDecl(String name, String value)
{
entities.add(name);
}

@Override
public void skippedEntity(String name)
throws SAXException
{
if (!name.startsWith("%") && !name.equals("[dtd]"))
{
// A non-validating parser with an external subset with
// skip any unknown entity reference.
// This is the case in EPUB 2.0.1 content documents.
// We check if the entity was declared and report an
// error if not.
if (!entities.contains("name"))
{
// We use the Xerces XNI error reporter to make
// the reporting message consistent with what Xerces
// would report itself.
XMLErrorReporter xniReporter = (XMLErrorReporter) parser
.getProperty("http://apache.org/xml/properties/internal/error-reporter");
xniReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
"EntityNotDeclared",
new Object[] { name },
XMLErrorReporter.SEVERITY_FATAL_ERROR);
}
}
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>Test</title>
</head>
<body>
<h1>Loomings</h1>
<!-- this entity reference uses an unknown entity name -->
&foo;
</body>
</html>
6 changes: 6 additions & 0 deletions src/test/resources/epub2/ops-content-document-xhtml.feature
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ Feature: EPUB 2 ▸ Open Publication Structure ▸ XHTML Document Checks
When checking document 'entities-character-references-valid.xhtml'
Then no errors or warnings are reported

Scenario: Report unknown entity references
When checking document 'entities-unknown-error.xhtml'
Then fatal error RSC-016 is reported
And the message contains 'was referenced, but not declared'
And no other errors or warnings are reported

Scenario: Report HTML5 elements used in OPS XHTML Content Documents
When checking document 'html5-elements-error.xhtml'
Then error RSC-005 is reported
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,5 @@
<title>Entites declared in the internal subset are allowed</title>
</head>
<body>
&foo;
</body>
</html>

0 comments on commit 4ad738c

Please sign in to comment.