-
Notifications
You must be signed in to change notification settings - Fork 407
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: improve checking of OCF file name characters
- significantly refactored `OCFFilenameChecker`: - now based on ICU4J for more up-to-date Unicode support - iterate over the input string codepoints (`int`) instead of code units (`char`) for better support of charactres outside the BMP. - rely on ICU UnicodeSet to define the set of forbidden characters - improve the message reported for forbidden characters (notably for non-printable characters) - the class now implements the `Checker` interface - add unit tests for the `OCFFilenameChecker` only in a new feature file `filename-checker.feature`, located along with the EPUB 3 OCF functional tests, for easier lookup. - add a few functional tests, to make sure `OCFFilenameChecker` is correctly called when checking full publications or single package documents. - update ICU4J to v72.1 Fixes #1240, fixes #1292, fixes #1302
- Loading branch information
Showing
34 changed files
with
515 additions
and
248 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
211 changes: 123 additions & 88 deletions
211
src/main/java/com/adobe/epubcheck/ocf/OCFFilenameChecker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,127 +1,162 @@ | ||
package com.adobe.epubcheck.ocf; | ||
|
||
import java.util.LinkedHashSet; | ||
import java.util.Set; | ||
import java.util.stream.Collectors; | ||
|
||
import org.w3c.epubcheck.core.Checker; | ||
|
||
import com.adobe.epubcheck.api.EPUBLocation; | ||
import com.adobe.epubcheck.api.Report; | ||
import com.adobe.epubcheck.messages.MessageId; | ||
import com.adobe.epubcheck.opf.ValidationContext; | ||
import com.adobe.epubcheck.util.EPUBVersion; | ||
import com.google.common.collect.ImmutableSet; | ||
import com.google.common.base.Preconditions; | ||
import com.google.common.collect.ImmutableMap; | ||
import com.ibm.icu.lang.UCharacter; | ||
import com.ibm.icu.text.UCharacterIterator; | ||
import com.ibm.icu.text.UForwardCharacterIterator; | ||
import com.ibm.icu.text.UnicodeSet; | ||
|
||
//FIXME 2022 update related PKG-* messages to contain the file name string | ||
public final class OCFFilenameChecker | ||
public final class OCFFilenameChecker implements Checker | ||
{ | ||
private static final Set<String> RESTRICTED_30_CHARACTER_SET = ImmutableSet.of("PRIVATE_USE_AREA", | ||
"ARABIC_PRESENTATION_FORMS_A", "SPECIALS", "SUPPLEMENTARY_PRIVATE_USE_AREA_A", | ||
"SUPPLEMENTARY_PRIVATE_USE_AREA_B", "VARIATION_SELECTORS_SUPPLEMENT", "TAGS"); | ||
|
||
private static final UnicodeSet ASCII = new UnicodeSet("[:ascii:]").freeze(); | ||
|
||
private static final UnicodeSet DISALLOWED_EPUB2 = new UnicodeSet() | ||
// .add(0x002F)// SOLIDUS '/' -- allowed as path separator | ||
.add(0x0022)// QUOTATION MARK '"' | ||
.add(0x002A)// ASTERISK '*' | ||
// .add(0x002E)// FULL STOP '.' -- only disallowed as the last character | ||
.add(0x003A)// COLON ':' | ||
.add(0x003C)// LESS-THAN SIGN '<' | ||
.add(0x003E)// GREATER-THAN SIGN '>' | ||
.add(0x003F)// QUESTION MARK '?' | ||
.add(0x005C)// REVERSE SOLIDUS '\' | ||
.freeze(); | ||
|
||
private static final ImmutableMap<String, UnicodeSet> DISALLOWED_EPUB3 = new ImmutableMap.Builder<String, UnicodeSet>() | ||
.put("ASCII", new UnicodeSet() // | ||
.addAll(DISALLOWED_EPUB2)// all disallowed in EPUB 2.0.1 | ||
.add(0x007C) // VERTICAL LINE '|' | ||
.freeze()) | ||
.put("NON CHARACTER", new UnicodeSet("[:Noncharacter_Code_Point=Yes:]")// | ||
.freeze()) | ||
.put("CONTROL", new UnicodeSet().add(0x007F) // DEL | ||
.addAll(0x0000, 0x001F) // C0 range | ||
.addAll(0x0080, 0x009F) // C1 range | ||
.freeze()) | ||
.put("PRIVATE USE", new UnicodeSet() // | ||
.addAll(0xE000, 0xF8FF) // Private Use Area | ||
.addAll(0xF0000, 0xFFFFF) // Supplementary Private Use Area-A | ||
.addAll(0x100000, 0x10FFFF) // Supplementary Private Use Area-B | ||
.freeze()) | ||
.put("SPECIALS", new UnicodeSet() // | ||
.addAll(0xFFF0, 0xFFFF) // Specials Blocks | ||
.freeze()) | ||
.put("DEPRECATED", new UnicodeSet() // | ||
.add(0xE0001)// LANGUAGE TAG | ||
// .add(0xE007F)// CANCEL TAG -- reinstated in Emoji tag sequences | ||
.freeze()) | ||
.build(); | ||
|
||
private static String toString(int codepoint, String setName) | ||
{ | ||
assert setName != null; | ||
StringBuilder result = new StringBuilder().append(String.format("U+%04X ", codepoint)); | ||
if ("ASCII".equals(setName)) | ||
{ | ||
result.append('(').append(UCharacter.toString(codepoint)).append(')'); | ||
} | ||
else | ||
{ | ||
String characterName = UCharacter.getName(codepoint); | ||
if (characterName != null) | ||
{ | ||
result.append(characterName).append(' '); | ||
} | ||
result.append('(').append(setName).append(')'); | ||
} | ||
return result.toString(); | ||
} | ||
|
||
private final Report report; | ||
private final EPUBVersion version; | ||
private final EPUBLocation location; | ||
private final String filename; | ||
|
||
public OCFFilenameChecker(String filename, ValidationContext context) | ||
{ | ||
this(filename, context, null); | ||
} | ||
|
||
public OCFFilenameChecker(ValidationContext context) | ||
public OCFFilenameChecker(String filename, ValidationContext context, EPUBLocation location) | ||
{ | ||
Preconditions.checkArgument(filename != null); | ||
Preconditions.checkArgument(context != null); | ||
this.filename = filename; | ||
this.report = context.report; | ||
this.version = context.version; | ||
this.location = EPUBLocation.of(context); | ||
this.location = (location != null) ? location : EPUBLocation.of(context); | ||
} | ||
|
||
public String checkCompatiblyEscaped(final String str) | ||
@Override | ||
public void check() | ||
{ | ||
// don't check remote resources | ||
if (str.matches("^[^:/?#]+://.*")) | ||
{ | ||
return ""; | ||
} | ||
|
||
// the test string will be used to compare test result | ||
String test = checkNonAsciiFilename(str); | ||
|
||
if (str.endsWith(".")) | ||
{ | ||
report.message(MessageId.PKG_011, location, str); | ||
test += "."; | ||
} | ||
|
||
boolean spaces = false; | ||
final char[] ascciGraphic = new char[] { '<', '>', '"', '{', '}', '|', '^', '`', '*', | ||
'?' /* , ':','/', '\\' */ }; | ||
String result = ""; | ||
char[] chars = str.toCharArray(); | ||
for (char c : chars) | ||
// Iterate through the code points to search disallowed characters | ||
UCharacterIterator chars = UCharacterIterator.getInstance(filename); | ||
final Set<String> disallowed = new LinkedHashSet<>(); | ||
boolean hasSpaces = false; | ||
boolean isASCIIOnly = true; | ||
int codepoint; | ||
while ((codepoint = chars.nextCodePoint()) != UForwardCharacterIterator.DONE) | ||
{ | ||
for (char a : ascciGraphic) | ||
// Check if the string has non-ASCII characters | ||
isASCIIOnly = isASCIIOnly && ASCII.contains(codepoint); | ||
// Check if the string has space characters | ||
hasSpaces = hasSpaces || UCharacter.isUWhiteSpace(codepoint); | ||
// Check for disallowed characters | ||
switch (version) | ||
{ | ||
if (c == a) | ||
case VERSION_2: | ||
if (DISALLOWED_EPUB2.contains(codepoint)) | ||
{ | ||
result += "\"" + Character.toString(c) + "\","; | ||
test += Character.toString(c); | ||
disallowed.add(toString(codepoint, "ASCII")); | ||
} | ||
} | ||
if (Character.isSpaceChar(c)) | ||
{ | ||
spaces = true; | ||
test += Character.toString(c); | ||
break; | ||
default: | ||
for (String name : DISALLOWED_EPUB3.keySet()) | ||
{ | ||
if (DISALLOWED_EPUB3.get(name).contains(codepoint)) | ||
{ | ||
disallowed.add(toString(codepoint, name)); | ||
break; | ||
} | ||
} | ||
break; | ||
} | ||
} | ||
if (result.length() > 1) | ||
// Check that FULL STOP is not used as the last character | ||
if (chars.previousCodePoint() == 0x002E) | ||
{ | ||
result = result.substring(0, result.length() - 1); | ||
report.message(MessageId.PKG_009, location, str, result); | ||
report.message(MessageId.PKG_011, location, filename); | ||
} | ||
if (spaces) | ||
// Report if disallowed characters were found | ||
if (!disallowed.isEmpty()) | ||
{ | ||
report.message(MessageId.PKG_010, location, str); | ||
report.message(MessageId.PKG_009, location, filename, | ||
disallowed.stream().collect(Collectors.joining(", "))); | ||
} | ||
|
||
if (version == EPUBVersion.VERSION_3) | ||
// Report whitespace characters | ||
if (hasSpaces) | ||
{ | ||
checkCompatiblyEscaped30(str, test); | ||
report.message(MessageId.PKG_010, location, filename); | ||
} | ||
return test; | ||
} | ||
|
||
private String checkNonAsciiFilename(final String str) | ||
{ | ||
String nonAscii = str.replaceAll("[\\p{ASCII}]", ""); | ||
if (nonAscii.length() > 0) | ||
// Report non-ASCII characters as usage | ||
if (!isASCIIOnly) | ||
{ | ||
report.message(MessageId.PKG_012, location, str, nonAscii); | ||
report.message(MessageId.PKG_012, location, filename); | ||
} | ||
return nonAscii; | ||
} | ||
|
||
private String checkCompatiblyEscaped30(String str, String test) | ||
{ | ||
String result = ""; | ||
|
||
char[] chars = str.toCharArray(); | ||
for (char c : chars) | ||
{ | ||
if (Character.isISOControl(c)) | ||
{ | ||
result += "\"" + Character.toString(c) + "\","; | ||
test += Character.toString(c); | ||
} | ||
|
||
// DEL (U+007F) | ||
if (c == '\u007F') | ||
{ | ||
result += "\"" + Character.toString(c) + "\","; | ||
test += Character.toString(c); | ||
} | ||
String unicodeType = Character.UnicodeBlock.of(c).toString(); | ||
if (RESTRICTED_30_CHARACTER_SET.contains(unicodeType)) | ||
{ | ||
result += "\"" + Character.toString(c) + "\","; | ||
} | ||
} | ||
if (result.length() > 1) | ||
{ | ||
result = result.substring(0, result.length() - 1); | ||
report.message(MessageId.PKG_009, location, str, result); | ||
} | ||
return test; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.