-
-
Notifications
You must be signed in to change notification settings - Fork 3.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add option to strip Unicode from entry filenames #1135
Changes from 12 commits
92a3c3a
8403b9a
7ab0664
e6e40d1
81c8c55
b8bade2
31fcb3c
deba8ad
03c7ad9
2f65f11
202e985
7cdf7d7
0d12e6b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
import url from 'url'; | ||
import diacritics from 'diacritics'; | ||
import sanitizeFilename from 'sanitize-filename'; | ||
import { isString, escapeRegExp, flow, partialRight } from 'lodash'; | ||
import { Map } from 'immutable'; | ||
|
||
function getUrl(url, direct) { | ||
return `${ direct ? '/#' : '' }${ url }`; | ||
function getUrl(urlString, direct) { | ||
return `${ direct ? '/#' : '' }${ urlString }`; | ||
} | ||
|
||
export function getCollectionUrl(collectionName, direct) { | ||
|
@@ -20,9 +22,9 @@ export function addParams(urlString, params) { | |
return url.format(parsedUrl); | ||
} | ||
|
||
export function stripProtocol(url) { | ||
const protocolEndIndex = url.indexOf('//'); | ||
return protocolEndIndex > -1 ? url.slice(protocolEndIndex + 2) : url; | ||
export function stripProtocol(urlString) { | ||
const protocolEndIndex = urlString.indexOf('//'); | ||
return protocolEndIndex > -1 ? urlString.slice(protocolEndIndex + 2) : urlString; | ||
} | ||
|
||
/* See https://www.w3.org/International/articles/idn-and-iri/#path. | ||
|
@@ -34,34 +36,52 @@ export function stripProtocol(url) { | |
*/ | ||
const uriChars = /[\w\-.~]/i; | ||
const ucsChars = /[\xA0-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/u; | ||
const validIRIChar = (char) => (uriChars.test(char) || ucsChars.test(char)); | ||
// `sanitizeIRI` does not actually URI-encode the chars (that is the browser's and server's job), just removes the ones that are not allowed. | ||
export function sanitizeIRI(str, { replacement = "" } = {}) { | ||
if (!isString(str)) throw "The input slug must be a string."; | ||
if (!isString(replacement)) throw "`options.replacement` must be a string."; | ||
const validURIChar = char => uriChars.test(char); | ||
const validIRIChar = char => uriChars.test(char) || ucsChars.test(char); | ||
// `sanitizeURI` does not actually URI-encode the chars (that is the browser's and server's job), just removes the ones that are not allowed. | ||
export function sanitizeURI(str, { replacement = "", encoding = "unicode" } = {}) { | ||
if (!isString(str)) { | ||
throw new Error("The input slug must be a string."); | ||
} | ||
if (!isString(replacement)) { | ||
throw new Error("`options.replacement` must be a string."); | ||
} | ||
|
||
let validChar; | ||
if (encoding === "unicode") { | ||
validChar = validIRIChar; | ||
} else if (encoding === "ascii") { | ||
validChar = validURIChar; | ||
} else { | ||
throw new Error('`options.encoding` must be "unicode" or "ascii".'); | ||
} | ||
|
||
// Check and make sure the replacement character is actually a safe char itself. | ||
if (!Array.from(replacement).every(validIRIChar)) throw "The replacement character(s) (options.replacement) is itself unsafe."; | ||
if (!Array.from(replacement).every(validChar)) { | ||
throw new Error("The replacement character(s) (options.replacement) is itself unsafe."); | ||
} | ||
|
||
// `Array.from` must be used instead of `String.split` because | ||
// `split` converts things like emojis into UTF-16 surrogate pairs. | ||
return Array.from(str).map(char => (validIRIChar(char) ? char : replacement)).join(''); | ||
return Array.from(str).map(char => (validChar(char) ? char : replacement)).join(''); | ||
} | ||
|
||
export function sanitizeSlug(str, { replacement = '-' } = {}) { | ||
if (!isString(str)) throw "The input slug must be a string."; | ||
if (!isString(replacement)) throw "`options.replacement` must be a string."; | ||
export function sanitizeSlug(str, options = Map()) { | ||
const encoding = options.get('encoding', 'unicode'); | ||
const stripDiacritics = options.get('clean_accents', false); | ||
const replacement = options.get('sanitize_replacement', '-'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Intentionally undocumented? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn't sure if we wanted to wait until someone actually had a valid use case for it -- validating it in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nah, I'm fine with it as is, was just curious. |
||
|
||
if (!isString(str)) { throw new Error("The input slug must be a string."); } | ||
|
||
// Sanitize as IRI (i18n URI) and as filename. | ||
const sanitize = flow([ | ||
partialRight(sanitizeIRI, { replacement }), | ||
const sanitizedSlug = flow([ | ||
...(stripDiacritics ? [diacritics.remove] : []), | ||
partialRight(sanitizeURI, { replacement, encoding }), | ||
partialRight(sanitizeFilename, { replacement }), | ||
]); | ||
const sanitizedSlug = sanitize(str); | ||
|
||
])(str); | ||
|
||
// Remove any doubled or trailing replacement characters (that were added in the sanitizers). | ||
const doubleReplacement = new RegExp('(?:' + escapeRegExp(replacement) + ')+', 'g'); | ||
const trailingReplacment = new RegExp(escapeRegExp(replacement) + '$'); | ||
const doubleReplacement = new RegExp(`(?:${ escapeRegExp(replacement) })+`, 'g'); | ||
const trailingReplacment = new RegExp(`${ escapeRegExp(replacement) }$`); | ||
const normalizedSlug = sanitizedSlug | ||
.replace(doubleReplacement, replacement) | ||
.replace(trailingReplacment, ''); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'should remove accents with
clean_accents
set'