Skip to content

Commit

Permalink
feat(text/unstable): handle non-Latin-script text in slugify (denol…
Browse files Browse the repository at this point in the history
  • Loading branch information
lionel-rowe authored Oct 21, 2024
1 parent 0f4649d commit a541fb4
Show file tree
Hide file tree
Showing 2 changed files with 236 additions and 23 deletions.
124 changes: 111 additions & 13 deletions text/unstable_slugify.ts
Original file line number Diff line number Diff line change
@@ -1,28 +1,126 @@
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
// This module is browser compatible.

const wordSegmenter = new Intl.Segmenter("en-US", { granularity: "word" });

/** Options for {@linkcode slugify}. */
export type SlugifyOptions = {
/**
* The regular expression to use for stripping characters.
* @default {typeof NON_WORD}
*/
strip: RegExp;
/**
* The transliteration function to use for converting non-Latin text.
* Called on each word in the input before joining them with dashes.
* @default {undefined}
*/
transliterate: ((word: string) => string) | undefined;
};

/**
* Converts a string into a {@link https://en.wikipedia.org/wiki/Clean_URL#Slug | slug}.
* A regular expression for stripping non-word characters from slugs.
*
* @experimental **UNSTABLE**: New API, yet to be vetted.
* @example Usage
* ```ts
* import { NON_WORD, slugify } from "@std/text/unstable-slugify";
* import { assertEquals } from "@std/assert";
* assertEquals(slugify("déjà-vu", { strip: NON_WORD }), "déjà-vu");
* assertEquals(slugify("Συστημάτων Γραφής", { strip: NON_WORD }), "συστημάτων-γραφής");
* ```
*/
export const NON_WORD = /[^\p{L}\p{M}\p{N}\-]+/gu;
/**
* A regular expression for stripping diacritics from slugs.
*
* @example Usage
* ```ts
* import { slugify } from "@std/text/unstable-slugify";
* import { DIACRITICS, slugify } from "@std/text/unstable-slugify";
* import { assertEquals } from "@std/assert";
* assertEquals(slugify("déjà-vu", { strip: DIACRITICS }), "deja-vu");
* assertEquals(slugify("Συστημάτων Γραφής", { strip: DIACRITICS }), "συστηματων-γραφης");
* ```
*/
export const DIACRITICS = /[^\p{L}\p{N}\-]+/gu;
/**
* A regular expression for stripping ASCII diacritics (but not other diacritics) from slugs.
*
* assertEquals(slugify("hello world"), "hello-world");
* assertEquals(slugify("déjà vu"), "deja-vu");
* @example Usage
* ```ts
* import { ASCII_DIACRITICS, slugify } from "@std/text/unstable-slugify";
* import { assertEquals } from "@std/assert";
* assertEquals(slugify("déjà-vu", { strip: ASCII_DIACRITICS }), "deja-vu");
* assertEquals(slugify("Συστημάτων Γραφής", { strip: ASCII_DIACRITICS }), "συστημάτων-γραφής");
* ```
*/
export const ASCII_DIACRITICS = /(?<=[a-zA-Z])\p{M}+|[^\p{L}\p{M}\p{N}\-]+/gu;
/**
* A regular expression for stripping non-ASCII characters from slugs.
*
* @example Usage
* ```ts
* import { NON_ASCII, slugify } from "@std/text/unstable-slugify";
* import { assertEquals } from "@std/assert";
* assertEquals(slugify("déjà-vu", { strip: NON_ASCII }), "deja-vu");
* assertEquals(slugify("Συστημάτων Γραφής", { strip: NON_ASCII }), "-");
* ```
*/
export const NON_ASCII = /[^0-9a-zA-Z\-]/g;

/**
* Converts a string into a {@link https://en.wikipedia.org/wiki/Clean_URL#Slug | slug}.
*
* @experimental **UNSTABLE**: New API, yet to be vetted.
*
* @param input The string that is going to be converted into a slug
* @param options The options for the slugify function
* @returns The string as a slug
*
* @example Basic usage
* ```ts
* import { slugify } from "@std/text/unstable-slugify";
* import { assertEquals } from "@std/assert";
*
* assertEquals(slugify("Hello, world!"), "hello-world");
* assertEquals(slugify("Συστημάτων Γραφής"), "συστημάτων-γραφής");
* ```
*
* @example With transliteration using a third-party library
* ```ts no-eval
* import { NON_ASCII, slugify } from "@std/text/unstable-slugify";
* // example third-party transliteration library
* import transliterate from "npm:any-ascii";
*
* slugify("Συστημάτων Γραφής", { transliterate, strip: NON_ASCII });
* // => "sistimaton-grafis"
* ```
*/
export function slugify(input: string): string {
return input
.trim()
.normalize("NFD")
.replaceAll(/[^a-zA-Z0-9\s-]/g, "")
.replaceAll(/\s+|-+/g, "-")
.replaceAll(/^-+|-+$/g, "")
.toLowerCase();
export function slugify(
input: string,
options?: Partial<SlugifyOptions>,
): string {
// clone with `new RegExp` in case `lastIndex` isn't zeroed
const stripRe = new RegExp(options?.strip ?? NON_WORD);
const words: string[] = [];

for (
const s of wordSegmenter.segment(
input.trim().normalize("NFD").toLowerCase(),
)
) {
if (s.isWordLike) {
words.push(s.segment);
} else if (s.segment.length) {
words.push("-");
}
}

return words
.map(options?.transliterate ?? ((x) => x))
.join(options?.transliterate ? "-" : "")
.replaceAll(stripRe, "")
.normalize("NFC")
.replaceAll(/-{2,}/g, "-")
.replaceAll(/^-|-$/g, "") ||
"-";
}
135 changes: 125 additions & 10 deletions text/unstable_slugify_test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
import { assertEquals } from "@std/assert/equals";
import { slugify } from "./unstable_slugify.ts";
import { assertEquals, assertMatch } from "@std/assert";
import {
ASCII_DIACRITICS,
DIACRITICS,
NON_ASCII,
NON_WORD,
slugify,
} from "./unstable_slugify.ts";

Deno.test("slugify() returns kebabcase", () => {
assertEquals(slugify("hello world"), "hello-world");
Expand All @@ -16,23 +22,73 @@ Deno.test("slugify() handles whitespaces", () => {
assertEquals(slugify("Hello\r\nWorld"), "hello-world");
});

Deno.test("slugify() replaces diacritic characters", () => {
assertEquals(slugify("déjà vu"), "deja-vu");
assertEquals(slugify("Cliché"), "cliche");
assertEquals(slugify("façade"), "facade");
assertEquals(slugify("résumé"), "resume");
Deno.test("slugify() normalizes diacritic characters to NFC form by default", () => {
assertEquals(slugify("déjà vu".normalize("NFD")), "déjà-vu".normalize("NFC"));
assertEquals(slugify("Cliché".normalize("NFD")), "cliché".normalize("NFC"));
assertEquals(slugify("façade".normalize("NFD")), "façade".normalize("NFC"));
assertEquals(slugify("résumé".normalize("NFD")), "résumé".normalize("NFC"));
assertEquals(
slugify("Συστημάτων Γραφής".normalize("NFD")),
"συστημάτων-γραφής".normalize("NFC"),
);
});

Deno.test("slugify() strips all non-ASCII chars, including diacritics, if strip: NON_ASCII", () => {
assertEquals(
slugify("déjà vu".normalize("NFC"), { strip: NON_ASCII }),
"deja-vu",
);
assertEquals(
slugify("déjà vu".normalize("NFD"), { strip: NON_ASCII }),
"deja-vu",
);
assertEquals(slugify("Συστημάτων Γραφής", { strip: NON_ASCII }), "-");
});

Deno.test("slugify() strips all diacritics if strip: DIACRITICS", () => {
assertEquals(
slugify("déjà vu".normalize("NFC"), { strip: DIACRITICS }),
"deja-vu",
);
assertEquals(
slugify("déjà vu".normalize("NFD"), { strip: DIACRITICS }),
"deja-vu",
);
assertEquals(
slugify("Συστημάτων Γραφής", { strip: DIACRITICS }),
"συστηματων-γραφης",
);
});

Deno.test("slugify() strips ASCII diacritics (but not other diacritics) if strip: ASCII_DIACRITICS", () => {
assertEquals(
slugify("déjà-vu".normalize("NFC"), { strip: ASCII_DIACRITICS }),
"deja-vu",
);
assertEquals(
slugify("déjà-vu".normalize("NFD"), { strip: ASCII_DIACRITICS }),
"deja-vu",
);
assertEquals(
slugify("Συστημάτων Γραφής", { strip: ASCII_DIACRITICS }),
"συστημάτων-γραφής",
);
});

Deno.test("slugify() handles dashes", () => {
assertEquals(slugify("-Hello-World-"), "hello-world");
assertEquals(slugify("--Hello--World--"), "hello-world");
});

Deno.test("slugify() handles empty string", () => {
assertEquals(slugify(""), "");
Deno.test("slugify() converts empty string to a single dash", () => {
// Prevent any issues with zero-length slugs in URLs, e.g.
// `/a//b` -> `/a/b`; `/a/` -> `/a`
assertEquals(slugify(""), "-");
assertEquals(slugify("abc", { strip: /./g }), "-");
});

Deno.test("slugify() removes unknown special characters", () => {
Deno.test("slugify() replaces non-word characters with dashes", () => {
assertEquals(slugify("Hello, world!"), "hello-world");
assertEquals(slugify("hello ~ world"), "hello-world");

assertEquals(
Expand All @@ -56,3 +112,62 @@ Deno.test("slugify() removes unknown special characters", () => {
"bitcoin-soars-past-33000-its-highest-ever",
);
});

Deno.test("slugify() works with non-Latin alphabetic text", () => {
assertEquals(slugify("Συστημάτων Γραφής"), "συστημάτων-γραφής");
assertEquals(slugify("三人行,必有我师"), "三人行-必有我师");
});

Deno.test("slugify() deletes non-matches when a custom strip regex is supplied", () => {
assertEquals(slugify("abcdef", { strip: /[ace]/g }), "bdf");
});

Deno.test("slugify() strips apostrophes within words", () => {
// curly apostrophe
assertEquals(slugify("What’s up?"), "whats-up");
// straight apostrophe
assertEquals(slugify("What's up?"), "whats-up");
});

Deno.test("slugify() strips or replaces all non-alphanumeric ASCII chars except for `-`", () => {
/**
* Ensure that interpolation into all parts of a URL (path segment, search
* params, hash, subdomain, etc.) is safe, i.e. doesn't allow path traversal
* or other exploits, which could be allowed by presence of chars like
* `./?&=#` etc.
*/
const ASCII_ALPHANUM_OR_DASH_ONLY = /^[a-zA-Z0-9\-]+$/;
const ALL_ASCII = Array.from(
{ length: 0x80 },
(_, i) => String.fromCodePoint(i),
).join("");

// with default
assertMatch(slugify(ALL_ASCII), ASCII_ALPHANUM_OR_DASH_ONLY);
// even if we explicitly set the strip regex to match nothing
assertMatch(
slugify(ALL_ASCII, { strip: /[^\s\S]/gu }),
ASCII_ALPHANUM_OR_DASH_ONLY,
);

// defense-in-depth - the exported regexes _also_ all strip non-ASCII characters
for (const re of [ASCII_DIACRITICS, DIACRITICS, NON_ASCII, NON_WORD]) {
assertMatch(ALL_ASCII.replaceAll(re, ""), ASCII_ALPHANUM_OR_DASH_ONLY);
}
});

Deno.test("slugify() `transliterate` option works alongside third-party transliteration libs", () => {
/**
* We just use a simple mock transliteration function to test basic
* compatibility here. For actual transliteration libraries,
* [npm:any-ascii](https://github.com/anyascii/anyascii) seems to be a good
* general-purpose option.
*/
const transliterate = (s: string) => [...s].map((c) => map[c]).join("");

const map: Record<string, string> = { : "kuang", : "quan", : "shui" };
const input = "矿泉水";
const expected = "kuangquan-shui";

assertEquals(slugify(input, { transliterate }), expected);
});

0 comments on commit a541fb4

Please sign in to comment.