feat(text/unstable): handle non-Latin-script text in slugify (denol…

…and#6012)
dreamcatcher-tech · Oct 21, 2024 · a541fb4 · a541fb4
1 parent 0f4649d
commit a541fb4
Show file tree

Hide file tree

Showing 2 changed files with 236 additions and 23 deletions.
diff --git a/text/unstable_slugify.ts b/text/unstable_slugify.ts
@@ -1,28 +1,126 @@
 // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+// This module is browser compatible.
+
+const wordSegmenter = new Intl.Segmenter("en-US", { granularity: "word" });
+
+/** Options for {@linkcode slugify}. */
+export type SlugifyOptions = {
+  /**
+   * The regular expression to use for stripping characters.
+   * @default {typeof NON_WORD}
+   */
+  strip: RegExp;
+  /**
+   * The transliteration function to use for converting non-Latin text.
+   * Called on each word in the input before joining them with dashes.
+   * @default {undefined}
+   */
+  transliterate: ((word: string) => string) | undefined;
+};
 
 /**
- * Converts a string into a {@link https://en.wikipedia.org/wiki/Clean_URL#Slug | slug}.
+ * A regular expression for stripping non-word characters from slugs.
  *
- * @experimental **UNSTABLE**: New API, yet to be vetted.
+ * @example Usage
+ * ```ts
+ * import { NON_WORD, slugify } from "@std/text/unstable-slugify";
+ * import { assertEquals } from "@std/assert";
+ * assertEquals(slugify("déjà-vu", { strip: NON_WORD }), "déjà-vu");
+ * assertEquals(slugify("Συστημάτων Γραφής", { strip: NON_WORD }), "συστημάτων-γραφής");
+ * ```
+ */
+export const NON_WORD = /[^\p{L}\p{M}\p{N}\-]+/gu;
+/**
+ * A regular expression for stripping diacritics from slugs.
  *
  * @example Usage
  * ```ts
- * import { slugify } from "@std/text/unstable-slugify";
+ * import { DIACRITICS, slugify } from "@std/text/unstable-slugify";
  * import { assertEquals } from "@std/assert";
+ * assertEquals(slugify("déjà-vu", { strip: DIACRITICS }), "deja-vu");
+ * assertEquals(slugify("Συστημάτων Γραφής", { strip: DIACRITICS }), "συστηματων-γραφης");
+ * ```
+ */
+export const DIACRITICS = /[^\p{L}\p{N}\-]+/gu;
+/**
+ * A regular expression for stripping ASCII diacritics (but not other diacritics) from slugs.
  *
- * assertEquals(slugify("hello world"), "hello-world");
- * assertEquals(slugify("déjà vu"), "deja-vu");
+ * @example Usage
+ * ```ts
+ * import { ASCII_DIACRITICS, slugify } from "@std/text/unstable-slugify";
+ * import { assertEquals } from "@std/assert";
+ * assertEquals(slugify("déjà-vu", { strip: ASCII_DIACRITICS }), "deja-vu");
+ * assertEquals(slugify("Συστημάτων Γραφής", { strip: ASCII_DIACRITICS }), "συστημάτων-γραφής");
  * ```
+ */
+export const ASCII_DIACRITICS = /(?<=[a-zA-Z])\p{M}+|[^\p{L}\p{M}\p{N}\-]+/gu;
+/**
+ * A regular expression for stripping non-ASCII characters from slugs.
+ *
+ * @example Usage
+ * ```ts
+ * import { NON_ASCII, slugify } from "@std/text/unstable-slugify";
+ * import { assertEquals } from "@std/assert";
+ * assertEquals(slugify("déjà-vu", { strip: NON_ASCII }), "deja-vu");
+ * assertEquals(slugify("Συστημάτων Γραφής", { strip: NON_ASCII }), "-");
+ * ```
+ */
+export const NON_ASCII = /[^0-9a-zA-Z\-]/g;
+
+/**
+ * Converts a string into a {@link https://en.wikipedia.org/wiki/Clean_URL#Slug | slug}.
+ *
+ * @experimental **UNSTABLE**: New API, yet to be vetted.
  *
  * @param input The string that is going to be converted into a slug
+ * @param options The options for the slugify function
  * @returns The string as a slug
+ *
+ * @example Basic usage
+ * ```ts
+ * import { slugify } from "@std/text/unstable-slugify";
+ * import { assertEquals } from "@std/assert";
+ *
+ * assertEquals(slugify("Hello, world!"), "hello-world");
+ * assertEquals(slugify("Συστημάτων Γραφής"), "συστημάτων-γραφής");
+ * ```
+ *
+ * @example With transliteration using a third-party library
+ * ```ts no-eval
+ * import { NON_ASCII, slugify } from "@std/text/unstable-slugify";
+ * // example third-party transliteration library
+ * import transliterate from "npm:any-ascii";
+ *
+ * slugify("Συστημάτων Γραφής", { transliterate, strip: NON_ASCII });
+ * // => "sistimaton-grafis"
+ * ```
  */
-export function slugify(input: string): string {
-  return input
-    .trim()
-    .normalize("NFD")
-    .replaceAll(/[^a-zA-Z0-9\s-]/g, "")
-    .replaceAll(/\s+|-+/g, "-")
-    .replaceAll(/^-+|-+$/g, "")
-    .toLowerCase();
+export function slugify(
+  input: string,
+  options?: Partial<SlugifyOptions>,
+): string {
+  // clone with `new RegExp` in case `lastIndex` isn't zeroed
+  const stripRe = new RegExp(options?.strip ?? NON_WORD);
+  const words: string[] = [];
+
+  for (
+    const s of wordSegmenter.segment(
+      input.trim().normalize("NFD").toLowerCase(),
+    )
+  ) {
+    if (s.isWordLike) {
+      words.push(s.segment);
+    } else if (s.segment.length) {
+      words.push("-");
+    }
+  }
+
+  return words
+    .map(options?.transliterate ?? ((x) => x))
+    .join(options?.transliterate ? "-" : "")
+    .replaceAll(stripRe, "")
+    .normalize("NFC")
+    .replaceAll(/-{2,}/g, "-")
+    .replaceAll(/^-|-$/g, "") ||
+    "-";
 }
diff --git a/text/unstable_slugify_test.ts b/text/unstable_slugify_test.ts
@@ -1,6 +1,12 @@
 // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
-import { assertEquals } from "@std/assert/equals";
-import { slugify } from "./unstable_slugify.ts";
+import { assertEquals, assertMatch } from "@std/assert";
+import {
+  ASCII_DIACRITICS,
+  DIACRITICS,
+  NON_ASCII,
+  NON_WORD,
+  slugify,
+} from "./unstable_slugify.ts";
 
 Deno.test("slugify() returns kebabcase", () => {
   assertEquals(slugify("hello world"), "hello-world");
@@ -16,23 +22,73 @@ Deno.test("slugify() handles whitespaces", () => {
   assertEquals(slugify("Hello\r\nWorld"), "hello-world");
 });
 
-Deno.test("slugify() replaces diacritic characters", () => {
-  assertEquals(slugify("déjà vu"), "deja-vu");
-  assertEquals(slugify("Cliché"), "cliche");
-  assertEquals(slugify("façade"), "facade");
-  assertEquals(slugify("résumé"), "resume");
+Deno.test("slugify() normalizes diacritic characters to NFC form by default", () => {
+  assertEquals(slugify("déjà vu".normalize("NFD")), "déjà-vu".normalize("NFC"));
+  assertEquals(slugify("Cliché".normalize("NFD")), "cliché".normalize("NFC"));
+  assertEquals(slugify("façade".normalize("NFD")), "façade".normalize("NFC"));
+  assertEquals(slugify("résumé".normalize("NFD")), "résumé".normalize("NFC"));
+  assertEquals(
+    slugify("Συστημάτων Γραφής".normalize("NFD")),
+    "συστημάτων-γραφής".normalize("NFC"),
+  );
+});
+
+Deno.test("slugify() strips all non-ASCII chars, including diacritics, if strip: NON_ASCII", () => {
+  assertEquals(
+    slugify("déjà vu".normalize("NFC"), { strip: NON_ASCII }),
+    "deja-vu",
+  );
+  assertEquals(
+    slugify("déjà vu".normalize("NFD"), { strip: NON_ASCII }),
+    "deja-vu",
+  );
+  assertEquals(slugify("Συστημάτων Γραφής", { strip: NON_ASCII }), "-");
+});
+
+Deno.test("slugify() strips all diacritics if strip: DIACRITICS", () => {
+  assertEquals(
+    slugify("déjà vu".normalize("NFC"), { strip: DIACRITICS }),
+    "deja-vu",
+  );
+  assertEquals(
+    slugify("déjà vu".normalize("NFD"), { strip: DIACRITICS }),
+    "deja-vu",
+  );
+  assertEquals(
+    slugify("Συστημάτων Γραφής", { strip: DIACRITICS }),
+    "συστηματων-γραφης",
+  );
+});
+
+Deno.test("slugify() strips ASCII diacritics (but not other diacritics) if strip: ASCII_DIACRITICS", () => {
+  assertEquals(
+    slugify("déjà-vu".normalize("NFC"), { strip: ASCII_DIACRITICS }),
+    "deja-vu",
+  );
+  assertEquals(
+    slugify("déjà-vu".normalize("NFD"), { strip: ASCII_DIACRITICS }),
+    "deja-vu",
+  );
+  assertEquals(
+    slugify("Συστημάτων Γραφής", { strip: ASCII_DIACRITICS }),
+    "συστημάτων-γραφής",
+  );
 });
 
 Deno.test("slugify() handles dashes", () => {
   assertEquals(slugify("-Hello-World-"), "hello-world");
   assertEquals(slugify("--Hello--World--"), "hello-world");
 });
 
-Deno.test("slugify() handles empty string", () => {
-  assertEquals(slugify(""), "");
+Deno.test("slugify() converts empty string to a single dash", () => {
+  // Prevent any issues with zero-length slugs in URLs, e.g.
+  // `/a//b` -> `/a/b`; `/a/` -> `/a`
+  assertEquals(slugify(""), "-");
+  assertEquals(slugify("abc", { strip: /./g }), "-");
 });
 
-Deno.test("slugify() removes unknown special characters", () => {
+Deno.test("slugify() replaces non-word characters with dashes", () => {
+  assertEquals(slugify("Hello, world!"), "hello-world");
   assertEquals(slugify("hello ~ world"), "hello-world");
 
   assertEquals(
@@ -56,3 +112,62 @@ Deno.test("slugify() removes unknown special characters", () => {
     "bitcoin-soars-past-33000-its-highest-ever",
   );
 });
+
+Deno.test("slugify() works with non-Latin alphabetic text", () => {
+  assertEquals(slugify("Συστημάτων Γραφής"), "συστημάτων-γραφής");
+  assertEquals(slugify("三人行，必有我师"), "三人行-必有我师");
+});
+
+Deno.test("slugify() deletes non-matches when a custom strip regex is supplied", () => {
+  assertEquals(slugify("abcdef", { strip: /[ace]/g }), "bdf");
+});
+
+Deno.test("slugify() strips apostrophes within words", () => {
+  // curly apostrophe
+  assertEquals(slugify("What’s up?"), "whats-up");
+  // straight apostrophe
+  assertEquals(slugify("What's up?"), "whats-up");
+});
+
+Deno.test("slugify() strips or replaces all non-alphanumeric ASCII chars except for `-`", () => {
+  /**
+   * Ensure that interpolation into all parts of a URL (path segment, search
+   * params, hash, subdomain, etc.) is safe, i.e. doesn't allow path traversal
+   * or other exploits, which could be allowed by presence of chars like
+   * `./?&=#` etc.
+   */
+  const ASCII_ALPHANUM_OR_DASH_ONLY = /^[a-zA-Z0-9\-]+$/;
+  const ALL_ASCII = Array.from(
+    { length: 0x80 },
+    (_, i) => String.fromCodePoint(i),
+  ).join("");
+
+  // with default
+  assertMatch(slugify(ALL_ASCII), ASCII_ALPHANUM_OR_DASH_ONLY);
+  // even if we explicitly set the strip regex to match nothing
+  assertMatch(
+    slugify(ALL_ASCII, { strip: /[^\s\S]/gu }),
+    ASCII_ALPHANUM_OR_DASH_ONLY,
+  );
+
+  // defense-in-depth - the exported regexes _also_ all strip non-ASCII characters
+  for (const re of [ASCII_DIACRITICS, DIACRITICS, NON_ASCII, NON_WORD]) {
+    assertMatch(ALL_ASCII.replaceAll(re, ""), ASCII_ALPHANUM_OR_DASH_ONLY);
+  }
+});
+
+Deno.test("slugify() `transliterate` option works alongside third-party transliteration libs", () => {
+  /**
+   * We just use a simple mock transliteration function to test basic
+   * compatibility here. For actual transliteration libraries,
+   * [npm:any-ascii](https://github.com/anyascii/anyascii) seems to be a good
+   * general-purpose option.
+   */
+  const transliterate = (s: string) => [...s].map((c) => map[c]).join("");
+
+  const map: Record<string, string> = { 矿: "kuang", 泉: "quan", 水: "shui" };
+  const input = "矿泉水";
+  const expected = "kuangquan-shui";
+
+  assertEquals(slugify(input, { transliterate }), expected);
+});