From 04e8fc07e15268158bbe310b7190bb162960ffe1 Mon Sep 17 00:00:00 2001
From: Benjamin Fischer <benjamin.fischer@evolutics.info>
Date: Mon, 24 May 2021 17:19:51 +0200
Subject: [PATCH] Respect non-ASCII identifiers in sanitization for clearer
 names

See also Rust RFC 2457: https://github.com/rust-lang/rfcs/pull/2457
---
 CHANGELOG.md                       |  8 +++++
 Cargo.toml                         |  1 +
 README.md                          | 27 +++++++++-----
 src/generate_view/sanitize_name.rs | 58 +++++++++++++++++++++++-------
 src/lib.rs                         | 27 +++++++++-----
 5 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71c2f32..9bb6a41 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,14 @@ Notable library changes are documented here in a format based on
 
 ## Unreleased
 
+### Changed
+
+- Respect Unicode identifiers in
+  [name sanitization](https://github.com/evolutics/iftree#name-sanitization).
+  If you only use ASCII file paths, then this change has no effect. Essentially,
+  non-ASCII characters that are valid in identifiers (from Rust 1.53.0) are
+  preserved instead of replaced by an underscore `"_"`.
+
 ## 0.1.1 – 2021-05-14
 
 ### Fixed
diff --git a/Cargo.toml b/Cargo.toml
index 42b148b..06ad29c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,7 @@ quote = "1.0"
 serde = { version = "1.0", features = ["derive"] }
 syn = { version = "1.0", features = ["default", "extra-traits"] }
 toml = "0.5"
+unicode-xid = "0.2"
 
 [dev-dependencies]
 actix-web = "3.3"
diff --git a/README.md b/README.md
index d1fa926..3373534 100644
--- a/README.md
+++ b/README.md
@@ -163,20 +163,31 @@ See
 
 ### Name sanitization
 
-When generating identifiers based on paths, names are sanitized as follows to
-ensure they are
-[valid identifiers](https://doc.rust-lang.org/reference/identifiers.html):
-
-- Characters other than ASCII alphanumericals are replaced by `"_"`.
-- If the first character is numeric, then `"_"` is prepended.
-- If the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`, then `"_"`
-  is appended.
+When generating identifiers based on paths, names are sanitized. For example, a
+folder name `.my-assets` is sanitized to an identifier `_my_assets`.
+
+The sanitization process is designed to generate valid
+[Unicode identifiers](https://doc.rust-lang.org/nightly/reference/identifiers.html).
+Essentially, it replaces invalid identifier characters by underscores `"_"`.
+More precisely:
+
+1. Characters without the property `XID_Continue` are replaced by `"_"`. The set
+   of `XID_Continue` characters in ASCII is `[0-9A-Z_a-z]`.
+1. Next, if the first character does not have the property `XID_Start`, then
+   `"_"` is prepended unless the first character is already `"_"`. The set of
+   `XID_Start` characters in ASCII is `[A-Za-z]`.
+1. Finally, if the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`,
+   then `"_"` is appended.
 
 Names are further adjusted to respect naming conventions in the default case:
 
 - Lowercase for folders (because they map to module names).
 - Uppercase for filenames (because they map to static variables).
 
+Note that non-ASCII identifiers are only supported from Rust 1.53.0. For earlier
+versions, the sanitization here may generate invalid identifiers if you use
+non-ASCII paths, in which case you need to manually rename the affected files.
+
 ### Portable file paths
 
 To prevent issues when developing on different platforms, any paths in your
diff --git a/src/generate_view/sanitize_name.rs b/src/generate_view/sanitize_name.rs
index 6f7de4c..6bad2a0 100644
--- a/src/generate_view/sanitize_name.rs
+++ b/src/generate_view/sanitize_name.rs
@@ -21,7 +21,7 @@ fn sanitize_by_convention(name: &str, convention: Convention) -> String {
 fn sanitize_special_characters(name: &str) -> String {
     name.chars()
         .map(|character| {
-            if character.is_ascii_alphanumeric() {
+            if unicode_xid::UnicodeXID::is_xid_continue(character) {
                 character
             } else {
                 '_'
@@ -32,14 +32,14 @@ fn sanitize_special_characters(name: &str) -> String {
 
 fn sanitize_first_character(name: String) -> String {
     match name.chars().next() {
-        Some(first_character) if first_character.is_numeric() => format!("_{}", name),
-        _ => name,
+        Some(first_character) if unicode_xid::UnicodeXID::is_xid_start(first_character) => name,
+        Some('_') => name,
+        _ => format!("_{}", name),
     }
 }
 
 fn sanitize_special_cases(name: String) -> String {
     match name.as_ref() {
-        "" => String::from("__"),
         "_" | "crate" | "self" | "Self" | "super" => format!("{}_", name),
         _ => name,
     }
@@ -60,33 +60,65 @@ mod tests {
 
     #[test]
     fn handles_convention_of_screaming_snake_case() {
-        let actual = main("README.md", Convention::ScreamingSnakeCase);
+        let actual = main("README_ß_ŉ.md", Convention::ScreamingSnakeCase);
 
-        let expected = quote::format_ident!("r#README_MD");
+        let expected = quote::format_ident!("r#README_SS_ʼN_MD");
         assert_eq!(actual, expected);
     }
 
     #[test]
     fn handles_convention_of_snake_case() {
-        let actual = main("README.md", Convention::SnakeCase);
+        let actual = main("README_ß_ŉ.md", Convention::SnakeCase);
 
-        let expected = quote::format_ident!("r#readme_md");
+        let expected = quote::format_ident!("r#readme_ß_ŉ_md");
         assert_eq!(actual, expected);
     }
 
     #[test]
     fn handles_special_characters() {
-        let actual = main("A B##C_D±EÅF𝟙G.H", Convention::ScreamingSnakeCase);
+        let actual = main("_0 1##2$3±4√5👽6.7", stubs::convention());
+
+        let expected = quote::format_ident!("r#_0_1__2_3_4_5_6_7");
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn handles_non_ascii_identifiers() {
+        let actual = main("åb_π_𝟙", Convention::SnakeCase);
+
+        let expected = quote::format_ident!("r#åb_π_𝟙");
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn handles_first_character_if_xid_start() {
+        let actual = main("a", Convention::SnakeCase);
+
+        let expected = quote::format_ident!("r#a");
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn handles_first_character_if_underscore() {
+        let actual = main("_2", stubs::convention());
+
+        let expected = quote::format_ident!("r#_2");
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn handles_first_character_if_xid_continue_but_not_xid_start() {
+        let actual = main("3", stubs::convention());
 
-        let expected = quote::format_ident!("r#A_B__C_D_E_F_G_H");
+        let expected = quote::format_ident!("r#_3");
         assert_eq!(actual, expected);
     }
 
     #[test]
-    fn handles_first_character() {
-        let actual = main("2a", Convention::SnakeCase);
+    fn handles_first_character_if_not_xid_continue() {
+        let actual = main(".4", stubs::convention());
 
-        let expected = quote::format_ident!("r#_2a");
+        let expected = quote::format_ident!("r#_4");
         assert_eq!(actual, expected);
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index 38a0f37..7c327ed 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -161,20 +161,31 @@
 //!
 //! ## Name sanitization
 //!
-//! When generating identifiers based on paths, names are sanitized as follows to
-//! ensure they are
-//! [valid identifiers](https://doc.rust-lang.org/reference/identifiers.html):
-//!
-//! - Characters other than ASCII alphanumericals are replaced by `"_"`.
-//! - If the first character is numeric, then `"_"` is prepended.
-//! - If the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`, then `"_"`
-//!   is appended.
+//! When generating identifiers based on paths, names are sanitized. For example, a
+//! folder name `.my-assets` is sanitized to an identifier `_my_assets`.
+//!
+//! The sanitization process is designed to generate valid
+//! [Unicode identifiers](https://doc.rust-lang.org/nightly/reference/identifiers.html).
+//! Essentially, it replaces invalid identifier characters by underscores `"_"`.
+//! More precisely:
+//!
+//! 1. Characters without the property `XID_Continue` are replaced by `"_"`. The set
+//!    of `XID_Continue` characters in ASCII is `[0-9A-Z_a-z]`.
+//! 1. Next, if the first character does not have the property `XID_Start`, then
+//!    `"_"` is prepended unless the first character is already `"_"`. The set of
+//!    `XID_Start` characters in ASCII is `[A-Za-z]`.
+//! 1. Finally, if the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`,
+//!    then `"_"` is appended.
 //!
 //! Names are further adjusted to respect naming conventions in the default case:
 //!
 //! - Lowercase for folders (because they map to module names).
 //! - Uppercase for filenames (because they map to static variables).
 //!
+//! Note that non-ASCII identifiers are only supported from Rust 1.53.0. For earlier
+//! versions, the sanitization here may generate invalid identifiers if you use
+//! non-ASCII paths, in which case you need to manually rename the affected files.
+//!
 //! ## Portable file paths
 //!
 //! To prevent issues when developing on different platforms, any paths in your