From 04e8fc07e15268158bbe310b7190bb162960ffe1 Mon Sep 17 00:00:00 2001 From: Benjamin Fischer Date: Mon, 24 May 2021 17:19:51 +0200 Subject: [PATCH] Respect non-ASCII identifiers in sanitization for clearer names See also Rust RFC 2457: https://github.com/rust-lang/rfcs/pull/2457 --- CHANGELOG.md | 8 +++++ Cargo.toml | 1 + README.md | 27 +++++++++----- src/generate_view/sanitize_name.rs | 58 +++++++++++++++++++++++------- src/lib.rs | 27 +++++++++----- 5 files changed, 92 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 71c2f32..9bb6a41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ Notable library changes are documented here in a format based on ## Unreleased +### Changed + +- Respect Unicode identifiers in + [name sanitization](https://github.com/evolutics/iftree#name-sanitization). + If you only use ASCII file paths, then this change has no effect. Essentially, + non-ASCII characters that are valid in identifiers (from Rust 1.53.0) are + preserved instead of replaced by an underscore `"_"`. + ## 0.1.1 – 2021-05-14 ### Fixed diff --git a/Cargo.toml b/Cargo.toml index 42b148b..06ad29c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ quote = "1.0" serde = { version = "1.0", features = ["derive"] } syn = { version = "1.0", features = ["default", "extra-traits"] } toml = "0.5" +unicode-xid = "0.2" [dev-dependencies] actix-web = "3.3" diff --git a/README.md b/README.md index d1fa926..3373534 100644 --- a/README.md +++ b/README.md @@ -163,20 +163,31 @@ See ### Name sanitization -When generating identifiers based on paths, names are sanitized as follows to -ensure they are -[valid identifiers](https://doc.rust-lang.org/reference/identifiers.html): - -- Characters other than ASCII alphanumericals are replaced by `"_"`. -- If the first character is numeric, then `"_"` is prepended. -- If the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`, then `"_"` - is appended. +When generating identifiers based on paths, names are sanitized. For example, a +folder name `.my-assets` is sanitized to an identifier `_my_assets`. + +The sanitization process is designed to generate valid +[Unicode identifiers](https://doc.rust-lang.org/nightly/reference/identifiers.html). +Essentially, it replaces invalid identifier characters by underscores `"_"`. +More precisely: + +1. Characters without the property `XID_Continue` are replaced by `"_"`. The set + of `XID_Continue` characters in ASCII is `[0-9A-Z_a-z]`. +1. Next, if the first character does not have the property `XID_Start`, then + `"_"` is prepended unless the first character is already `"_"`. The set of + `XID_Start` characters in ASCII is `[A-Za-z]`. +1. Finally, if the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`, + then `"_"` is appended. Names are further adjusted to respect naming conventions in the default case: - Lowercase for folders (because they map to module names). - Uppercase for filenames (because they map to static variables). +Note that non-ASCII identifiers are only supported from Rust 1.53.0. For earlier +versions, the sanitization here may generate invalid identifiers if you use +non-ASCII paths, in which case you need to manually rename the affected files. + ### Portable file paths To prevent issues when developing on different platforms, any paths in your diff --git a/src/generate_view/sanitize_name.rs b/src/generate_view/sanitize_name.rs index 6f7de4c..6bad2a0 100644 --- a/src/generate_view/sanitize_name.rs +++ b/src/generate_view/sanitize_name.rs @@ -21,7 +21,7 @@ fn sanitize_by_convention(name: &str, convention: Convention) -> String { fn sanitize_special_characters(name: &str) -> String { name.chars() .map(|character| { - if character.is_ascii_alphanumeric() { + if unicode_xid::UnicodeXID::is_xid_continue(character) { character } else { '_' @@ -32,14 +32,14 @@ fn sanitize_special_characters(name: &str) -> String { fn sanitize_first_character(name: String) -> String { match name.chars().next() { - Some(first_character) if first_character.is_numeric() => format!("_{}", name), - _ => name, + Some(first_character) if unicode_xid::UnicodeXID::is_xid_start(first_character) => name, + Some('_') => name, + _ => format!("_{}", name), } } fn sanitize_special_cases(name: String) -> String { match name.as_ref() { - "" => String::from("__"), "_" | "crate" | "self" | "Self" | "super" => format!("{}_", name), _ => name, } @@ -60,33 +60,65 @@ mod tests { #[test] fn handles_convention_of_screaming_snake_case() { - let actual = main("README.md", Convention::ScreamingSnakeCase); + let actual = main("README_ß_ʼn.md", Convention::ScreamingSnakeCase); - let expected = quote::format_ident!("r#README_MD"); + let expected = quote::format_ident!("r#README_SS_ʼN_MD"); assert_eq!(actual, expected); } #[test] fn handles_convention_of_snake_case() { - let actual = main("README.md", Convention::SnakeCase); + let actual = main("README_ß_ʼn.md", Convention::SnakeCase); - let expected = quote::format_ident!("r#readme_md"); + let expected = quote::format_ident!("r#readme_ß_ʼn_md"); assert_eq!(actual, expected); } #[test] fn handles_special_characters() { - let actual = main("A B##C_D±EÅF𝟙G.H", Convention::ScreamingSnakeCase); + let actual = main("_0 1##2$3±4√5👽6.7", stubs::convention()); + + let expected = quote::format_ident!("r#_0_1__2_3_4_5_6_7"); + assert_eq!(actual, expected); + } + + #[test] + fn handles_non_ascii_identifiers() { + let actual = main("åb_π_𝟙", Convention::SnakeCase); + + let expected = quote::format_ident!("r#åb_π_𝟙"); + assert_eq!(actual, expected); + } + + #[test] + fn handles_first_character_if_xid_start() { + let actual = main("a", Convention::SnakeCase); + + let expected = quote::format_ident!("r#a"); + assert_eq!(actual, expected); + } + + #[test] + fn handles_first_character_if_underscore() { + let actual = main("_2", stubs::convention()); + + let expected = quote::format_ident!("r#_2"); + assert_eq!(actual, expected); + } + + #[test] + fn handles_first_character_if_xid_continue_but_not_xid_start() { + let actual = main("3", stubs::convention()); - let expected = quote::format_ident!("r#A_B__C_D_E_F_G_H"); + let expected = quote::format_ident!("r#_3"); assert_eq!(actual, expected); } #[test] - fn handles_first_character() { - let actual = main("2a", Convention::SnakeCase); + fn handles_first_character_if_not_xid_continue() { + let actual = main(".4", stubs::convention()); - let expected = quote::format_ident!("r#_2a"); + let expected = quote::format_ident!("r#_4"); assert_eq!(actual, expected); } diff --git a/src/lib.rs b/src/lib.rs index 38a0f37..7c327ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -161,20 +161,31 @@ //! //! ## Name sanitization //! -//! When generating identifiers based on paths, names are sanitized as follows to -//! ensure they are -//! [valid identifiers](https://doc.rust-lang.org/reference/identifiers.html): -//! -//! - Characters other than ASCII alphanumericals are replaced by `"_"`. -//! - If the first character is numeric, then `"_"` is prepended. -//! - If the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`, then `"_"` -//! is appended. +//! When generating identifiers based on paths, names are sanitized. For example, a +//! folder name `.my-assets` is sanitized to an identifier `_my_assets`. +//! +//! The sanitization process is designed to generate valid +//! [Unicode identifiers](https://doc.rust-lang.org/nightly/reference/identifiers.html). +//! Essentially, it replaces invalid identifier characters by underscores `"_"`. +//! More precisely: +//! +//! 1. Characters without the property `XID_Continue` are replaced by `"_"`. The set +//! of `XID_Continue` characters in ASCII is `[0-9A-Z_a-z]`. +//! 1. Next, if the first character does not have the property `XID_Start`, then +//! `"_"` is prepended unless the first character is already `"_"`. The set of +//! `XID_Start` characters in ASCII is `[A-Za-z]`. +//! 1. Finally, if the name is `"_"`, `"crate"`, `"self"`, `"Self"`, or `"super"`, +//! then `"_"` is appended. //! //! Names are further adjusted to respect naming conventions in the default case: //! //! - Lowercase for folders (because they map to module names). //! - Uppercase for filenames (because they map to static variables). //! +//! Note that non-ASCII identifiers are only supported from Rust 1.53.0. For earlier +//! versions, the sanitization here may generate invalid identifiers if you use +//! non-ASCII paths, in which case you need to manually rename the affected files. +//! //! ## Portable file paths //! //! To prevent issues when developing on different platforms, any paths in your