Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added escaping to ActiveEnum to allow for non-UAX#31 chars, as well as camel_case related conflicts. #1374

Merged
3 changes: 2 additions & 1 deletion sea-orm-macros/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ syn = { version = "1", default-features = false, features = ["parsing", "proc-ma
quote = { version = "1", default-features = false }
heck = { version = "0.4", default-features = false }
proc-macro2 = { version = "1", default-features = false }
unicode-ident = { version = "1" }

[dev-dependencies]
sea-orm = { path = "../", features = ["macros"] }
sea-orm = { path = "../", features = ["macros", "tests-cfg"] }
serde = { version = "1.0", features = ["derive"] }

[features]
Expand Down
9 changes: 4 additions & 5 deletions sea-orm-macros/src/derives/active_enum.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use super::util::camel_case_with_escaped_non_uax31;
use heck::ToUpperCamelCase;
use proc_macro2::TokenStream;
use quote::{format_ident, quote, quote_spanned};
Expand Down Expand Up @@ -247,11 +248,9 @@ impl ActiveEnum {
let enum_variants: Vec<syn::Ident> = str_variants
.iter()
.map(|v| {
if v.chars().next().map(char::is_numeric).unwrap_or(false) {
format_ident!("_{}", v)
} else {
format_ident!("{}", v.to_upper_camel_case())
}
let v_cleaned = camel_case_with_escaped_non_uax31(v);

format_ident!("{}", v_cleaned)
})
.collect();

Expand Down
131 changes: 131 additions & 0 deletions sea-orm-macros/src/derives/util.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use heck::ToUpperCamelCase;
use quote::format_ident;
use syn::{punctuated::Punctuated, token::Comma, Field, Ident, Meta};

Expand Down Expand Up @@ -54,6 +55,97 @@ where
}
}

/// Turn a string to PascalCase while escaping all special characters in ASCII words.
///
/// (camel_case is used here to match naming of heck.)
///
/// In ActiveEnum, string_value will be PascalCased and made
/// an identifier in {Enum}Variant.
///
/// However Rust only allows for XID_Start char followed by
/// XID_Continue characters as identifiers; this causes a few
/// problems:
///
/// - `string_value = ""` will cause a panic;
/// - `string_value` containing only non-alphanumerics will become `""`
/// and cause the above panic;
/// - `string_values`:
/// - `"A B"`
/// - `"A B"`
/// - `"A_B"`
/// - `"A_ B"`
/// shares the same identifier of `"AB"`;
///
/// This function does the PascelCase conversion with a few special escapes:
/// - Non-Unicode Standard Annex #31 compliant characters will converted to their hex notation;
/// - `"_"` into `"0x5F"`;
/// - `" "` into `"0x20"`;
/// - Empty strings will become special keyword of `"__Empty"`
///
/// Note that this does NOT address:
///
/// - case-sensitivity. String value "ABC" and "abc" remains
/// conflicted after .camel_case().
///
/// Example Conversions:
///
/// ```ignore
/// assert_eq!(camel_case_with_escaped_non_uax31(""), "__Empty");
/// assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x20");
/// assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x200x20");
/// assert_eq!(camel_case_with_escaped_non_uax31("_"), "_0x5F");
/// assert_eq!(camel_case_with_escaped_non_uax31("foobar"), "Foobar");
/// assert_eq!(camel_case_with_escaped_non_uax31("foo bar"), "Foo0x20bar");
/// ```
pub(crate) fn camel_case_with_escaped_non_uax31<T>(string: T) -> String
where
T: ToString,
{
let additional_chars_to_replace: [char; 2] = ['_', ' '];

let mut rebuilt = string
.to_string()
.chars()
.enumerate()
.map(|(pos, char_)| {
if !additional_chars_to_replace.contains(&char_)
&& match pos {
0 => unicode_ident::is_xid_start(char_),
_ => unicode_ident::is_xid_continue(char_),
}
{
char_.to_string()
} else {
format!("{:#X}", char_ as u32)
}
})
.reduce(
// Join the "characters" (now strings)
// back together
|lhs, rhs| lhs + rhs.as_str(),
)
.map_or(
// if string_value is ""
// Make sure the default does NOT go through camel_case,
// as the __ will be removed! The underscores are
// what guarantees this being special case avoiding
// all potential conflicts.
String::from("__Empty"),
|s| s.to_upper_camel_case(),
);

if rebuilt
.chars()
.next()
.map(char::is_numeric)
.unwrap_or(false)
{
rebuilt = String::from("_") + &rebuilt;
}

rebuilt
}

pub(crate) const RAW_IDENTIFIER: &str = "r#";

pub(crate) const RUST_KEYWORDS: [&str; 49] = [
Expand All @@ -65,3 +157,42 @@ pub(crate) const RUST_KEYWORDS: [&str; 49] = [
];

pub(crate) const RUST_SPECIAL_KEYWORDS: [&str; 3] = ["crate", "Self", "self"];

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_non_uax31_escape() {
// Test empty string
assert_eq!(camel_case_with_escaped_non_uax31(""), "__Empty");

// Test additional_chars_to_replace (to_camel_case related characters)
assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x20");

// Test additional_chars_to_replace (multiples. ensure distinct from single)
assert_eq!(camel_case_with_escaped_non_uax31(" "), "_0x200x20");

// Test additional_chars_to_replace (udnerscores)
assert_eq!(camel_case_with_escaped_non_uax31("_"), "_0x5F");

// Test typical use case
assert_eq!(camel_case_with_escaped_non_uax31("foobar"), "Foobar");

// Test spaced words distinct from non-spaced
assert_eq!(camel_case_with_escaped_non_uax31("foo bar"), "Foo0x20bar");

// Test undescored words distinct from non-spaced and spaced
assert_eq!(camel_case_with_escaped_non_uax31("foo_bar"), "Foo0x5Fbar");

// Test leading numeric characters
assert_eq!(camel_case_with_escaped_non_uax31("1"), "_0x31");

// Test escaping also works on full string following lead numeric character
// This was previously a fail condition.
assert_eq!(
camel_case_with_escaped_non_uax31("1 2 3"),
"_0x310x2020x203"
);
}
}
3 changes: 3 additions & 0 deletions sea-orm-macros/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,9 @@ pub fn derive_active_model_behavior(input: TokenStream) -> TokenStream {
/// - For enum variant
/// - `string_value` or `num_value`:
/// - For `string_value`, value should be passed as string, i.e. `string_value = "A"`
/// - Due to the way internal Enums are automatically derived, the following restrictions apply:
/// - members cannot share identical `string_value`, case-insensitive.
/// - in principle, any future Titlecased Rust keywords are not valid `string_value`.
/// - For `num_value`, value should be passed as integer, i.e. `num_value = 1` or `num_value = 1i32`
/// - Note that only one of it can be specified, and all variants of an enum have to annotate with the same `*_value` macro attribute
///
Expand Down
85 changes: 85 additions & 0 deletions src/entity/active_enum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -408,4 +408,89 @@ mod tests {
test_fallback_uint!(U32Fallback, u32, "u32", "Integer", Integer);
test_fallback_uint!(U64Fallback, u64, "u64", "BigInteger", BigInteger);
}

#[test]
fn escaped_non_uax31() {
#[derive(Debug, Clone, PartialEq, Eq, EnumIter, DeriveActiveEnum, Copy)]
#[sea_orm(rs_type = "String", db_type = "Enum", enum_name = "pop_os_names_typos")]
pub enum PopOSTypos {
#[sea_orm(string_value = "Pop!_OS")]
PopOSCorrect,
#[sea_orm(string_value = "Pop\u{2757}_OS")]
PopOSEmoji,
#[sea_orm(string_value = "Pop!_操作系统")]
PopOSChinese,
#[sea_orm(string_value = "PopOS")]
PopOSASCIIOnly,
#[sea_orm(string_value = "Pop OS")]
PopOSASCIIOnlyWithSpace,
#[sea_orm(string_value = "Pop!OS")]
PopOSNoUnderscore,
#[sea_orm(string_value = "Pop_OS")]
PopOSNoExclaimation,
#[sea_orm(string_value = "!PopOS_")]
PopOSAllOverThePlace,
#[sea_orm(string_value = "Pop!_OS22.04LTS")]
PopOSWithVersion,
#[sea_orm(string_value = "22.04LTSPop!_OS")]
PopOSWithVersionPrefix,
#[sea_orm(string_value = "!_")]
PopOSJustTheSymbols,
#[sea_orm(string_value = "")]
Nothing,
// This WILL fail:
// Both PopOs and PopOS will create identifier "Popos"
// #[sea_orm(string_value = "PopOs")]
// PopOSLowerCase,
}
let values = [
"Pop!_OS",
"Pop\u{2757}_OS",
"Pop!_操作系统",
"PopOS",
"Pop OS",
"Pop!OS",
"Pop_OS",
"!PopOS_",
"Pop!_OS22.04LTS",
"22.04LTSPop!_OS",
"!_",
"",
];
for (variant, val) in PopOSTypos::iter().zip(values) {
assert_eq!(variant.to_value(), val);
assert_eq!(PopOSTypos::try_from_value(&val.to_owned()), Ok(variant));
}

#[derive(Clone, Debug, PartialEq, EnumIter, DeriveActiveEnum)]
#[sea_orm(
rs_type = "String",
db_type = "String(None)",
enum_name = "conflicting_string_values"
)]
pub enum ConflictingStringValues {
#[sea_orm(string_value = "")]
Member1,
#[sea_orm(string_value = "$")]
Member2,
#[sea_orm(string_value = "$$")]
Member3,
#[sea_orm(string_value = "AB")]
Member4,
#[sea_orm(string_value = "A_B")]
Member5,
#[sea_orm(string_value = "A$B")]
Member6,
#[sea_orm(string_value = "0 123")]
Member7,
}
type EnumVariant = ConflictingStringValuesVariant;
assert_eq!(EnumVariant::__Empty.to_string(), "");
assert_eq!(EnumVariant::_0x24.to_string(), "$");
assert_eq!(EnumVariant::_0x240x24.to_string(), "$$");
assert_eq!(EnumVariant::Ab.to_string(), "AB");
assert_eq!(EnumVariant::A0x5Fb.to_string(), "A_B");
assert_eq!(EnumVariant::A0x24B.to_string(), "A$B");
assert_eq!(EnumVariant::_0x300x20123.to_string(), "0 123");
}
}