diff --git a/NEWS.md b/NEWS.md index aaff6334190ad..12425749d1553 100644 --- a/NEWS.md +++ b/NEWS.md @@ -124,6 +124,11 @@ Standard library changes #### Unicode * Added function `isequal_normalized` to check for Unicode equivalence without explicitly constructing normalized strings ([#42493]). +* The `Unicode.normalize` function now accepts a `chartransform` keyword that can + be used to supply custom character mappings, and a `Unicode.julia_chartransform` + function is provided to reproduce the mapping used in identifier normalization + by the Julia parser ([#42561]). + Deprecated or removed --------------------- diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index 6e147194c5910..e687d94365c4a 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -145,20 +145,43 @@ const UTF8PROC_STRIPMARK = (1<<13) utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result))) -function utf8proc_map(str::Union{String,SubString{String}}, options::Integer) - nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint), - str, sizeof(str), C_NULL, 0, options) - nwords < 0 && utf8proc_error(nwords) +# static wrapper around user callback function +utf8proc_custom_func(codepoint::UInt32, callback::Any) = + UInt32(callback(codepoint))::UInt32 + +function utf8proc_decompose(str, options, buffer, nwords, chartransform::typeof(identity)) + ret = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint), + str, sizeof(str), buffer, nwords, options) + ret < 0 && utf8proc_error(ret) + return ret +end +function utf8proc_decompose(str, options, buffer, nwords, chartransform::T) where T + ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ref{T}), + str, sizeof(str), buffer, nwords, options, + @cfunction(utf8proc_custom_func, UInt32, (UInt32, Ref{T})), chartransform) + ret < 0 && utf8proc_error(ret) + return ret +end + +function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform=identity) + nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform) buffer = Base.StringVector(nwords*4) - nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint), - str, sizeof(str), buffer, nwords, options) - nwords < 0 && utf8proc_error(nwords) + nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform) nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options) nbytes < 0 && utf8proc_error(nbytes) return String(resize!(buffer, nbytes)) end -utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags) +# from julia_charmap.h, used by julia_chartransform in the Unicode stdlib +const _julia_charmap = Dict{UInt32,UInt32}( + 0x025B => 0x03B5, + 0x00B5 => 0x03BC, + 0x00B7 => 0x22C5, + 0x0387 => 0x22C5, + 0x2212 => 0x002D, +) + +utf8proc_map(s::AbstractString, flags::Integer, chartransform=identity) = utf8proc_map(String(s), flags, chartransform) # Documented in Unicode module function normalize( @@ -176,6 +199,7 @@ function normalize( casefold::Bool=false, lump::Bool=false, stripmark::Bool=false, + chartransform=identity, ) flags = 0 stable && (flags = flags | UTF8PROC_STABLE) @@ -198,7 +222,7 @@ function normalize( casefold && (flags = flags | UTF8PROC_CASEFOLD) lump && (flags = flags | UTF8PROC_LUMP) stripmark && (flags = flags | UTF8PROC_STRIPMARK) - utf8proc_map(s, flags) + utf8proc_map(s, flags, chartransform) end function normalize(s::AbstractString, nf::Symbol) diff --git a/src/flisp/julia_charmap.h b/src/flisp/julia_charmap.h index 59f408ce012c9..3c54eaf98f484 100644 --- a/src/flisp/julia_charmap.h +++ b/src/flisp/julia_charmap.h @@ -1,6 +1,9 @@ /* Array of {original codepoint, replacement codepoint} normalizations to perform on Julia identifiers, to canonicalize characters that - are both easily confused and easily inputted by accident. */ + are both easily confused and easily inputted by accident. + + Important: when this table is updated, also update the corresponding table + in base/strings/unicode.jl */ static const uint32_t charmap[][2] = { { 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon { 0x00B5, 0x03BC }, // micro sign -> greek small letter mu diff --git a/stdlib/Unicode/docs/src/index.md b/stdlib/Unicode/docs/src/index.md index 9d0cf781925d5..2771c8a9f01cc 100644 --- a/stdlib/Unicode/docs/src/index.md +++ b/stdlib/Unicode/docs/src/index.md @@ -1,6 +1,7 @@ # Unicode ```@docs +Unicode.julia_chartransform Unicode.isassigned Unicode.isequal_normalized Unicode.normalize diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl index 29b58d4dda60d..0068f17f77090 100644 --- a/stdlib/Unicode/src/Unicode.jl +++ b/stdlib/Unicode/src/Unicode.jl @@ -4,6 +4,50 @@ module Unicode export graphemes, isequal_normalized +""" + Unicode.julia_chartransform(c::Union{Char,Integer}) + +Map the Unicode character (`Char`) or codepoint (`Integer`) `c` to the corresponding +"equivalent" character or codepoint, respectively, according to the custom equivalence +used within the Julia parser (in addition to NFC normalization). + +For example, `'µ'` (U+00B5 micro) is treated as equivalent to `'μ'` (U+03BC mu) by +Julia's parser, so `julia_chartransform` performs this transformation while leaving +other characters unchanged: +```jldoctest +julia> Unicode.julia_chartransform('\u00B5') +'μ': Unicode U+03BC (category Ll: Letter, lowercase) + +julia> Unicode.julia_chartransform('x') +'x': ASCII/Unicode U+0078 (category Ll: Letter, lowercase) +``` + +`julia_chartransform` is mainly useful for passing to the [`Unicode.normalize`](@ref) +function in order to mimic the normalization used by the Julia parser: +```jl +julia> s = "\u00B5o\u0308" +"µö" + +julia> s2 = Unicode.normalize(s, compose=true, stable=true, chartransform=Unicode.julia_chartransform) +"μö" + +julia> collect(s2) +2-element Vector{Char}: + 'μ': Unicode U+03BC (category Ll: Letter, lowercase) + 'ö': Unicode U+00F6 (category Ll: Letter, lowercase) + +julia> s2 == string(Meta.parse(s)) +true +``` + +!!! compat "Julia 1.8" + This function was introduced in Julia 1.8. +""" +function julia_chartransform end +julia_chartransform(codepoint::UInt32) = get(Base.Unicode._julia_charmap, codepoint, codepoint) +julia_chartransform(codepoint::Integer) = julia_chartransform(UInt32(codepoint)) +julia_chartransform(char::Char) = Char(julia_chartransform(UInt32(char))) + """ Unicode.normalize(s::AbstractString; keywords...) Unicode.normalize(s::AbstractString, normalform::Symbol) @@ -42,6 +86,13 @@ options (which all default to `false` except for `compose`) are specified: * `rejectna=true`: throw an error if unassigned code points are found * `stable=true`: enforce Unicode versioning stability (never introduce characters missing from earlier Unicode versions) +You can also use the `chartransform` keyword (which defaults to `identity`) to pass an arbitrary +*function* mapping `Integer` codepoints to codepoints, which is is called on each +character in `s` as it is processed, in order to perform arbitrary additional normalizations. +For example, by passing `chartransform=Unicode.julia_chartransform`, you can apply a few Julia-specific +character normalizations that are performed by Julia when parsing identifiers (in addition to +NFC normalization: `compose=true, stable=true`). + For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`. # Examples @@ -58,6 +109,9 @@ julia> Unicode.normalize("JuLiA", casefold=true) julia> Unicode.normalize("JúLiA", stripmark=true) "JuLiA" ``` + +!!! compat "Julia 1.8" + The `chartransform` keyword argument requires Julia 1.8. """ function normalize end normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf) @@ -98,12 +152,16 @@ function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, end """ - isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false) + isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity) Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`, ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks and other combining characters. +As with [`Unicode.normalize`](@ref), you can also pass an arbitrary +function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints) +to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref). + # Examples For example, the string `"noël"` can be constructed in two canonically equivalent ways @@ -130,7 +188,7 @@ julia> isequal_normalized(s1, "NOËL", casefold=true) true ``` """ -function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false) +function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity) function decompose_next_char!(c, state, d, options, s) n = _decompose_char!(c, d, options) if n > length(d) # may be possible in future Unicode versions? @@ -148,11 +206,11 @@ function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bo while true if j1 > n1 i1 === nothing && return i2 === nothing && j2 > n2 - j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1) + j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1) end if j2 > n2 i2 === nothing && return false - j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2) + j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2) end d1[j1] == d2[j2] || return false j1 += 1; j2 += 1 diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index 12348a18d5b15..a4faac2bd3ba9 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -2,7 +2,7 @@ using Test using Unicode -using Unicode: normalize, isassigned +using Unicode: normalize, isassigned, julia_chartransform @testset "string normalization" begin # normalize (Unicode normalization etc.): @@ -25,6 +25,11 @@ using Unicode: normalize, isassigned @test normalize("\t\r", stripcc=true) == " " @test normalize("\t\r", stripcc=true, newline2ls=true) == " \u2028" @test normalize("\u0072\u0307\u0323", :NFC) == "\u1E5B\u0307" #26917 + + # julia_chartransform identifier normalization + @test normalize("julia\u025B\u00B5\u00B7\u0387\u2212", chartransform=julia_chartransform) == + "julia\u03B5\u03BC\u22C5\u22C5\u002D" + @test julia_chartransform('\u00B5') === '\u03BC' end @testset "unicode sa#15" begin @@ -428,4 +433,5 @@ end @test !isequal_normalized("no\u00EBl", "noel") @test isequal_normalized("no\u00EBl", "noel", stripmark=true) @test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true) + @test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform) end