Skip to content

Commit

Permalink
add Unicode.julia_chartransform Julia-parser normalization (#42561)
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj authored Oct 18, 2021
1 parent 1b64755 commit 50fcb03
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 15 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ Standard library changes
#### Unicode
* Added function `isequal_normalized` to check for Unicode equivalence without
explicitly constructing normalized strings ([#42493]).
* The `Unicode.normalize` function now accepts a `chartransform` keyword that can
be used to supply custom character mappings, and a `Unicode.julia_chartransform`
function is provided to reproduce the mapping used in identifier normalization
by the Julia parser ([#42561]).


Deprecated or removed
---------------------
Expand Down
42 changes: 33 additions & 9 deletions base/strings/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,20 +145,43 @@ const UTF8PROC_STRIPMARK = (1<<13)

utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), C_NULL, 0, options)
nwords < 0 && utf8proc_error(nwords)
# static wrapper around user callback function
utf8proc_custom_func(codepoint::UInt32, callback::Any) =
UInt32(callback(codepoint))::UInt32

function utf8proc_decompose(str, options, buffer, nwords, chartransform::typeof(identity))
ret = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), buffer, nwords, options)
ret < 0 && utf8proc_error(ret)
return ret
end
function utf8proc_decompose(str, options, buffer, nwords, chartransform::T) where T
ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ref{T}),
str, sizeof(str), buffer, nwords, options,
@cfunction(utf8proc_custom_func, UInt32, (UInt32, Ref{T})), chartransform)
ret < 0 && utf8proc_error(ret)
return ret
end

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform=identity)
nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform)
buffer = Base.StringVector(nwords*4)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), buffer, nwords, options)
nwords < 0 && utf8proc_error(nwords)
nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform)
nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
nbytes < 0 && utf8proc_error(nbytes)
return String(resize!(buffer, nbytes))
end

utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
# from julia_charmap.h, used by julia_chartransform in the Unicode stdlib
const _julia_charmap = Dict{UInt32,UInt32}(
0x025B => 0x03B5,
0x00B5 => 0x03BC,
0x00B7 => 0x22C5,
0x0387 => 0x22C5,
0x2212 => 0x002D,
)

utf8proc_map(s::AbstractString, flags::Integer, chartransform=identity) = utf8proc_map(String(s), flags, chartransform)

# Documented in Unicode module
function normalize(
Expand All @@ -176,6 +199,7 @@ function normalize(
casefold::Bool=false,
lump::Bool=false,
stripmark::Bool=false,
chartransform=identity,
)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
Expand All @@ -198,7 +222,7 @@ function normalize(
casefold && (flags = flags | UTF8PROC_CASEFOLD)
lump && (flags = flags | UTF8PROC_LUMP)
stripmark && (flags = flags | UTF8PROC_STRIPMARK)
utf8proc_map(s, flags)
utf8proc_map(s, flags, chartransform)
end

function normalize(s::AbstractString, nf::Symbol)
Expand Down
5 changes: 4 additions & 1 deletion src/flisp/julia_charmap.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/* Array of {original codepoint, replacement codepoint} normalizations
to perform on Julia identifiers, to canonicalize characters that
are both easily confused and easily inputted by accident. */
are both easily confused and easily inputted by accident.
Important: when this table is updated, also update the corresponding table
in base/strings/unicode.jl */
static const uint32_t charmap[][2] = {
{ 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon
{ 0x00B5, 0x03BC }, // micro sign -> greek small letter mu
Expand Down
1 change: 1 addition & 0 deletions stdlib/Unicode/docs/src/index.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Unicode

```@docs
Unicode.julia_chartransform
Unicode.isassigned
Unicode.isequal_normalized
Unicode.normalize
Expand Down
66 changes: 62 additions & 4 deletions stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,50 @@ module Unicode

export graphemes, isequal_normalized

"""
Unicode.julia_chartransform(c::Union{Char,Integer})
Map the Unicode character (`Char`) or codepoint (`Integer`) `c` to the corresponding
"equivalent" character or codepoint, respectively, according to the custom equivalence
used within the Julia parser (in addition to NFC normalization).
For example, `'µ'` (U+00B5 micro) is treated as equivalent to `'μ'` (U+03BC mu) by
Julia's parser, so `julia_chartransform` performs this transformation while leaving
other characters unchanged:
```jldoctest
julia> Unicode.julia_chartransform('\u00B5')
'μ': Unicode U+03BC (category Ll: Letter, lowercase)
julia> Unicode.julia_chartransform('x')
'x': ASCII/Unicode U+0078 (category Ll: Letter, lowercase)
```
`julia_chartransform` is mainly useful for passing to the [`Unicode.normalize`](@ref)
function in order to mimic the normalization used by the Julia parser:
```jl
julia> s = "\u00B5o\u0308"
"µö"
julia> s2 = Unicode.normalize(s, compose=true, stable=true, chartransform=Unicode.julia_chartransform)
"μö"
julia> collect(s2)
2-element Vector{Char}:
'μ': Unicode U+03BC (category Ll: Letter, lowercase)
'ö': Unicode U+00F6 (category Ll: Letter, lowercase)
julia> s2 == string(Meta.parse(s))
true
```
!!! compat "Julia 1.8"
This function was introduced in Julia 1.8.
"""
function julia_chartransform end
julia_chartransform(codepoint::UInt32) = get(Base.Unicode._julia_charmap, codepoint, codepoint)
julia_chartransform(codepoint::Integer) = julia_chartransform(UInt32(codepoint))
julia_chartransform(char::Char) = Char(julia_chartransform(UInt32(char)))

"""
Unicode.normalize(s::AbstractString; keywords...)
Unicode.normalize(s::AbstractString, normalform::Symbol)
Expand Down Expand Up @@ -42,6 +86,13 @@ options (which all default to `false` except for `compose`) are specified:
* `rejectna=true`: throw an error if unassigned code points are found
* `stable=true`: enforce Unicode versioning stability (never introduce characters missing from earlier Unicode versions)
You can also use the `chartransform` keyword (which defaults to `identity`) to pass an arbitrary
*function* mapping `Integer` codepoints to codepoints, which is is called on each
character in `s` as it is processed, in order to perform arbitrary additional normalizations.
For example, by passing `chartransform=Unicode.julia_chartransform`, you can apply a few Julia-specific
character normalizations that are performed by Julia when parsing identifiers (in addition to
NFC normalization: `compose=true, stable=true`).
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
# Examples
Expand All @@ -58,6 +109,9 @@ julia> Unicode.normalize("JuLiA", casefold=true)
julia> Unicode.normalize("JúLiA", stripmark=true)
"JuLiA"
```
!!! compat "Julia 1.8"
The `chartransform` keyword argument requires Julia 1.8.
"""
function normalize end
normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf)
Expand Down Expand Up @@ -98,12 +152,16 @@ function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32},
end

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false)
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`,
ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks
and other combining characters.
As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
# Examples
For example, the string `"noël"` can be constructed in two canonically equivalent ways
Expand All @@ -130,7 +188,7 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
true
```
"""
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false)
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
function decompose_next_char!(c, state, d, options, s)
n = _decompose_char!(c, d, options)
if n > length(d) # may be possible in future Unicode versions?
Expand All @@ -148,11 +206,11 @@ function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bo
while true
if j1 > n1
i1 === nothing && return i2 === nothing && j2 > n2
j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1)
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
end
if j2 > n2
i2 === nothing && return false
j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2)
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
end
d1[j1] == d2[j2] || return false
j1 += 1; j2 += 1
Expand Down
8 changes: 7 additions & 1 deletion stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

using Test
using Unicode
using Unicode: normalize, isassigned
using Unicode: normalize, isassigned, julia_chartransform

@testset "string normalization" begin
# normalize (Unicode normalization etc.):
Expand All @@ -25,6 +25,11 @@ using Unicode: normalize, isassigned
@test normalize("\t\r", stripcc=true) == " "
@test normalize("\t\r", stripcc=true, newline2ls=true) == " \u2028"
@test normalize("\u0072\u0307\u0323", :NFC) == "\u1E5B\u0307" #26917

# julia_chartransform identifier normalization
@test normalize("julia\u025B\u00B5\u00B7\u0387\u2212", chartransform=julia_chartransform) ==
"julia\u03B5\u03BC\u22C5\u22C5\u002D"
@test julia_chartransform('\u00B5') === '\u03BC'
end

@testset "unicode sa#15" begin
Expand Down Expand Up @@ -428,4 +433,5 @@ end
@test !isequal_normalized("no\u00EBl", "noel")
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)
end

7 comments on commit 50fcb03

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Executing the daily package evaluation, I will reply here when finished:

@nanosoldier runtests(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your package evaluation job has completed - possible new issues were detected. A full report can be found here.

@vtjnash
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nanosoldier runbenchmarks(ALL, isdaily = true)

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

@vtjnash
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nanosoldier runbenchmarks(("array" && ("index" || "bool")) || "allocation elision view", vs="@1389c2fc4af952f5c8b9759cf6fe633995b523f9")

@nanosoldier
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your benchmark job has completed - possible performance regressions were detected. A full report can be found here.

@vtjnash
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well that seems bad. Anyone have time to investigate?

Please sign in to comment.