Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add Unicode.isequal_normalized function #42493

Merged
merged 7 commits into from
Oct 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ Standard library changes
* The standard log levels `BelowMinLevel`, `Debug`, `Info`, `Warn`, `Error`,
and `AboveMaxLevel` are now exported from the Logging stdlib ([#40980]).

#### Unicode
* Added function `isequal_normalized` to check for Unicode equivalence without
explicitly constructing normalized strings ([#42493]).

Deprecated or removed
---------------------
Expand Down
1 change: 1 addition & 0 deletions stdlib/Unicode/docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

```@docs
Unicode.isassigned
Unicode.isequal_normalized
Unicode.normalize
Unicode.graphemes
```
72 changes: 71 additions & 1 deletion stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

module Unicode

export graphemes
export graphemes, isequal_normalized

"""
Unicode.normalize(s::AbstractString; keywords...)
Expand Down Expand Up @@ -89,4 +89,74 @@ letter combined with an accent mark is a single grapheme.)
"""
graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)

using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK

function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
ret < 0 && utf8proc_error(ret)
return ret
end

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false)

Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`,
ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks
and other combining characters.

# Examples

For example, the string `"noël"` can be constructed in two canonically equivalent ways
in Unicode, depending on whether `"ë"` is formed from a single codepoint U+00EB or
from the ASCII character `'o'` followed by the U+0308 combining-diaeresis character.

```jldoctest
julia> s1 = "no\u00EBl"
"noël"

julia> s2 = "noe\u0308l"
"noël"

julia> s1 == s2
false

julia> isequal_normalized(s1, s2)
true

julia> isequal_normalized(s1, "noel", stripmark=true)
true

julia> isequal_normalized(s1, "NOËL", casefold=true)
true
```
"""
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false)
function decompose_next_char!(c, state, d, options, s)
n = _decompose_char!(c, d, options)
if n > length(d) # may be possible in future Unicode versions?
n = _decompose_char!(c, resize!(d, n), options)
end
return 1, n, iterate(s, state)
end
options = UTF8PROC_DECOMPOSE
casefold && (options |= UTF8PROC_CASEFOLD)
stripmark && (options |= UTF8PROC_STRIPMARK)
i1,i2 = iterate(s1),iterate(s2)
d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers
n1 = n2 = 0 # lengths of codepoint buffers
j1 = j2 = 1 # indices in d1, d2
while true
if j1 > n1
i1 === nothing && return i2 === nothing && j2 > n2
j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1)
end
if j2 > n2
i2 === nothing && return false
j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2)
end
d1[j1] == d2[j2] || return false
j1 += 1; j2 += 1
end
end

end
12 changes: 12 additions & 0 deletions stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -417,3 +417,15 @@ end
@test prod(["*" for i in 1:3]) == "***"
@test prod(["*" for i in 1:0]) == ""
end

@testset "Unicode equivalence" begin
@test isequal_normalized("no\u00EBl", "noe\u0308l")
@test !isequal_normalized("no\u00EBl", "noe\u0308l ")
@test isequal_normalized("", "")
@test !isequal_normalized("", " ")
@test !isequal_normalized("no\u00EBl", "NOËL")
@test isequal_normalized("no\u00EBl", "NOËL", casefold=true)
@test !isequal_normalized("no\u00EBl", "noel")
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
end