diff --git a/NEWS.md b/NEWS.md index 6e0b00c92f041..60eb4a5ed06b4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -90,6 +90,10 @@ Standard library changes #### UUIDs +#### Unicode + +* `graphemes(s, m:n)` returns a substring of the `m`-th to `n`-th graphemes in `s` ([#44266]). + #### Mmap #### DelimitedFiles diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl index e31f7ee1e27f2..0467a8d50aa6b 100644 --- a/stdlib/Unicode/src/Unicode.jl +++ b/stdlib/Unicode/src/Unicode.jl @@ -143,6 +143,69 @@ letter combined with an accent mark is a single grapheme.) """ graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s) +""" + graphemes(s::AbstractString, m:n) -> SubString + +Returns a [`SubString`](@ref) of `s` consisting of the `m`-th +through `n`-th graphemes of the string `s`, where the second +argument `m:n` is an integer-valued [`AbstractUnitRange`](@ref). + +Loosely speaking, this corresponds to the `m:n`-th user-perceived +"characters" in the string. For example: + +```jldoctest +julia> s = graphemes("exposé", 3:6) +"posé" + +julia> collect(s) +5-element Vector{Char}: + 'p': ASCII/Unicode U+0070 (category Ll: Letter, lowercase) + 'o': ASCII/Unicode U+006F (category Ll: Letter, lowercase) + 's': ASCII/Unicode U+0073 (category Ll: Letter, lowercase) + 'e': ASCII/Unicode U+0065 (category Ll: Letter, lowercase) + '́': Unicode U+0301 (category Mn: Mark, nonspacing) +``` +This consists of the 3rd to *7th* codepoints ([`Char`](@ref)s) in `"exposé"`, +because the grapheme `"é"` is actually *two* Unicode codepoints +(an `'e'` followed by an acute-accent combining character U+0301). + +Because finding grapheme boundaries requires iteration over the +string contents, the `graphemes(s, m:n)` function requires time +proportional to the length of the string (number of codepoints) +before the end of the substring. + +!!! compat "Julia 1.9" + The `m:n` argument of `graphemes` requires Julia 1.9. +""" +function graphemes(s::AbstractString, r::AbstractUnitRange{<:Integer}) + m, n = Int(first(r)), Int(last(r)) + m > 0 || throw(ArgumentError("starting index $m is not ≥ 1")) + n < m && return @view s[1:0] + c0 = eltype(s)(0x00000000) + state = Ref{Int32}(0) + count = 0 + i, iprev, ilast = 1, 1, lastindex(s) + # find the start of the m-th grapheme + while i ≤ ilast && count < m + @inbounds c = s[i] + count += Base.Unicode.isgraphemebreak!(state, c0, c) + c0 = c + i, iprev = nextind(s, i), i + end + start = iprev + count < m && throw(BoundsError(s, i)) + # find the end of the n-th grapheme + while i ≤ ilast + @inbounds c = s[i] + count += Base.Unicode.isgraphemebreak!(state, c0, c) + count > n && break + c0 = c + i, iprev = nextind(s, i), i + end + count < n && throw(BoundsError(s, i)) + return @view s[start:iprev] +end + using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer) diff --git a/stdlib/Unicode/test/runtests.jl b/stdlib/Unicode/test/runtests.jl index a4faac2bd3ba9..1d1b78e02bf27 100644 --- a/stdlib/Unicode/test/runtests.jl +++ b/stdlib/Unicode/test/runtests.jl @@ -271,6 +271,16 @@ end @test Base.Unicode.isgraphemebreak('α', 'β') @test !Base.Unicode.isgraphemebreak('α', '\u0302') + + for pre in ("","ä"), post in ("","x̂") + prelen = length(graphemes(pre)) + @test graphemes(pre * "öü" * post, (1:2) .+ prelen) == "öü" + @test graphemes(pre * "ö" * post, (1:1) .+ prelen) == "ö" + end + @test graphemes("äöüx", 6:5)::SubString{String} == "" + @test_throws BoundsError graphemes("äöüx", 2:5) + @test_throws BoundsError graphemes("äöüx", 5:5) + @test_throws ArgumentError graphemes("äöüx", 0:1) end @testset "#3721, #6939 up-to-date character widths" begin