Skip to content

Commit

Permalink
ncodeunits(c::Char): fast equivalent of ncodeunits(string(c))
Browse files Browse the repository at this point in the history
  • Loading branch information
StefanKarpinski committed Sep 13, 2018
1 parent 59dba6f commit b931684
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
9 changes: 9 additions & 0 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ Char
(::Type{T})(x::AbstractChar) where {T<:Union{Number,AbstractChar}} = T(codepoint(x))
(::Type{T})(x::T) where {T<:AbstractChar} = x

"""
ncodeunits(c::Char) -> Int
Return the number of code units required to encode a character as UTF-8.
This is the number of bytes which will be printed if the character is written
to an output stream, or `ncodeunits(string(c))` but computed efficiently.
"""
ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient

"""
codepoint(c::AbstractChar) -> Integer
Expand Down
23 changes: 23 additions & 0 deletions test/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,26 @@ Base.codepoint(c::ASCIIChar) = reinterpret(UInt8, c)
@test_throws MethodError write(IOBuffer(), ASCIIChar('x'))
@test_throws MethodError read(IOBuffer('x'), ASCIIChar)
end

@testset "ncodeunits(::Char)" begin
# valid encodings
@test ncodeunits('\0') == 1
@test ncodeunits('\x1') == 1
@test ncodeunits('\x7f') == 1
@test ncodeunits('\u80') == 2
@test ncodeunits('\uff') == 2
@test ncodeunits('\u7ff') == 2
@test ncodeunits('\u800') == 3
@test ncodeunits('\uffff') == 3
@test ncodeunits('\U10000') == 4
@test ncodeunits('\U10ffff') == 4
# invalid encodings
@test ncodeunits(reinterpret(Char, 0x80_00_00_00)) == 1
@test ncodeunits(reinterpret(Char, 0x81_00_00_00)) == 1
@test ncodeunits(reinterpret(Char, 0x80_80_00_00)) == 2
@test ncodeunits(reinterpret(Char, 0x80_01_00_00)) == 2
@test ncodeunits(reinterpret(Char, 0x80_00_80_00)) == 3
@test ncodeunits(reinterpret(Char, 0x80_00_01_00)) == 3
@test ncodeunits(reinterpret(Char, 0x80_00_00_80)) == 4
@test ncodeunits(reinterpret(Char, 0x80_00_00_01)) == 4
end

0 comments on commit b931684

Please sign in to comment.