From 38715b8c25c449bce1f1f8c32a530cf9d0b2451f Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Tue, 31 Jan 2017 11:18:56 -0500 Subject: [PATCH] deprecate string methods of character predicates, e.g. `isnumber("")` fixes #14156 --- base/deprecated.jl | 6 ++++ base/regex.jl | 2 +- base/strings/basic.jl | 10 +++--- base/strings/utf8proc.jl | 74 +++++++++++++++------------------------- test/unicode/utf8proc.jl | 56 +++++++++++++++--------------- 5 files changed, 68 insertions(+), 80 deletions(-) diff --git a/base/deprecated.jl b/base/deprecated.jl index 5e966bd834efc..cbc30488e5b27 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1849,4 +1849,10 @@ end) @deprecate FloatRange{T}(start::T, step, len, den) Base.floatrange(T, start, step, len, den) +for name in ("alnum", "alpha", "cntrl", "digit", "number", "graph", + "lower", "print", "punct", "space", "upper", "xdigit") + f = Symbol("is",name) + @eval @deprecate ($f)(s::AbstractString) all($f, s) +end + # End deprecations scheduled for 0.6 diff --git a/base/regex.jl b/base/regex.jl index add2154d352b6..16926728d0402 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -291,7 +291,7 @@ function _replace(io, repl_s::SubstitutionString, str, r, re) end # TODO: avoid this allocation groupname = SubString(repl, groupstart, prevind(repl, i)) - if isnumber(groupname) + if all(isnumber,groupname) _write_capture(io, re, parse(Int, groupname)) else group = PCRE.substring_number_from_name(re.regex, groupname) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index d5da3559f7f65..26cab3c511679 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -280,20 +280,20 @@ isascii(s::AbstractString) = all(isascii, s) promote_rule{S<:AbstractString,T<:AbstractString}(::Type{S}, ::Type{T}) = String """ - isxdigit(c::Union{Char,AbstractString}) -> Bool + isxdigit(c::Char) -> Bool -Tests whether a character is a valid hexadecimal digit, or whether this is true for all elements of a string. +Tests whether a character is a valid hexadecimal digit. Note that this does not +include `x` (as in the standard `0x` prefix). ```jldoctest -julia> isxdigit("abc") +julia> isxdigit('a') true -julia> isxdigit("0x9") +julia> isxdigit('x') false ``` """ isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' -isxdigit(s::AbstractString) = all(isxdigit, s) ## uppercase, lowercase, and titlecase transformations ## diff --git a/base/strings/utf8proc.jl b/base/strings/utf8proc.jl index 1ad2e37bf4cf2..27683d1d4defb 100644 --- a/base/strings/utf8proc.jl +++ b/base/strings/utf8proc.jl @@ -219,10 +219,10 @@ is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN ## libc character class predicates ## """ - islower(c::Union{Char,AbstractString}) -> Bool + islower(c::Char) -> Bool -Tests whether a character is a lowercase letter, or whether this is true for all elements of -a string. A character is classified as lowercase if it belongs to Unicode category Ll, +Tests whether a character is a lowercase letter. +A character is classified as lowercase if it belongs to Unicode category Ll, Letter: Lowercase. """ islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL) @@ -230,10 +230,10 @@ islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL) # true for Unicode upper and mixed case """ - isupper(c::Union{Char,AbstractString}) -> Bool + isupper(c::Char) -> Bool -Tests whether a character is an uppercase letter, or whether this is true for all elements -of a string. A character is classified as uppercase if it belongs to Unicode category Lu, +Tests whether a character is an uppercase letter. +A character is classified as uppercase if it belongs to Unicode category Lu, Letter: Uppercase, or Lt, Letter: Titlecase. """ function isupper(c::Char) @@ -242,36 +242,35 @@ function isupper(c::Char) end """ - isdigit(c::Union{Char,AbstractString}) -> Bool + isdigit(c::Char) -> Bool -Tests whether a character is a numeric digit (0-9), or whether this is true for all elements -of a string. +Tests whether a character is a numeric digit (0-9). """ isdigit(c::Char) = ('0' <= c <= '9') """ - isalpha(c::Union{Char,AbstractString}) -> Bool + isalpha(c::Char) -> Bool -Tests whether a character is alphabetic, or whether this is true for all elements of a -string. A character is classified as alphabetic if it belongs to the Unicode general +Tests whether a character is alphabetic. +A character is classified as alphabetic if it belongs to the Unicode general category Letter, i.e. a character whose category code begins with 'L'. """ isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO) """ - isnumber(c::Union{Char,AbstractString}) -> Bool + isnumber(c::Char) -> Bool -Tests whether a character is numeric, or whether this is true for all elements of a string. +Tests whether a character is numeric. A character is classified as numeric if it belongs to the Unicode general category Number, i.e. a character whose category code begins with 'N'. """ isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO) """ - isalnum(c::Union{Char,AbstractString}) -> Bool + isalnum(c::Char) -> Bool -Tests whether a character is alphanumeric, or whether this is true for all elements of a -string. A character is classified as alphabetic if it belongs to the Unicode general +Tests whether a character is alphanumeric. +A character is classified as alphabetic if it belongs to the Unicode general category Letter or Number, i.e. a character whose category code begins with 'L' or 'N'. """ function isalnum(c::Char) @@ -283,67 +282,50 @@ end # following C++ only control characters from the Latin-1 subset return true """ - iscntrl(c::Union{Char,AbstractString}) -> Bool + iscntrl(c::Char) -> Bool -Tests whether a character is a control character, or whether this is true for all elements -of a string. Control characters are the non-printing characters of the Latin-1 subset of Unicode. +Tests whether a character is a control character. +Control characters are the non-printing characters of the Latin-1 subset of Unicode. """ iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f)) """ - ispunct(c::Union{Char,AbstractString}) -> Bool + ispunct(c::Char) -> Bool Tests whether a character belongs to the Unicode general category Punctuation, i.e. a -character whose category code begins with 'P'. For strings, tests whether this is true for -all elements of the string. +character whose category code begins with 'P'. """ ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO) # \u85 is the Unicode Next Line (NEL) character """ - isspace(c::Union{Char,AbstractString}) -> Bool + isspace(c::Char) -> Bool Tests whether a character is any whitespace character. Includes ASCII characters '\\t', '\\n', '\\v', '\\f', '\\r', and ' ', Latin-1 character U+0085, and characters in Unicode -category Zs. For strings, tests whether this is true for all elements of the string. +category Zs. """ @inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS """ - isprint(c::Union{Char,AbstractString}) -> Bool + isprint(c::Char) -> Bool -Tests whether a character is printable, including spaces, but not a control character. For -strings, tests whether this is true for all elements of the string. +Tests whether a character is printable, including spaces, but not a control character. """ isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS) # true in principal if a printer would use ink """ - isgraph(c::Union{Char,AbstractString}) -> Bool + isgraph(c::Char) -> Bool -Tests whether a character is printable, and not a space, or whether this is true for all -elements of a string. Any character that would cause a printer to use ink should be +Tests whether a character is printable, and not a space. +Any character that would cause a printer to use ink should be classified with `isgraph(c)==true`. """ isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO) -for name = ("alnum", "alpha", "cntrl", "digit", "number", "graph", - "lower", "print", "punct", "space", "upper") - f = Symbol("is",name) - @eval begin - function $f(s::AbstractString) - for c in s - if !$f(c) - return false - end - end - return true - end - end -end - ############################################################################ # iterators for grapheme segmentation diff --git a/test/unicode/utf8proc.jl b/test/unicode/utf8proc.jl index c88cc61632fd3..45f2929efca12 100644 --- a/test/unicode/utf8proc.jl +++ b/test/unicode/utf8proc.jl @@ -187,34 +187,34 @@ let end -@test isspace(" \t \n \r ")==true -@test isgraph(" \t \n \r ")==false -@test isprint(" \t \n \r ")==false -@test isalpha(" \t \n \r ")==false -@test isnumber(" \t \n \r ")==false -@test ispunct(" \t \n \r ")==false - -@test isspace("ΣβΣβ")==false -@test isalpha("ΣβΣβ")==true -@test isgraph("ΣβΣβ")==true -@test isprint("ΣβΣβ")==true -@test isupper("ΣβΣβ")==false -@test islower("ΣβΣβ")==false -@test isnumber("ΣβΣβ")==false -@test iscntrl("ΣβΣβ")==false -@test ispunct("ΣβΣβ")==false - -@test isnumber("23435")==true -@test isdigit("23435")==true -@test isalnum("23435")==true -@test isalpha("23435")==false -@test iscntrl( string(Char(0x0080))) == true -@test ispunct( "‡؟჻") ==true - -@test isxdigit('0') == true -@test isxdigit("0") == true -@test isxdigit("a") == true -@test isxdigit("g") == false +@test all(isspace," \t \n \r ") +@test !all(isgraph," \t \n \r ") +@test !all(isprint," \t \n \r ") +@test !all(isalpha," \t \n \r ") +@test !all(isnumber," \t \n \r ") +@test !all(ispunct," \t \n \r ") + +@test !all(isspace,"ΣβΣβ") +@test all(isalpha,"ΣβΣβ") +@test all(isgraph,"ΣβΣβ") +@test all(isprint,"ΣβΣβ") +@test !all(isupper,"ΣβΣβ") +@test !all(islower,"ΣβΣβ") +@test !all(isnumber,"ΣβΣβ") +@test !all(iscntrl,"ΣβΣβ") +@test !all(ispunct,"ΣβΣβ") + +@test all(isnumber,"23435") +@test all(isdigit,"23435") +@test all(isalnum,"23435") +@test !all(isalpha,"23435") +@test all(iscntrl,string(Char(0x0080))) +@test all(ispunct, "‡؟჻") + +@test isxdigit('0') +@test isxdigit('a') +@test !isxdigit('x') +@test !isxdigit('g') # check utf8proc handling of CN category constants let c_ll = 'β', c_cn = '\u038B'