diff --git a/base/pcre.jl b/base/pcre.jl index 0809e68199fad3..5955c3880f6d59 100644 --- a/base/pcre.jl +++ b/base/pcre.jl @@ -49,7 +49,8 @@ const COMPILE_MASK = NO_START_OPTIMIZE | NO_UTF_CHECK | UNGREEDY | - UTF + UTF | + UCP const EXECUTE_MASK = NEWLINE_ANY | diff --git a/base/regex.jl b/base/regex.jl index 32d93275997c84..bea891d40407ba 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -4,7 +4,7 @@ include("pcre.jl") -const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX +const DEFAULT_COMPILER_OPTS = PCRE.UTF | PCRE.ALT_BSUX | PCRE.UCP const DEFAULT_MATCH_OPTS = zero(UInt32) mutable struct Regex @@ -40,11 +40,15 @@ end function Regex(pattern::AbstractString, flags::AbstractString) options = DEFAULT_COMPILER_OPTS for f in flags - options |= f=='i' ? PCRE.CASELESS : - f=='m' ? PCRE.MULTILINE : - f=='s' ? PCRE.DOTALL : - f=='x' ? PCRE.EXTENDED : - throw(ArgumentError("unknown regex flag: $f")) + if f == 'a' + options &= ~PCRE.UCP + else + options |= f=='i' ? PCRE.CASELESS : + f=='m' ? PCRE.MULTILINE : + f=='s' ? PCRE.DOTALL : + f=='x' ? PCRE.EXTENDED : + throw(ArgumentError("unknown regex flag: $f")) + end end Regex(pattern, options, DEFAULT_MATCH_OPTS) end @@ -72,8 +76,12 @@ after the ending quote, to change its behaviour: - `s` allows the `.` modifier to match newlines. - `x` enables "comment mode": whitespace is enabled except when escaped with `\\`, and `#` is treated as starting a comment. +- `a` disables `UCP` mode (enables ASCII mode). By default `\\B`, `\\b`, `\\D`, `\\d`, `\\S`, + `\\s`, `\\W`, `\\w`, etc match based on unicode character properties. With this option, + only ASCII characters are recognized. + -For example, this regex has all three flags enabled: +For example, this regex has the first three flags enabled: ```jldoctest julia> match(r"a+.*b+.*?d\$"ism, "Goodbye,\\nOh, angry,\\nBad world\\n") @@ -83,15 +91,16 @@ RegexMatch("angry,\\nBad world") macro r_str(pattern, flags...) Regex(pattern, flags...) end function show(io::IO, re::Regex) - imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED + imsxa = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED|PCRE.UCP opts = re.compile_options - if (opts & ~imsx) == DEFAULT_COMPILER_OPTS + if (opts & ~imsxa) == (DEFAULT_COMPILER_OPTS & ~imsxa) print(io, 'r') print_quoted_literal(io, re.pattern) if (opts & PCRE.CASELESS ) != 0; print(io, 'i'); end if (opts & PCRE.MULTILINE) != 0; print(io, 'm'); end if (opts & PCRE.DOTALL ) != 0; print(io, 's'); end if (opts & PCRE.EXTENDED ) != 0; print(io, 'x'); end + if (opts & PCRE.UCP ) == 0; print(io, 'a'); end else print(io, "Regex(") show(io, re.pattern) diff --git a/test/regex.jl b/test/regex.jl index fe5ce3c7f58bda..8882fa6985bf35 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -73,3 +73,7 @@ end @test_throws ErrorException Regex("\Udfff") # code points 0xd800-0xdfff are not defined @test_throws ErrorException Regex("\xc0\x80") # overlong 2-byte sequence @test_throws ErrorException Regex("\xff") # illegal byte (0xfe or 0xff) + +# 'a' flag to disable UCP +@test match(r"\w+", "Düsseldorf").match == "Düsseldorf" +@test match(r"\w+"a, "Düsseldorf").match == "D"