export utf8proc functionality in Julia (followup to JuliaLang#5462 and …

…JuliaLang#5434)
stevengj · Jan 27, 2014 · 5799e46 · 5799e46
1 parent c19a8e9
commit 5799e46
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 2 deletions.
diff --git a/base/char.jl b/base/char.jl
@@ -1,8 +1,6 @@
 char(x) = convert(Char, x)
 char(x::FloatingPoint) = char(iround(x))
 
-is_valid_char(c) = !('\ud800' <= c <= '\udfff' || '\U10ffff' < c)
-
 integer(x::Char) = int(x)
 unsigned(x::Char) = uint(x)
 

diff --git a/base/exports.jl b/base/exports.jl
@@ -750,6 +750,7 @@ export
     bytes2hex,
     bytestring,
     char,
+    charcategory,
     charwidth,
     chomp,
     chop,
@@ -766,6 +767,7 @@ export
     hex2bytes,
     ind2chr,
     info,
+    is_assigned_char,
     is_valid_ascii,
     is_valid_char,
     is_valid_utf8,
@@ -793,6 +795,7 @@ export
     matchall,
     ndigits,
     nextind,
+    normalize_string,
     oct,
     parsefloat,
     parseint,

diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -75,6 +75,8 @@ include("utf8.jl")
 include("utf16.jl")
 include("iobuffer.jl")
 include("string.jl")
+include("utf8proc.jl")
+importall .UTF8proc
 include("regex.jl")
 include("base64.jl")
 importall .Base64

diff --git a/base/utf8proc.jl b/base/utf8proc.jl
@@ -0,0 +1,89 @@
+# Various Unicode functionality from the utf8proc library
+module UTF8proc
+
+# also exported by Base:
+export normalize_string, charcategory, is_valid_char, is_assigned_char
+
+# whether codepoints are valid Unicode
+is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))
+
+const UTF8PROC_NULLTERM  = (1<<0)
+const UTF8PROC_STABLE    = (1<<1)
+const UTF8PROC_COMPAT    = (1<<2)
+const UTF8PROC_COMPOSE   = (1<<3)
+const UTF8PROC_DECOMPOSE = (1<<4)
+const UTF8PROC_IGNORE    = (1<<5)
+const UTF8PROC_REJECTNA  = (1<<6)
+const UTF8PROC_NLF2LS    = (1<<7)
+const UTF8PROC_NLF2PS    = (1<<8)
+const UTF8PROC_NLF2LF    = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
+const UTF8PROC_STRIPCC   = (1<<9)
+const UTF8PROC_CASEFOLD  = (1<<10)
+const UTF8PROC_CHARBOUND = (1<<11)
+const UTF8PROC_LUMP      = (1<<12)
+const UTF8PROC_STRIPMARK = (1<<13)
+
+let
+    const p = Array(Ptr{Uint8}, 1)
+    global utf8proc_map
+    function utf8proc_map(s::String, flags::Integer)
+        result = ccall(:utf8proc_map, Cssize_t,
+                       (Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
+                       bytestring(s), 0, p, flags | UTF8PROC_NULLTERM)
+        result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
+                                             (Cssize_t,), result)))
+        a = ccall(:jl_ptr_to_array_1d, Vector{Uint8}, 
+                  (Any, Ptr{Uint8}, Csize_t, Cint),
+                  Vector{Uint8}, p[1], result, true)
+        ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
+    end
+end
+
+function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=false, decompose::Bool=false, ignorable::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
+    flags = 0
+    stable && (flags = flags | UTF8PROC_STABLE)
+    compat && (flags = flags | UTF8PROC_COMPAT)
+    compose && (flags = flags | UTF8PROC_COMPOSE)
+    decompose && (flags = flags | UTF8PROC_DECOMPOSE)
+    ignorable && (flags = flags | UTF8PROC_IGNORE)
+    rejectna && (flags = flags | UTF8PROC_REJECTNA)
+    newline2ls && (flags = flags | UTF8PROC_NLF2LS)
+    newline2ps && (flags = flags | UTF8PROC_NLF2PS)
+    stripcc && (flags = flags | UTF8PROC_STRIPCC)
+    casefold && (flags = flags | UTF8PROC_CASEFOLD)
+    lump && (flags = flags | UTF8PROC_LUMP)
+    flags = flags | UTF8PROC_STRIPMARK | (decompose ? 0 : UTF8PROC_COMPOSE)
+    utf8proc_map(s, flags)
+end
+
+function normalize_string(s::String, nf::Symbol)
+    utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
+                    nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
+                    nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
+                                   | UTF8PROC_COMPAT) :
+                    nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
+                                   | UTF8PROC_COMPAT) :
+                    throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
+end
+
+# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category
+function category_code(c)
+    # note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c
+    c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
+    unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
+end
+
+is_assigned_char(c) = category_code(c) != 0
+
+# symbolic Unicode category code for a codepoint
+let
+    const codes = [:Cn,:Lu,:Ll,:Lt,:Lm,:Lo,:Mn,:Mc,:Me,:Nd,:Nl,:No,
+                   :Pc,:Pd,:Ps,:Pe,:Pi,:Pf,:Po,:Sm,:Sc,:Sk,:So,:Zs,:Zl,:Zp,
+                   :Cc,:Cf,:Cs,:Co,:Cn]
+    global category
+    charcategory(c) = codes[category_code(c)+1]
+end
+
+# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string
+
+end # module