Skip to content

Commit

Permalink
export utf8proc functionality in Julia (followup to JuliaLang#5462 and
Browse files Browse the repository at this point in the history
  • Loading branch information
stevengj committed Jan 27, 2014
1 parent c19a8e9 commit 5799e46
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 2 deletions.
2 changes: 0 additions & 2 deletions base/char.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
char(x) = convert(Char, x)
char(x::FloatingPoint) = char(iround(x))

is_valid_char(c) = !('\ud800' <= c <= '\udfff' || '\U10ffff' < c)

integer(x::Char) = int(x)
unsigned(x::Char) = uint(x)

Expand Down
3 changes: 3 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,7 @@ export
bytes2hex,
bytestring,
char,
charcategory,
charwidth,
chomp,
chop,
Expand All @@ -766,6 +767,7 @@ export
hex2bytes,
ind2chr,
info,
is_assigned_char,
is_valid_ascii,
is_valid_char,
is_valid_utf8,
Expand Down Expand Up @@ -793,6 +795,7 @@ export
matchall,
ndigits,
nextind,
normalize_string,
oct,
parsefloat,
parseint,
Expand Down
2 changes: 2 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ include("utf8.jl")
include("utf16.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
importall .UTF8proc
include("regex.jl")
include("base64.jl")
importall .Base64
Expand Down
89 changes: 89 additions & 0 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Various Unicode functionality from the utf8proc library
module UTF8proc

# also exported by Base:
export normalize_string, charcategory, is_valid_char, is_assigned_char

# whether codepoints are valid Unicode
is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))

const UTF8PROC_NULLTERM = (1<<0)
const UTF8PROC_STABLE = (1<<1)
const UTF8PROC_COMPAT = (1<<2)
const UTF8PROC_COMPOSE = (1<<3)
const UTF8PROC_DECOMPOSE = (1<<4)
const UTF8PROC_IGNORE = (1<<5)
const UTF8PROC_REJECTNA = (1<<6)
const UTF8PROC_NLF2LS = (1<<7)
const UTF8PROC_NLF2PS = (1<<8)
const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
const UTF8PROC_STRIPCC = (1<<9)
const UTF8PROC_CASEFOLD = (1<<10)
const UTF8PROC_CHARBOUND = (1<<11)
const UTF8PROC_LUMP = (1<<12)
const UTF8PROC_STRIPMARK = (1<<13)

let
const p = Array(Ptr{Uint8}, 1)
global utf8proc_map
function utf8proc_map(s::String, flags::Integer)
result = ccall(:utf8proc_map, Cssize_t,
(Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
bytestring(s), 0, p, flags | UTF8PROC_NULLTERM)
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
(Cssize_t,), result)))
a = ccall(:jl_ptr_to_array_1d, Vector{Uint8},
(Any, Ptr{Uint8}, Csize_t, Cint),
Vector{Uint8}, p[1], result, true)
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
end
end

function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=false, decompose::Bool=false, ignorable::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
compat && (flags = flags | UTF8PROC_COMPAT)
compose && (flags = flags | UTF8PROC_COMPOSE)
decompose && (flags = flags | UTF8PROC_DECOMPOSE)
ignorable && (flags = flags | UTF8PROC_IGNORE)
rejectna && (flags = flags | UTF8PROC_REJECTNA)
newline2ls && (flags = flags | UTF8PROC_NLF2LS)
newline2ps && (flags = flags | UTF8PROC_NLF2PS)
stripcc && (flags = flags | UTF8PROC_STRIPCC)
casefold && (flags = flags | UTF8PROC_CASEFOLD)
lump && (flags = flags | UTF8PROC_LUMP)
flags = flags | UTF8PROC_STRIPMARK | (decompose ? 0 : UTF8PROC_COMPOSE)
utf8proc_map(s, flags)
end

function normalize_string(s::String, nf::Symbol)
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
| UTF8PROC_COMPAT) :
nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
| UTF8PROC_COMPAT) :
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
end

# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category
function category_code(c)
# note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c
c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
end

is_assigned_char(c) = category_code(c) != 0

# symbolic Unicode category code for a codepoint
let
const codes = [:Cn,:Lu,:Ll,:Lt,:Lm,:Lo,:Mn,:Mc,:Me,:Nd,:Nl,:No,
:Pc,:Pd,:Ps,:Pe,:Pi,:Pf,:Po,:Sm,:Sc,:Sk,:So,:Zs,:Zl,:Zp,
:Cc,:Cf,:Cs,:Co,:Cn]
global category
charcategory(c) = codes[category_code(c)+1]
end

# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string

end # module

0 comments on commit 5799e46

Please sign in to comment.