From e62352b9616761b155c78dc61e5d21f0fd0c5218 Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Tue, 29 Nov 2016 10:57:47 -0500 Subject: [PATCH 1/5] faster, smaller `String` allocation --- NEWS.md | 8 + base/LineEdit.jl | 2 +- base/boot.jl | 37 ++-- base/c.jl | 12 +- base/datafmt.jl | 20 +-- base/deepcopy.jl | 9 + base/deprecated.jl | 2 +- base/error.jl | 2 +- base/exports.jl | 1 + base/expr.jl | 5 +- base/interactiveutil.jl | 2 + base/iobuffer.jl | 17 +- base/loading.jl | 4 +- base/pointer.jl | 4 +- base/printf.jl | 4 +- base/regex.jl | 4 +- base/replutil.jl | 10 +- base/serialize.jl | 13 ++ base/strings/basic.jl | 89 +--------- base/strings/io.jl | 8 +- base/strings/search.jl | 127 ++++---------- base/strings/string.jl | 351 +++++++++++++++++++++++++++++--------- base/strings/strings.jl | 1 - base/strings/types.jl | 18 +- base/strings/util.jl | 26 +-- base/sysimg.jl | 21 ++- base/util.jl | 6 +- doc/src/stdlib/strings.md | 1 + src/alloc.c | 1 + src/array.c | 23 +-- src/builtins.c | 15 +- src/codegen.cpp | 2 +- src/dump.c | 25 ++- src/gc.c | 4 + src/init.c | 1 - src/jltypes.c | 5 + src/julia.h | 8 +- src/toplevel.c | 8 +- test/reflection.jl | 1 - test/replcompletions.jl | 4 +- test/strings/basic.jl | 5 +- 41 files changed, 497 insertions(+), 409 deletions(-) diff --git a/NEWS.md b/NEWS.md index 58db62115117a..9c7dc0a12bf9c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,6 +33,13 @@ Breaking changes This section lists changes that do not have deprecation warnings. + * `String`s no longer have a `.data` field (as part of a significant performance + improvement). Use `Vector{UInt8}(str)` to access a string as a byte array. + However, allocating the `Vector` object has overhead. You can also use + `codeunit(str, i)` to access the `i`th byte of a `String`. + Use `sizeof(str)` instead of `length(str.data)`, and `pointer(str)` instead of + `pointer(str.data)`. ([#19449]) + * Operations between `Float16` and `Integers` now return `Float16` instead of `Float32`. ([#17261]) * Keyword arguments are processed left-to-right: if the same keyword is specified more than @@ -815,6 +822,7 @@ Language tooling improvements [#19233]: https://github.com/JuliaLang/julia/issues/19233 [#19288]: https://github.com/JuliaLang/julia/issues/19288 [#19305]: https://github.com/JuliaLang/julia/issues/19305 +[#19449]: https://github.com/JuliaLang/julia/issues/19449 [#19469]: https://github.com/JuliaLang/julia/issues/19469 [#19543]: https://github.com/JuliaLang/julia/issues/19543 [#19598]: https://github.com/JuliaLang/julia/issues/19598 diff --git a/base/LineEdit.jl b/base/LineEdit.jl index 27492222b8212..4e6c4dfade98f 100644 --- a/base/LineEdit.jl +++ b/base/LineEdit.jl @@ -437,7 +437,7 @@ function splice_buffer!{T<:Integer}(buf::IOBuffer, r::UnitRange{T}, ins::Abstrac elseif pos > last(r) seek(buf, pos - length(r)) end - splice!(buf.data, r + 1, ins.data) # position(), etc, are 0-indexed + splice!(buf.data, r + 1, Vector{UInt8}(ins)) # position(), etc, are 0-indexed buf.size = buf.size + sizeof(ins) - length(r) seek(buf, position(buf) + sizeof(ins)) end diff --git a/base/boot.jl b/base/boot.jl index c9e1b1a800c00..92d61c8a69494 100644 --- a/base/boot.jl +++ b/base/boot.jl @@ -182,8 +182,6 @@ else typealias UInt UInt32 end -abstract AbstractString - function Typeof end (f::typeof(Typeof))(x::ANY) = isa(x,Type) ? Type{x} : typeof(x) @@ -192,12 +190,19 @@ type ErrorException <: Exception msg::AbstractString ErrorException(msg::AbstractString) = new(msg) end + +Expr(args::ANY...) = _expr(args...) + +macro _noinline_meta() + Expr(:meta, :noinline) +end + immutable BoundsError <: Exception a::Any i::Any BoundsError() = new() - BoundsError(a::ANY) = new(a) - BoundsError(a::ANY, i::ANY) = new(a,i) + BoundsError(a::ANY) = (@_noinline_meta; new(a)) + BoundsError(a::ANY, i) = (@_noinline_meta; new(a,i)) end immutable DivideError <: Exception end immutable DomainError <: Exception end @@ -221,11 +226,7 @@ end abstract DirectIndexString <: AbstractString -immutable String <: AbstractString - data::Array{UInt8,1} - # required to make String("foo") work (#15120): - String(d::Array{UInt8,1}) = new(d) -end +String(s::String) = s # no constructor yet # This should always be inlined getptls() = ccall(:jl_get_ptls_states, Ptr{Void}, ()) @@ -278,8 +279,6 @@ immutable VecElement{T} end VecElement{T}(arg::T) = VecElement{T}(arg) -Expr(args::ANY...) = _expr(args...) - # used by lowering of splicing unquote splicedexpr(hd::Symbol, args::Array{Any,1}) = (e=Expr(hd); e.args=args; e) @@ -338,16 +337,18 @@ Array{T}(::Type{T}, m::Int) = Array{T,1}(m) Array{T}(::Type{T}, m::Int,n::Int) = Array{T,2}(m,n) Array{T}(::Type{T}, m::Int,n::Int,o::Int) = Array{T,3}(m,n,o) - # primitive Symbol constructors -Symbol(s::String) = Symbol(s.data) +function Symbol(s::String) + return ccall(:jl_symbol_n, Ref{Symbol}, (Ptr{UInt8}, Int), + ccall(:jl_string_ptr, Ptr{UInt8}, (Any,), s), + sizeof(s)) +end function Symbol(a::Array{UInt8,1}) return ccall(:jl_symbol_n, Ref{Symbol}, (Ptr{UInt8}, Int), - ccall(:jl_array_ptr, Ptr{UInt8}, (Any,), a), - Intrinsics.arraylen(a)) + ccall(:jl_array_ptr, Ptr{UInt8}, (Any,), a), + Intrinsics.arraylen(a)) end - # docsystem basics macro doc(x...) atdoc(x...) @@ -378,8 +379,8 @@ unsafe_write(io::IO, x::Ptr{UInt8}, nb::Int) = write(io::IO, x::UInt8) = (ccall(:jl_uv_putb, Void, (Ptr{Void}, UInt8), io_pointer(io), x); 1) function write(io::IO, x::String) - nb = sizeof(x.data) - unsafe_write(io, ccall(:jl_array_ptr, Ptr{UInt8}, (Any,), x.data), nb) + nb = sizeof(x) + unsafe_write(io, ccall(:jl_string_ptr, Ptr{UInt8}, (Any,), x), nb) return nb end diff --git a/base/c.jl b/base/c.jl index 031e004663d7d..eddb21bf8c6b3 100644 --- a/base/c.jl +++ b/base/c.jl @@ -80,14 +80,12 @@ unsafe_wrap(::Type{String}, p::Cstring, len::Integer, own::Bool=false) = unsafe_string(s::Cstring) = unsafe_string(convert(Ptr{UInt8}, s)) # convert strings to String etc. to pass as pointers -cconvert(::Type{Cstring}, s::String) = - ccall(:jl_array_cconvert_cstring, Ref{Vector{UInt8}}, - (Vector{UInt8},), s.data) +cconvert(::Type{Cstring}, s::String) = s cconvert(::Type{Cstring}, s::AbstractString) = cconvert(Cstring, String(s)::String) function cconvert(::Type{Cwstring}, s::AbstractString) - v = transcode(Cwchar_t, String(s).data) + v = transcode(Cwchar_t, Vector{UInt8}(String(s))) !isempty(v) && v[end] == 0 || push!(v, 0) return v end @@ -100,7 +98,7 @@ containsnul(p::Ptr, len) = containsnul(s::String) = containsnul(unsafe_convert(Ptr{Cchar}, s), sizeof(s)) containsnul(s::AbstractString) = '\0' in s -function unsafe_convert(::Type{Cstring}, s::Vector{UInt8}) +function unsafe_convert(::Type{Cstring}, s::Union{String,Vector{UInt8}}) p = unsafe_convert(Ptr{Cchar}, s) containsnul(p, sizeof(s)) && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) @@ -133,7 +131,7 @@ same argument. This is only available on Windows. """ function cwstring(s::AbstractString) - bytes = String(s).data + bytes = Vector{UInt8}(String(s)) 0 in bytes && throw(ArgumentError("embedded NULs are not allowed in C strings: $(repr(s))")) return push!(transcode(UInt16, bytes), 0) end @@ -170,7 +168,7 @@ function transcode{S<:Union{Int32,UInt32}}(::Type{UInt8}, src::Vector{S}) take!(buf) end transcode(::Type{String}, src::String) = src -transcode(T, src::String) = transcode(T, src.data) +transcode(T, src::String) = transcode(T, Vector{UInt8}(src)) transcode(::Type{String}, src) = String(transcode(UInt8, src)) function transcode(::Type{UInt16}, src::Vector{UInt8}) diff --git a/base/datafmt.jl b/base/datafmt.jl index 47503f95a1a3c..79d7a6f770877 100644 --- a/base/datafmt.jl +++ b/base/datafmt.jl @@ -129,7 +129,7 @@ function readdlm_auto(input::AbstractString, dlm::Char, T::Type, eol::Char, auto # TODO: It would be nicer to use String(a) without making a copy, # but because the mmap'ed array is not NUL-terminated this causes # jl_try_substrtod to segfault below. - return readdlm_string(String(copy(a)), dlm, T, eol, auto, optsd) + return readdlm_string(unsafe_string(pointer(a),length(a)), dlm, T, eol, auto, optsd) else return readdlm_string(readstring(input), dlm, T, eol, auto, optsd) end @@ -153,7 +153,7 @@ type DLMOffsets <: DLMHandler offsets = Array{Array{Int,1}}(1) offsets[1] = Array{Int}(offs_chunk_size) thresh = ceil(min(typemax(UInt), Base.Sys.total_memory()) / sizeof(Int) / 5) - new(offsets, 1, thresh, length(sbuff.data)) + new(offsets, 1, thresh, sizeof(sbuff)) end end @@ -220,7 +220,7 @@ end _chrinstr(sbuff::String, chr::UInt8, startpos::Int, endpos::Int) = (endpos >= startpos) && (C_NULL != ccall(:memchr, Ptr{UInt8}, - (Ptr{UInt8}, Int32, Csize_t), pointer(sbuff.data)+startpos-1, chr, endpos-startpos+1)) + (Ptr{UInt8}, Int32, Csize_t), pointer(sbuff)+startpos-1, chr, endpos-startpos+1)) function store_cell{T}(dlmstore::DLMStore{T}, row::Int, col::Int, quoted::Bool, startpos::Int, endpos::Int) @@ -463,17 +463,9 @@ function colval{T<:Char}(sbuff::String, startpos::Int, endpos::Int, cells::Array end colval(sbuff::String, startpos::Int, endpos::Int, cells::Array, row::Int, col::Int) = true -function dlm_parse{T,D}(dbuff::T, eol::D, dlm::D, qchar::D, cchar::D, - ign_adj_dlm::Bool, allow_quote::Bool, allow_comments::Bool, - skipstart::Int, skipblanks::Bool, dh::DLMHandler) - all_ascii = (D <: UInt8) || (isascii(eol) && - isascii(dlm) && - (!allow_quote || isascii(qchar)) && - (!allow_comments || isascii(cchar))) - if T === String && all_ascii - return dlm_parse(dbuff.data, eol % UInt8, dlm % UInt8, qchar % UInt8, cchar % UInt8, - ign_adj_dlm, allow_quote, allow_comments, skipstart, skipblanks, dh) - end +function dlm_parse{D}(dbuff::String, eol::D, dlm::D, qchar::D, cchar::D, + ign_adj_dlm::Bool, allow_quote::Bool, allow_comments::Bool, + skipstart::Int, skipblanks::Bool, dh::DLMHandler) ncols = nrows = col = 0 is_default_dlm = (dlm == invalid_dlm(D)) error_str = "" diff --git a/base/deepcopy.jl b/base/deepcopy.jl index 36061a28bb2ec..b5099f8ef41fc 100644 --- a/base/deepcopy.jl +++ b/base/deepcopy.jl @@ -22,6 +22,15 @@ function deepcopy_internal(x::SimpleVector, stackdict::ObjectIdDict) return y end +function deepcopy_internal(x::String, stackdict::ObjectIdDict) + if haskey(stackdict, x) + return stackdict[x] + end + y = unsafe_string(pointer(x), sizeof(x)) + stackdict[x] = y + return y +end + function deepcopy_internal(x::ANY, stackdict::ObjectIdDict) T = typeof(x)::DataType nf = nfields(T) diff --git a/base/deprecated.jl b/base/deprecated.jl index a0634bbe80412..6fcedec8453e1 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -430,7 +430,7 @@ end endn += 1 end (endn > idx) && (endn -= 1) - splice!(a, idx:endn, invalids_as.data) + splice!(a, idx:endn, Vector{UInt8}(invalids_as)) l = length(a) end String(a) diff --git a/base/error.jl b/base/error.jl index 16839cd61ff8d..fdf945657ebe8 100644 --- a/base/error.jl +++ b/base/error.jl @@ -73,7 +73,7 @@ macro assert(ex, msgs...) elseif !isempty(msgs) && (isa(msg, Expr) || isa(msg, Symbol)) # message is an expression needing evaluating msg = :(Main.Base.string($(esc(msg)))) - elseif isdefined(Main, :Base) && isdefined(Main.Base, :string) + elseif isdefined(Main, :Base) && isdefined(Main.Base, :string) && applicable(Main.Base.string, msg) msg = Main.Base.string(msg) else # string() might not be defined during bootstrap diff --git a/base/exports.jl b/base/exports.jl index c9be9f24ad702..6243ad46f9f4b 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -778,6 +778,7 @@ export chomp, chop, chr2ind, + codeunit, dec, digits, digits!, diff --git a/base/expr.jl b/base/expr.jl index 251a8217b83d5..5d255314617c9 100644 --- a/base/expr.jl +++ b/base/expr.jl @@ -9,9 +9,8 @@ Generates a symbol which will not conflict with other variable names. """ gensym() = ccall(:jl_gensym, Ref{Symbol}, ()) -gensym(s::String) = gensym(s.data) -gensym(a::Array{UInt8,1}) = - ccall(:jl_tagged_gensym, Ref{Symbol}, (Ptr{UInt8}, Int32), a, length(a)) +gensym(s::String) = ccall(:jl_tagged_gensym, Ref{Symbol}, (Ptr{UInt8}, Int32), s, sizeof(s)) + gensym(ss::String...) = map(gensym, ss) gensym(s::Symbol) = ccall(:jl_tagged_gensym, Ref{Symbol}, (Ptr{UInt8}, Int32), s, ccall(:strlen, Csize_t, (Ptr{UInt8},), s)) diff --git a/base/interactiveutil.jl b/base/interactiveutil.jl index 89cb2430ea94c..ddb77a589adf9 100644 --- a/base/interactiveutil.jl +++ b/base/interactiveutil.jl @@ -773,6 +773,8 @@ function summarysize(obj::Array, seen, excl) return size end +summarysize(s::String, seen, excl) = sizeof(Int) + sizeof(s) + function summarysize(obj::SimpleVector, seen, excl) key = pointer_from_objref(obj) haskey(seen, key) ? (return 0) : (seen[key] = true) diff --git a/base/iobuffer.jl b/base/iobuffer.jl index a99963ab6bacd..9ba49c07706ab 100644 --- a/base/iobuffer.jl +++ b/base/iobuffer.jl @@ -250,7 +250,7 @@ isopen(io::AbstractIOBuffer) = io.readable || io.writable || io.seekable || nb_a function String(io::AbstractIOBuffer) io.readable || throw(ArgumentError("IOBuffer is not readable")) io.seekable || throw(ArgumentError("IOBuffer is not seekable")) - return String(copy!(Array{UInt8}(io.size), 1, io.data, 1, io.size)) + return unsafe_string(pointer(io.data), io.size) end """ @@ -326,22 +326,11 @@ function unsafe_write(to::AbstractIOBuffer, p::Ptr{UInt8}, nb::UInt) return written end -function write_sub{T}(to::AbstractIOBuffer, a::AbstractArray{T}, offs, nel) +function write_sub(to::AbstractIOBuffer, a::AbstractArray{UInt8}, offs, nel) if offs+nel-1 > length(a) || offs < 1 || nel < 0 throw(BoundsError()) end - local written::Int - if isbits(T) && isa(a,Array) - nb = UInt(nel * sizeof(T)) - written = unsafe_write(to, pointer(a, offs), nb) - else - written = 0 - ensureroom(to, UInt(sizeof(a))) - for i = offs:offs+nel-1 - written += write(to, a[i]) - end - end - return written + unsafe_write(to, pointer(a, offs), UInt(nel)) end @inline function write(to::AbstractIOBuffer, a::UInt8) diff --git a/base/loading.jl b/base/loading.jl index 6bf95515c511a..9fdc20dd5f7ea 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -64,12 +64,12 @@ elseif is_apple() break end # Hack to compensate for inability to create a string from a subarray with no allocations. - path_basename.data == casepreserved_basename && return true + Vector{UInt8}(path_basename) == casepreserved_basename && return true # If there is no match, it's possible that the file does exist but HFS+ # performed unicode normalization. See https://developer.apple.com/library/mac/qa/qa1235/_index.html. isascii(path_basename) && return false - normalize_string(path_basename, :NFD).data == casepreserved_basename + Vector{UInt8}(normalize_string(path_basename, :NFD)) == casepreserved_basename end else # Generic fallback that performs a slow directory listing. diff --git a/base/pointer.jl b/base/pointer.jl index 184783033fe6b..8dcbed8a03c46 100644 --- a/base/pointer.jl +++ b/base/pointer.jl @@ -32,8 +32,8 @@ convert{T}(::Type{Ptr{T}}, p::Ptr) = box(Ptr{T}, unbox(Ptr{Void},p)) # object to pointer (when used with ccall) unsafe_convert(::Type{Ptr{UInt8}}, x::Symbol) = ccall(:jl_symbol_name, Ptr{UInt8}, (Any,), x) unsafe_convert(::Type{Ptr{Int8}}, x::Symbol) = ccall(:jl_symbol_name, Ptr{Int8}, (Any,), x) -unsafe_convert(::Type{Ptr{UInt8}}, s::String) = unsafe_convert(Ptr{UInt8}, s.data) -unsafe_convert(::Type{Ptr{Int8}}, s::String) = convert(Ptr{Int8}, unsafe_convert(Ptr{UInt8}, s.data)) +unsafe_convert(::Type{Ptr{UInt8}}, s::String) = convert(Ptr{UInt8}, pointer_from_objref(s)+sizeof(Int)) +unsafe_convert(::Type{Ptr{Int8}}, s::String) = convert(Ptr{Int8}, pointer_from_objref(s)+sizeof(Int)) # convert strings to String etc. to pass as pointers cconvert(::Type{Ptr{UInt8}}, s::AbstractString) = String(s) cconvert(::Type{Ptr{Int8}}, s::AbstractString) = String(s) diff --git a/base/printf.jl b/base/printf.jl index 4a4556afeff2c..754e91276b176 100644 --- a/base/printf.jl +++ b/base/printf.jl @@ -870,8 +870,8 @@ function decode_hex(d::Integer, symbols::Array{UInt8,1}) return Int32(pt), Int32(pt), neg end -const hex_symbols = "0123456789abcdef".data -const HEX_symbols = "0123456789ABCDEF".data +const hex_symbols = b"0123456789abcdef" +const HEX_symbols = b"0123456789ABCDEF" decode_hex(x::Integer) = decode_hex(x,hex_symbols) decode_HEX(x::Integer) = decode_hex(x,HEX_symbols) diff --git a/base/regex.jl b/base/regex.jl index 42b606bbf359f..add2154d352b6 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -177,7 +177,7 @@ match(r::Regex, s::AbstractString, i::Integer) = throw(ArgumentError( function matchall(re::Regex, str::String, overlap::Bool=false) regex = compile(re).regex - n = length(str.data) + n = sizeof(str) matches = SubString{String}[] offset = UInt32(0) opts = re.match_options @@ -344,7 +344,7 @@ function next(itr::RegexMatchIterator, prev_match) prevempty ? opts_nonempty : UInt32(0)) if mat === nothing - if prevempty && offset <= length(itr.string.data) + if prevempty && offset <= sizeof(itr.string) offset = nextind(itr.string, offset) prevempty = false continue diff --git a/base/replutil.jl b/base/replutil.jl index 8a7ea086c6a98..4759425fd4d65 100644 --- a/base/replutil.jl +++ b/base/replutil.jl @@ -147,7 +147,7 @@ function show(io::IO, ::MIME"text/plain", s::String) show(io, s) else println(io, sizeof(s), "-byte String of invalid UTF-8 data:") - showarray(io, s.data, false; header=false) + showarray(io, Vector{UInt8}(s), false; header=false) end end @@ -256,7 +256,13 @@ showerror(io::IO, ::DivideError) = print(io, "DivideError: integer division erro showerror(io::IO, ::StackOverflowError) = print(io, "StackOverflowError:") showerror(io::IO, ::UndefRefError) = print(io, "UndefRefError: access to undefined reference") showerror(io::IO, ::EOFError) = print(io, "EOFError: read end of file") -showerror(io::IO, ex::ErrorException) = print(io, ex.msg) +function showerror(io::IO, ex::ErrorException) + print(io, ex.msg) + if ex.msg == "type String has no field data" + println(io) + print(io, "Use `Vector{UInt8}(str)` instead.") + end +end showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found") showerror(io::IO, ex::InterruptException) = print(io, "InterruptException:") showerror(io::IO, ex::ArgumentError) = print(io, "ArgumentError: $(ex.msg)") diff --git a/base/serialize.jl b/base/serialize.jl index fb662009658fd..a4d1e77c41f67 100644 --- a/base/serialize.jl +++ b/base/serialize.jl @@ -245,6 +245,12 @@ trimmedindex(P, d, i::Real) = oftype(i, 1) trimmedindex(P, d, i::Colon) = i trimmedindex(P, d, i::AbstractArray) = oftype(i, reshape(linearindices(i), indices(i))) +function serialize(s::AbstractSerializer, ss::String) + serialize_type(s, String) + write(s.io, sizeof(ss)) + write(s.io, ss) +end + function serialize{T<:AbstractString}(s::AbstractSerializer, ss::SubString{T}) # avoid saving a copy of the parent string, keeping the type of ss serialize_any(s, convert(SubString{T}, convert(T,ss))) @@ -859,6 +865,13 @@ function deserialize(s::AbstractSerializer, ::Type{Task}) t end +function deserialize(s::AbstractSerializer, ::Type{String}) + n = read(s.io, Int) + out = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n) + unsafe_read(s.io, pointer(out), n) + return out +end + # default DataType deserializer function deserialize(s::AbstractSerializer, t::DataType) nf = nfields(t) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 876743eead9ec..bc1322e84ad15 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -17,48 +17,9 @@ Convert a string to a contiguous byte array representation encoded as UTF-8 byte This representation is often appropriate for passing strings to C. """ String(s::AbstractString) = print_to_string(s) -String(s::String) = s -# String constructor docstring from boot.jl, workaround for #16730 -# and the unavailability of @doc in boot.jl context. -""" - String(v::Vector{UInt8}) - -Create a new `String` from a vector `v` of bytes containing -UTF-8 encoded characters. This function takes "ownership" of -the array, which means that you should not subsequently modify -`v` (since strings are supposed to be immutable in Julia) for -as long as the string exists. - -If you need to subsequently modify `v`, use `String(copy(v))` instead. -""" -String(v::Array{UInt8,1}) - - -""" - unsafe_string(p::Ptr{UInt8}, [length::Integer]) - -Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8. -(The pointer can be safely freed afterwards.) If `length` is specified -(the length of the data in bytes), the string does not have to be NUL-terminated. - -This function is labelled "unsafe" because it will crash if `p` is not -a valid memory address to data of the requested length. - -See also [`unsafe_wrap(String, p, [length])`](@ref), which takes a pointer -and wraps a string object around it without making a copy. -""" -function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer) - p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) - ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8},Int), p, len) -end -function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}) - p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) - ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p) -end - -convert(::Type{Vector{UInt8}}, s::AbstractString) = String(s).data -convert(::Type{Array{UInt8}}, s::AbstractString) = String(s).data +convert(::Type{Vector{UInt8}}, s::AbstractString) = convert(Vector{UInt8}, String(s)) +convert(::Type{Array{UInt8}}, s::AbstractString) = convert(Vector{UInt8}, s) convert(::Type{String}, s::AbstractString) = String(s) convert(::Type{Vector{Char}}, s::AbstractString) = collect(s) convert(::Type{Symbol}, s::AbstractString) = Symbol(s) @@ -155,12 +116,10 @@ end ==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0 isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0 -# faster comparisons for byte strings and symbols +# faster comparisons for symbols -cmp(a::String, b::String) = lexcmp(a.data, b.data) cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b))) -==(a::String, b::String) = a.data == b.data isless(a::Symbol, b::Symbol) = cmp(a,b) < 0 ## Generic validation functions ## @@ -190,32 +149,6 @@ prevind(s::AbstractArray , i::Integer) = Int(i)-1 nextind(s::DirectIndexString, i::Integer) = Int(i)+1 nextind(s::AbstractArray , i::Integer) = Int(i)+1 -function prevind(s::String, i::Integer) - j = Int(i) - e = endof(s.data) - if j > e - return endof(s) - end - j -= 1 - while j > 0 && is_valid_continuation(s.data[j]) - j -= 1 - end - j -end - -function nextind(s::String, i::Integer) - j = Int(i) - if j < 1 - return 1 - end - e = endof(s.data) - j += 1 - while j <= e && is_valid_continuation(s.data[j]) - j += 1 - end - j -end - """ prevind(str::AbstractString, i::Integer) @@ -319,10 +252,6 @@ next(e::EachStringIndex, state) = (state, nextind(e.s, state)) done(e::EachStringIndex, state) = done(e.s, state) eltype(::Type{EachStringIndex}) = Int -typealias Chars Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}} - -typealias ByteArray Union{Vector{UInt8},Vector{Int8}} - ## character column width function ## """ @@ -366,18 +295,6 @@ false isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F' isxdigit(s::AbstractString) = all(isxdigit, s) -## checking UTF-8 & ACSII validity ## - -byte_string_classify(data::Vector{UInt8}) = - ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data)) -byte_string_classify(s::String) = byte_string_classify(s.data) - # 0: neither valid ASCII nor UTF-8 - # 1: valid ASCII - # 2: valid UTF-8 - -isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0 -isvalid(s::String) = isvalid(String, s) - ## uppercase, lowercase, and titlecase transformations ## """ diff --git a/base/strings/io.jl b/base/strings/io.jl index 0d6e35a3a1c14..60a5b639dbe02 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -106,7 +106,7 @@ write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); en show(io::IO, s::AbstractString) = print_quoted(io, s) write(to::AbstractIOBuffer, s::SubString{String}) = - s.endof==0 ? 0 : write_sub(to, s.string.data, s.offset + 1, nextind(s, s.endof) - 1) + s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1)) ## printing literal quoted string data ## @@ -136,8 +136,8 @@ end Create a read-only `IOBuffer` on the data underlying the given string. """ -IOBuffer(str::String) = IOBuffer(str.data) -IOBuffer(s::SubString{String}) = IOBuffer(view(s.string.data, s.offset + 1 : s.offset + sizeof(s))) +IOBuffer(str::String) = IOBuffer(Vector{UInt8}(str)) +IOBuffer(s::SubString{String}) = IOBuffer(view(Vector{UInt8}(s.string), s.offset + 1 : s.offset + sizeof(s))) # join is implemented using IO @@ -308,7 +308,7 @@ end unescape_string(s::AbstractString) = sprint(endof(s), unescape_string, s) -macro b_str(s); :($(unescape_string(s)).data); end +macro b_str(s); :(Vector{UInt8}($(unescape_string(s)))); end ## multiline strings ## diff --git a/base/strings/search.jl b/base/strings/search.jl index 497e2993b425b..a1c5d4ce2e5ce 100644 --- a/base/strings/search.jl +++ b/base/strings/search.jl @@ -1,5 +1,7 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license +typealias Chars Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}} + """ search(string::AbstractString, chars::Chars, [start::Integer]) @@ -78,16 +80,19 @@ function _search_bloom_mask(c) UInt64(1) << (c & 63) end -function _searchindex(s::Array, t::Array, i) - n = length(t) - m = length(s) +_nthbyte(s::String, i) = codeunit(s, i) +_nthbyte(a::ByteArray, i) = a[i] + +function _searchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, i) + n = sizeof(t) + m = sizeof(s) if n == 0 return 1 <= i <= m+1 ? max(1, i) : 0 elseif m == 0 return 0 elseif n == 1 - return search(s, t[1], i) + return search(s, _nthbyte(t,1), i) end w = m - n @@ -97,21 +102,21 @@ function _searchindex(s::Array, t::Array, i) bloom_mask = UInt64(0) skip = n - 1 - tlast = t[end] + tlast = _nthbyte(t,n) for j in 1:n - bloom_mask |= _search_bloom_mask(t[j]) - if t[j] == tlast && j < n + bloom_mask |= _search_bloom_mask(_nthbyte(t,j)) + if _nthbyte(t,j) == tlast && j < n skip = n - j - 1 end end i -= 1 while i <= w - if s[i+n] == tlast + if _nthbyte(s,i+n) == tlast # check candidate j = 0 while j < n - 1 - if s[i+j+1] != t[j+1] + if _nthbyte(s,i+j+1) != _nthbyte(t,j+1) break end j += 1 @@ -123,13 +128,13 @@ function _searchindex(s::Array, t::Array, i) end # no match, try to rule out the next character - if i < w && bloom_mask & _search_bloom_mask(s[i+n+1]) == 0 + if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 i += n else i += skip end elseif i < w - if bloom_mask & _search_bloom_mask(s[i+n+1]) == 0 + if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0 i += n end end @@ -158,11 +163,11 @@ function searchindex(s::String, t::String, i::Integer=1) if endof(t) == 1 search(s, t[1], i) else - searchindex(s.data, t.data, i) + _searchindex(s, t, i) end end -function search(s::ByteArray, t::ByteArray, i) +function _search(s, t, i::Integer) idx = searchindex(s,t,i) if isempty(t) idx:idx-1 @@ -171,14 +176,8 @@ function search(s::ByteArray, t::ByteArray, i) end end -function search(s::AbstractString, t::AbstractString, i::Integer=start(s)) - idx = searchindex(s,t,i) - if isempty(t) - idx:idx-1 - else - idx:(idx > 0 ? idx + endof(t) - 1 : -1) - end -end +search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i) +search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i) function rsearch(s::AbstractString, c::Chars) j = search(RevString(s), c) @@ -238,16 +237,16 @@ function _rsearchindex(s, t, i) end end -function _rsearchindex(s::Array, t::Array, k) - n = length(t) - m = length(s) +function _rsearchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, k) + n = sizeof(t) + m = sizeof(s) if n == 0 return 0 <= k <= m ? max(k, 1) : 0 elseif m == 0 return 0 elseif n == 1 - return rsearch(s, t[1], k) + return rsearch(s, _nthbyte(t,1), k) end w = m - n @@ -257,21 +256,21 @@ function _rsearchindex(s::Array, t::Array, k) bloom_mask = UInt64(0) skip = n - 1 - tfirst = t[1] + tfirst = _nthbyte(t,1) for j in n:-1:1 - bloom_mask |= _search_bloom_mask(t[j]) - if t[j] == tfirst && j > 1 + bloom_mask |= _search_bloom_mask(_nthbyte(t,j)) + if _nthbyte(t,j) == tfirst && j > 1 skip = j - 2 end end i = min(k - n + 1, w + 1) while i > 0 - if s[i] == tfirst + if _nthbyte(s,i) == tfirst # check candidate j = 1 while j < n - if s[i+j] != t[j+1] + if _nthbyte(s,i+j) != _nthbyte(t,j+1) break end j += 1 @@ -283,13 +282,13 @@ function _rsearchindex(s::Array, t::Array, k) end # no match, try to rule out the next character - if i > 1 && bloom_mask & _search_bloom_mask(s[i-1]) == 0 + if i > 1 && bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0 i -= n else i -= skip end elseif i > 1 - if bloom_mask & _search_bloom_mask(s[i-1]) == 0 + if bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0 i -= n end end @@ -299,7 +298,7 @@ function _rsearchindex(s::Array, t::Array, k) 0 end -rsearchindex(s::ByteArray,t::ByteArray,i) = _rsearchindex(s,t,i) +rsearchindex(s::ByteArray, t::ByteArray, i::Integer) = _rsearchindex(s,t,i) """ rsearchindex(s::AbstractString, substring, [start::Integer]) @@ -315,7 +314,7 @@ function rsearchindex(s::String, t::String) if endof(t) == 1 rsearch(s, t[1]) else - _rsearchindex(s.data, t.data, length(s.data)) + _rsearchindex(s, t, sizeof(s)) end end @@ -325,7 +324,7 @@ function rsearchindex(s::String, t::String, i::Integer) if endof(t) == 1 rsearch(s, t[1], i) elseif endof(t) != 0 - _rsearchindex(s.data, t.data, nextind(s, i)-1) + _rsearchindex(s, t, nextind(s, i)-1) elseif i > sizeof(s) return 0 elseif i == 0 @@ -335,7 +334,7 @@ function rsearchindex(s::String, t::String, i::Integer) end end -function rsearch(s::ByteArray, t::ByteArray, i::Integer) +function _rsearch(s, t, i::Integer) idx = rsearchindex(s,t,i) if isempty(t) idx:idx-1 @@ -344,14 +343,8 @@ function rsearch(s::ByteArray, t::ByteArray, i::Integer) end end -function rsearch(s::AbstractString, t::AbstractString, i::Integer=endof(s)) - idx = rsearchindex(s,t,i) - if isempty(t) - idx:idx-1 - else - idx:(idx > 0 ? idx + endof(t) - 1 : -1) - end -end +rsearch(s::AbstractString, t::AbstractString, i::Integer=endof(s)) = _rsearch(s, t, i) +rsearch(s::ByteArray, t::ByteArray, i::Integer=endof(s)) = _rsearch(s, t, i) """ contains(haystack::AbstractString, needle::AbstractString) @@ -366,49 +359,3 @@ true contains(haystack::AbstractString, needle::AbstractString) = searchindex(haystack,needle)!=0 in(::AbstractString, ::AbstractString) = error("use contains(x,y) for string containment") - -# ByteArray optimizations - -# find the index of the first occurrence of a value in a byte array - -function search(a::ByteArray, b::Union{Int8,UInt8}, i::Integer) - if i < 1 - throw(BoundsError(a, i)) - end - n = length(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) - q == C_NULL ? 0 : Int(q-p+1) -end -function search(a::ByteArray, b::Char, i::Integer) - if isascii(b) - search(a,UInt8(b),i) - else - search(a,string(b).data,i).start - end -end -search(a::ByteArray, b::Union{Int8,UInt8,Char}) = search(a,b,1) - -function rsearch(a::ByteArray, b::Union{Int8,UInt8}, i::Integer) - if i < 1 - return i == 0 ? 0 : throw(BoundsError(a, i)) - end - n = length(a) - if i > n - return i == n+1 ? 0 : throw(BoundsError(a, i)) - end - p = pointer(a) - q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i) - q == C_NULL ? 0 : Int(q-p+1) -end -function rsearch(a::ByteArray, b::Char, i::Integer) - if isascii(b) - rsearch(a,UInt8(b),i) - else - rsearch(a,string(b).data,i).start - end -end -rsearch(a::ByteArray, b::Union{Int8,UInt8,Char}) = rsearch(a,b,length(a)) diff --git a/base/strings/string.jl b/base/strings/string.jl index 702db77e53ddd..30c98f3c9a045 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -1,11 +1,130 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license -## from base/boot.jl: -# -# immutable String <: AbstractString -# data::Vector{UInt8} -# end -# +typealias ByteArray Union{Vector{UInt8},Vector{Int8}} + +## constructors and conversions ## + +# String constructor docstring from boot.jl, workaround for #16730 +# and the unavailability of @doc in boot.jl context. +""" + String(v::Vector{UInt8}) + +Create a new `String` from a vector `v` of bytes containing +UTF-8 encoded characters. This function takes "ownership" of +the array, which means that you should not subsequently modify +`v` (since strings are supposed to be immutable in Julia) for +as long as the string exists. + +If you need to subsequently modify `v`, use `String(copy(v))` instead. +""" +function String(v::Array{UInt8,1}) + # TODO share data + unsafe_string(pointer(v), length(v)) +end + +""" + unsafe_string(p::Ptr{UInt8}, [length::Integer]) + +Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8. +(The pointer can be safely freed afterwards.) If `length` is specified +(the length of the data in bytes), the string does not have to be NUL-terminated. + +This function is labelled "unsafe" because it will crash if `p` is not +a valid memory address to data of the requested length. + +See also [`unsafe_wrap(String, p, [length])`](@ref), which takes a pointer +and wraps a string object around it without making a copy. +""" +function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer) + p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) + ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len) +end +function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}) + p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) + ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p) +end + +# TODO share data +convert(::Type{Vector{UInt8}}, s::String) = UInt8[ unsafe_codeunit(s,i) for i=1:s.len ] +convert(::Type{String}, s::String) = s +convert(::Type{String}, v::Vector{UInt8}) = String(v) + +## low-level functions ## + +pointer(s::String) = unsafe_convert(Ptr{UInt8}, s) +pointer(s::String, i::Integer) = pointer(s)+(i-1) + +sizeof(s::String) = s.len + +""" + codeunit(s::AbstractString, i::Integer) + +Get the `i`th code unit of an encoded string. For example, +returns the `i`th byte of the representation of a UTF-8 string. +""" +codeunit(s::AbstractString, i::Integer) + +@inline function codeunit(s::String, i::Integer) + @boundscheck if (i < 1) | (i > s.len) + throw(BoundsError(s,i)) + end + unsafe_load(pointer(s),i) +end + +write(io::IO, s::String) = unsafe_write(io, pointer(s), UInt(s.len)) + +## comparison ## + +function cmp(a::String, b::String) + c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), + a, b, min(a.len,b.len)) + return c < 0 ? -1 : c > 0 ? +1 : cmp(a.len,b.len) +end + +function ==(a::String, b::String) + a.len == b.len && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, a.len) +end + +## prevind and nextind ## + +function prevind(s::String, i::Integer) + j = Int(i) + e = s.len + if j > e + return endof(s) + end + j -= 1 + @inbounds while j > 0 && is_valid_continuation(codeunit(s,j)) + j -= 1 + end + j +end + +function nextind(s::String, i::Integer) + j = Int(i) + if j < 1 + return 1 + end + e = s.len + j += 1 + @inbounds while j <= e && is_valid_continuation(codeunit(s,j)) + j += 1 + end + j +end + +## checking UTF-8 & ACSII validity ## + +byte_string_classify(data::Vector{UInt8}) = + ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data)) +byte_string_classify(s::String) = + ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, s.len) + # 0: neither valid ASCII nor UTF-8 + # 1: valid ASCII + # 2: valid UTF-8 + +isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0 +isvalid(s::String) = isvalid(String, s) ## basic UTF-8 decoding & iteration ## @@ -34,46 +153,35 @@ const utf8_trailing = [ ## required core functionality ## function endof(s::String) - d = s.data - i = length(d) - @inbounds while i > 0 && is_valid_continuation(d[i]) + p = pointer(s) + i = s.len + while i > 0 && is_valid_continuation(unsafe_load(p,i)) i -= 1 end i end function length(s::String) - d = s.data + p = pointer(s) cnum = 0 - for i = 1:length(d) - @inbounds cnum += !is_valid_continuation(d[i]) + for i = 1:s.len + cnum += !is_valid_continuation(unsafe_load(p,i)) end cnum end -@noinline function slow_utf8_next(d::Vector{UInt8}, b::UInt8, i::Int) - # potentially faster version - # d = s.data - # a::UInt32 = d[i] - # if a < 0x80; return Char(a); end - # #if a&0xc0==0x80; return '\ufffd'; end - # b::UInt32 = a<<6 + d[i+1] - # if a < 0xe0; return Char(b - 0x00003080); end - # c::UInt32 = b<<6 + d[i+2] - # if a < 0xf0; return Char(c - 0x000e2080); end - # return Char(c<<6 + d[i+3] - 0x03c82080) - +@noinline function slow_utf8_next(p::Ptr{UInt8}, b::UInt8, i::Int, l::Int) if is_valid_continuation(b) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, unsafe_load(p,i))) end trailing = utf8_trailing[b + 1] - if length(d) < i + trailing + if l < i + trailing return '\ufffd', i+1 end c::UInt32 = 0 for j = 1:(trailing + 1) c <<= 6 - c += d[i] + c += unsafe_load(p,i) i += 1 end c -= utf8_offset[trailing + 1] @@ -82,18 +190,21 @@ end # This implementation relies on `next` returning a value past the end of the # String's underlying data, which is true for valid Strings -done(s::String, state) = state > endof(s.data) +done(s::String, state) = state > s.len @inline function next(s::String, i::Int) # function is split into this critical fast-path # for pure ascii data, such as parsing numbers, # and a longer function that can handle any utf8 data - d = s.data - b = d[i] + @boundscheck if (i < 1) | (i > s.len) + throw(BoundsError(s,i)) + end + p = pointer(s) + b = unsafe_load(p, i) if b < 0x80 return Char(b), i + 1 end - return slow_utf8_next(d, b, i) + return slow_utf8_next(p, b, i, s.len) end function first_utf8_byte(ch::Char) @@ -106,9 +217,9 @@ function first_utf8_byte(ch::Char) end function reverseind(s::String, i::Integer) - j = length(s.data) + 1 - i - d = s.data - while is_valid_continuation(d[j]) + j = s.len + 1 - i + p = pointer(s) + while is_valid_continuation(unsafe_load(p,j)) j -= 1 end return j @@ -116,101 +227,175 @@ end ## overload methods for efficiency ## -sizeof(s::String) = sizeof(s.data) - isvalid(s::String, i::Integer) = - (1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i]) - -const empty_utf8 = String(UInt8[]) + (1 <= i <= s.len) && !is_valid_continuation(unsafe_load(pointer(s),i)) function getindex(s::String, r::UnitRange{Int}) - isempty(r) && return empty_utf8 + isempty(r) && return "" i, j = first(r), last(r) - d = s.data - if i < 1 || i > length(s.data) + l = s.len + if i < 1 || i > l throw(BoundsError(s, i)) end - if is_valid_continuation(d[i]) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + @inbounds si = codeunit(s, i) + if is_valid_continuation(si) + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si)) end - if j > length(d) + if j > l throw(BoundsError()) end j = nextind(s,j)-1 - String(d[i:j]) + unsafe_string(pointer(s,i), j-i+1) end -function search(s::String, c::Char, i::Integer) +function search(s::String, c::Char, i::Integer = 1) if i < 1 || i > sizeof(s) i == sizeof(s) + 1 && return 0 throw(BoundsError(s, i)) end - d = s.data - if is_valid_continuation(d[i]) - throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i])) + if is_valid_continuation(codeunit(s,i)) + throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i))) end - c < Char(0x80) && return search(d, c%UInt8, i) + c < Char(0x80) && return search(s, c%UInt8, i) while true - i = search(d, first_utf8_byte(c), i) + i = search(s, first_utf8_byte(c), i) (i==0 || s[i] == c) && return i i = next(s,i)[2] end end -function rsearch(s::String, c::Char, i::Integer) - c < Char(0x80) && return rsearch(s.data, c%UInt8, i) +function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1) + if i < 1 + throw(BoundsError(a, i)) + end + n = sizeof(a) + if i > n + return i == n+1 ? 0 : throw(BoundsError(a, i)) + end + p = pointer(a) + q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1) + q == C_NULL ? 0 : Int(q-p+1) +end + +function search(a::ByteArray, b::Char, i::Integer = 1) + if isascii(b) + search(a,UInt8(b),i) + else + search(a,Vector{UInt8}(string(b)),i).start + end +end + +function rsearch(s::String, c::Char, i::Integer = s.len) + c < Char(0x80) && return rsearch(s, c%UInt8, i) b = first_utf8_byte(c) while true - i = rsearch(s.data, b, i) + i = rsearch(s, b, i) (i==0 || s[i] == c) && return i i = prevind(s,i) end end +function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = s.len) + if i < 1 + return i == 0 ? 0 : throw(BoundsError(a, i)) + end + n = sizeof(a) + if i > n + return i == n+1 ? 0 : throw(BoundsError(a, i)) + end + p = pointer(a) + q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i) + q == C_NULL ? 0 : Int(q-p+1) +end + +function rsearch(a::ByteArray, b::Char, i::Integer = length(a)) + if isascii(b) + rsearch(a,UInt8(b),i) + else + rsearch(a,Vector{UInt8}(string(b)),i).start + end +end + +## optimized concatenation, reverse, repeat ## + function string(a::String...) if length(a) == 1 return a[1]::String end - # ^^ at least one must be UTF-8 or the ASCII-only method would get called - data = Array{UInt8}(0) - for d in a - append!(data,d.data) + n = 0 + for str in a + n += str.len + end + out = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n) + offs = 1 + for str in a + unsafe_copy!(pointer(out,offs), pointer(str), str.len) + offs += str.len end - String(data) + return out +end + +# UTF-8 encoding length of a character +function codelen(d::Char) + c = UInt32(d) + if c < 0x80 + return 1 + elseif c < 0x800 + return 2 + elseif c < 0x10000 + return 3 + elseif c < 0x110000 + return 4 + end + return 3 # '\ufffd' end function string(a::Union{String,Char}...) - s = Array{UInt8}(0) + n = 0 + for d in a + if isa(d,Char) + n += codelen(d::Char) + else + n += (d::String).len + end + end + out = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n) + offs = 1 + p = pointer(out) for d in a if isa(d,Char) c = UInt32(d::Char) if c < 0x80 - push!(s, c%UInt8) + unsafe_store!(p, c%UInt8, offs); offs += 1 elseif c < 0x800 - push!(s, (( c >> 6 ) | 0xC0)%UInt8) - push!(s, (( c & 0x3F ) | 0x80)%UInt8) + unsafe_store!(p, (( c >> 6 ) | 0xC0)%UInt8, offs); offs += 1 + unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 elseif c < 0x10000 - push!(s, (( c >> 12 ) | 0xE0)%UInt8) - push!(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8) - push!(s, (( c & 0x3F ) | 0x80)%UInt8) + unsafe_store!(p, (( c >> 12 ) | 0xE0)%UInt8, offs); offs += 1 + unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 + unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 elseif c < 0x110000 - push!(s, (( c >> 18 ) | 0xF0)%UInt8) - push!(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8) - push!(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8) - push!(s, (( c & 0x3F ) | 0x80)%UInt8) + unsafe_store!(p, (( c >> 18 ) | 0xF0)%UInt8, offs); offs += 1 + unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 + unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1 + unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1 else # '\ufffd' - push!(s, 0xef); push!(s, 0xbf); push!(s, 0xbd) + unsafe_store!(p, 0xef, offs); offs += 1 + unsafe_store!(p, 0xbf, offs); offs += 1 + unsafe_store!(p, 0xbd, offs); offs += 1 end else - append!(s,(d::String).data) + l = (d::String).len + unsafe_copy!(pointer(out,offs), pointer(d::String), l) + offs += l end end - String(s) + return out end function reverse(s::String) - dat = s.data + dat = convert(Vector{UInt8},s) n = length(dat) n <= 1 && return s buf = Vector{UInt8}(n) @@ -241,10 +426,12 @@ function reverse(s::String) String(buf) end -write(io::IO, s::String) = write(io, s.data) - -pointer(x::String) = pointer(x.data) -pointer(x::String, i::Integer) = pointer(x.data)+(i-1) - -convert(::Type{String}, s::String) = s -convert(::Type{String}, v::Vector{UInt8}) = String(v) +function repeat(s::String, r::Integer) + r < 0 && throw(ArgumentError("can't repeat a string $r times")) + n = s.len + out = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n*r) + for i=1:r + unsafe_copy!(pointer(out, 1+(i-1)*n), pointer(s), n) + end + return out +end diff --git a/base/strings/strings.jl b/base/strings/strings.jl index c6d7834640699..2f175900e37d5 100644 --- a/base/strings/strings.jl +++ b/base/strings/strings.jl @@ -1,7 +1,6 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license include("strings/errors.jl") -include("strings/string.jl") include("strings/types.jl") include("strings/basic.jl") include("strings/search.jl") diff --git a/base/strings/types.jl b/base/strings/types.jl index 658966f0a380b..5fb036bb80871 100644 --- a/base/strings/types.jl +++ b/base/strings/types.jl @@ -76,7 +76,7 @@ prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset convert{T<:AbstractString}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s)) String(p::SubString{String}) = - String(p.string.data[1+p.offset:p.offset+nextind(p, p.endof)-1]) + unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1) function getindex(s::AbstractString, r::UnitRange{Int}) checkbounds(s, r) || throw(BoundsError(s, r)) @@ -95,7 +95,7 @@ end cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s function unsafe_convert{R<:Union{Int8, UInt8}}(::Type{Ptr{R}}, s::SubString{String}) - unsafe_convert(Ptr{R}, s.string.data) + s.offset + convert(Ptr{R}, pointer(s.string)) + s.offset end ## reversed strings without data movement ## @@ -140,16 +140,6 @@ function repeat(s::AbstractString, r::Integer) repeat(convert(String, s), r) end -function repeat(s::String, r::Integer) - r < 0 && throw(ArgumentError("can't repeat a string $r times")) - d = s.data; n = length(d) - out = Array{UInt8}(n*r) - for i=1:r - copy!(out, 1+(i-1)*n, d, 1, n) - end - convert(typeof(s), out) -end - """ ^(s::AbstractString, n::Integer) @@ -163,5 +153,5 @@ julia> "Test "^3 """ (^)(s::AbstractString, r::Integer) = repeat(s,r) -pointer(x::SubString{String}) = pointer(x.string.data) + x.offset -pointer(x::SubString{String}, i::Integer) = pointer(x.string.data) + x.offset + (i-1) +pointer(x::SubString{String}) = pointer(x.string) + x.offset +pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1) diff --git a/base/strings/util.jl b/base/strings/util.jl index cf6b630191a24..6f3dc0efaf113 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -52,7 +52,8 @@ function endswith(a::AbstractString, b::AbstractString) end endswith(str::AbstractString, chars::Chars) = !isempty(str) && last(str) in chars -startswith(a::String, b::String) = startswith(a.data, b.data) +startswith(a::String, b::String) = + (a.len >= b.len && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, b.len) == 0) startswith(a::Vector{UInt8}, b::Vector{UInt8}) = (length(a) >= length(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0) @@ -87,9 +88,9 @@ function chomp(s::AbstractString) end function chomp(s::String) i = endof(s) - if i < 1 || s.data[i] != 0x0a + if i < 1 || codeunit(s,i) != 0x0a SubString(s, 1, i) - elseif i < 2 || s.data[i-1] != 0x0d + elseif i < 2 || codeunit(s,i-1) != 0x0d SubString(s, 1, i-1) else SubString(s, 1, i-2) @@ -97,13 +98,14 @@ function chomp(s::String) end # NOTE: use with caution -- breaks the immutable string convention! -function chomp!(s::String) - if !isempty(s) && s.data[end] == 0x0a - n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 - ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) - end - return s -end +# TODO: this is hard to provide with the new representation +#function chomp!(s::String) +# if !isempty(s) && codeunit(s,s.len) == 0x0a +# n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 +# ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) +# end +# return s +#end chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types const _default_delims = [' ','\t','\n','\v','\f','\r'] @@ -340,7 +342,7 @@ function replace(str::String, pattern, repl, limit::Integer) ensureroom(out, floor(Int, 1.2sizeof(str))) while j != 0 if i == a || i <= k - write_sub(out, str.data, i, j-i) + unsafe_write(out, pointer(str, i), UInt(j-i)) _replace(out, repl, str, r, pattern) end if k=2?Core.ARGS[2].data:"".data, "build_h.jl".data))) # include($BUILDROOT/base/build_h.jl) -include(String(vcat(length(Core.ARGS)>=2?Core.ARGS[2].data:"".data, "version_git.jl".data))) # include($BUILDROOT/base/version_git.jl) +include(string((length(Core.ARGS)>=2 ? Core.ARGS[2] : ""), "build_h.jl")) # include($BUILDROOT/base/build_h.jl) +include(string((length(Core.ARGS)>=2 ? Core.ARGS[2] : ""), "version_git.jl")) # include($BUILDROOT/base/version_git.jl) + include("osutils.jl") include("c.jl") include("sysinfo.jl") @@ -158,14 +168,7 @@ include("io.jl") include("iostream.jl") include("iobuffer.jl") -# define MIME"foo/bar" early so that we can overload 3-arg show -immutable MIME{mime} end -macro MIME_str(s) - :(MIME{$(Expr(:quote, Symbol(s)))}) -end - # strings & printing -include("char.jl") include("intfuncs.jl") include("strings/strings.jl") include("parse.jl") diff --git a/base/util.jl b/base/util.jl index 6864bb8bca37c..6cf42650cb7e1 100644 --- a/base/util.jl +++ b/base/util.jl @@ -536,7 +536,7 @@ will always be called. """ function securezero! end @noinline securezero!{T<:Number}(a::AbstractArray{T}) = fill!(a, 0) -securezero!(s::String) = securezero!(s.data) +securezero!(s::String) = unsafe_securezero!(pointer(s), sizeof(s)) @noinline unsafe_securezero!{T}(p::Ptr{T}, len::Integer=1) = ccall(:memset, Ptr{T}, (Ptr{T}, Cint, Csize_t), p, 0, len*sizeof(T)) unsafe_securezero!(p::Ptr{Void}, len::Integer=1) = Ptr{Void}(unsafe_securezero!(Ptr{UInt8}(p), len)) @@ -669,8 +669,8 @@ a starting `crc` integer to be mixed in with the checksum. (Technically, a little-endian checksum is computed.) """ function crc32c end -crc32c(a::Array{UInt8}, crc::UInt32=0x00000000) = ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_t), crc, a, sizeof(a)) -crc32c(s::String, crc::UInt32=0x00000000) = crc32c(s.data, crc) +crc32c(a::Union{Array{UInt8},String}, crc::UInt32=0x00000000) = + ccall(:jl_crc32c, UInt32, (UInt32, Ptr{UInt8}, Csize_t), crc, a, sizeof(a)) """ @kwdef typedef diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md index ea0525ab654bb..a9895595484c7 100644 --- a/doc/src/stdlib/strings.md +++ b/doc/src/stdlib/strings.md @@ -11,6 +11,7 @@ Core.String(::AbstractString) Base.transcode Base.unsafe_string Base.unsafe_wrap(::Type{String}, ::Union{Ptr{Int8}, Ptr{UInt8}}, ::Integer, ::Bool) +Base.codeunit(::AbstractString, ::Integer) Base.ascii Base.@r_str Base.Docs.@html_str diff --git a/src/alloc.c b/src/alloc.c index bff42a4b4d07b..5ecebb73abe04 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -29,6 +29,7 @@ jl_value_t *jl_array_uint8_type; jl_value_t *jl_array_any_type=NULL; jl_value_t *jl_array_symbol_type; jl_datatype_t *jl_weakref_type; +jl_datatype_t *jl_abstractstring_type; jl_datatype_t *jl_string_type; jl_datatype_t *jl_expr_type; jl_datatype_t *jl_globalref_type; diff --git a/src/array.c b/src/array.c index a9c18062b0a01..913e94b52bbe9 100644 --- a/src/array.c +++ b/src/array.c @@ -369,20 +369,23 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len) JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) { - jl_ptls_t ptls = jl_get_ptls_states(); - if (!jl_typeis(a, jl_array_uint8_type)) - jl_type_error("jl_array_to_string", (jl_value_t*)jl_array_uint8_type, (jl_value_t*)a); - jl_value_t *s = jl_gc_alloc(ptls, sizeof(void*), jl_string_type); - jl_set_nth_field(s, 0, (jl_value_t*)a); - return s; + return jl_pchar_to_string(jl_array_data(a), jl_array_len(a)); } JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len) { - jl_array_t *a = jl_pchar_to_array(str, len); - JL_GC_PUSH1(&a); - jl_value_t *s = jl_array_to_string(a); - JL_GC_POP(); + jl_value_t *s = jl_gc_alloc(jl_get_ptls_states(), sizeof(size_t)+len+1, jl_string_type); + *(size_t*)s = len; + memcpy((char*)s + sizeof(size_t), str, len); + ((char*)s + sizeof(size_t))[len] = 0; + return s; +} + +JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len) +{ + jl_value_t *s = jl_gc_alloc(jl_get_ptls_states(), sizeof(size_t)+len+1, jl_string_type); + *(size_t*)s = len; + ((char*)s + sizeof(size_t))[len] = 0; return s; } diff --git a/src/builtins.c b/src/builtins.c index 721e6270d21ca..6b112f2c557a5 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -358,7 +358,8 @@ JL_CALLABLE(jl_f_sizeof) jl_value_t *x = args[0]; if (jl_is_datatype(x)) { jl_datatype_t *dx = (jl_datatype_t*)x; - if (dx->name == jl_array_typename || dx == jl_symbol_type || dx == jl_simplevector_type) + if (dx->name == jl_array_typename || dx == jl_symbol_type || dx == jl_simplevector_type || + dx == jl_string_type) jl_error("type does not have a canonical binary representation"); if (!(dx->name->names == jl_emptysvec && jl_datatype_size(dx) > 0)) { // names===() and size > 0 => bitstype, size always known @@ -367,9 +368,10 @@ JL_CALLABLE(jl_f_sizeof) } return jl_box_long(jl_datatype_size(x)); } - if (jl_is_array(x)) { + if (jl_is_array(x)) return jl_box_long(jl_array_len(x) * ((jl_array_t*)x)->elsize); - } + if (jl_is_string(x)) + return jl_box_long(jl_string_len(x)); jl_datatype_t *dt = (jl_datatype_t*)jl_typeof(x); assert(jl_is_datatype(dt)); assert(!dt->abstract); @@ -1228,6 +1230,9 @@ void jl_init_primitives(void) add_builtin("Int", (jl_value_t*)jl_int32_type); #endif + add_builtin("AbstractString", (jl_value_t*)jl_abstractstring_type); + add_builtin("String", (jl_value_t*)jl_string_type); + add_builtin("ANY", jl_ANY_flag); } @@ -1371,7 +1376,9 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v, jl_datatype_t *vt n += jl_printf(out, "nothing"); } else if (vt == jl_string_type) { - n += jl_printf(out, "\"%s\"", jl_iostr_data(v)); + n += jl_printf(out, "\""); + jl_uv_puts(out, jl_string_data(v), jl_string_len(v)); n += jl_string_len(v); + n += jl_printf(out, "\""); } else if (vt == jl_uniontype_type) { n += jl_show_svec(out, ((jl_uniontype_t*)v)->types, "Union", "{", "}"); diff --git a/src/codegen.cpp b/src/codegen.cpp index 7e953346d02a6..bb96e2dd9bdfb 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -2781,7 +2781,7 @@ static bool emit_builtin_call(jl_cgval_t *ret, jl_value_t *f, jl_value_t **args, sty = (jl_datatype_t*)jl_tparam0(sty); } if (jl_is_datatype(sty) && sty != jl_symbol_type && sty->name != jl_array_typename && - sty != jl_simplevector_type && + sty != jl_simplevector_type && sty != jl_string_type && // exclude DataType, since each DataType has its own size, not sizeof(DataType). // this is issue #8798 sty != jl_datatype_type) { diff --git a/src/dump.c b/src/dump.c index 1a1668c06b71f..ae3740fd18e42 100644 --- a/src/dump.c +++ b/src/dump.c @@ -997,6 +997,11 @@ static void jl_serialize_value_(jl_serializer_state *s, jl_value_t *v) else if (jl_typeis(v, jl_task_type)) { jl_error("Task cannot be serialized"); } + else if (jl_typeis(v, jl_string_type)) { + writetag(s->s, jl_string_type); + write_int32(s->s, jl_string_len(v)); + ios_write(s->s, jl_string_data(v), jl_string_len(v)); + } else { jl_datatype_t *t = (jl_datatype_t*)jl_typeof(v); void *data = jl_data_ptr(v); @@ -1963,6 +1968,14 @@ static jl_value_t *jl_deserialize_value_(jl_serializer_state *s, jl_value_t *vta else if (vtag == (jl_value_t*)Singleton_tag) { return jl_deserialize_value_singleton(s, loc); } + else if (vtag == (jl_value_t*)jl_string_type) { + size_t n = read_int32(s->s); + jl_value_t *str = jl_alloc_string(n); + if (usetable) + arraylist_push(&backref_list, str); + ios_read(s->s, jl_string_data(str), n); + return str; + } else { assert(vtag == (jl_value_t*)jl_datatype_type || vtag == (jl_value_t*)SmallDataType_tag); return jl_deserialize_value_any(s, vtag, loc); @@ -2938,7 +2951,7 @@ void jl_init_serializer(void) void *tags[] = { jl_symbol_type, jl_ssavalue_type, jl_datatype_type, jl_slotnumber_type, jl_simplevector_type, jl_array_type, jl_typedslot_type, jl_expr_type, (void*)LongSymbol_tag, (void*)LongSvec_tag, - (void*)LongExpr_tag, (void*)LiteralVal_tag, + (void*)LongExpr_tag, (void*)LiteralVal_tag, jl_string_type, (void*)SmallInt64_tag, (void*)SmallDataType_tag, (void*)Int32_tag, (void*)Array1d_tag, (void*)Singleton_tag, jl_module_type, jl_tvar_type, jl_method_instance_type, jl_method_type, @@ -2966,8 +2979,7 @@ void jl_init_serializer(void) #ifndef _P64 jl_box_int32(33), jl_box_int32(34), jl_box_int32(35), jl_box_int32(36), jl_box_int32(37), jl_box_int32(38), - jl_box_int32(39), jl_box_int32(40), jl_box_int32(41), - jl_box_int32(42), jl_box_int32(43), + jl_box_int32(39), #endif jl_box_int64(0), jl_box_int64(1), jl_box_int64(2), jl_box_int64(3), jl_box_int64(4), jl_box_int64(5), @@ -2983,11 +2995,10 @@ void jl_init_serializer(void) #ifdef _P64 jl_box_int64(33), jl_box_int64(34), jl_box_int64(35), jl_box_int64(36), jl_box_int64(37), jl_box_int64(38), - jl_box_int64(39), jl_box_int64(40), jl_box_int64(41), - jl_box_int64(42), jl_box_int64(43), + jl_box_int64(39), #endif jl_labelnode_type, jl_linenumbernode_type, - jl_gotonode_type, jl_quotenode_type, + jl_gotonode_type, jl_quotenode_type, jl_abstractstring_type, jl_type_type, jl_bottom_type, jl_ref_type, jl_pointer_type, jl_vararg_type, jl_abstractarray_type, jl_densearray_type, jl_void_type, jl_function_type, @@ -3010,7 +3021,7 @@ void jl_init_serializer(void) jl_typector_type->name, jl_intrinsic_type->name, jl_task_type->name, jl_labelnode_type->name, jl_linenumbernode_type->name, jl_builtin_type->name, jl_gotonode_type->name, jl_quotenode_type->name, - jl_globalref_type->name, + jl_globalref_type->name, jl_string_type->name, jl_abstractstring_type->name, ptls->root_task, diff --git a/src/gc.c b/src/gc.c index 83a22e9394466..753a1442c66bc 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1382,6 +1382,10 @@ static int push_root(jl_ptls_t ptls, jl_value_t *v, int d, int bits) bits = gc_setmark(ptls, v, sizeof(jl_weakref_t)); goto ret; } + else if (vt == (jl_value_t*)jl_string_type) { + bits = gc_setmark(ptls, v, jl_string_len(v) + sizeof(size_t) + 1); + goto ret; + } if ((jl_is_datatype(vt) && ((jl_datatype_t*)vt)->layout->pointerfree)) { int sz = jl_datatype_size(vt); bits = gc_setmark(ptls, v, sz); diff --git a/src/init.c b/src/init.c index b49703c0be759..af168dd0dfbe9 100644 --- a/src/init.c +++ b/src/init.c @@ -828,7 +828,6 @@ void jl_get_builtin_hooks(void) jl_segv_exception = jl_new_struct_uninit((jl_datatype_t*)core("SegmentationFault")); #endif - jl_string_type = (jl_datatype_t*)core("String"); jl_weakref_type = (jl_datatype_t*)core("WeakRef"); jl_vecelement_typename = ((jl_datatype_t*)core("VecElement"))->name; } diff --git a/src/jltypes.c b/src/jltypes.c index fc334d67e6f37..1f3ebf3d602f7 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -4009,6 +4009,11 @@ void jl_init_types(void) jl_ANY_flag = (jl_value_t*)tvar("ANY"); + jl_abstractstring_type = jl_new_abstracttype((jl_value_t*)jl_symbol("AbstractString"), jl_any_type, jl_emptysvec); + jl_string_type = jl_new_datatype(jl_symbol("String"), jl_abstractstring_type, jl_emptysvec, + jl_svec1(jl_symbol("len")), jl_svec1(jl_long_type), + 0, 1, 1); + // complete builtin type metadata jl_value_t *pointer_void = jl_apply_type((jl_value_t*)jl_pointer_type, jl_svec1(jl_void_type)); diff --git a/src/julia.h b/src/julia.h index d40fe57238fa8..5003e38761b88 100644 --- a/src/julia.h +++ b/src/julia.h @@ -515,6 +515,7 @@ extern JL_DLLEXPORT jl_datatype_t *jl_densearray_type; extern JL_DLLEXPORT jl_datatype_t *jl_array_type; extern JL_DLLEXPORT jl_typename_t *jl_array_typename; extern JL_DLLEXPORT jl_datatype_t *jl_weakref_type; +extern JL_DLLEXPORT jl_datatype_t *jl_abstractstring_type; extern JL_DLLEXPORT jl_datatype_t *jl_string_type; extern JL_DLLEXPORT jl_datatype_t *jl_errorexception_type; extern JL_DLLEXPORT jl_datatype_t *jl_argumenterror_type; @@ -763,9 +764,8 @@ STATIC_INLINE void jl_array_uint8_set(void *a, size_t i, uint8_t x) #define jl_data_ptr(v) ((jl_value_t**)v) #define jl_array_ptr_data(a) ((jl_value_t**)((jl_array_t*)a)->data) -#define jl_string_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data) -#define jl_string_len(s) (jl_array_len((jl_array_t*)(jl_data_ptr(s)[0]))) -#define jl_iostr_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data) +#define jl_string_data(s) ((char*)s + sizeof(void*)) +#define jl_string_len(s) (*(size_t*)s) #define jl_gf_mtable(f) (((jl_datatype_t*)jl_typeof(f))->name->mt) #define jl_gf_name(f) (jl_gf_mtable(f)->name) @@ -1156,6 +1156,7 @@ JL_DLLEXPORT jl_array_t *jl_alloc_array_3d(jl_value_t *atype, size_t nr, JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len); JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len); JL_DLLEXPORT jl_value_t *jl_cstr_to_string(const char *str); +JL_DLLEXPORT jl_value_t *jl_alloc_string(size_t len); JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a); JL_DLLEXPORT jl_array_t *jl_alloc_vec_any(size_t n); JL_DLLEXPORT jl_value_t *jl_arrayref(jl_array_t *a, size_t i); // 0-indexed @@ -1616,6 +1617,7 @@ typedef struct { #define _JL_FORMAT_ATTR(type, str, arg) #endif +JL_DLLEXPORT void jl_uv_puts(uv_stream_t *stream, const char *str, size_t n); JL_DLLEXPORT int jl_printf(uv_stream_t *s, const char *format, ...) _JL_FORMAT_ATTR(printf, 2, 3); JL_DLLEXPORT int jl_vprintf(uv_stream_t *s, const char *format, va_list args) diff --git a/src/toplevel.c b/src/toplevel.c index ad08895d6d4a8..301b8ba8fe38a 100644 --- a/src/toplevel.c +++ b/src/toplevel.c @@ -678,12 +678,8 @@ JL_DLLEXPORT jl_value_t *jl_load(const char *fname) // load from filename given as a String object JL_DLLEXPORT jl_value_t *jl_load_(jl_value_t *str) { - jl_array_t *ary = - jl_array_cconvert_cstring((jl_array_t*)(jl_data_ptr(str)[0])); - JL_GC_PUSH1(&ary); - jl_value_t *res = jl_load((const char*)ary->data); - JL_GC_POP(); - return res; + // assume String has a hidden '\0' at the end + return jl_load((const char*)jl_string_data(str)); } // method definition ---------------------------------------------------------- diff --git a/test/reflection.jl b/test/reflection.jl index 59b572d399a9f..d7c1493b31709 100644 --- a/test/reflection.jl +++ b/test/reflection.jl @@ -160,7 +160,6 @@ not_const = 1 @test isimmutable(1) == true @test isimmutable([]) == false -@test isimmutable("abc") == true ## find bindings tests @test ccall(:jl_get_module_of_binding, Any, (Any, Any), Base, :sin)==Base diff --git a/test/replcompletions.jl b/test/replcompletions.jl index 6335342596c57..20aaf34ada5f7 100644 --- a/test/replcompletions.jl +++ b/test/replcompletions.jl @@ -360,13 +360,13 @@ s = "\"\"." c,r = test_complete(s) @test length(c)==1 @test r == (endof(s)+1):endof(s) -@test c[1] == "data" +@test c[1] == "len" s = "(\"\"*\"\")." c,r = test_complete(s) @test length(c)==1 @test r == (endof(s)+1):endof(s) -@test c[1] == "data" +@test c[1] == "len" s = "CompletionFoo.test_y_array[1]." c,r = test_complete(s) diff --git a/test/strings/basic.jl b/test/strings/basic.jl index d7556f339826e..55fa2d3e42730 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -1,10 +1,7 @@ # This file is a part of Julia. License is MIT: http://julialang.org/license # constructors -let d = [0x61,0x62,0x63,0x21] - @test String(d) == "abc!" - @test String(d).data === d # String(d) should not make a copy -end +@test String([0x61,0x62,0x63,0x21]) == "abc!" @test String("abc!") == "abc!" @test isempty(string()) From 79bb7bea96500a51138d6f7750ae0c178b3bde0f Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Tue, 20 Dec 2016 23:03:03 -0500 Subject: [PATCH 2/5] make tests independent of string representation --- test/core.jl | 6 +----- test/iobuffer.jl | 2 +- test/misc.jl | 2 +- test/mmap.jl | 38 +++++++++++++++++------------------ test/perf/shootout/revcomp.jl | 2 +- test/read.jl | 10 ++++----- test/strings/basic.jl | 4 ++-- test/unicode/utf8.jl | 2 +- 8 files changed, 31 insertions(+), 35 deletions(-) diff --git a/test/core.jl b/test/core.jl index d0a43d40effbd..4be22a56c1e0b 100644 --- a/test/core.jl +++ b/test/core.jl @@ -636,10 +636,6 @@ let @test !isdefined(a, :foo) @test !isdefined(2, :a) - @test isdefined("a",:data) - @test isdefined("a", 1) - @test !isdefined("a", 2) - @test_throws TypeError isdefined(2) end @@ -4007,7 +4003,7 @@ b = "aaa" c = [0x2, 0x1, 0x3] @test check_nul(a) -@test check_nul(b.data) +@test check_nul(Vector{UInt8}(b)) @test check_nul(c) d = [0x2, 0x1, 0x3] @test check_nul(d) diff --git a/test/iobuffer.jl b/test/iobuffer.jl index 53b7eb4df806b..5d429a39317a2 100644 --- a/test/iobuffer.jl +++ b/test/iobuffer.jl @@ -103,7 +103,7 @@ write(io,[1,2,3]) skip(io,1) @test write(io,UInt8(104)) == 1 skip(io,3) -@test write(io,"apples".data) == 3 +@test write(io,b"apples") == 3 skip(io,71) @test write(io,'y') == 1 @test readstring(io) == "happy" diff --git a/test/misc.jl b/test/misc.jl index a5b489255ec44..af9972d1f8210 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -492,7 +492,7 @@ end let s = "abcα🐨\0x\0" for T in (UInt8, UInt16, UInt32, Int32) - @test transcode(T, s) == transcode(T, s.data) + @test transcode(T, s) == transcode(T, Vector{UInt8}(s)) @test transcode(String, transcode(T, s)) == s end end diff --git a/test/mmap.jl b/test/mmap.jl index 14285d3851de6..f79a1154baf21 100644 --- a/test/mmap.jl +++ b/test/mmap.jl @@ -2,7 +2,7 @@ file = tempname() write(file, "Hello World\n") -t = "Hello World".data +t = b"Hello World" @test Mmap.mmap(file, Array{UInt8,3}, (11,1,1)) == reshape(t,(11,1,1)) gc(); gc() @test Mmap.mmap(file, Array{UInt8,3}, (1,11,1)) == reshape(t,(1,11,1)) @@ -16,7 +16,7 @@ gc(); gc() gc(); gc() @test Mmap.mmap(file, Array{UInt8,2}, (0,12)) == Array{UInt8}((0,0)) m = Mmap.mmap(file, Array{UInt8,3}, (1,2,1)) -@test m == reshape("He".data,(1,2,1)) +@test m == reshape(b"He",(1,2,1)) finalize(m); m=nothing; gc() # constructors @@ -49,7 +49,7 @@ s = open(f->f,file,"w") @test Mmap.mmap(file) == Array{UInt8}(0) # requested len=0 on empty file @test Mmap.mmap(file,Vector{UInt8},0) == Array{UInt8}(0) m = Mmap.mmap(file,Vector{UInt8},12) -m[:] = "Hello World\n".data +m[:] = b"Hello World\n" Mmap.sync!(m) finalize(m); m=nothing; gc() @test open(readstring,file) == "Hello World\n" @@ -115,10 +115,10 @@ write(file, "Hello World\n") s = open(file, "r") @test isreadonly(s) == true c = Mmap.mmap(s, Vector{UInt8}, (11,)) -@test c == "Hello World".data +@test c == b"Hello World" finalize(c); c=nothing; gc() c = Mmap.mmap(s, Vector{UInt8}, (UInt16(11),)) -@test c == "Hello World".data +@test c == b"Hello World" finalize(c); c=nothing; gc() @test_throws ArgumentError Mmap.mmap(s, Vector{UInt8}, (Int16(-11),)) @test_throws ArgumentError Mmap.mmap(s, Vector{UInt8}, (typemax(UInt),)) @@ -136,18 +136,18 @@ close(s) finalize(c); c=nothing; gc() c = Mmap.mmap(file) -@test c == "Hellx World\n".data +@test c == b"Hellx World\n" finalize(c); c=nothing; gc() c = Mmap.mmap(file, Vector{UInt8}, 3) -@test c == "Hel".data +@test c == b"Hel" finalize(c); c=nothing; gc() s = open(file, "r") c = Mmap.mmap(s, Vector{UInt8}, 6) -@test c == "Hellx ".data +@test c == b"Hellx " close(s) finalize(c); c=nothing; gc() c = Mmap.mmap(file, Vector{UInt8}, 5, 6) -@test c == "World".data +@test c == b"World" finalize(c); c=nothing; gc() s = open(file, "w") @@ -156,26 +156,26 @@ close(s) # test Mmap.mmap m = Mmap.mmap(file) -t = "Hello World\n" +tdata = b"Hello World\n" for i = 1:12 - @test m[i] == t.data[i] + @test m[i] == tdata[i] end @test_throws BoundsError m[13] finalize(m); m=nothing; gc() m = Mmap.mmap(file,Vector{UInt8},6) -@test m[1] == "H".data[1] -@test m[2] == "e".data[1] -@test m[3] == "l".data[1] -@test m[4] == "l".data[1] -@test m[5] == "o".data[1] -@test m[6] == " ".data[1] +@test m[1] == b"H"[1] +@test m[2] == b"e"[1] +@test m[3] == b"l"[1] +@test m[4] == b"l"[1] +@test m[5] == b"o"[1] +@test m[6] == b" "[1] @test_throws BoundsError m[7] finalize(m); m=nothing; gc() m = Mmap.mmap(file,Vector{UInt8},2,6) -@test m[1] == "W".data[1] -@test m[2] == "o".data[1] +@test m[1] == b"W"[1] +@test m[2] == b"o"[1] @test_throws BoundsError m[3] finalize(m); m = nothing; gc() diff --git a/test/perf/shootout/revcomp.jl b/test/perf/shootout/revcomp.jl index 5ef433c092b63..b9fc342cb9728 100644 --- a/test/perf/shootout/revcomp.jl +++ b/test/perf/shootout/revcomp.jl @@ -43,7 +43,7 @@ function revcomp(infile="revcomp-input.txt") input = open(infile, "r") buff = UInt8[] while true - line = readline(input).data + line = readuntil(input, UInt8('\n')) if isempty(line) # print_buff(buff) return diff --git a/test/read.jl b/test/read.jl index d753c8bdd33b2..42bce748e9538 100644 --- a/test/read.jl +++ b/test/read.jl @@ -296,7 +296,7 @@ for (name, f) in l @test readstring("$filename.to") == text verbose && println("$name write(::IOBuffer, ...)") - to = IOBuffer(copy(text.data), false, true) + to = IOBuffer(copy(Vector{UInt8}(text)), false, true) write(to, io()) @test String(take!(to)) == text @@ -365,14 +365,14 @@ test_read_nbyte() let s = "qwerty" - @test read(IOBuffer(s)) == s.data - @test read(IOBuffer(s), 10) == s.data - @test read(IOBuffer(s), 1) == s.data[1:1] + @test read(IOBuffer(s)) == Vector{UInt8}(s) + @test read(IOBuffer(s), 10) == Vector{UInt8}(s) + @test read(IOBuffer(s), 1) == Vector{UInt8}(s)[1:1] # Test growing output array x = UInt8[] n = readbytes!(IOBuffer(s), x, 10) - @test x == s.data + @test x == Vector{UInt8}(s) @test n == length(x) end diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 55fa2d3e42730..41ac521e4fe89 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -438,8 +438,8 @@ foobaz(ch) = reinterpret(Char, typemax(UInt32)) # issue #18280: next/nextind must return past String's underlying data for s in ("Hello", "Σ", "こんにちは", "😊😁") - @test next(s, endof(s))[2] > endof(s.data) - @test nextind(s, endof(s)) > endof(s.data) + @test next(s, endof(s))[2] > sizeof(s) + @test nextind(s, endof(s)) > sizeof(s) end # Test cmp with AbstractStrings that don't index the same as UTF-8, which would include diff --git a/test/unicode/utf8.jl b/test/unicode/utf8.jl index 0a4665146b697..c2ea8b449b2e4 100644 --- a/test/unicode/utf8.jl +++ b/test/unicode/utf8.jl @@ -5,7 +5,7 @@ let ch = 0x10000 for hi = 0xd800:0xdbff for lo = 0xdc00:0xdfff - @test convert(String, String(Char[hi, lo]).data) == string(Char(ch)) + @test convert(String, Vector{UInt8}(String(Char[hi, lo]))) == string(Char(ch)) ch += 1 end end From ce24657073f44b127bb2281024e7db073ec4eb92 Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Wed, 21 Dec 2016 17:45:47 -0500 Subject: [PATCH 3/5] add the ability to share data between `String`s and `Vector{UInt8}`s --- base/c.jl | 2 +- base/filesystem.jl | 4 +-- base/intfuncs.jl | 10 +++---- base/iobuffer.jl | 28 +++++++++++------- base/iostream.jl | 10 +++++-- base/mpfr.jl | 6 ++-- base/strings/basic.jl | 4 +-- base/strings/io.jl | 10 ++++--- base/strings/string.jl | 16 +++++------ base/strings/util.jl | 5 ++-- src/array.c | 65 ++++++++++++++++++++++++++++++++++++------ src/gc.c | 52 ++++++++++++++++++++++++++++++--- src/julia.h | 3 +- src/julia_internal.h | 2 ++ src/sys.c | 13 ++++++++- 15 files changed, 175 insertions(+), 55 deletions(-) diff --git a/base/c.jl b/base/c.jl index eddb21bf8c6b3..b9dd62f8a4d6c 100644 --- a/base/c.jl +++ b/base/c.jl @@ -255,7 +255,7 @@ function transcode(::Type{UInt8}, src::Vector{UInt16}) a = src[i += 1] end - dst = Array{UInt8}(m) + dst = StringVector(m) a = src[1] i, j = 1, 0 while true diff --git a/base/filesystem.jl b/base/filesystem.jl index c43f8bf92836b..a9c2a6d3e006b 100644 --- a/base/filesystem.jl +++ b/base/filesystem.jl @@ -173,9 +173,9 @@ function readbytes!(f::File, b::Array{UInt8}, nb=length(b)) uv_error("read",ret) return ret end -read(io::File) = read!(io, Array{UInt8}(nb_available(io))) +read(io::File) = read!(io, Base.StringVector(nb_available(io))) readavailable(io::File) = read(io) -read(io::File, nb::Integer) = read!(io, Array{UInt8}(min(nb, nb_available(io)))) +read(io::File, nb::Integer) = read!(io, Base.StringVector(min(nb, nb_available(io)))) const SEEK_SET = Int32(0) const SEEK_CUR = Int32(1) diff --git a/base/intfuncs.jl b/base/intfuncs.jl index 309516af6daa1..003c5ff783a20 100644 --- a/base/intfuncs.jl +++ b/base/intfuncs.jl @@ -377,7 +377,7 @@ string(x::Union{Int8,Int16,Int32,Int64,Int128}) = dec(x) function bin(x::Unsigned, pad::Int, neg::Bool) i = neg + max(pad,sizeof(x)<<3-leading_zeros(x)) - a = Array{UInt8}(i) + a = StringVector(i) while i > neg a[i] = '0'+(x&0x1) x >>= 1 @@ -389,7 +389,7 @@ end function oct(x::Unsigned, pad::Int, neg::Bool) i = neg + max(pad,div((sizeof(x)<<3)-leading_zeros(x)+2,3)) - a = Array{UInt8}(i) + a = StringVector(i) while i > neg a[i] = '0'+(x&0x7) x >>= 3 @@ -401,7 +401,7 @@ end function dec(x::Unsigned, pad::Int, neg::Bool) i = neg + max(pad,ndigits0z(x)) - a = Array{UInt8}(i) + a = StringVector(i) while i > neg a[i] = '0'+rem(x,10) x = oftype(x,div(x,10)) @@ -413,7 +413,7 @@ end function hex(x::Unsigned, pad::Int, neg::Bool) i = neg + max(pad,(sizeof(x)<<1)-(leading_zeros(x)>>2)) - a = Array{UInt8}(i) + a = StringVector(i) while i > neg d = x & 0xf a[i] = '0'+d+39*(d>9) @@ -433,7 +433,7 @@ function base(b::Int, x::Unsigned, pad::Int, neg::Bool) 2 <= b <= 62 || throw(ArgumentError("base must be 2 ≤ base ≤ 62, got $b")) digits = b <= 36 ? base36digits : base62digits i = neg + max(pad,ndigits0z(x,b)) - a = Array{UInt8}(i) + a = StringVector(i) while i > neg a[i] = digits[1+rem(x,b)] x = div(x,b) diff --git a/base/iobuffer.jl b/base/iobuffer.jl index 9ba49c07706ab..a77c2f82bebf5 100644 --- a/base/iobuffer.jl +++ b/base/iobuffer.jl @@ -22,6 +22,9 @@ typealias IOBuffer AbstractIOBuffer{Vector{UInt8}} AbstractIOBuffer{T<:AbstractVector{UInt8}}(data::T, readable::Bool, writable::Bool, seekable::Bool, append::Bool, maxsize::Int) = AbstractIOBuffer{T}(data, readable, writable, seekable, append, maxsize) +# allocate Vector{UInt8}s for IOBuffer storage that can efficiently become Strings +StringVector(n::Integer) = Vector{UInt8}(_string_n(n)) + # IOBuffers behave like Files. They are typically readable and writable. They are seekable. (They can be appendable). """ @@ -34,7 +37,12 @@ last argument optionally specifies a size beyond which the buffer may not be gro """ IOBuffer(data::AbstractVector{UInt8}, readable::Bool=true, writable::Bool=false, maxsize::Int=typemax(Int)) = AbstractIOBuffer(data, readable, writable, true, false, maxsize) -IOBuffer(readable::Bool, writable::Bool) = IOBuffer(UInt8[], readable, writable) +function IOBuffer(readable::Bool, writable::Bool) + b = IOBuffer(StringVector(32), readable, writable) + b.data[:] = 0 + b.size = 0 + return b +end """ IOBuffer() -> IOBuffer @@ -48,7 +56,7 @@ IOBuffer() = IOBuffer(true, true) Create a fixed size IOBuffer. The buffer will not grow dynamically. """ -IOBuffer(maxsize::Int) = (x=IOBuffer(Array{UInt8}(maxsize), true, true, maxsize); x.size=0; x) +IOBuffer(maxsize::Int) = (x=IOBuffer(StringVector(maxsize), true, true, maxsize); x.size=0; x) # PipeBuffers behave like Unix Pipes. They are typically readable and writable, they act appendable, and are not seekable. @@ -63,7 +71,7 @@ optionally specifying a size beyond which the underlying `Array` may not be grow """ PipeBuffer(data::Vector{UInt8}=UInt8[], maxsize::Int=typemax(Int)) = AbstractIOBuffer(data,true,true,false,true,maxsize) -PipeBuffer(maxsize::Int) = (x = PipeBuffer(Array{UInt8}(maxsize),maxsize); x.size=0; x) +PipeBuffer(maxsize::Int) = (x = PipeBuffer(Vector{UInt8}(maxsize),maxsize); x.size=0; x) function copy(b::AbstractIOBuffer) ret = typeof(b)(b.writable ? copy(b.data) : b.data, @@ -263,10 +271,10 @@ function take!(io::AbstractIOBuffer) ismarked(io) && unmark(io) if io.seekable nbytes = io.size - data = copy!(Array{UInt8}(nbytes), 1, io.data, 1, nbytes) + data = copy!(StringVector(nbytes), 1, io.data, 1, nbytes) else nbytes = nb_available(io) - data = read!(io,Array{UInt8}(nbytes)) + data = read!(io,StringVector(nbytes)) end if io.writable io.ptr = 1 @@ -280,14 +288,14 @@ function take!(io::IOBuffer) data = io.data if io.writable maxsize = (io.maxsize == typemax(Int) ? 0 : min(length(io.data),io.maxsize)) - io.data = Array{UInt8}(maxsize) + io.data = StringVector(maxsize) else data = copy(data) end resize!(data,io.size) else nbytes = nb_available(io) - a = Array{UInt8}(nbytes) + a = StringVector(nbytes) data = read!(io, a) end if io.writable @@ -357,9 +365,9 @@ function readbytes!(io::AbstractIOBuffer, b::Array{UInt8}, nb::Int) read_sub(io, b, 1, nr) return nr end -read(io::AbstractIOBuffer) = read!(io,Array{UInt8}(nb_available(io))) +read(io::AbstractIOBuffer) = read!(io,StringVector(nb_available(io))) readavailable(io::AbstractIOBuffer) = read(io) -read(io::AbstractIOBuffer, nb::Integer) = read!(io,Array{UInt8}(min(nb, nb_available(io)))) +read(io::AbstractIOBuffer, nb::Integer) = read!(io,StringVector(min(nb, nb_available(io)))) function search(buf::IOBuffer, delim::UInt8) p = pointer(buf.data, buf.ptr) @@ -381,7 +389,7 @@ end function readuntil(io::AbstractIOBuffer, delim::UInt8) lb = 70 - A = Array{UInt8}(lb) + A = StringVector(lb) n = 0 data = io.data for i = io.ptr : io.size diff --git a/base/iostream.jl b/base/iostream.jl index 5899db85c5d29..7cbb3787bd9ce 100644 --- a/base/iostream.jl +++ b/base/iostream.jl @@ -222,7 +222,11 @@ take!(s::IOStream) = ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios) function readuntil(s::IOStream, delim::UInt8) - ccall(:jl_readuntil, Array{UInt8,1}, (Ptr{Void}, UInt8), s.ios, delim) + ccall(:jl_readuntil, Array{UInt8,1}, (Ptr{Void}, UInt8, UInt8), s.ios, delim, 0) +end + +function readline(s::IOStream) + ccall(:jl_readuntil, Ref{String}, (Ptr{Void}, UInt8, UInt8), s.ios, '\n', 1) end function readbytes_all!(s::IOStream, b::Array{UInt8}, nb) @@ -278,7 +282,7 @@ function read(s::IOStream) sz -= pos end end - b = Array{UInt8}(sz<=0 ? 1024 : sz) + b = Array{UInt8,1}(sz<=0 ? 1024 : sz) nr = readbytes_all!(s, b, typemax(Int)) resize!(b, nr) end @@ -294,7 +298,7 @@ requested bytes, until an error or end-of-file occurs. If `all` is `false`, at m all stream types support the `all` option. """ function read(s::IOStream, nb::Integer; all::Bool=true) - b = Array{UInt8}(nb) + b = Array{UInt8,1}(nb) nr = readbytes!(s, b, nb, all=all) resize!(b, nr) end diff --git a/base/mpfr.jl b/base/mpfr.jl index 289f43a0bebd5..8349bdc8ab8f7 100644 --- a/base/mpfr.jl +++ b/base/mpfr.jl @@ -923,17 +923,17 @@ function string(x::BigFloat) # is, excluding the most significant, ceil(log(10, 2^precision(x))) k = ceil(Int32, precision(x) * 0.3010299956639812) lng = k + Int32(8) # Add space for the sign, the most significand digit, the dot and the exponent - buf = Array{UInt8}(lng + 1) + buf = Base.StringVector(lng + 1) # format strings are guaranteed to contain no NUL, so we don't use Cstring lng = ccall((:mpfr_snprintf,:libmpfr), Int32, (Ptr{UInt8}, Culong, Ptr{UInt8}, Ptr{BigFloat}...), buf, lng + 1, "%.Re", &x) if lng < k + 5 # print at least k decimal places lng = ccall((:mpfr_sprintf,:libmpfr), Int32, (Ptr{UInt8}, Ptr{UInt8}, Ptr{BigFloat}...), buf, "%.$(k)Re", &x) elseif lng > k + 8 - buf = Array{UInt8}(lng + 1) + buf = Base.StringVector(lng + 1) lng = ccall((:mpfr_snprintf,:libmpfr), Int32, (Ptr{UInt8}, Culong, Ptr{UInt8}, Ptr{BigFloat}...), buf, lng + 1, "%.Re", &x) end n = (1 <= x < 10 || -10 < x <= -1 || x == 0) ? lng - 4 : lng - return String(buf[1:n]) + return String(resize!(buf,n)) end print(io::IO, b::BigFloat) = print(io, string(b)) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index bc1322e84ad15..d5da3559f7f65 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -377,7 +377,7 @@ end ## string map, filter, has ## function map(f, s::AbstractString) - out = IOBuffer(Array{UInt8}(endof(s)),true,true) + out = IOBuffer(StringVector(endof(s)),true,true) truncate(out,0) for c in s c2 = f(c) @@ -390,7 +390,7 @@ function map(f, s::AbstractString) end function filter(f, s::AbstractString) - out = IOBuffer(Array{UInt8}(endof(s)),true,true) + out = IOBuffer(StringVector(endof(s)),true,true) truncate(out,0) for c in s if f(c) diff --git a/base/strings/io.jl b/base/strings/io.jl index 60a5b639dbe02..a1fb8faf2d223 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -45,8 +45,10 @@ println(io::IO, xs...) = print(io, xs..., '\n') ## conversion of general objects to strings ## function sprint(size::Integer, f::Function, args...; env=nothing) - s = IOBuffer(Array{UInt8}(size), true, true) - truncate(s,0) + s = IOBuffer(StringVector(size), true, true) + # specialized version of truncate(s,0) + s.size = 0 + s.ptr = 1 if env !== nothing f(IOContext(s, env), args...) else @@ -75,7 +77,7 @@ tostr_sizehint(x::Float32) = 12 function print_to_string(xs...; env=nothing) # specialized for performance reasons - s = IOBuffer(Array{UInt8}(tostr_sizehint(xs[1])), true, true) + s = IOBuffer(StringVector(tostr_sizehint(xs[1])), true, true) # specialized version of truncate(s,0) s.size = 0 s.ptr = 1 @@ -345,7 +347,7 @@ function unindent(str::AbstractString, indent::Int; tabwidth=8) pos = start(str) endpos = endof(str) # Note: this loses the type of the original string - buf = IOBuffer(Array{UInt8}(endpos), true, true) + buf = IOBuffer(StringVector(endpos), true, true) truncate(buf,0) cutting = true col = 0 # current column (0 based) diff --git a/base/strings/string.jl b/base/strings/string.jl index 30c98f3c9a045..73e758eed59e7 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -18,8 +18,7 @@ as long as the string exists. If you need to subsequently modify `v`, use `String(copy(v))` instead. """ function String(v::Array{UInt8,1}) - # TODO share data - unsafe_string(pointer(v), length(v)) + ccall(:jl_array_to_string, Ref{String}, (Any,), v) end """ @@ -44,8 +43,9 @@ function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}) ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p) end -# TODO share data -convert(::Type{Vector{UInt8}}, s::String) = UInt8[ unsafe_codeunit(s,i) for i=1:s.len ] +_string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n) + +convert(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) convert(::Type{String}, s::String) = s convert(::Type{String}, v::Vector{UInt8}) = String(v) @@ -326,7 +326,7 @@ function string(a::String...) for str in a n += str.len end - out = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n) + out = _string_n(n) offs = 1 for str in a unsafe_copy!(pointer(out,offs), pointer(str), str.len) @@ -359,7 +359,7 @@ function string(a::Union{String,Char}...) n += (d::String).len end end - out = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n) + out = _string_n(n) offs = 1 p = pointer(out) for d in a @@ -398,7 +398,7 @@ function reverse(s::String) dat = convert(Vector{UInt8},s) n = length(dat) n <= 1 && return s - buf = Vector{UInt8}(n) + buf = StringVector(n) out = n pos = 1 @inbounds while out > 0 @@ -429,7 +429,7 @@ end function repeat(s::String, r::Integer) r < 0 && throw(ArgumentError("can't repeat a string $r times")) n = s.len - out = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n*r) + out = _string_n(n*r) for i=1:r unsafe_copy!(pointer(out, 1+(i-1)*n), pointer(s), n) end diff --git a/base/strings/util.jl b/base/strings/util.jl index 6f3dc0efaf113..770b242615804 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -338,8 +338,9 @@ function replace(str::String, pattern, repl, limit::Integer) i = a = start(str) r = search(str,pattern,i) j, k = first(r), last(r) - out = IOBuffer() - ensureroom(out, floor(Int, 1.2sizeof(str))) + out = IOBuffer(StringVector(floor(Int, 1.2sizeof(str))), true, true) + out.size = 0 + out.ptr = 1 while j != 0 if i == a || i <= k unsafe_write(out, pointer(str, i), UInt(j-i)) diff --git a/src/array.c b/src/array.c index 913e94b52bbe9..37e8406e231ab 100644 --- a/src/array.c +++ b/src/array.c @@ -37,7 +37,7 @@ STATIC_INLINE jl_value_t *jl_array_owner(jl_array_t *a) { if (a->flags.how == 3) { a = (jl_array_t*)jl_array_data_owner(a); - assert(a->flags.how != 3); + assert(jl_is_string(a) || a->flags.how != 3); } return (jl_value_t*)a; } @@ -96,9 +96,8 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, // No allocation or safepoint allowed after this a->flags.how = 0; data = (char*)a + doffs; - if (tot > 0 && !isunboxed) { + if (tot > 0 && !isunboxed) memset(data, 0, tot); - } } else { tsz = JL_ARRAY_ALIGN(tsz, JL_CACHE_BYTE_ALIGNMENT); // align whole object @@ -235,6 +234,32 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data, return a; } +JL_DLLEXPORT jl_array_t *jl_string_to_array(jl_value_t *str) +{ + jl_ptls_t ptls = jl_get_ptls_states(); + jl_array_t *a; + + int ndimwords = jl_array_ndimwords(1); + int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t) + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT); + a = (jl_array_t*)jl_gc_alloc(ptls, tsz, jl_array_uint8_type); + a->flags.pooled = tsz <= GC_MAX_SZCLASS; + a->flags.ndims = 1; + a->offset = 0; + a->data = jl_string_data(str); + a->flags.isaligned = 0; + a->elsize = 1; + a->flags.ptrarray = 0; + jl_array_data_owner(a) = str; + a->flags.how = 3; + a->flags.isshared = 1; + size_t l = jl_string_len(str); +#ifdef STORE_ARRAY_LEN + a->length = l; +#endif + a->nrows = a->maxsize = l; + return a; +} + // own_buffer != 0 iff GC should call free() on this pointer eventually JL_DLLEXPORT jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data, size_t nel, int own_buffer) @@ -369,7 +394,17 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len) JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a) { - return jl_pchar_to_string(jl_array_data(a), jl_array_len(a)); + if (a->flags.how == 3 && a->offset == 0 && a->elsize == 1 && + (jl_array_ndims(a) != 1 || + !(a->maxsize+sizeof(void*)+1 > GC_MAX_SZCLASS && jl_array_nrows(a)+sizeof(void*)+1 <= GC_MAX_SZCLASS))) { + jl_value_t *o = jl_array_data_owner(a); + if (jl_is_string(o)) { + a->flags.isshared = 1; + *(size_t*)o = jl_array_len(a); + return o; + } + } + return jl_pchar_to_string((const char*)jl_array_data(a), jl_array_len(a)); } JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len) @@ -595,6 +630,20 @@ static int NOINLINE array_resize_buffer(jl_array_t *a, size_t newlen) a->data = jl_gc_managed_realloc(olddata, nbytes, oldnbytes, a->flags.isaligned, (jl_value_t*)a); } + else if (a->flags.how == 3 && jl_is_string(jl_array_data_owner(a))) { + // if data is in a String, keep it that way + jl_value_t *s; + if (a->flags.isshared) { + s = jl_alloc_string(nbytes); + newbuf = 1; + } + else { + s = jl_gc_realloc_string(jl_array_data_owner(a), nbytes); + } + jl_array_data_owner(a) = s; + jl_gc_wb(a, s); + a->data = jl_string_data(s); + } else { newbuf = 1; if ( @@ -630,6 +679,9 @@ static void NOINLINE array_try_unshare(jl_array_t *a) if (a->flags.isshared) { if (a->flags.how != 3) jl_error("cannot resize array with shared data"); + // allow resizing when data is shared with a String + if (jl_is_string(jl_array_data_owner(a))) + return; assert(a->offset == 0); size_t len = jl_array_nrows(a); size_t es = a->elsize; @@ -674,7 +726,6 @@ STATIC_INLINE void jl_array_grow_at_beg(jl_array_t *a, size_t idx, size_t inc, char *data = (char*)a->data; char *newdata; if (a->offset >= inc) { - assert(!a->flags.isshared); newdata = data - nbinc; a->offset -= inc; if (idx > 0) { @@ -703,7 +754,6 @@ STATIC_INLINE void jl_array_grow_at_beg(jl_array_t *a, size_t idx, size_t inc, a->offset = newoffset; } else { - assert(!a->flags.isshared); a->offset = (a->maxsize - newnrows) / 2; newdata = data - oldoffsnb + a->offset * elsz; // We could use memcpy if resizing allocates a new buffer, @@ -822,7 +872,6 @@ STATIC_INLINE void jl_array_del_at_beg(jl_array_t *a, size_t idx, size_t dec, { // no error checking // assume inbounds, assume unshared - assert(!a->flags.isshared); size_t elsz = a->elsize; size_t offset = a->offset; offset += dec; @@ -857,7 +906,6 @@ STATIC_INLINE void jl_array_del_at_end(jl_array_t *a, size_t idx, size_t dec, { // no error checking // assume inbounds, assume unshared - assert(!a->flags.isshared); char *data = (char*)a->data; size_t elsz = a->elsize; size_t last = idx + dec; @@ -1041,6 +1089,7 @@ STATIC_INLINE int jl_has_implicit_byte(jl_array_t *a) // We should check the owner. if (a->flags.how == 3) { a = (jl_array_t*)jl_array_data_owner(a); + if (jl_is_string(a)) return 1; return a->elsize == 1 && jl_has_implicit_byte_owned(a); } return jl_has_implicit_byte_owned(a); diff --git a/src/gc.c b/src/gc.c index 753a1442c66bc..3f1f80c3b4f06 100644 --- a/src/gc.c +++ b/src/gc.c @@ -2084,11 +2084,11 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) return b; } -JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, - int isaligned, jl_value_t *owner) +static void *gc_managed_realloc_(void *d, size_t sz, size_t oldsz, + int isaligned, jl_value_t *owner, int8_t can_collect) { - jl_ptls_t ptls = jl_get_ptls_states(); - maybe_collect(ptls); + if (can_collect) + maybe_collect(jl_get_ptls_states()); size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" @@ -2115,6 +2115,50 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, return b; } +JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, + int isaligned, jl_value_t *owner) +{ + return gc_managed_realloc_(d, sz, oldsz, isaligned, owner, 1); +} + +jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz) +{ + size_t len = jl_string_len(s); + if (sz <= len) return s; + jl_taggedvalue_t *v = jl_astaggedvalue(s); + size_t strsz = len + sizeof(size_t) + 1; + if (strsz <= GC_MAX_SZCLASS || + // TODO: because of issue #17971 we can't resize old objects + gc_marked(v->bits.gc)) { + // pool allocated; can't be grown in place so allocate a new object. + jl_value_t *snew = jl_alloc_string(sz); + memcpy(jl_string_data(snew), jl_string_data(s), len); + return snew; + } + size_t newsz = sz + sizeof(size_t) + 1; + size_t offs = offsetof(bigval_t, header); + size_t allocsz = LLT_ALIGN(newsz + offs, JL_CACHE_BYTE_ALIGNMENT); + if (allocsz < sz) // overflow in adding offs, size was "negative" + jl_throw(jl_memory_exception); + bigval_t *hdr = bigval_header(v); + jl_ptls_t ptls = jl_get_ptls_states(); + maybe_collect(ptls); // don't want this to happen during jl_gc_managed_realloc + gc_big_object_unlink(hdr); + // TODO: this is not safe since it frees the old pointer. ideally we'd like + // the old pointer to be left alone if we can't grow in place. + // for now it's up to the caller to make sure there are no references to the + // old pointer. + bigval_t *newbig = + (bigval_t*)gc_managed_realloc_(hdr, allocsz, LLT_ALIGN(strsz+offs, JL_CACHE_BYTE_ALIGNMENT), + 1, s, 0); + newbig->sz = allocsz; + newbig->age = 0; + gc_big_object_link(newbig, &ptls->heap.big_objects); + jl_value_t *snew = jl_valueof(&newbig->header); + *(size_t*)snew = sz; + return snew; +} + // Perm gen allocator // 2M pool #define GC_PERM_POOL_SIZE (2 * 1024 * 1024) diff --git a/src/julia.h b/src/julia.h index 5003e38761b88..2ca168fa11bb0 100644 --- a/src/julia.h +++ b/src/julia.h @@ -130,7 +130,7 @@ typedef struct { 0 = data is inlined, or a foreign pointer we don't manage 1 = julia-allocated buffer that needs to be marked 2 = malloc-allocated pointer this array object manages - 3 = has a pointer to the Array that owns the data + 3 = has a pointer to the object that owns the data */ uint16_t how:2; uint16_t ndims:10; @@ -1601,7 +1601,6 @@ JL_DLLEXPORT int jl_tcp_bind(uv_tcp_t *handle, uint16_t port, uint32_t host, JL_DLLEXPORT int jl_sizeof_ios_t(void); JL_DLLEXPORT jl_array_t *jl_take_buffer(ios_t *s); -JL_DLLEXPORT jl_value_t *jl_readuntil(ios_t *s, uint8_t delim); typedef struct { void *data; diff --git a/src/julia_internal.h b/src/julia_internal.h index dca90eff5a7e5..38c94c1b9974c 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -192,6 +192,8 @@ STATIC_INLINE void *jl_gc_alloc_buf(jl_ptls_t ptls, size_t sz) return jl_gc_alloc(ptls, sz, (void*)jl_buff_tag); } +jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz); + jl_code_info_t *jl_type_infer(jl_method_instance_t **li, size_t world, int force); jl_generic_fptr_t jl_generate_fptr(jl_method_instance_t *li, void *F, size_t world); jl_llvm_functions_t jl_compile_linfo(jl_method_instance_t **pli, jl_code_info_t *src, size_t world, const jl_cgparams_t *params); diff --git a/src/sys.c b/src/sys.c index f130ce20c110c..c4d7efaafdf51 100644 --- a/src/sys.c +++ b/src/sys.c @@ -250,13 +250,18 @@ JL_DLLEXPORT jl_array_t *jl_take_buffer(ios_t *s) return a; } -JL_DLLEXPORT jl_value_t *jl_readuntil(ios_t *s, uint8_t delim) +JL_DLLEXPORT jl_value_t *jl_readuntil(ios_t *s, uint8_t delim, uint8_t str) { jl_array_t *a; // manually inlined common case char *pd = (char*)memchr(s->buf+s->bpos, delim, (size_t)(s->size - s->bpos)); if (pd) { size_t n = pd-(s->buf+s->bpos)+1; + if (str) { + jl_value_t *str = jl_pchar_to_string(s->buf + s->bpos, n); + s->bpos += n; + return str; + } a = jl_alloc_array_1d(jl_array_uint8_type, n); memcpy(jl_array_data(a), s->buf + s->bpos, n); s->bpos += n; @@ -277,6 +282,12 @@ JL_DLLEXPORT jl_value_t *jl_readuntil(ios_t *s, uint8_t delim) a->nrows = n; ((char*)a->data)[n] = '\0'; } + if (str) { + JL_GC_PUSH1(&a); + jl_value_t *st = jl_array_to_string(a); + JL_GC_POP(); + return st; + } } return (jl_value_t*)a; } From c01a2cfa1d60e76e0ad0a8ef890a9e924345e2b8 Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Sun, 25 Dec 2016 17:13:19 -0500 Subject: [PATCH 4/5] deprecate `unsafe_wrap` for `String` --- base/c.jl | 4 ---- base/deprecated.jl | 13 +++++++++++++ base/fft/FFTW.jl | 5 ++++- base/gmp.jl | 6 ++++-- base/path.jl | 4 +++- base/pointer.jl | 26 -------------------------- base/strings/string.jl | 3 --- base/strings/utf8proc.jl | 21 +++++++++++++-------- doc/src/stdlib/strings.md | 1 - test/strings/basic.jl | 2 +- 10 files changed, 38 insertions(+), 47 deletions(-) diff --git a/base/c.jl b/base/c.jl index b9dd62f8a4d6c..3d7eab2954469 100644 --- a/base/c.jl +++ b/base/c.jl @@ -73,10 +73,6 @@ pointer(p::Cwstring) = convert(Ptr{Cwchar_t}, p) ==(x::Union{Cstring,Cwstring}, y::Ptr) = pointer(x) == y ==(x::Ptr, y::Union{Cstring,Cwstring}) = x == pointer(y) -# here, not in pointer.jl, to avoid bootstrapping problems in coreimg.jl -unsafe_wrap(::Type{String}, p::Cstring, own::Bool=false) = unsafe_wrap(String, convert(Ptr{UInt8}, p), own) -unsafe_wrap(::Type{String}, p::Cstring, len::Integer, own::Bool=false) = - unsafe_wrap(String, convert(Ptr{UInt8}, p), len, own) unsafe_string(s::Cstring) = unsafe_string(convert(Ptr{UInt8}, s)) # convert strings to String etc. to pass as pointers diff --git a/base/deprecated.jl b/base/deprecated.jl index 6fcedec8453e1..a265fef82d2d8 100644 --- a/base/deprecated.jl +++ b/base/deprecated.jl @@ -1499,4 +1499,17 @@ end # Calling promote_op is likely a bad idea, so deprecate its convenience wrapper promote_eltype_op @deprecate promote_eltype_op(op, As...) promote_op(op, map(eltype, As)...) +function unsafe_wrap(::Type{String}, p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer, own::Bool=false) + Base.depwarn("unsafe_wrap(String, ...) is deprecated; use `unsafe_string` instead.", :unsafe_wrap) + #ccall(:jl_array_to_string, Ref{String}, (Any,), + # ccall(:jl_ptr_to_array_1d, Vector{UInt8}, (Any, Ptr{UInt8}, Csize_t, Cint), + # Vector{UInt8}, p, len, own)) + unsafe_string(p, len) +end +unsafe_wrap(::Type{String}, p::Union{Ptr{UInt8},Ptr{Int8}}, own::Bool=false) = + unsafe_wrap(String, p, ccall(:strlen, Csize_t, (Ptr{UInt8},), p), own) +unsafe_wrap(::Type{String}, p::Cstring, own::Bool=false) = unsafe_wrap(String, convert(Ptr{UInt8}, p), own) +unsafe_wrap(::Type{String}, p::Cstring, len::Integer, own::Bool=false) = + unsafe_wrap(String, convert(Ptr{UInt8}, p), len, own) + # End deprecations scheduled for 0.6 diff --git a/base/fft/FFTW.jl b/base/fft/FFTW.jl index ea885ce670c2e..465646773d7af 100644 --- a/base/fft/FFTW.jl +++ b/base/fft/FFTW.jl @@ -282,7 +282,10 @@ sprint_plan_{T<:fftwDouble}(plan::FFTWPlan{T}) = sprint_plan_{T<:fftwSingle}(plan::FFTWPlan{T}) = ccall((:fftwf_sprint_plan,libfftwf), Ptr{UInt8}, (PlanPtr,), plan) function sprint_plan(plan::FFTWPlan) - unsafe_wrap(String, sprint_plan_(plan), true) + p = sprint_plan_(plan) + str = unsafe_string(p) + Libc.free(p) + return str end function show{T,K,inplace}(io::IO, p::cFFTWPlan{T,K,inplace}) diff --git a/base/gmp.jl b/base/gmp.jl index d720be1666de0..3cdf65b6fcdec 100644 --- a/base/gmp.jl +++ b/base/gmp.jl @@ -532,8 +532,10 @@ hex(n::BigInt, pad::Int) = base(16, n, pad) function base(b::Integer, n::BigInt) 2 <= b <= 62 || throw(ArgumentError("base must be 2 ≤ base ≤ 62, got $b")) - p = ccall((:__gmpz_get_str,:libgmp), Ptr{UInt8}, (Ptr{UInt8}, Cint, Ptr{BigInt}), C_NULL, b, &n) - unsafe_wrap(String, p, true) + nd = ndigits(n, b) + str = Base._string_n(n < 0 ? nd+1 : nd) + ccall((:__gmpz_get_str,:libgmp), Ptr{UInt8}, (Ptr{UInt8}, Cint, Ptr{BigInt}), str, b, &n) + return str end function base(b::Integer, n::BigInt, pad::Integer) diff --git a/base/path.jl b/base/path.jl index 097b73046ef3e..9b854bdbb4b5e 100644 --- a/base/path.jl +++ b/base/path.jl @@ -288,7 +288,9 @@ else # !windows function realpath(path::AbstractString) p = ccall(:realpath, Ptr{UInt8}, (Cstring, Ptr{UInt8}), path, C_NULL) systemerror(:realpath, p == C_NULL) - return unsafe_wrap(String, p, true) + str = unsafe_string(p) + Libc.free(p) + return str end end # os-test diff --git a/base/pointer.jl b/base/pointer.jl index 8dcbed8a03c46..688e69ebe8ba5 100644 --- a/base/pointer.jl +++ b/base/pointer.jl @@ -93,32 +93,6 @@ program, in the same manner as C. unsafe_store!(p::Ptr{Any}, x::ANY, i::Integer=1) = pointerset(p, x, Int(i), 1) unsafe_store!{T}(p::Ptr{T}, x, i::Integer=1) = pointerset(p, convert(T,x), Int(i), 1) -# unsafe pointer to string conversions (don't make a copy, unlike unsafe_string) -# (Cstring versions are in c.jl) -""" - unsafe_wrap(String, p::Ptr{UInt8}, [length,] own=false) - -Wrap a pointer `p` to an array of bytes in a `String` object, -interpreting the bytes as UTF-8 encoded characters *without making a -copy*. The optional `length` argument indicates the length in bytes of -the pointer's data; if it is omitted, the data is assumed to be -NUL-terminated. The `own` argument optionally specifies whether Julia -should take ownership of the memory, calling `free` on the pointer -when the array is no longer referenced. - -This function is labelled "unsafe" because it will crash if `p` is not -a valid memory address to data of the requested length. - -See also [`unsafe_string`](@ref), which takes a pointer -and makes a copy of the data. -""" -unsafe_wrap(::Type{String}, p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer, own::Bool=false) = - ccall(:jl_array_to_string, Ref{String}, (Any,), - ccall(:jl_ptr_to_array_1d, Vector{UInt8}, (Any, Ptr{UInt8}, Csize_t, Cint), - Vector{UInt8}, p, len, own)) -unsafe_wrap(::Type{String}, p::Union{Ptr{UInt8},Ptr{Int8}}, own::Bool=false) = - unsafe_wrap(String, p, ccall(:strlen, Csize_t, (Ptr{UInt8},), p), own) - # convert a raw Ptr to an object reference, and vice-versa """ unsafe_pointer_to_objref(p::Ptr) diff --git a/base/strings/string.jl b/base/strings/string.jl index 73e758eed59e7..1cc67be1c2d6b 100644 --- a/base/strings/string.jl +++ b/base/strings/string.jl @@ -30,9 +30,6 @@ Copy a string from the address of a C-style (NUL-terminated) string encoded as U This function is labelled "unsafe" because it will crash if `p` is not a valid memory address to data of the requested length. - -See also [`unsafe_wrap(String, p, [length])`](@ref), which takes a pointer -and wraps a string object around it without making a copy. """ function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer) p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) diff --git a/base/strings/utf8proc.jl b/base/strings/utf8proc.jl index 00ae091051e3d..1ad2e37bf4cf2 100644 --- a/base/strings/utf8proc.jl +++ b/base/strings/utf8proc.jl @@ -102,14 +102,19 @@ const UTF8PROC_STRIPMARK = (1<<13) ############################################################################ -function utf8proc_map(s::String, flags::Integer) - p = Ref{Ptr{UInt8}}() - result = ccall(:utf8proc_map, Cssize_t, - (Ptr{UInt8}, Cssize_t, Ref{Ptr{UInt8}}, Cint), - s, sizeof(s), p, flags) - result < 0 && error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, - (Cssize_t,), result))) - unsafe_wrap(String, p[], result, true)::String +utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result))) + +function utf8proc_map(str::String, options::Integer) + nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint), + str, sizeof(str), C_NULL, 0, options) + nwords < 0 && utf8proc_error(nwords) + buffer = Base.StringVector(nwords*4) + nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint), + str, sizeof(str), buffer, nwords, options) + nwords < 0 && utf8proc_error(nwords) + nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options) + nbytes < 0 && utf8proc_error(nbytes) + return String(resize!(buffer, nbytes)) end utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags) diff --git a/doc/src/stdlib/strings.md b/doc/src/stdlib/strings.md index a9895595484c7..be655da87e526 100644 --- a/doc/src/stdlib/strings.md +++ b/doc/src/stdlib/strings.md @@ -10,7 +10,6 @@ Base.repr Core.String(::AbstractString) Base.transcode Base.unsafe_string -Base.unsafe_wrap(::Type{String}, ::Union{Ptr{Int8}, Ptr{UInt8}}, ::Integer, ::Bool) Base.codeunit(::AbstractString, ::Integer) Base.ascii Base.@r_str diff --git a/test/strings/basic.jl b/test/strings/basic.jl index 41ac521e4fe89..9048ba7e2036a 100644 --- a/test/strings/basic.jl +++ b/test/strings/basic.jl @@ -245,7 +245,7 @@ end cstrdup(s) = @static is_windows() ? ccall(:_strdup, Cstring, (Cstring,), s) : ccall(:strdup, Cstring, (Cstring,), s) let p = cstrdup("hello") - @test unsafe_string(p) == "hello" == unsafe_wrap(String, cstrdup(p), true) + @test unsafe_string(p) == "hello" Libc.free(p) end From 8c687da8e3e391cd2174f80f625ce171245c7930 Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Sat, 7 Jan 2017 18:05:23 -0500 Subject: [PATCH 5/5] (de)serialize TypeMapEntry lists iteratively to save stack space --- src/dump.c | 111 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 78 insertions(+), 33 deletions(-) diff --git a/src/dump.c b/src/dump.c index ae3740fd18e42..79fee8bc7c429 100644 --- a/src/dump.c +++ b/src/dump.c @@ -1002,6 +1002,24 @@ static void jl_serialize_value_(jl_serializer_state *s, jl_value_t *v) write_int32(s->s, jl_string_len(v)); ios_write(s->s, jl_string_data(v), jl_string_len(v)); } + else if (jl_typeis(v, jl_typemap_entry_type)) { + writetag(s->s, jl_typemap_entry_type); + size_t n = 0; + jl_typemap_entry_t *te = (jl_typemap_entry_t*)v; + while ((jl_value_t*)te != jl_nothing) { + n++; te = te->next; + } + write_int32(s->s, n); + te = (jl_typemap_entry_t*)v; + size_t i, nf = jl_datatype_nfields(jl_typemap_entry_type); + while ((jl_value_t*)te != jl_nothing) { + for (i = 1; i < nf; i++) { + if (jl_field_size(jl_typemap_entry_type, i) > 0) + jl_serialize_value(s, jl_get_nth_field((jl_value_t*)te, i)); + } + te = te->next; + } + } else { jl_datatype_t *t = (jl_datatype_t*)jl_typeof(v); void *data = jl_data_ptr(v); @@ -1821,6 +1839,59 @@ static jl_value_t *jl_deserialize_value_singleton(jl_serializer_state *s, jl_val return v; } +static void jl_deserialize_struct(jl_serializer_state *s, jl_value_t *v, size_t startfield) +{ + jl_datatype_t *dt = (jl_datatype_t*)jl_typeof(v); + size_t i, nf = jl_datatype_nfields(dt); + char *data = (char*)jl_data_ptr(v); + for (i = startfield; i < nf; i++) { + if (jl_field_size(dt, i) > 0) { + if (jl_field_isptr(dt, i)) { + jl_value_t **fld = (jl_value_t**)(data+jl_field_offset(dt, i)); + *fld = jl_deserialize_value(s, fld); + } + else { + jl_set_nth_field(v, i, jl_deserialize_value(s, NULL)); + } + } + } + if (s->mode == MODE_MODULE) { + if (dt == jl_typename_type) { + jl_typename_t *tn = (jl_typename_t*)v; + tn->cache = jl_emptysvec; // the cache is refilled later (tag 5) + tn->linearcache = jl_emptysvec; // the cache is refilled later (tag 5) + } + if (dt == jl_typemap_entry_type) { + if (((jl_typemap_entry_t*)v)->max_world == ~(size_t)0) { + // update world validity to reflect current state of the counter + ((jl_typemap_entry_t*)v)->min_world = jl_world_counter; + } + else { + // garbage entry - delete it :( + ((jl_typemap_entry_t*)v)->min_world = ((jl_typemap_entry_t*)v)->max_world - 1; + } + } + } +} + +static jl_value_t *jl_deserialize_typemap_entry(jl_serializer_state *s) +{ + int N = read_int32(s->s); int n = N; + jl_value_t *te = jl_nothing; + jl_value_t **pn = &te; + while (n > 0) { + jl_value_t *v = jl_gc_alloc(s->ptls, jl_datatype_size(jl_typemap_entry_type), jl_typemap_entry_type); + if (n == N && s->mode != MODE_AST) + arraylist_push(&backref_list, v); + jl_deserialize_struct(s, v, 1); + ((jl_typemap_entry_t*)v)->next = (jl_typemap_entry_t*)jl_nothing; + *pn = v; + pn = (jl_value_t**)&((jl_typemap_entry_t*)v)->next; + n--; + } + return te; +} + static jl_value_t *jl_deserialize_value_any(jl_serializer_state *s, jl_value_t *vtag, jl_value_t **loc) { int usetable = (s->mode != MODE_AST); @@ -1849,41 +1920,12 @@ static jl_value_t *jl_deserialize_value_any(jl_serializer_state *s, jl_value_t * } } jl_set_typeof(v, dt); - size_t i, nf = jl_datatype_nfields(dt); - if (nf == 0 && jl_datatype_size(dt)>0) { + if (jl_datatype_nfields(dt) == 0 && jl_datatype_size(dt)>0) { int nby = jl_datatype_size(dt); ios_read(s->s, (char*)jl_data_ptr(v), nby); } else { - char *data = (char*)jl_data_ptr(v); - for (i = 0; i < nf; i++) { - if (jl_field_size(dt, i) > 0) { - if (jl_field_isptr(dt, i)) { - jl_value_t **fld = (jl_value_t**)(data+jl_field_offset(dt, i)); - *fld = jl_deserialize_value(s, fld); - } - else { - jl_set_nth_field(v, i, jl_deserialize_value(s, NULL)); - } - } - } - if (s->mode == MODE_MODULE) { - if (dt == jl_typename_type) { - jl_typename_t *tn = (jl_typename_t*)v; - tn->cache = jl_emptysvec; // the cache is refilled later (tag 5) - tn->linearcache = jl_emptysvec; // the cache is refilled later (tag 5) - } - if (dt == jl_typemap_entry_type) { - if (((jl_typemap_entry_t*)v)->max_world == ~(size_t)0) { - // update world validity to reflect current state of the counter - ((jl_typemap_entry_t*)v)->min_world = jl_world_counter; - } - else { - // garbage entry - delete it :( - ((jl_typemap_entry_t*)v)->min_world = ((jl_typemap_entry_t*)v)->max_world - 1; - } - } - } + jl_deserialize_struct(s, v, 0); } return v; } @@ -1976,6 +2018,9 @@ static jl_value_t *jl_deserialize_value_(jl_serializer_state *s, jl_value_t *vta ios_read(s->s, jl_string_data(str), n); return str; } + else if (vtag == (jl_value_t*)jl_typemap_entry_type) { + return jl_deserialize_typemap_entry(s); + } else { assert(vtag == (jl_value_t*)jl_datatype_type || vtag == (jl_value_t*)SmallDataType_tag); return jl_deserialize_value_any(s, vtag, loc); @@ -2952,7 +2997,7 @@ void jl_init_serializer(void) jl_simplevector_type, jl_array_type, jl_typedslot_type, jl_expr_type, (void*)LongSymbol_tag, (void*)LongSvec_tag, (void*)LongExpr_tag, (void*)LiteralVal_tag, jl_string_type, - (void*)SmallInt64_tag, (void*)SmallDataType_tag, + (void*)SmallInt64_tag, (void*)SmallDataType_tag, jl_typemap_entry_type, (void*)Int32_tag, (void*)Array1d_tag, (void*)Singleton_tag, jl_module_type, jl_tvar_type, jl_method_instance_type, jl_method_type, (void*)CommonSym_tag, (void*)NearbyGlobal_tag, jl_globalref_type, @@ -3005,7 +3050,7 @@ void jl_init_serializer(void) jl_typector_type, jl_typename_type, jl_builtin_type, jl_code_info_type, jl_task_type, jl_uniontype_type, jl_typetype_type, jl_typetype_tvar, jl_ANY_flag, jl_array_any_type, jl_intrinsic_type, jl_abstractslot_type, - jl_methtable_type, jl_typemap_level_type, jl_typemap_entry_type, + jl_methtable_type, jl_typemap_level_type, jl_voidpointer_type, jl_newvarnode_type, jl_array_symbol_type, jl_anytuple_type, jl_tparam0(jl_anytuple_type), jl_typeof(jl_emptytuple), jl_array_uint8_type,