Skip to content

Commit

Permalink
Merge pull request #759 from alyst/optimize_RDAIO
Browse files Browse the repository at this point in the history
Optimize RDAIO
  • Loading branch information
garborg committed Jan 6, 2015
2 parents 0f21922 + e6fc839 commit 99be77f
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 34 deletions.
92 changes: 60 additions & 32 deletions src/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,36 +132,46 @@ sxtype = uint8
##
##############################################################################

LONG_VECTOR_SUPPORT = (WORD_SIZE > 32) # disable long vectors support on 32-bit machines

if LONG_VECTOR_SUPPORT
typealias RVecLength Int64
else
typealias RVecLength Int
end

# abstract RDA format IO stream wrapper
abstract RDAIO
abstract RDAIO

type RDAXDRIO <: RDAIO # RDA XDR(binary) format IO stream wrapper
sub::IO # underlying IO stream
type RDAXDRIO{T<:IO} <: RDAIO # RDA XDR(binary) format IO stream wrapper
sub::T # underlying IO stream
buf::Vector{Uint8} # buffer for strings

RDAXDRIO( io::IO ) = new( io, Array(Uint8, 1024) )
RDAXDRIO( io::T ) = new( io, Array(Uint8, 1024) )
end
RDAXDRIO{T <: IO}(io::T) = RDAXDRIO{T}(io)

readint32(io::RDAXDRIO) = hton(read(io.sub, Int32))
readuint32(io::RDAXDRIO) = hton(read(io.sub, Uint32))
readfloat64(io::RDAIO) = hton(read(io.sub, Float64))
readint32(io::RDAXDRIO) = ntoh(read(io.sub, Int32))
readuint32(io::RDAXDRIO) = ntoh(read(io.sub, Uint32))
readfloat64(io::RDAXDRIO) = ntoh(read(io.sub, Float64))

readintorNA(io::RDAXDRIO) = readint32(io)
readintorNA(io::RDAXDRIO, n::Int64) = map!(hton, read(io.sub, Int32, n))
readintorNA(io::RDAXDRIO, n::RVecLength) = map!(ntoh, read(io.sub, Int32, n))

readfloatorNA(io::RDAXDRIO) = readfloat64(io)
readfloatorNA(io::RDAXDRIO, n::Int64) = map!(hton, read(io.sub, Float64, n))
readfloatorNA(io::RDAXDRIO, n::RVecLength) = map!(ntoh, read(io.sub, Float64, n))

function readnchars(io::RDAXDRIO, n::Int32) # a single character string
readbytes!(io.sub, io.buf, n)
bytestring(pointer(io.buf), n)
bytestring(pointer(io.buf), n)::ASCIIString
end

type RDAASCIIIO <: RDAIO # RDA ASCII format IO stream wrapper
sub::IO # underlying IO stream
type RDAASCIIIO{T<:IO} <: RDAIO # RDA ASCII format IO stream wrapper
sub::T # underlying IO stream

RDAASCIIIO( io::IO ) = new( io )
RDAASCIIIO( io::T ) = new( io )
end
RDAASCIIIO{T <: IO}(io::T) = RDAASCIIIO{T}(io)

readint32(io::RDAASCIIIO) = int32(readline(io.sub))
readuint32(io::RDAASCIIIO) = uint32(readline(io.sub))
Expand All @@ -171,25 +181,27 @@ function readintorNA(io::RDAASCIIIO)
str = chomp(readline(io.sub));
str == R_NA_STRING ? R_NA_INT32 : int32(str)
end
readintorNA(io::RDAASCIIIO, n::Int64) = Int32[readintorNA(io) for i in 1:n]
readintorNA(io::RDAASCIIIO, n::RVecLength) = Int32[readintorNA(io) for i in 1:n]

function readfloatorNA(io::RDAASCIIIO)
str = chomp(readline(io.sub));
str == R_NA_STRING ? R_NA_FLOAT64 : float64(str)
end
readfloatorNA(io::RDAASCIIIO, n::Int64) = Float64[readfloatorNA(io) for i in 1:n]
readfloatorNA(io::RDAASCIIIO, n::RVecLength) = Float64[readfloatorNA(io) for i in 1:n]

function readnchars(io::RDAASCIIIO, n::Int32) # reads N bytes-sized string
if (n==-1) return "" end
str = unescape_string(chomp(readline(io.sub)))
return length(str) == n ? str : error("Character string length mismatch")
length(str) == n || error("Character string length mismatch")
str
end

type RDANativeIO <: RDAIO # RDA native binary format IO stream wrapper (TODO)
sub::IO # underlying IO stream
type RDANativeIO{T<:IO} <: RDAIO # RDA native binary format IO stream wrapper (TODO)
sub::T # underlying IO stream

RDANativeIO( io::IO ) = new( io )
RDANativeIO( io::T ) = new( io )
end
RDANativeIO{T <: IO}(io::T) = RDANativeIO{T}(io)

function rdaio(io::IO, formatcode::AbstractString)
if formatcode == "X" RDAXDRIO(io)
Expand All @@ -199,18 +211,34 @@ function rdaio(io::IO, formatcode::AbstractString)
end
end

# reads the length of any data vector from a stream
# from R's serialize.c
function readlength(io::RDAIO)
len = convert(Int64, readint32(io))
if (len < -1) error("negative serialized length for vector")
elseif (len >= 0)
return len
else # big vectors, the next 2 ints encode the length
len1, len2 = convert(Int64, readint32(io)), convert(Int64, readint32(io))
# sanity check for now */
if (len1 > 65536) error("invalid upper part of serialized vector length") end
return (len1 << 32) + len2
if LONG_VECTOR_SUPPORT
# reads the length of any data vector from a stream
# from R's serialize.c
function readlength(io::RDAIO)
len = convert(RVecLength, readint32(io))
if (len < -1) error("negative serialized length for vector")
elseif (len >= 0)
return len
else # big vectors, the next 2 ints encode the length
len1, len2 = convert(RVecLength, readint32(io)), convert(RVecLength, readint32(io))
# sanity check for now
if (len1 > 65536) error("invalid upper part of serialized vector length") end
return (len1 << 32) + len2
end
end
else
# reads the length of any data vector from a stream
# fails when long (> 2^31-1) vector encountered
# from R's serialize.c
function readlength(io::RDAIO)
len = convert(RVecLength, readint32(io))
if (len >= 0)
return len
elseif (len < -1)
error("negative serialized length for vector")
else
error("negative serialized vector length:\nperhaps long vector from 64-bit version of R?")
end
end
end

Expand All @@ -232,7 +260,7 @@ function readcharacter(io::RDAIO) # a single character string
props.nchar==-1 ? "" : readnchars(io, props.nchar)
end

function readcharacter(io::RDAIO, n::Int64) # a single character string
function readcharacter(io::RDAIO, n::RVecLength) # a single character string
res = fill("", n)
na = falses(n)
for i in 1:n
Expand Down
6 changes: 4 additions & 2 deletions test/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ module TestRDA
df[3, :num] = NaN
df[:, :cplx] = @data [NA, complex128(1,NaN), NaN]
@test isequal(DataFrame(read_rda("$testdir/data/RDA/NAs.rda")["df"]), df)
# NA ASCII test disabled for now, because NaN are saved as NA. Investigating it from R side.
#@test isequal(DataFrame(read_rda("$testdir/data/RDA/NAs_ascii.rda")["df"]), df)
# ASCII format saves NaN as NA
df[3, :num] = NA
df[:, :cplx] = @data [NA, NA, NA]
@test isequal(DataFrame(read_rda("$testdir/data/RDA/NAs_ascii.rda")["df"]), df)

rda_names = names(DataFrame(read_rda("$testdir/data/RDA/names.rda")["df"]))
expected_names = [:_end, :x!, :x1, :_B_C_, :x, :x_1]
Expand Down

0 comments on commit 99be77f

Please sign in to comment.