Skip to content

Commit

Permalink
faster readcsv. fixes #3350
Browse files Browse the repository at this point in the history
  • Loading branch information
tanmaykm committed Jun 17, 2013
1 parent 3771bf2 commit 261e350
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 122 deletions.
217 changes: 95 additions & 122 deletions base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,103 +2,6 @@

const invalid_dlm = char(0xfffffffe)

function dlm_readrow(io::IO, dlm, eol::Char)
row_string = readuntil(io, eol)
while length(row_string)==1 && row_string[1] == eol
row_string = readuntil(io, eol)
end
if dlm == invalid_dlm
row = split(row_string)
else
row = split(row_string, dlm, true)
end
if endswith(row[end], eol)
row[end] = chop(row[end])
end
row
end

# all strings
function readdlm(a, io, dlm, nr, nc, row, eol)
for i=1:nr
a[i,:] = row
if i < nr
row = dlm_readrow(io, dlm, eol)
end
end
a
end

# all numeric, with NaN for invalid data
function readdlm{T<:Number}(a::Array{T}, io, dlm, nr, nc, row, eol)
tmp = Array(Float64,1)
for i=1:nr
for j=1:nc
if T <: Char
if length(row[j]) != 1
error("file entry \"$(row[j])\" is not a Char")
end
a[i,j] = row[j][1]
elseif float64_isvalid(row[j], tmp)
a[i,j] = tmp[1]
else
if !(T <: FloatingPoint)
error("file entry \"$(row[j])\" cannot be converted to $T")
end
a[i,j] = NaN
end
end
if i < nr
row = dlm_readrow(io, dlm, eol)
end
end
end

# float64 or string
readdlm(a::Array{Any}, io, dlm, nr, nc, row, eol) =
readdlm(a, io, dlm, nr, nc, row, eol, 1, 1)
function readdlm(a::Array{Any}, io, dlm, nr, nc, row, eol, i0, j0)
tmp = Array(Float64,1)
j = j0
for i=i0:nr
while j <= nc
el = row[j]
if float64_isvalid(el, tmp)
a[i,j] = tmp[1]
else
a[i,j] = el
end
j += 1
end
j = 1
if i < nr
row = dlm_readrow(io, dlm, eol)
end
end
a
end

# float64 or cell depending on data
function readdlm_auto(a, io, dlm, nr, nc, row, eol)
tmp = Array(Float64, 1)
for i=1:nr
for j=1:nc
el = row[j]
if !float64_isvalid(el, tmp)
a = convert(Array{Any,2}, a)
readdlm(a, io, dlm, nr, nc, row, eol, i, j)
return a
else
a[i,j] = tmp[1]
end
end
if i < nr
row = dlm_readrow(io, dlm, eol)
end
end
a
end

countlines(nameorfile) = countlines(nameorfile, '\n')
function countlines(filename::String, eol::Char)
open(filename) do io
Expand Down Expand Up @@ -129,38 +32,108 @@ function countlines(io::IO, eol::Char)
nl
end

function readdlm_setup(fname::String, dlm, eol)
if length(dlm) == 0
error("readdlm: no separator characters specified")
end
nr = countlines(fname,eol)
io = open(fname)
row = dlm_readrow(io, dlm, eol)
nc = length(row)
return (io, nr, nc, row)
readdlm(input, T::Type) = readdlm(input, invalid_dlm, T, '\n')
readdlm(input, dlm::Char, T::Type) = readdlm(input, dlm, T, '\n')

readdlm(input) = readdlm(input, invalid_dlm, '\n')
readdlm(input, dlm::Char) = readdlm(input, dlm, '\n')

readdlm(input, dlm::Char, eol::Char) = readdlm_auto(input, dlm, Float64, eol, true)
readdlm(input, dlm::Char, T::Type, eol::Char) = readdlm_auto(input, dlm, T, eol, false)
function readdlm_auto(input, dlm::Char, T::Type, eol::Char, auto::Bool=false)
sbuff = readall(input)
nrows,ncols = dlm_dims(sbuff, eol, dlm)
offsets = zeros(Int, nrows, ncols)
cells = Array(T, nrows, ncols)
dlm_offsets(sbuff, dlm, eol, offsets)
dlm_fill(cells, offsets, sbuff, auto)
end

readdlm(fname::String, T::Type) = readdlm(fname, invalid_dlm, T, '\n')
function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int)
(row == 1) && (col == 1) && return 1
pp_row = (1 == col) ? (row-1) : row
pp_col = (1 == col) ? ncols : (col-1)

ret = offsets[pp_row, pp_col]
(ret == 0) ? dlm_col_begin(csv, pp_row, pp_col) : (ret+2)

This comment has been minimized.

Copy link
@quinnj

quinnj Jun 19, 2013

Member

@tanmaykm, Where does the csv variable come from here? Is this ever called? I don't quite get this conditional. I'm trying to dig into this new code for a port to DataFrames. Thanks!

This comment has been minimized.

Copy link
@tanmaykm

tanmaykm Jun 19, 2013

Author Member

That's a typo bug. It should have been ncols, offsets. #3442 would fix it once merged.

end

readdlm(fname::String, dlm, T::Type) = readdlm(fname, dlm, T, '\n')
function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool)
maxrow,maxcol = size(cells)
tmp64 = Array(Float64,1)
for row in 1:maxrow
for col in 1:maxcol
start_pos = dlm_col_begin(maxcol, offsets, row, col)
end_pos = offsets[row,col]
sval = SubString(sbuff, start_pos, end_pos)

function readdlm(fname::String, dlm, T::Type, eol::Char)
(io, nr, nc, row) = readdlm_setup(fname, dlm, eol)
a = Array(T, nr, nc)
readdlm(a, io, dlm, nr, nc, row, eol)
close(io)
return a
if T <: Char
(length(sval) != 1) && error("file entry \"$(sval)\" is not a Char")
cells[row,col] = sval
elseif T <: Number
if(float64_isvalid(sval, tmp64))
cells[row,col] = tmp64[1]
elseif auto
return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false)
else
cells[row,col] = NaN
end
elseif T <: String
cells[row,col] = sval
elseif T == Any
cells[row,col] = float64_isvalid(sval, tmp64) ? tmp64[1] : sval
else
error("file entry \"$(sval)\" cannot be converted to $T")
end
end
end
cells
end

readdlm(fname::String) = readdlm(fname, invalid_dlm, '\n')
readdlm(fname::String, dlm) = readdlm(fname, dlm, '\n')

function readdlm(fname::String, dlm, eol::Char)
(io, nr, nc, row) = readdlm_setup(fname, dlm, eol)
a = Array(Float64, nr, nc)
a = readdlm_auto(a, io, dlm, nr, nc, row, eol)
close(io)
return a
function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2})
col = 0
row = 1
maxrow,maxcol = size(offsets)
idx = 1
while(idx < length(sbuff.data))
val,idx = next(sbuff, idx)
(val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-2
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
end
end
function dlm_offsets(sbuff::ASCIIString, dlmc, eolc, offsets::Array{Int,2})
dbuff = sbuff.data
dlm = uint8(dlmc)
eol = uint8(eolc)
col = 0
row = 1
maxrow,maxcol = size(offsets)
for idx in 1:length(dbuff)
val = dbuff[idx]
(val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue
col += 1
offsets[row,col] = idx-1
(row >= maxrow) && (col == maxcol) && break
(val == eol) && (row += 1; col = 0)
end
end

dlm_dims(s::ASCIIString, eol, dlm) = dlm_dims(s.data, uint8(eol), uint8(dlm))
function dlm_dims(dbuff, eol, dlm)
ncols = nrows = col = 0
for val in dbuff
(val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue
col += 1
(val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0)
end
(col > 0) && (nrow += 1)
ncols = max(ncols, col, 1)
nrows = max(nrows, 1)
return (nrows, ncols)
end

readcsv(io) = readdlm(io, ',')
Expand Down
5 changes: 5 additions & 0 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1212,6 +1212,11 @@ float64_isvalid(s::String, out::Array{Float64,1}) =
float32_isvalid(s::String, out::Array{Float32,1}) =
ccall(:jl_strtof, Int32, (Ptr{Uint8},Ptr{Float32}), s, out) == 0

float64_isvalid(s::SubString, out::Array{Float64,1}) =
ccall(:jl_substrtod, Int32, (Ptr{Uint8},Int,Int,Ptr{Float64}), s.string, s.offset, s.endof, out) == 0
float32_isvalid(s::SubString, out::Array{Float32,1}) =
ccall(:jl_substrtof, Int32, (Ptr{Uint8},Int,Int,Ptr{Float32}), s.string, s.offset, s.endof, out) == 0

begin
local tmp::Array{Float64,1} = Array(Float64,1)
local tmpf::Array{Float32,1} = Array(Float32,1)
Expand Down
29 changes: 29 additions & 0 deletions src/builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,18 @@ DLLEXPORT void jl_print_int64(JL_STREAM *s, int64_t i)
JL_PRINTF(s, "%lld", i);
}

DLLEXPORT int jl_substrtod(char *str, int offset, int len, double *out)
{
char *p;
errno = 0;
char *bstr = str+offset;
*out = strtod(bstr, &p);
if((p == bstr) || (p != (bstr+len)) ||
(errno==ERANGE && (*out==0 || *out==HUGE_VAL || *out==-HUGE_VAL)))
return 1;
return 0;
}

DLLEXPORT int jl_strtod(char *str, double *out)
{
char *p;
Expand All @@ -616,6 +628,23 @@ DLLEXPORT int jl_strtod(char *str, double *out)
return 0;
}

DLLEXPORT int jl_substrtof(char *str, int offset, int len, float *out)
{
char *p;
errno = 0;
char *bstr = str+offset;
#if defined(_OS_WINDOWS_) && !defined(_COMPILER_MINGW_)
*out = (float)strtod(bstr, &p);
#else
*out = strtof(bstr, &p);
#endif

if((p == bstr) || (p != (bstr+len)) ||
(errno==ERANGE && (*out==0 || *out==HUGE_VALF || *out==-HUGE_VALF)))
return 1;
return 0;
}

DLLEXPORT int jl_strtof(char *str, float *out)
{
char *p;
Expand Down

0 comments on commit 261e350

Please sign in to comment.