Skip to content

Commit

Permalink
include_columns and exclude_columns keyword arguments
Browse files Browse the repository at this point in the history
  • Loading branch information
jaakkor2 committed Feb 26, 2024
1 parent 83ea5c6 commit 5aab44d
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 7 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "JMPReader"
uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29"
authors = ["Jaakko Ruohio <jaakkor2@gmail.com>"]
version = "0.1.7-DEV"
version = "0.1.7"

[deps]
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Expand All @@ -15,6 +15,8 @@ CodecZlib = "0.7"
DataFrames = "1"
Dates = "1.6"
LibDeflate = "0.4"
Printf = "1"
Test = "1"
WeakRefStrings = "1.4"
julia = "1.6"

Expand Down
16 changes: 11 additions & 5 deletions src/JMPReader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ using DataFrames: DataFrame, select!
using CodecZlib: transcode, GzipDecompressor
using LibDeflate: gzip_decompress!, Decompressor, LibDeflateErrors, LibDeflateErrors.deflate_insufficient_space
using WeakRefStrings: StringVector
using Base.Threads: nthreads, @threads, @spawn
using Base.Iterators: partition

include("types.jl")
Expand All @@ -17,19 +16,26 @@ include("metadata.jl")
include("column.jl")

"""
function readjmp(fn::AbstractString)
function readjmp(fn::AbstractString; include_columns::Union{Nothing, Vector} = nothing; exclude_columns::Union{Nothing, Vector} = nothing)
Read a JMP file.
Included and excluded columns can be defined using keyword arguments `include_columns` and `exclude_columns`.
These are vectors defining columns with any combination of `Integer`, `OrdinalRange`, `String`, `Symbol`, `Regex`.
"""
function readjmp(fn::AbstractString)
function readjmp(fn::AbstractString;
include_columns::Union{Nothing, Vector} = nothing,
exclude_columns::Union{Nothing, Vector} = nothing)

isfile(fn) || throw(ArgumentError("\"$fn\" does not exist"))
a = read(fn)
check_magic(a, fn)
info = metadata(a)
colinds = filter_columns(info.column.names, include_columns, exclude_columns)

deflatebuffer = Vector{UInt8}()
alldata = [column_data(a, info, i, deflatebuffer) for i in 1:info.ncols]
names = info.column.names
alldata = [column_data(a, info, i, deflatebuffer) for i in colinds]
names = info.column.names[colinds]
df = DataFrame(alldata, names)

return df
Expand Down
26 changes: 25 additions & 1 deletion src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,28 @@ function rstripnull!(s::StringVector)
s.lengths[i] = length
end
nothing
end
end

function filter_columns(names, include_columns, exclude_columns)
cols = 1:length(names)
if !isnothing(include_columns)
cols = intersect(cols, filter_names(names, include_columns))
end
if !isnothing(exclude_columns)
cols = setdiff(cols, filter_names(names, exclude_columns))
end
cols = sort(cols)
return cols
end

function filter_names(names, rules)
idx = UInt[]
for rule in rules
isa(rule, Integer) && push!(idx, rule)
isa(rule, OrdinalRange) && push!(idx, rule...)
isa(rule, String) && push!(idx, findfirst(==(rule), names))
isa(rule, Symbol) && push!(idx, findfirst(==(String(rule)), names))
isa(rule, Regex) && push!(idx, findall(contains.(names, rule))...)
end
return idx
end
11 changes: 11 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,15 @@ end
df = readjmp(joinpath(@__DIR__, "singlecolumnsinglerow.jmp"))
@test size(df) == (1, 1)
@test df."Column 1" == [1]
end

@testset "column name filtering" begin
names = ["foo", "foo_x", "foo_y", "bar", "bar_x", "bar_y", "baz"]
@test JMPReader.filter_columns(names, [1,8], nothing) == [1]
@test JMPReader.filter_columns(names, [1,7,"bar"], nothing) == [1,4,7]
@test JMPReader.filter_columns(names, [1,7,"bar",4], nothing) == [1,4,7]
@test JMPReader.filter_columns(names, nothing, [r"^foo"]) == [4,5,6,7]
@test JMPReader.filter_columns(names, nothing, [r"_x$", :bar]) == [1,3,6,7]
@test JMPReader.filter_columns(names, [3:10], [10:-3:1]) == [3,5,6]
@test JMPReader.filter_columns(names, ["foo_x", :baz, r"^bar", 3, 1:2], [4, r"x$"]) == [1,3,6,7]
end

0 comments on commit 5aab44d

Please sign in to comment.