include_columns and exclude_columns keyword arguments

jaakkor2 · Feb 26, 2024 · 5aab44d · 5aab44d
1 parent 83ea5c6
commit 5aab44d
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 7 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "JMPReader"
 uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29"
 authors = ["Jaakko Ruohio <jaakkor2@gmail.com>"]
-version = "0.1.7-DEV"
+version = "0.1.7"
 
 [deps]
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
@@ -15,6 +15,8 @@ CodecZlib = "0.7"
 DataFrames = "1"
 Dates = "1.6"
 LibDeflate = "0.4"
+Printf = "1"
+Test = "1"
 WeakRefStrings = "1.4"
 julia = "1.6"
 

diff --git a/src/JMPReader.jl b/src/JMPReader.jl
@@ -7,7 +7,6 @@ using DataFrames: DataFrame, select!
 using CodecZlib: transcode, GzipDecompressor
 using LibDeflate: gzip_decompress!, Decompressor, LibDeflateErrors, LibDeflateErrors.deflate_insufficient_space
 using WeakRefStrings: StringVector
-using Base.Threads: nthreads, @threads, @spawn
 using Base.Iterators: partition
 
 include("types.jl")
@@ -17,19 +16,26 @@ include("metadata.jl")
 include("column.jl")
 
 """
-    function readjmp(fn::AbstractString)
+    function readjmp(fn::AbstractString; include_columns::Union{Nothing, Vector} = nothing; exclude_columns::Union{Nothing, Vector} = nothing)
 
 Read a JMP file.
+
+Included and excluded columns can be defined using keyword arguments `include_columns` and `exclude_columns`.
+These are vectors defining columns with any combination of `Integer`, `OrdinalRange`, `String`, `Symbol`, `Regex`.
 """
-function readjmp(fn::AbstractString)
+function readjmp(fn::AbstractString;
+    include_columns::Union{Nothing, Vector} = nothing,
+    exclude_columns::Union{Nothing, Vector} = nothing)
+
     isfile(fn) || throw(ArgumentError("\"$fn\" does not exist"))
     a = read(fn)
     check_magic(a, fn)
     info = metadata(a)
+    colinds = filter_columns(info.column.names, include_columns, exclude_columns)
 
     deflatebuffer = Vector{UInt8}()
-    alldata = [column_data(a, info, i, deflatebuffer) for i in 1:info.ncols]
-    names = info.column.names
+    alldata = [column_data(a, info, i, deflatebuffer) for i in colinds]
+    names = info.column.names[colinds]
     df = DataFrame(alldata, names)
 
     return df

diff --git a/src/utils.jl b/src/utils.jl
@@ -71,4 +71,28 @@ function rstripnull!(s::StringVector)
         s.lengths[i] = length
     end
     nothing
-end
+end
+
+function filter_columns(names, include_columns, exclude_columns)
+    cols = 1:length(names)
+    if !isnothing(include_columns)
+        cols = intersect(cols, filter_names(names, include_columns))
+    end
+    if !isnothing(exclude_columns)
+        cols = setdiff(cols, filter_names(names, exclude_columns))
+    end
+    cols = sort(cols)
+    return cols
+end
+
+function filter_names(names, rules)
+    idx = UInt[]
+    for rule in rules
+        isa(rule, Integer) && push!(idx, rule)
+        isa(rule, OrdinalRange) && push!(idx, rule...)
+        isa(rule, String) && push!(idx, findfirst(==(rule), names))
+        isa(rule, Symbol) && push!(idx, findfirst(==(String(rule)), names))
+        isa(rule, Regex) && push!(idx, findall(contains.(names, rule))...)
+    end
+    return idx
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -60,4 +60,15 @@ end
     df = readjmp(joinpath(@__DIR__, "singlecolumnsinglerow.jmp"))
     @test size(df) == (1, 1)
     @test df."Column 1" == [1]
+end
+
+@testset "column name filtering" begin
+    names = ["foo", "foo_x", "foo_y", "bar", "bar_x", "bar_y", "baz"]
+    @test JMPReader.filter_columns(names, [1,8], nothing) == [1]
+    @test JMPReader.filter_columns(names, [1,7,"bar"], nothing) == [1,4,7]
+    @test JMPReader.filter_columns(names, [1,7,"bar",4], nothing) == [1,4,7]
+    @test JMPReader.filter_columns(names, nothing, [r"^foo"]) == [4,5,6,7]
+    @test JMPReader.filter_columns(names, nothing, [r"_x$", :bar]) == [1,3,6,7]
+    @test JMPReader.filter_columns(names, [3:10], [10:-3:1]) == [3,5,6]
+    @test JMPReader.filter_columns(names, ["foo_x", :baz, r"^bar", 3, 1:2], [4, r"x$"]) == [1,3,6,7]
 end