Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvement for direct loading of .nc files #23

Merged
merged 5 commits into from
Sep 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions src/core/aggregation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,7 @@ function dropagg(f, A::AbDimArray, dims)
DimensionalData.rebuild(r, Array(r.data))
end

function dimindex(A::AbDimArray, Dim)
@assert hasdim(A, Dim)
return findfirst(x -> x isa Dim, dims(A))
end

Base.ones(A::AbDimArray) = basetypeof(A)(ones(size(A)), dims(A))

dimindex(A::AbDimArray, Dim) = DimensionalData.dimnum(A, Dim)

#########################################################################
# Other dimensions
Expand Down
2 changes: 2 additions & 0 deletions src/core/coredefs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,5 @@ function Base.summary(io::IO, A::ClimArray)
print(io, '\n')
end
end

Base.ones(A::AbDimArray) = basetypeof(A)(ones(size(A)), dims(A))
49 changes: 42 additions & 7 deletions src/core/loading_nc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,13 @@ A = ClimArray(file, "tow_sw_all")
```
(of course you can just do `NCDataset("file.nc")` for single files).

If there are no missing values in the data (according to CF standards), the
returned array is automatically converted to a concrete type (i.e. `Union{Float32, Missing}`
becomes `Float32`).
We do two performance improvements while loading the data:
1. If there are no missing values in the data (according to CF standards), the
returned array is automatically converted to a concrete type (i.e. `Union{Float32, Missing}`
becomes `Float32`).
2. Dimensions that are ranges (i.e. sampled with constant step size) are automatically
transformed to a standard Julia `Range` type (which makes sub-selecting faster).


At the moment, support for auto-loading equal area space types does not exist,
see [Types of spatial coordinates](@ref). But
Expand All @@ -71,15 +75,16 @@ end

# TODO: Allow this function to take as input a tuple of indices, e.g. (:, :, 1:5)
# and only load this part, and correctly and instantly make it a ClimArray, which
# can solve "large memory" or "large data" problems.
# can solve "large memory" or "large data" problems. This funcionality
# must be sure to load the correct ranges of dimensions as well though!

function ClimArray(ds::NCDatasets.AbstractDataset, var::String; eqarea = false)
svar = string(var)
cfvar = ds[svar]
attrib = Dict(cfvar.attrib)
A = cfvar |> Array
if eqarea
# TODO: I have to re-work this code to be more general and allow other dimensions
# as well!!!!
# TODO: This piece of code is specific to CDO output...
if haskey(ds, "ncells") # this is the equal area grid, so we make a Coord dimension
lon = ds["lon"] |> Array .|> wrap_lon
lat = ds["lat"] |> Array
Expand Down Expand Up @@ -116,9 +121,39 @@ Create a tuple of `Dimension`s from the `dnames` (tuple of strings).
function create_dims(ds::NCDatasets.AbstractDataset, dnames)
true_dims = getindex.(Ref(COMMONNAMES), dnames)
dim_values = Array.(getindex.(Ref(ds), dnames))
return dim_values .|> true_dims
optimal_values = vector2range.(dim_values)
return optimal_values .|> true_dims
end

#########################################################################
# Making vectors → ranges
#########################################################################
function vector2range(x::Vector{<:Real})
dx = x[2]-x[1]
for i in 3:length(x)
x[i]-x[i-1] ≠ dx && return x # if no constant step, return array as is
end
r = x[1]:dx:x[end]
@assert r == x
return r
end

function vector2range(t::Vector{<:DateTime})
!sampled_less_than_date(t) && return vector2range(Date.(t))
# TODO: implement hourly sampling here
@warn "Hourly sampling not yet implemented."
return t
end

function vector2range(t::Vector{<:Date})
tsamp = temporal_sampling(t)
period = tsamp2period(tsamp)
r = t[1]:period:t[end]
@assert r == t
return r
end


#########################################################################
# Equal area related
#########################################################################
Expand Down
22 changes: 20 additions & 2 deletions src/physical_dimensions/temporal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ function time_in_days(t::AbstractArray{<:TimeType}, T = Float32)
end
time_in_days(t::AbstractArray{<:Real}) = t


"""
temporal_sampling(x) → symbol
Return the temporal sampling type of `x`, which is either an array of `Date`s or
Expand All @@ -100,13 +99,16 @@ Possible return values are:
- `:yearly`, where all dates have the same month+day, but different year.
- `:monthly`, where all dates have the same day, but different month.
- `:daily`, where the temporal difference between dates are exactly 1 day.
- `:hourly`, where the temporal difference between entries is exactly 1 hour.
- `:other`, which means that `x` doesn't fall to any of the above categories.

For vector input, only the first 3 entries of the temporal information are used
to deduce the sampling (while for ranges, checking the step is enough).
"""
temporal_sampling(A::AbDimArray) = temporal_sampling(dims(A, Time).val)
function temporal_sampling(t::AbstractVector{<:TimeType})
#TODO: implement hourly!
sampled_less_than_date(t) && error("Hourly sampling not yet implemented")
d1 = daymonth(t[2]) .- daymonth(t[1])
d2 = daymonth(t[3]) .- daymonth(t[2])
samemonth = d1[2] == d1[2] == 0
Expand All @@ -128,8 +130,24 @@ temporal_sampling(t::AbstractVector) = error("Need `<:TimeType` elements.")
temporal_sampling(t::StepRange{<:Any,Month}) = :monthly
temporal_sampling(t::StepRange{<:Any,Year}) = :yearly
temporal_sampling(t::StepRange{<:Any,Day}) = :daily
temporal_sampling(t::StepRange{<:Any,Hour}) = :hourly
temporal_sampling(t::StepRange{<:Any,<:Any}) = :other

"return true if hours or minutes are ≠ 0."
function sampled_less_than_date(t::AbstractVector{<:DateTime})
r = 1:length(t)
any(i -> Dates.hour(t[i]) ≠ 0, r) || any(i -> Dates.minute(t[i]) ≠ 0, r)
end
sampled_less_than_date(t::AbstractVector{<:Date}) = false

"return the appropriate subtype of Dates.Period."
function tsamp2period(tsamp)
tsamp == :monthly && return Month(1)
tsamp == :yearly && return Year(1)
tsamp == :daily && return Day(1)
error("Don't know the period of $tsamp sampling!")
end

#########################################################################
# temporal statistics
#########################################################################
Expand Down Expand Up @@ -235,7 +253,7 @@ function timeagg(f, T::AbstractVector{<:TimeType}, a::Vector, w = nothing) # ver
mys = maxyearspan(T, tsamp)
t = view(T, 1:mys)
if tsamp == :monthly
dimw = daysinmonth.(t)
dimw = float.(daysinmonth.(t))
!isnothing(w) && (dimw .*= view(w, 1:mys))
return f(view(a, 1:mys), weights(dimw))
else
Expand Down
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ end
res = timeagg(mean, A, w)
@test all(res .≈ A[Time(5)])
@test dims(A, Lon) == dims(res, Lon)

# TODO: more tests needed here, e.g. for timeagg(mean, t, a, w)
end

@testset "Advanced temporal manipulation" begin
Expand Down