Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add copyto! and similar for Datasets #937

Merged
merged 9 commits into from
Jun 7, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 134 additions & 8 deletions src/HDF5.jl
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,8 @@ end
# This infers the Julia type from the HDF5.Datatype. Specific file formats should provide their own read(dset).
const DatasetOrAttribute = Union{Dataset,Attribute}

include("views.jl")

function Base.read(obj::DatasetOrAttribute)
dtype = datatype(obj)
T = get_jl_type(dtype)
Expand All @@ -957,6 +959,19 @@ function Base.read(obj::DatasetOrAttribute, ::Type{T}, I...) where T
return val
end

"""
read!(obj::DatasetOrAttribute, output_buffer::AbstractArray{T} [, I...]) where T

Read [part of] a dataset or attribute into a preallocated output buffer.
The output buffer must be convertible to a pointer and have a contiguous layout.
"""
function Base.read!(obj::DatasetOrAttribute, buf::AbstractArray{T}, I...) where T
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious why do we need this method? Seems like copyto! alone would be sufficient as per the AbstractArray interface.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We currently implement Base.read, so why not Base.read!?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should define this method, because we are moving more and more having the HDF5.Dataset type satisfying the AbstractArray interface. Users would expect copyto! as that is the standard name for this function, thus additionally adding Base.read! seems unnecessary.

I understand that we are defining methods on Base.read throughout the HDF5 module, but I don't see that we should extend that and add Base.read!

The definition in IO/Networking, also suggests that this wouldn't be the best overload, since it's used by default for reading from a filename or stream.
https://docs.julialang.org/en/v1/base/io-network/#Base.read!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Base.read! has been removed.

dtype = datatype(obj)
val = generic_read!(buf, obj, dtype, T, I...)
close(dtype)
return val
end

# `Type{String}` does not have a definite size, so the generic_read does not accept
# it even though it will return a `String`. This explicit overload allows that usage.
function Base.read(obj::DatasetOrAttribute, ::Type{String}, I...)
Expand All @@ -968,10 +983,26 @@ function Base.read(obj::DatasetOrAttribute, ::Type{String}, I...)
return val
end

"""
copyto!(output_buffer::AbstractArray{T}, obj::Union{DatasetOrAttribute}) where T
copyto!(output_buffer::AbstractArray{T}, @view(obj::Union{DatasetOrAttribute}, I...))

Copy [part of] a HDF5 dataset or attribute to a preallocated output buffer.
The output buffer must be convertible to a pointer and have a contiguous layout.
"""
function Base.copyto!(output_buffer::AbstractArray{T}, obj::DatasetOrAttribute, I...) where T
return Base.read!(obj, output_buffer, I...)
end

# Special handling for reading OPAQUE datasets and attributes
function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque})
function generic_read!(buf::Matrix{UInt8}, obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque})
generic_read(obj, filetype, Opaque, buf)
end
function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque}, buf::Union{Matrix{UInt8}, Nothing} = nothing)
sz = size(obj)
buf = Matrix{UInt8}(undef, sizeof(filetype), prod(sz))
if isnothing(buf)
buf = Matrix{UInt8}(undef, sizeof(filetype), prod(sz))
end
if obj isa Dataset
read_dataset(obj, filetype, buf, obj.xfer)
else
Expand All @@ -989,7 +1020,14 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque
end

# generic read function
function generic_read!(buf::Union{AbstractMatrix{UInt8}, AbstractArray{T}}, obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I...) where T
musm marked this conversation as resolved.
Show resolved Hide resolved
return _generic_read(obj, filetype, T, buf, I...)
end
function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I...) where T
return _generic_read(obj, filetype, T, nothing, I...)
end
function _generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T},
buf::Union{AbstractMatrix{UInt8}, AbstractArray{T}, Nothing}, I...) where T
!isconcretetype(T) && error("type $T is not concrete")
!isempty(I) && obj isa Attribute && error("HDF5 attributes do not support hyperslab selections")

Expand Down Expand Up @@ -1026,13 +1064,18 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I.
end
end

if do_normalize(T)
# The entire dataset is read into in a buffer matrix where the first dimension at
# any stage of normalization is the bytes for a single element of type `T`, and
# the second dimension of the matrix runs through all elements.
buf = Matrix{UInt8}(undef, sizeof(T), prod(sz))
if isnothing(buf)
if do_normalize(T)
# The entire dataset is read into in a buffer matrix where the first dimension at
# any stage of normalization is the bytes for a single element of type `T`, and
# the second dimension of the matrix runs through all elements.
buf = Matrix{UInt8}(undef, sizeof(T), prod(sz))
else
buf = Array{T}(undef, sz...)
end
else
buf = Array{T}(undef, sz...)
sizeof(buf) != prod(sz)*sizeof(T) &&
error("Provided array buffer of size, $(size(buf)), and element type, $(eltype(buf)), does not match the dataset of size, $sz, and type, $T")
end
memspace = isempty(I) ? dspace : dataspace(sz)

Expand Down Expand Up @@ -1062,6 +1105,89 @@ function generic_read(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I.
end
end


"""
similar(obj::DatasetOrAttribute, [::Type{T}], [I::Integer...])

Return a `Array{T}` or `Matrix{UInt8}` to that can contain [part of] the dataset.
"""
function Base.similar(obj::DatasetOrAttribute, I::Integer...)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question, why do we need to pass the indices here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Base.similar accepts an optional size value. These are not indices. I suppose we should rename the argument.
https://docs.julialang.org/en/v1/base/arrays/#Base.similar

Copy link
Member

@musm musm Jun 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was more specifically looking at the AbstractArray interface https://docs.julialang.org/en/v1/manual/interfaces/#man-interface-array, where there are 4 optional similar methods that we could define:

Optional methods Default definition Brief description
similar(A) similar(A, eltype(A), size(A)) Return a mutable array with the same shape and element type
similar(A, ::Type{S}) similar(A, S, size(A)) Return a mutable array with the same shape and the specified element type
similar(A, dims::Dims) similar(A, eltype(A), dims) Return a mutable array with the same element type and size dims
similar(A, ::Type{S}, dims::Dims) Array{S}(undef, dims) Return a mutable array with the specified element type and size

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we do implement AbstractArray in 0.17, then we do not need this signature. We could just implement the version with Dims. However, since we are not implementing AbstractArray in 0.16, we probably do need to retain a method with the above signature since we do not have a similar(a::AbstractArray{T}, dims::Union{Integer, AbstractUnitRange}...) to cover that case for us.

I will rename this to Base.similar(obj::DatasetOrAttribute, dims::Dims). We probably sill need Base.similar, dims::Integer...) as well, until we actually do implement AbstractArray.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally I think read! makes more sense as long as we're using read.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simonbyrne did you intend to comment on the other thread? Is there a specific part of #937 (comment) you disagree with?
The goal is not to remove copyto! and replace it with read!. Originally, @mkitti had read! as an alias for copyto!, which we require in order to satisfy part of the AbstractArray interface. My comment on read! was that this was both redundant with copyto! and unexpected since read! is used in base Julia for reading from an IO stream/filename, which is very different than what copyto! accomplishes.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh yes, sorry...

dtype = datatype(obj)
T = get_jl_type(dtype)
val = similar(obj, dtype, T, I...)
close(dtype)
return val
end

function Base.similar(obj::DatasetOrAttribute, ::Type{T}, I::Integer...) where T
dtype = datatype(obj)
val = similar(obj, dtype, T, I...)
close(dtype)
return val
end

function Base.similar(obj::DatasetOrAttribute, filetype::Datatype, ::Type{Opaque})
sz = size(obj)
return Matrix{UInt8}(undef, sizeof(filetype), prod(sz))
end

# Duplicated from generic_read. TODO: Deduplicate
function Base.similar(obj::DatasetOrAttribute, filetype::Datatype, ::Type{T}, I::Integer...) where T
!isconcretetype(T) && error("type $T is not concrete")
!isempty(I) && obj isa Attribute && error("HDF5 attributes do not support hyperslab selections")

I = Base.OneTo.(I)

memtype = Datatype(API.h5t_get_native_type(filetype)) # padded layout in memory

if sizeof(T) != sizeof(memtype)
error("""
Type size mismatch
sizeof($T) = $(sizeof(T))
sizeof($memtype) = $(sizeof(memtype))
""")
end

dspace = dataspace(obj)
stype = API.h5s_get_simple_extent_type(dspace)
stype == API.H5S_NULL && return EmptyArray{T}()

if !isempty(I)
indices = Base.to_indices(obj, I)
dspace = hyperslab(dspace, indices...)
end

scalar = false
if stype == API.H5S_SCALAR
sz = (1,)
scalar = true
elseif isempty(I)
sz = size(dspace)
else
sz = map(length, filter(i -> !isa(i, Int), indices))
if isempty(sz)
sz = (1,)
scalar = true
end
end

if do_normalize(T)
# The entire dataset is read into in a buffer matrix where the first dimension at
# any stage of normalization is the bytes for a single element of type `T`, and
# the second dimension of the matrix runs through all elements.
buf = Matrix{UInt8}(undef, sizeof(T), prod(sz))
buf = reshape(normalize_types(T, buf), sz...)
else
buf = Array{T}(undef, sz...)
end


close(memtype)
close(dspace)

return buf
end

# Array constructor for datasets
Array(x::Dataset) = read(x)

Expand Down
41 changes: 41 additions & 0 deletions src/views.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Permit Base.view for Dataset and Attribute to normalize Base.copyto! syntax

# Long term consideration:
# If Dataset and Attribute become an AbstractArray, perhaps these should just be SubArrays ?
"""
DatasetView

This is an experimental data type and may change. Use `view` rather than `DatasetView` directly.
"""
struct DatasetView
parent::Dataset
indices
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to store the dataspace?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is worthwhile eventually, but I think we should defer this to a larger overhaul of Dataset.

In particular, we should consider parameterizing Dataset and making it an AbstractArray or AbstractDiskArray for version 0.17.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #930

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm leaning towards removing the view feature from this pull request.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's a good idea, but I would suggest making it actually a functional part: i.e. calling read!(dataset, array, I...) would internally call read!(view(dataset, I...), array)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. I just think the view component is much more complicated than the other parts.

end

function Base.view(obj::Dataset, I...)
return DatasetView(obj, I)
end

Base.similar(view::DatasetView) = similar(view.parent, length.(view.indices)...)

"""
AttributeView

This is an experimental data type and may change. Use `view` rather than `AttributeView` directly.
"""
struct AttributeView
parent::Attribute
indices
end

function Base.view(obj::Attribute, I...)
return AttributeView(obj, I)
end

Base.similar(view::AttributeView) = similar(view.parent, length.(view.indices)...)

const DatasetOrAttributeView = Union{DatasetView, AttributeView}

function Base.copyto!(output_buffer::AbstractArray{T}, view::DatasetOrAttributeView) where T
return Base.read!(view.parent, output_buffer, view.indices...)
end
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ include("filter.jl")
include("chunkstorage.jl")
@debug "fileio"
include("fileio.jl")
@debug "views"
include("views.jl")
@debug "filter test utils"
include("filters/FilterTestUtils.jl")

Expand Down
39 changes: 39 additions & 0 deletions test/views.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
using HDF5
using Test

@testset "views and non-allocating methods" begin
fn = tempname()

@info "view.jl" fn
data = rand(UInt16, 16, 16)

h5open(fn, "w") do h5f
h5f["data"] = data
end

h5open(fn, "r") do h5f
buffer = similar(h5f["data"])
copyto!(buffer, h5f["data"])
@test isequal(buffer, data)

read!(h5f["data"], buffer)
@test isequal(buffer, data)

v = @view(h5f["data"][1:4, 1:4])

buffer = similar(v)
@test size(buffer) == (4,4)
copyto!(buffer, v)
@test isequal(buffer, @view(data[1:4, 1:4]))

buffer .= 1
read!(h5f["data"], buffer, 1:4, 1:4)
@test isequal(buffer, @view(data[1:4, 1:4]))

@test size(similar(h5f["data"], Int16)) == size(h5f["data"])
@test size(similar(h5f["data"], 5,6)) == (5, 6)
@test size(similar(h5f["data"], Int16, 8,7)) == (8,7)
end

rm(fn)
end