Skip to content

Commit

Permalink
VLen-UTF8 codec. Refactoring and bug fixes (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
keller-mark committed Sep 18, 2023
1 parent ff42619 commit 1a8e187
Show file tree
Hide file tree
Showing 46 changed files with 1,356 additions and 281 deletions.
7 changes: 6 additions & 1 deletion R/array-nested.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ NestedArray <- R6::R6Class("NestedArray",
self$data <- data # TODO?
} else if(!is.raw(data) && (is.array(data) || is.vector(data)) && is.atomic(data)) {
# Create array from R atomic vector or array().

num_shape_elements <- compute_size(shape)
# Check that data array has same shape as expected
if(!is.null(dim(data)) && all(ensure_vec(dim(data)) == ensure_vec(shape))) {
Expand All @@ -114,6 +113,7 @@ NestedArray <- R6::R6Class("NestedArray",
astype_func <- self$dtype_obj$get_asrtype()
self$data <- array(data=as.array(astype_func(data)), dim=shape)
}
# TODO: account for order == "C"?
} else if(is.raw(data)) {
# Create array from a raw vector.

Expand Down Expand Up @@ -270,6 +270,11 @@ NestedArray <- R6::R6Class("NestedArray",
flatten_to_raw = function(order = NA) {
data_as_vec <- self$flatten(order = order)

if(self$dtype_obj$is_object) {
# The object_codec in filters will handle the conversion to raw.
return(data_as_vec)
}

endian <- self$dtype_obj$byte_order
# Normalize to only "little" or "big" since this is what writeBin accepts.
if(endian == "nr") {
Expand Down
39 changes: 20 additions & 19 deletions R/dtypes.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ is_structured_dtype <- function(dtype) {
#' @keywords internal
get_dtype_parts <- function(dtype) {
# TODO: support object dtype (without digits required in regex)
dtype_regex <- "^(\\||>|<)(b|i|u|f|c|m|M|S|U|V|O)(\\d+)"
dtype_regex <- "^(\\||>|<)(b|i|u|f|c|m|M|S|U|V|O)(\\d+)?"
if(stringr::str_detect(dtype, dtype_regex)) {
dtype_matches <- stringr::str_match(dtype, dtype_regex)
basic_type <- dtype_matches[1,3]
Expand All @@ -32,12 +32,19 @@ get_dtype_parts <- function(dtype) {
} else {
byte_multiplier <- 1
}
num_items <- as.integer(dtype_matches[1,4])
if(!is.null(dtype_matches[1,4])) {
num_items <- as.integer(dtype_matches[1,4])
num_bytes <- num_items * byte_multiplier
} else {
# Support object dtype
num_items <- NA
num_bytes <- NA
}
result <- list(
dtype_str = dtype,
byte_order = dtype_matches[1,2],
basic_type = dtype_matches[1,3],
num_bytes = num_items * byte_multiplier,
num_bytes = num_bytes,
num_items = num_items
)
return(result)
Expand All @@ -48,16 +55,15 @@ get_dtype_parts <- function(dtype) {

#' @keywords internal
check_dtype_support <- function(dtype_parts) {
if(!is_na(dtype_parts) && dtype_parts$basic_type %in% c("b", "i", "u", "f", "S", "U")) {
if(!is_na(dtype_parts) && dtype_parts$basic_type %in% c("b", "i", "u", "f", "S", "U", "O")) {
return(TRUE)
}
stop(paste("Unsupported dtype:", dtype_parts))
return(FALSE)
}

#' @keywords internal
get_dtype_rtype <- function(dtype) {
dtype_parts <- get_dtype_parts(dtype)
get_dtype_rtype <- function(basic_type) {

# Reference: https://github.com/gzuidhof/zarr.js/blob/292804/src/nestedArray/types.ts#L32
BASICTYPE_RTYPE_MAPPING <- list(
Expand All @@ -66,10 +72,11 @@ get_dtype_rtype <- function(dtype) {
"i" = integer(),
"f" = double(),
"S" = character(),
"U" = character()
"U" = character(),
"O" = character() # TODO: will object always be character?
)

return(BASICTYPE_RTYPE_MAPPING[[dtype_parts$basic_type]])
return(BASICTYPE_RTYPE_MAPPING[[basic_type]])
}

#' @keywords internal
Expand Down Expand Up @@ -100,7 +107,8 @@ get_dtype_signed <- function(dtype) {
"i" = TRUE,
"f" = TRUE,
"S" = FALSE, # TODO: is this correct?
"U" = FALSE # TODO: is this correct?
"U" = FALSE, # TODO: is this correct?
"O" = FALSE
)
return(DTYPE_SIGNED_MAPPING[[dtype_parts$basic_type]])
}
Expand All @@ -117,16 +125,12 @@ get_dtype_asrtype <- function(dtype) {
"i" = as.integer,
"f" = as.double,
"S" = as.character,
"U" = as.character
"U" = as.character,
"O" = as.character
)
return(DTYPE_RTYPE_MAPPING[[dtype_parts$basic_type]])
}

#' @keywords internal
get_typed_array_ctr <- function(dtype) {
rtype <- get_dtype_rtype(dtype)
return(function(dim) array(data = rtype, dim = dim))
}

# Reference: https://numpy.org/doc/stable/reference/arrays.dtypes.html

Expand Down Expand Up @@ -165,8 +169,6 @@ Dtype <- R6::R6Class("Dtype",
initialize = function(dtype, object_codec = NA) {
self$dtype <- dtype

# TODO: support dtype_str == "|O" for object dtypes / dont require numeric part of dtype string

dtype_parts <- get_dtype_parts(dtype)
check_dtype_support(dtype_parts)
self$byte_order <- get_dtype_endianness(dtype)
Expand All @@ -178,14 +180,13 @@ Dtype <- R6::R6Class("Dtype",
self$is_structured <- is_structured_dtype(dtype)
self$is_object <- (self$basic_type == "O")

# TODO: port code from normalize_dtype in zarr-python
self$object_codec <- object_codec
},
get_asrtype = function() {
return(get_dtype_asrtype(self$dtype))
},
get_rtype = function() {
return(get_dtype_rtype(self$dtype))
return(get_dtype_rtype(self$basic_type))
},
get_typed_array_ctr = function() {
rtype <- self$get_rtype()
Expand Down
2 changes: 1 addition & 1 deletion R/meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Metadata2 <- R6::R6Class("Metadata2",
if(is.list(s)) {
return(s)
} else {
return(jsonlite::fromJSON(rawToChar(s)))
return(jsonlite::fromJSON(rawToChar(s), simplifyVector = FALSE))
}
},
encode_metadata = function(meta) {
Expand Down
21 changes: 3 additions & 18 deletions R/normalize.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,13 @@ normalize_shape <- function(shape) {
}

#' @keywords internal
normalize_dtype <- function(dtype, object_codec = NA, filters = NA) {
normalize_dtype <- function(dtype, object_codec = NA) {
# Reference: https://github.com/zarr-developers/zarr-python/blob/5dd4a0e6cdc04c6413e14f57f61d389972ea937c/zarr/util.py#L152

if(!is_na(object_codec) && !is_na(filters)) {
stop("expected only one of object_codec and filters to be specified in normalize_dtype")
}

if(is_na(dtype)) {
# np.dtype(None) returns 'float64'
if(!is_na(object_codec) || !is_na(filters)) {
stop("expected object_codec and filters to be NA due to NA dtype")
if(!is_na(object_codec)) {
stop("expected object_codec to be NA due to NA dtype")
}
return(Dtype$new("<f8"))
}
Expand All @@ -125,17 +121,6 @@ normalize_dtype <- function(dtype, object_codec = NA, filters = NA) {
}

if(is.character(dtype)) {
if(!is_na(filters)) {
dtype_init <- Dtype$new(dtype)
if(dtype_init$is_object) {
# Object ("|O") dtype should have one filter codec.
if(length(filters) == 1) {
return(Dtype$new(dtype, object_codec = filters[[1]]))
} else {
stop("expected filters list to have length 1 for object dtype")
}
}
}
# Filter list was NA but there could be non-NA object_codec parameter.
return(Dtype$new(dtype, object_codec = object_codec))
}
Expand Down
112 changes: 111 additions & 1 deletion R/numcodecs.R
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,114 @@ BloscCodec <- R6::R6Class("BloscCodec",
)
)

#' Variable-length UTF-8 codec for Zarr
#' @title VLenUtf8Codec Class
#' @docType class
#' @description
#' Class representing a VLenUtf8 compressor
#'
#' @rdname VLenUtf8Codec
#' @export
VLenUtf8Codec <- R6::R6Class("VLenUtf8Codec",
inherit = Codec,
public = list(
encode = function(vec_of_strings, zarr_arr) {
# Kind: array to bytes
# Reference: https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/vlen.pyx#L74

num_strings <- length(vec_of_strings)

encoded_values <- list()
encoded_lengths <- integer(num_strings)

data_length <- 0

# first iteration to convert to bytes
for(i in seq_len(num_strings)) {
orig_str <- vec_of_strings[i]
if(is.na(orig_str) || is.null(orig_str)) {
# treat these as missing value, normalize
orig_str <- ""
}
encoded_str <- charToRaw(orig_str)
encoded_str_len <- length(encoded_str)
encoded_values[[i]] <- encoded_str
encoded_lengths[i] <- encoded_str_len
data_length <- data_length + encoded_str_len + 4 # 4 bytes to store item length
}

# setup output
total_length <- 4 + data_length # 4 bytes to store number of items in header
out <- raw(total_length)

# write header
out[1:4] <- writeBin(
num_strings,
con = raw(),
size = 4,
endian = "little"
)

# second iteration, store data
pos <- 4
for(i in seq_len(num_strings)) {
l <- encoded_lengths[i]
out[(pos+1):(pos+4)] <- writeBin(
l,
con = raw(),
size = 4,
endian = "little"
)
pos <- pos + 4
out[(pos+1):(pos+l)] <- encoded_values[[i]]
pos <- pos + l
}

return(out)
},
decode = function(buf, zarr_arr) {
# Kind: bytes to array
# References:
# - https://github.com/manzt/zarrita.js/blob/050d128265af14ff3c82e125315f3f527112887d/packages/core/src/codecs/vlen-utf8.ts
# - https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/vlen.pyx#L132

num_strings <- readBin(
con = buf,
what = integer(),
size = 4,
n = 1,
signed = TRUE,
endian = "little"
)

vec_of_strings <- rep(NA, times = num_strings)

pos <- 4
for(i in seq_len(num_strings)) {
num_chars <- readBin(
con = buf[(pos+1):(pos+4)],
what = integer(),
size = 4,
n = 1,
signed = TRUE,
endian = "little"
)
pos <- pos + 4
vec_of_strings[i] <- rawToChar(buf[(pos+1):(pos+num_chars)])
pos <- pos + num_chars
}

return(vec_of_strings)
},
get_config = function() {
meta <- list(
id = jsonlite::unbox("vlen-utf8")
)
return(meta)
}
)
)

#' Get a codec instance from the registry.
#'
#' @param config A codec config as a named list.
Expand All @@ -393,8 +501,10 @@ get_codec <- function(config) {
result <- do.call(GzipCodec$new, config)
} else if(codec_id == "lzma") {
result <- do.call(LzmaCodec$new, config)
} else if(codec_id == "blosc") {
} else if(codec_id == "blosc") {
result <- do.call(BloscCodec$new, config)
} else if(codec_id == "vlen-utf8") {
result <- do.call(VLenUtf8Codec$new, config)
} else {
stop(paste("Unknown codec", codec_id))
}
Expand Down
3 changes: 2 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,8 @@ compute_size <- function(shape) {
#' @param val The value to check
#' @return Whether the value is NA
is_na <- function(val) {
if(length(val) > 1) {
if(length(val) != 1) {
# Including when val is integer(0), character(0), etc.
return(FALSE)
} else {
return(is.na(val))
Expand Down
21 changes: 16 additions & 5 deletions R/zarr-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,18 @@ ZarrArray <- R6::R6Class("ZarrArray",
meta_bytes <- private$store$get_item(mkey)
meta <- private$store$metadata_class$decode_array_metadata(meta_bytes)
private$meta <- meta
private$shape <- meta$shape
private$chunks <- meta$chunks
if(is.list(meta$shape)) {
private$shape <- as.integer(meta$shape)
} else {
# meta$shape might be null.
private$shape <- meta$shape
}
if(is.list(meta$chunks)) {
private$chunks <- as.integer(meta$chunks)
} else {
# meta$chunks might be null.
private$chunks <- meta$chunks
}
private$fill_value <- meta$fill_value
private$order <- meta$order
if("dimension_separator" %in% names(meta) && !is.na(meta$dimension_separator) && !is.null(meta$dimension_separator)) {
Expand All @@ -98,19 +108,20 @@ ZarrArray <- R6::R6Class("ZarrArray",
} else {
private$compressor <- get_codec(meta$compressor)
}
object_codec <- NA
if(is_na(meta$filters) || is.null(meta$filters)) {
private$filters <- NA
object_codec <- NA
} else {
private$filters <- list()
for(config in meta$filters) {
append(private$filters, get_codec(config))
private$filters <- append(private$filters, get_codec(config))
}
if(length(private$filters) == 1) {
object_codec <- private$filters[[1]]
}
}
private$dtype <- normalize_dtype(meta$dtype, filters = private$filters)
private$dtype <- normalize_dtype(meta$dtype, object_codec = object_codec)
},
load_metadata = function() {
private$load_metadata_nosync()
Expand Down Expand Up @@ -600,7 +611,7 @@ ZarrArray <- R6::R6Class("ZarrArray",
# raise RuntimeError('cannot read object array without object codec')

# ensure correct chunk shape
return(as.raw(chunk))
return(chunk)
},
encode_chunk = function(chunk_as_raw) {
# Reference: https://github.com/zarr-developers/zarr-python/blob/5dd4a0e6cdc04c6413e14f57f61d389972ea937c/zarr/core.py#L2105
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ print(selection$data)
| `Unicode` | βœ” / βœ” | Converted to `character` in R. |
| `void *` | ❌ / ❌ | |
| Structured data types | ❌ / ❌ | |
| Object data types | ❌ / ❌ | [On roadmap](https://github.com/keller-mark/pizzarr/issues/22) |
| Object data type - [VLenUTF8](https://numcodecs.readthedocs.io/en/stable/vlen.html#vlenutf8) | βœ” / βœ” | Converted to `character` in R. |


Note: no effort is made to assess loss of precision due to conversion.
Expand Down
Loading

0 comments on commit 1a8e187

Please sign in to comment.