VLen-UTF8 codec. Refactoring and bug fixes (#55)

keller-mark · Sep 18, 2023 · 1a8e187 · 1a8e187
1 parent ff42619
commit 1a8e187
Show file tree

Hide file tree

Showing 46 changed files with 1,356 additions and 281 deletions.
diff --git a/R/array-nested.R b/R/array-nested.R
@@ -104,7 +104,6 @@ NestedArray <- R6::R6Class("NestedArray",
         self$data <- data # TODO?
       } else if(!is.raw(data) && (is.array(data) || is.vector(data)) && is.atomic(data)) {
         # Create array from R atomic vector or array().
-
         num_shape_elements <- compute_size(shape)
         # Check that data array has same shape as expected
         if(!is.null(dim(data)) && all(ensure_vec(dim(data)) == ensure_vec(shape))) {
@@ -114,6 +113,7 @@ NestedArray <- R6::R6Class("NestedArray",
           astype_func <- self$dtype_obj$get_asrtype()
           self$data <- array(data=as.array(astype_func(data)), dim=shape)
         }
+        # TODO: account for order == "C"?
       } else if(is.raw(data)) {
         # Create array from a raw vector.
 
@@ -270,6 +270,11 @@ NestedArray <- R6::R6Class("NestedArray",
     flatten_to_raw = function(order = NA) {
       data_as_vec <- self$flatten(order = order)
 
+      if(self$dtype_obj$is_object) {
+        # The object_codec in filters will handle the conversion to raw.
+        return(data_as_vec)
+      }
+
       endian <- self$dtype_obj$byte_order
       # Normalize to only "little" or "big" since this is what writeBin accepts.
       if(endian == "nr") {

diff --git a/R/dtypes.R b/R/dtypes.R
@@ -23,7 +23,7 @@ is_structured_dtype <- function(dtype) {
 #' @keywords internal
 get_dtype_parts <- function(dtype) {
   # TODO: support object dtype (without digits required in regex)
-  dtype_regex <- "^(\\||>|<)(b|i|u|f|c|m|M|S|U|V|O)(\\d+)"
+  dtype_regex <- "^(\\||>|<)(b|i|u|f|c|m|M|S|U|V|O)(\\d+)?"
   if(stringr::str_detect(dtype, dtype_regex)) {
     dtype_matches <- stringr::str_match(dtype, dtype_regex)
     basic_type <- dtype_matches[1,3]
@@ -32,12 +32,19 @@ get_dtype_parts <- function(dtype) {
     } else {
       byte_multiplier <- 1
     }
-    num_items <- as.integer(dtype_matches[1,4])
+    if(!is.null(dtype_matches[1,4])) {
+      num_items <- as.integer(dtype_matches[1,4])
+      num_bytes <- num_items * byte_multiplier
+    } else {
+      # Support object dtype
+      num_items <- NA
+      num_bytes <- NA
+    }
     result <- list(
       dtype_str = dtype,
       byte_order = dtype_matches[1,2],
       basic_type = dtype_matches[1,3],
-      num_bytes = num_items * byte_multiplier,
+      num_bytes = num_bytes,
       num_items = num_items
     )
     return(result)
@@ -48,16 +55,15 @@ get_dtype_parts <- function(dtype) {
 
 #' @keywords internal
 check_dtype_support <- function(dtype_parts) {
-  if(!is_na(dtype_parts) && dtype_parts$basic_type %in% c("b", "i", "u", "f", "S", "U")) {
+  if(!is_na(dtype_parts) && dtype_parts$basic_type %in% c("b", "i", "u", "f", "S", "U", "O")) {
     return(TRUE)
   }
   stop(paste("Unsupported dtype:", dtype_parts))
   return(FALSE)
 }
 
 #' @keywords internal
-get_dtype_rtype <- function(dtype) {
-  dtype_parts <- get_dtype_parts(dtype)
+get_dtype_rtype <- function(basic_type) {
 
   # Reference: https://github.com/gzuidhof/zarr.js/blob/292804/src/nestedArray/types.ts#L32
   BASICTYPE_RTYPE_MAPPING <- list(
@@ -66,10 +72,11 @@ get_dtype_rtype <- function(dtype) {
     "i" = integer(),
     "f" = double(),
     "S" = character(),
-    "U" = character()
+    "U" = character(),
+    "O" = character() # TODO: will object always be character?
   )
 
-  return(BASICTYPE_RTYPE_MAPPING[[dtype_parts$basic_type]])
+  return(BASICTYPE_RTYPE_MAPPING[[basic_type]])
 }
 
 #' @keywords internal
@@ -100,7 +107,8 @@ get_dtype_signed <- function(dtype) {
     "i" = TRUE,
     "f" = TRUE,
     "S" = FALSE, # TODO: is this correct?
-    "U" = FALSE  # TODO: is this correct?
+    "U" = FALSE,  # TODO: is this correct?
+    "O" = FALSE
   )
   return(DTYPE_SIGNED_MAPPING[[dtype_parts$basic_type]])
 }
@@ -117,16 +125,12 @@ get_dtype_asrtype <- function(dtype) {
     "i" = as.integer,
     "f" = as.double,
     "S" = as.character,
-    "U" = as.character
+    "U" = as.character,
+    "O" = as.character
   )
   return(DTYPE_RTYPE_MAPPING[[dtype_parts$basic_type]])
 }
 
-#' @keywords internal
-get_typed_array_ctr <- function(dtype) {
-  rtype <- get_dtype_rtype(dtype)
-  return(function(dim) array(data = rtype, dim = dim))
-}
 
 # Reference: https://numpy.org/doc/stable/reference/arrays.dtypes.html
 
@@ -165,8 +169,6 @@ Dtype <- R6::R6Class("Dtype",
     initialize = function(dtype, object_codec = NA) {
       self$dtype <- dtype
 
-      # TODO: support dtype_str == "|O" for object dtypes / dont require numeric part of dtype string
-
       dtype_parts <- get_dtype_parts(dtype)
       check_dtype_support(dtype_parts)
       self$byte_order <-  get_dtype_endianness(dtype)
@@ -178,14 +180,13 @@ Dtype <- R6::R6Class("Dtype",
       self$is_structured <- is_structured_dtype(dtype)
       self$is_object <- (self$basic_type == "O")
 
-      # TODO: port code from normalize_dtype in zarr-python
       self$object_codec <- object_codec
     },
     get_asrtype = function() {
       return(get_dtype_asrtype(self$dtype))
     },
     get_rtype = function() {
-      return(get_dtype_rtype(self$dtype))
+      return(get_dtype_rtype(self$basic_type))
     },
     get_typed_array_ctr = function() {
       rtype <- self$get_rtype()

diff --git a/R/meta.R b/R/meta.R
@@ -10,7 +10,7 @@ Metadata2 <- R6::R6Class("Metadata2",
             if(is.list(s)) {
                 return(s)
             } else {
-                return(jsonlite::fromJSON(rawToChar(s)))
+                return(jsonlite::fromJSON(rawToChar(s), simplifyVector = FALSE))
             }
         },
         encode_metadata = function(meta) {

diff --git a/R/normalize.R b/R/normalize.R
@@ -103,17 +103,13 @@ normalize_shape <- function(shape) {
 }
 
 #' @keywords internal
-normalize_dtype <- function(dtype, object_codec = NA, filters = NA) {
+normalize_dtype <- function(dtype, object_codec = NA) {
   # Reference: https://github.com/zarr-developers/zarr-python/blob/5dd4a0e6cdc04c6413e14f57f61d389972ea937c/zarr/util.py#L152
 
-  if(!is_na(object_codec) && !is_na(filters)) {
-    stop("expected only one of object_codec and filters to be specified in normalize_dtype")
-  }
-
   if(is_na(dtype)) {
     # np.dtype(None) returns 'float64'
-    if(!is_na(object_codec) || !is_na(filters)) {
-      stop("expected object_codec and filters to be NA due to NA dtype")
+    if(!is_na(object_codec)) {
+      stop("expected object_codec to be NA due to NA dtype")
     }
     return(Dtype$new("<f8"))
   }
@@ -125,17 +121,6 @@ normalize_dtype <- function(dtype, object_codec = NA, filters = NA) {
   }
 
   if(is.character(dtype)) {
-    if(!is_na(filters)) {
-      dtype_init <- Dtype$new(dtype)
-      if(dtype_init$is_object) {
-        # Object ("|O") dtype should have one filter codec.
-        if(length(filters) == 1) {
-          return(Dtype$new(dtype, object_codec = filters[[1]]))
-        } else {
-          stop("expected filters list to have length 1 for object dtype")
-        }
-      }
-    }
     # Filter list was NA but there could be non-NA object_codec parameter.
     return(Dtype$new(dtype, object_codec = object_codec))
   }

diff --git a/R/numcodecs.R b/R/numcodecs.R
@@ -374,6 +374,114 @@ BloscCodec <- R6::R6Class("BloscCodec",
   )
 )
 
+#' Variable-length UTF-8 codec for Zarr
+#' @title VLenUtf8Codec Class
+#' @docType class
+#' @description
+#' Class representing a VLenUtf8 compressor
+#'
+#' @rdname VLenUtf8Codec
+#' @export
+VLenUtf8Codec <- R6::R6Class("VLenUtf8Codec",
+  inherit = Codec,
+  public = list(
+    encode = function(vec_of_strings, zarr_arr) {
+      # Kind: array to bytes
+      # Reference: https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/vlen.pyx#L74
+
+      num_strings <- length(vec_of_strings)
+
+      encoded_values <- list()
+      encoded_lengths <- integer(num_strings)
+
+      data_length <- 0
+
+      # first iteration to convert to bytes
+      for(i in seq_len(num_strings)) {
+        orig_str <- vec_of_strings[i]
+        if(is.na(orig_str) || is.null(orig_str)) {
+          # treat these as missing value, normalize
+          orig_str <- ""
+        }
+        encoded_str <- charToRaw(orig_str)
+        encoded_str_len <- length(encoded_str)
+        encoded_values[[i]] <- encoded_str
+        encoded_lengths[i] <- encoded_str_len
+        data_length <- data_length + encoded_str_len + 4 # 4 bytes to store item length
+      }
+
+      # setup output
+      total_length <- 4 + data_length # 4 bytes to store number of items in header
+      out <- raw(total_length)
+
+      # write header
+      out[1:4] <- writeBin(
+        num_strings,
+        con = raw(),
+        size = 4,
+        endian = "little"
+      )
+
+      # second iteration, store data
+      pos <- 4
+      for(i in seq_len(num_strings)) {
+        l <- encoded_lengths[i]
+        out[(pos+1):(pos+4)] <- writeBin(
+          l,
+          con = raw(),
+          size = 4,
+          endian = "little"
+        )
+        pos <- pos + 4
+        out[(pos+1):(pos+l)] <- encoded_values[[i]]
+        pos <- pos + l
+      }
+
+      return(out)
+    },
+    decode = function(buf, zarr_arr) {
+      # Kind: bytes to array
+      # References:
+      # - https://github.com/manzt/zarrita.js/blob/050d128265af14ff3c82e125315f3f527112887d/packages/core/src/codecs/vlen-utf8.ts
+      # - https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/vlen.pyx#L132
+
+      num_strings <- readBin(
+        con = buf,
+        what = integer(),
+        size = 4,
+        n = 1,
+        signed = TRUE,
+        endian = "little"
+      )
+
+      vec_of_strings <- rep(NA, times = num_strings)
+
+      pos <- 4
+      for(i in seq_len(num_strings)) {
+        num_chars <- readBin(
+          con = buf[(pos+1):(pos+4)],
+          what = integer(),
+          size = 4,
+          n = 1,
+          signed = TRUE,
+          endian = "little"
+        )
+        pos <- pos + 4
+        vec_of_strings[i] <- rawToChar(buf[(pos+1):(pos+num_chars)])
+        pos <- pos + num_chars
+      }
+
+      return(vec_of_strings)
+    },
+    get_config = function() {
+       meta <- list(
+         id = jsonlite::unbox("vlen-utf8")
+       )
+       return(meta)
+    }
+  )
+)
+
 #' Get a codec instance from the registry.
 #'
 #' @param config A codec config as a named list.
@@ -393,8 +501,10 @@ get_codec <- function(config) {
       result <- do.call(GzipCodec$new, config)
     } else if(codec_id == "lzma") {
       result <- do.call(LzmaCodec$new, config)
-    }  else if(codec_id == "blosc") {
+    } else if(codec_id == "blosc") {
       result <- do.call(BloscCodec$new, config)
+    } else if(codec_id == "vlen-utf8") {
+      result <- do.call(VLenUtf8Codec$new, config)
     } else {
       stop(paste("Unknown codec", codec_id))
     }

diff --git a/R/utils.R b/R/utils.R
@@ -232,7 +232,8 @@ compute_size <- function(shape) {
 #' @param val The value to check
 #' @return Whether the value is NA
 is_na <- function(val) {
-  if(length(val) > 1) {
+  if(length(val) != 1) {
+    # Including when val is integer(0), character(0), etc.
     return(FALSE)
   } else {
     return(is.na(val))

diff --git a/R/zarr-array.R b/R/zarr-array.R
@@ -83,8 +83,18 @@ ZarrArray <- R6::R6Class("ZarrArray",
       meta_bytes <- private$store$get_item(mkey)
       meta <- private$store$metadata_class$decode_array_metadata(meta_bytes)
       private$meta <- meta
-      private$shape <- meta$shape
-      private$chunks <- meta$chunks
+      if(is.list(meta$shape)) {
+        private$shape <- as.integer(meta$shape)
+      } else {
+        # meta$shape might be null.
+        private$shape <- meta$shape
+      }
+      if(is.list(meta$chunks)) {
+        private$chunks <- as.integer(meta$chunks)
+      } else {
+        # meta$chunks might be null.
+        private$chunks <- meta$chunks
+      }
       private$fill_value <- meta$fill_value
       private$order <- meta$order
       if("dimension_separator" %in% names(meta) && !is.na(meta$dimension_separator) && !is.null(meta$dimension_separator)) {
@@ -98,19 +108,20 @@ ZarrArray <- R6::R6Class("ZarrArray",
       } else {
         private$compressor <- get_codec(meta$compressor)
       }
+      object_codec <- NA
       if(is_na(meta$filters) || is.null(meta$filters)) {
         private$filters <- NA
         object_codec <- NA
       } else {
         private$filters <- list()
         for(config in meta$filters) {
-          append(private$filters, get_codec(config))
+          private$filters <- append(private$filters, get_codec(config))
         }
         if(length(private$filters) == 1) {
           object_codec <- private$filters[[1]]
         }
       }
-      private$dtype <- normalize_dtype(meta$dtype, filters = private$filters)
+      private$dtype <- normalize_dtype(meta$dtype, object_codec = object_codec)
     },
     load_metadata = function() {
       private$load_metadata_nosync()
@@ -600,7 +611,7 @@ ZarrArray <- R6::R6Class("ZarrArray",
           # raise RuntimeError('cannot read object array without object codec')
 
       # ensure correct chunk shape
-      return(as.raw(chunk))
+      return(chunk)
     },
     encode_chunk = function(chunk_as_raw) {
       # Reference: https://github.com/zarr-developers/zarr-python/blob/5dd4a0e6cdc04c6413e14f57f61d389972ea937c/zarr/core.py#L2105

diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ print(selection$data)
 | `Unicode`             |            ✔ / ✔             |  Converted to `character` in R.               |
 | `void *`              |            ❌ / ❌             |                                                                                                                                                                                 |
 | Structured data types |            ❌ / ❌             |   |
-| Object data types |            ❌ / ❌             |  [On roadmap](https://github.com/keller-mark/pizzarr/issues/22) |
+| Object data type - [VLenUTF8](https://numcodecs.readthedocs.io/en/stable/vlen.html#vlenutf8) |            ✔ / ✔             | Converted to `character` in R. |
 
 
 Note: no effort is made to assess loss of precision due to conversion.