Skip to content

Commit

Permalink
Add Enumerated Data Types
Browse files Browse the repository at this point in the history
TODO: Write a better commit message
  • Loading branch information
davisp committed Jun 14, 2023
1 parent ac8c852 commit 7937d8f
Show file tree
Hide file tree
Showing 68 changed files with 8,771 additions and 325 deletions.
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,12 @@ else()
elseif (CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
add_compile_options(-DNDEBUG -O3 -g3 -ggdb3 -gdwarf-3)
elseif (CMAKE_BUILD_TYPE MATCHES "Coverage")
add_compile_options(-DDEBUG -g3 -gdwarf-3 --coverage)
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
add_compile_options(-DDEBUG -O0 -g3 --coverage -fprofile-instr-generate -fcoverage-mapping)
add_link_options(--coverage -fprofile-instr-generate -fcoverage-mapping)
else()
add_compile_options(-DDEBUG -g3 -gdwarf-3 --coverage)
endif()
endif()

# Use -Wno-literal-suffix on Linux with C++ sources.
Expand Down Expand Up @@ -341,6 +346,7 @@ list(APPEND TILEDB_C_API_RELATIVE_HEADERS
"${CMAKE_SOURCE_DIR}/tiledb/api/c_api/datatype/datatype_api_external.h"
"${CMAKE_SOURCE_DIR}/tiledb/api/c_api/dimension/dimension_api_external.h"
"${CMAKE_SOURCE_DIR}/tiledb/api/c_api/dimension_label/dimension_label_api_external.h"
"${CMAKE_SOURCE_DIR}/tiledb/api/c_api/enumeration/enumeration_api_experimental.h"
"${CMAKE_SOURCE_DIR}/tiledb/api/c_api/error/error_api_external.h"
"${CMAKE_SOURCE_DIR}/tiledb/api/c_api/filesystem/filesystem_api_enum.h"
"${CMAKE_SOURCE_DIR}/tiledb/api/c_api/filesystem/filesystem_api_external.h"
Expand Down
22 changes: 22 additions & 0 deletions PJD_TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Chores Left
===

* Testing
* Array schema evolution after schema is read from disk
* Throw error if enumeration has longer length than the integer width of the attribute
* Require the attribute type to be integral when setting an enumeration
* Don't allow signed integer values for attributes with enumeratiojns? Does R need this?
* Require cell_val_num == 1 for attributes
* Don't forget to test drop and then add attribute with enumeration.
* ArraySchemaEvolution - serialization when adding the same enumeration to multiple attributes

* Missing APIs
* array schema evolution C API
* QueryConditon C API

* Miscellany
* Add Enumeration::dump(FILE* out)
* Document enumeration format changes in the storage format docs

* Future TODO:
* Update schema consolidation and vaccuming for enumerations
106 changes: 106 additions & 0 deletions examples/cpp_api/enumerations.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/**
* @file enumerations.cc
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2023 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*
* This example shows the use of enumerations in TileDB
*/

#include <iostream>
#include <tiledb/tiledb>
#include <tiledb/tiledb_experimental>

using namespace tiledb;

std::string array_bucket = "s3://tiledb-davisp";
std::string create_uri = "tiledb://demo/s3://tiledb-davisp/enumerations";
std::string array_uri = "tiledb://demo/enumerations";

void create_array(Config& cfg) {
Context ctx(cfg);
VFS vfs(ctx);

if (vfs.is_bucket(array_bucket)) {
vfs.remove_bucket(array_bucket);
}

std::cerr << "Creating bucket." << std::endl;
vfs.create_bucket(array_bucket);
if (!vfs.is_bucket(array_bucket)) {
throw std::runtime_error("Failed to create bucket.");
}

std::cerr << "Bucket exists." << std::endl;

ArraySchema schema(ctx, TILEDB_DENSE);

auto dim = Dimension::create<int>(ctx, "dim", {{1, 100}});
auto dom = Domain(ctx);
dom.add_dimension(dim);
schema.set_domain(dom);

auto attr = Attribute::create<int>(ctx, "attr");

std::vector<std::string> values = {"fred", "wilma", "barney", "pebbles"};
auto enmr = Enumeration::create(ctx, values);
ArraySchemaExperimental::add_attribute(ctx, schema, attr, enmr);

std::cerr << "Creating array" << std::endl;
Array::create(create_uri, schema);
}

void write_array(Config& cfg) {
Context ctx(cfg);

std::vector<int> attr_data;
for (size_t i = 1; i <= 100; i++) {
attr_data.push_back(i % 4);
}

Array array(ctx, array_uri, TILEDB_WRITE);
Query query(ctx, array, TILEDB_WRITE);
query.set_layout(TILEDB_ROW_MAJOR).set_data_buffer("attr", attr_data);
if (query.submit() != Query::Status::COMPLETE) {
throw std::runtime_error("Failed to write array.");
}

array.close();
}

int main(int, char*[]) {
Config cfg;
cfg["rest.username"] = "demo";
cfg["rest.password"] = "demodemodemodemo";
cfg["rest.server_address"] = "http://localhost:8081";

try {
create_array(cfg);
write_array(cfg);
} catch (std::exception& exc) {
std::cerr << "XKCD: Exception: " << exc.what() << std::endl;
}
}
28 changes: 28 additions & 0 deletions format_spec/enumeration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
title: Enumerations
---

## Main Structure

```
my_array # array folder
| ...
|_ schema # ArraySchema directory named `__schema`
|_ enumerations # Enumeration directory named `__enumerations`
| |_ enumeration # enumeration data with names `__t1_t2_uuid_v`
Enumeration data is stored in a subdirectory of the [array schema][./array_schema.md]
directory. Enumerations are stored using [Generic Tiles][./generic_tile.md].
Data stored in the generic tile follows the current format as of version 19.
| **Field** | **Type** | **Description** |
| :--- | :--- | :--- |
| Version number | `uint32_t` | Format version number of the generic tile |
| Datatype | `uint8_t` | The datatype of the enumeration values |
| Cell Val Num | `uint32_t` | The cell val num of the enumeration values |
| Ordered | `bool` | Whether the enumeration values should be considered ordered |
| Data Size | `uint64_t` | The number of bytes used to store the values |
| Data | `uint8_t` * Data Size | The data for the enumeration values |
| Offsets Size | `uint64_t` | The number of bytes used to store offsets if cell_var_num is TILEDB_VAR_NUM |
| Offsets | `uint8_t` * Offsets Size | The offsets data for the enumeration if cell_var_num is TILEDB_VAR_NUM |
Loading

0 comments on commit 7937d8f

Please sign in to comment.