forked from rust-lang/rust
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Vector Stubbing for the Rust Standard Library * This PR includes stub implementations for the Vector module for the Standard Library. * It includes 3 abstractions - RMCVec, CVec and NobackVec. * It also includes an experimental CHashSet implementation. Co-authored-by: Chinmay <chindesh@amazon.com> Co-authored-by: Daniel Schwartz-Narbonne <danielsn@users.noreply.github.com>
- Loading branch information
Showing
13 changed files
with
2,071 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 OR MIT | ||
|
||
#include <stdint.h> | ||
#include <assert.h> | ||
#include <stdlib.h> | ||
|
||
// This HashSet stub implementation is supposed to work with c_hashset.rs. | ||
// Please refer to that file for an introduction to the idea of a HashSet and | ||
// some other implemenntation details. Public methods defined in c_hashset.rs | ||
// act as wrappers around methods implemented here. | ||
|
||
// As noted before, this HashSet implementation is specifically for inputs which | ||
// are u16. The details below can be extended to larger sets if necessary. The | ||
// domain of the output is i16. | ||
// | ||
// The hash function that we choose is the identity function. | ||
// For all input x, hasher(x) = x. For our case, this satisfies all the | ||
// requirements of an ideal hash function, it is 1:1 and there exists only one | ||
// element in the input domain for each value in the output domain. | ||
// | ||
// An important thing to note here is that the hash function can be | ||
// appropriately modified depending on the type of the input value which is | ||
// stored in the hashset. As an example, if the HashSet needs to store a tuple | ||
// of integers, say <u32, u32>, the hash function can be modified to be: | ||
// | ||
// hash((x, y)) = prime * x + y; | ||
// | ||
// Although this value can be greater than the chosen output domain, the | ||
// function is still sound if the value wraps around because it guarantees a | ||
// unique output for a given pair of x and y. | ||
// | ||
// Another way to think about this problem could be through the lens of | ||
// uninterpreted functions where : if x == y => f(x) == f(y). Exploring this can | ||
// be future work. The idea would be to implement a HashSet similar to that seen | ||
// in functional programming languages. | ||
// | ||
// For the purpose of a HashSet, we dont necessarily need a SENTINEL outside the | ||
// range of the hashing function because of the way we design the HashSet | ||
// operations. | ||
const uint16_t SENTINEL = 1; | ||
|
||
uint16_t hasher(uint16_t value) { | ||
return value; | ||
} | ||
|
||
// We initialize all values of the domain to be 0 by initializing it with | ||
// calloc. This lets us get around the problem of looping through all elements | ||
// to initialize them individually with a special value. | ||
// | ||
// The domain array is to be interpreted such that | ||
// if domain[index] != 0, value such that hash(value) = index is present. | ||
// | ||
// However, this logic does not work for the value 0. For this, we choose the | ||
// SENTINEL value to initialize that element. | ||
typedef struct { | ||
uint16_t* domain; | ||
} hashset; | ||
|
||
// Ideally, this approach is much more suitable if we can work with arrays of | ||
// arbitrary size, specifically infinity. This would allow us to define hash | ||
// functions for any type because the output domain can be considered to be | ||
// infinite. However, CBMC currently does not handle unbounded arrays correctly. | ||
// Please see: https://github.com/diffblue/cbmc/issues/6261. Even in that case, | ||
// we might run into theoretical limitations of how solvers handle uninterpreted | ||
// symbols such as unbounded arrays. For the case of this API, the consumer can | ||
// request for an arbitrary number of HashSets which can be dynamically chosen. | ||
// As a result, the solver cannot know apriori how many unbounded arrays it | ||
// needs to initialize which might lead to errors. | ||
// | ||
// Firecracker uses HashSet<u32> (src/devices/src/virtio/vsock/unix/muxer.rs). | ||
// But for working with u32s, we run into the problem that the entire domain | ||
// cannot be allocated through malloc. We run into the error "array too large | ||
// for flattening". For that reason, we choose to work with u16 to demonstrate | ||
// the feasability of this approach. However, it should be extensible to other | ||
// integer types. | ||
// | ||
// Returns: pointer to a hashset instance which tracks the domain memory. This | ||
// pointer is used in later callbacks such as insert() and remove(). | ||
hashset* hashset_new() { | ||
hashset* set = (hashset *) malloc(sizeof(hashset)); | ||
// Initializes value all indexes to be 0, indicating that those elements are | ||
// not present in the HashSet. | ||
set->domain = calloc(UINT16_MAX, sizeof(uint16_t)); | ||
// For 0, choose another value to achieve the same. | ||
set->domain[0] = SENTINEL; | ||
return set; | ||
} | ||
|
||
// For insert, we need to first check if the value exists in the HashSet. If it | ||
// does, we immediately return a 0 (false) value back. | ||
// | ||
// If it doesnt, then we mark that element of the domain array with the value to | ||
// indicate that this element has been inserted. For element 0, we mark it with | ||
// the SENTINEL. | ||
// | ||
// To check if a value exists, we simply check if domain[hash] != 0 and | ||
// in the case of 0 if domain[0] != SENTINEL. | ||
// | ||
// Returns: an integer value 1 or 0. If the value is already present in the | ||
// hashset, this function returns a 0. If the value is sucessfully inserted, we | ||
// return a 1. | ||
uint32_t hashset_insert(hashset* s, uint16_t value) { | ||
uint16_t hash = hasher(value); | ||
|
||
if ((hash == 0 && s->domain[hash] != SENTINEL) || | ||
(hash !=0 && s->domain[hash] != 0)) { | ||
return 0; | ||
} | ||
|
||
s->domain[hash] = value; | ||
return 1; | ||
} | ||
|
||
// We perform a similar check here as described in hashset_insert(). We do not | ||
// duplicate code so as to not compute the hash twice. This can be improved. | ||
// | ||
// Returns: an integer value 1 or 0. If the value is present in the hashset, | ||
// this function returns a 1, otherwise 0. | ||
uint32_t hashset_contains(hashset* s, uint16_t value) { | ||
uint16_t hash = hasher(value); | ||
|
||
if ((hash == 0 && s->domain[hash] != SENTINEL) || | ||
(hash != 0 && s->domain[hash] != 0)) { | ||
return 1; | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
// We check if the element exists in the array. If it does not, we return a 0 | ||
// (false) value back. If it does, we mark it with 0 and in the case of 0, we | ||
// mark it with the SENTINEL and return 1. | ||
// | ||
// Returns: an integer value 1 or 0. If the value is not present in the hashset, | ||
// this function returns a 0. If the value is sucessfully removed from the | ||
// hashset, it returns a 1. | ||
uint32_t hashset_remove(hashset* s, uint16_t value) { | ||
uint16_t hash = hasher(value); | ||
|
||
if ((hash == 0 && s->domain[hash] == SENTINEL) || | ||
(hash !=0 && s->domain[hash] == 0)) { | ||
return 0; | ||
} | ||
|
||
if (hash == 0) { | ||
s->domain[hash] = SENTINEL; | ||
} else { | ||
s->domain[hash] = 0; | ||
} | ||
|
||
return 1; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 OR MIT | ||
|
||
#include <stdint.h> | ||
#include <assert.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
|
||
// This Vector stub implementation is supposed to work with c_vec.rs. Please | ||
// refer to that file for a detailed explanation about the workings of this | ||
// abstraction. Public methods implemented in c_vec.rs act as wrappers around | ||
// methods implemented here. | ||
|
||
// __CPROVER_max_malloc_size is dependent on the number of offset bits used to | ||
// represent a pointer variable. By default, this is chosen to be 56, in which | ||
// case the max_malloc_size is 2 ** (offset_bits - 1). We could go as far as to | ||
// assign the default capacity to be the max_malloc_size but that would be overkill. | ||
// Instead, we choose a high-enough value 2 ** 10. Another reason to do | ||
// this is that it would be easier for the solver to reason about memory if multiple | ||
// Vectors are initialized by the abstraction consumer. | ||
// | ||
// For larger array sizes such as 2 ** (31 - 1) we encounter "array size too large | ||
// for flattening" error. | ||
#define DEFAULT_CAPACITY 1024 | ||
#define MAX_MALLOC_SIZE 18014398509481984 | ||
|
||
// A Vector is a dynamically growing array type with contiguous memory. We track | ||
// allocated memory, the length of the Vector and the capacity of the | ||
// allocation. | ||
|
||
// As can be seen from the pointer to mem (unint32_t*), we track memory in terms | ||
// of words. The current implementation works only if the containing type is | ||
// u32. This was specifically chosen due to a use case seen in the Firecracker | ||
// codebase. This structure is used to communicate over the FFI boundary. | ||
// Future work: | ||
// Ideally, the pointer to memory would be uint8_t* - representing that we treat | ||
// memory as an array of bytes. This would allow us to be generic over the type | ||
// of the element contained in the Vector. In that case, we would have to treat | ||
// every sizeof(T) bytes as an indivdual element and cast memory accordingly. | ||
typedef struct { | ||
uint32_t* mem; | ||
size_t len; | ||
size_t capacity; | ||
} vec; | ||
|
||
// The grow operation resizes the vector and copies its original contents into a | ||
// new allocation. This is one of the more expensive operations for the solver | ||
// to reason about and one way to get around this problem is to use a large | ||
// allocation size. We also implement sized_grow which takes a argument | ||
// definining the minimum number of additional elements that need to be fit into | ||
// the Vector memory. This aims to replicate behavior as seen in the Rust | ||
// standard library where the size of the vector is decided based on the | ||
// following equation: | ||
// new_capacity = max(capacity * 2, capacity + additional). | ||
// Please refer to method amortized_grow in raw_vec.rs in the Standard Library | ||
// for additional information. | ||
// The current implementation performance depends on CBMCs performance about | ||
// reasoning about realloc. If CBMC does better, do would we in the case of | ||
// this abstraction. | ||
// | ||
// One important callout to make here is that because we allocate a large enough | ||
// buffer, we cant reason about buffer overflow bugs. This is because the | ||
// allocated memory will (most-likely) always have enough space allocated after | ||
// the required vec capacity. | ||
// | ||
// Future work: | ||
// Ideally, we would like to get around the issue of resizing altogether since | ||
// CBMC supports unbounded arrays. In that case, we would allocate memory of | ||
// size infinity and work with that. For program verification, this would | ||
// optimize a lot of operations since the solver does not really have to worry | ||
// about the bounds of memory. The appropriate constant for capacity would be | ||
// __CPROVER_constant_infinity_uint but this is currently blocked due to | ||
// incorrect translation of the constant: https://github.com/diffblue/cbmc/issues/6261. | ||
// | ||
// Another way to approach this problem would be to implement optimizations in | ||
// the realloc operation of CBMC. Rather than allocating a new memory block and | ||
// copying over elements, we can track only the end pointer of the memory and | ||
// shift it to track the new length. Since this behavior is that of the | ||
// allocator, the consumer of the API is blind to it. | ||
void vec_grow_exact(vec* v, size_t new_cap) { | ||
uint32_t* new_mem = (uint32_t* ) realloc(v->mem, new_cap * sizeof(*v->mem)); | ||
|
||
v->mem = new_mem; | ||
v->capacity = new_cap; | ||
} | ||
|
||
void vec_grow(vec* v) { | ||
size_t new_cap = v->capacity * 2; | ||
if (new_cap > MAX_MALLOC_SIZE) { | ||
// Panic if the new size requirement is greater than max size that can | ||
// be allocated through malloc. | ||
assert(0); | ||
} | ||
|
||
vec_grow_exact(v, new_cap); | ||
} | ||
|
||
void vec_sized_grow(vec* v, size_t additional) { | ||
size_t min_cap = v->capacity + additional; | ||
size_t grow_cap = v->capacity * 2; | ||
|
||
// This resembles the Rust Standard Library behavior - amortized_grow in | ||
// alloc/raw_vec.rs | ||
// | ||
// Reference: https://doc.rust-lang.org/src/alloc/raw_vec.rs.html#421 | ||
size_t new_cap = min_cap > grow_cap ? min_cap : grow_cap; | ||
if (new_cap > MAX_MALLOC_SIZE) { | ||
// Panic if the new size requirement is greater than max size that can | ||
// be allocated through malloc. | ||
assert(0); | ||
} | ||
|
||
vec_grow_exact(v, new_cap); | ||
} | ||
|
||
vec* vec_new() { | ||
vec *v = (vec *) malloc(sizeof(vec)); | ||
// Default size is DEFAULT_CAPACITY. We compute the maximum number of | ||
// elements to ensure that allocation size is aligned. | ||
size_t max_elements = DEFAULT_CAPACITY / sizeof(*v->mem); | ||
v->mem = (uint32_t *) malloc(max_elements * sizeof(*v->mem)); | ||
v->len = 0; | ||
v->capacity = max_elements; | ||
// Return a pointer to the allocated vec structure, which is used in future | ||
// callbacks. | ||
return v; | ||
} | ||
|
||
vec* vec_with_capacity(size_t capacity) { | ||
vec *v = (vec *) malloc(sizeof(vec)); | ||
if (capacity > MAX_MALLOC_SIZE) { | ||
// Panic if the new size requirement is greater than max size that can | ||
// be allocated through malloc. | ||
assert(0); | ||
} | ||
|
||
v->mem = (uint32_t *) malloc(capacity * sizeof(*v->mem)); | ||
v->len = 0; | ||
v->capacity = capacity; | ||
return v; | ||
} | ||
|
||
void vec_push(vec* v, uint32_t elem) { | ||
// If we have already reached capacity, resize the Vector before | ||
// pushing in new elements. | ||
if (v->len == v->capacity) { | ||
// Ensure that we have capacity to hold atleast one more element | ||
vec_sized_grow(v, 1); | ||
} | ||
|
||
v->mem[v->len] = elem; | ||
v->len += 1; | ||
} | ||
|
||
uint32_t vec_pop(vec* v) { | ||
assert(v->len > 0); | ||
v->len -= 1; | ||
|
||
return v->mem[v->len]; | ||
} | ||
|
||
void vec_append(vec* v1, vec* v2) { | ||
// Reserve enough space before adding in new elements. | ||
vec_sized_grow(v1, v2->len); | ||
// Perform a memcpy of elements which is cheaper than pushing each element | ||
// at once. | ||
memcpy(v1->mem + v1->len, v2->mem, v2->len * sizeof(*v2->mem)); | ||
v1->len = v1->len + v2->len; | ||
} | ||
|
||
size_t vec_len(vec* v) { | ||
return v->len; | ||
} | ||
|
||
size_t vec_cap(vec* v) { | ||
return v->capacity; | ||
} | ||
|
||
void vec_free(vec* v) { | ||
free(v->mem); | ||
free(v); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Verification-friendly Vector stubs | ||
---------- | ||
|
||
|
Oops, something went wrong.