diff --git a/src/packages/rt-alloc.fdoc b/src/packages/rt-alloc.fdoc new file mode 100644 index 000000000..4206bbceb --- /dev/null +++ b/src/packages/rt-alloc.fdoc @@ -0,0 +1,722 @@ +@title Real Time Allocators +@tangler allocator.hpp = share/lib/rtl/rt/allocator.hpp +@tangler malloc_free.hpp = share/lib/rtl/rt/malloc_free.hpp +@tangler bump.hpp = share/lib/rtl/rt/bump.hpp +@tangler block.hpp = share/lib/rtl/rt/block.hpp +@tangler ts_allocator.hpp = share/lib/rtl/rt/ts_allocator.hpp +@tangler ring_allocator.hpp = share/lib/rtl/rt/ring_allocator.hpp +@tangler test01.cxx = $PWD/test01.cxx +@h1 Real Time Allocators +In programming systems including C and C++, user space allocation is usually handled +by a universal allocator such as @{malloc()}. The operation time and memory consumption +of these systems may be very good but it is rarely subject to bounds required +for real time performance assurances. In addition, such systems not only may but +generally must occasionally request more memory from the operating system, +and at unpredictable times. + +Our objective here is to produce a more manifestly performance bound allocation +system by observing that all programs are finite state machines, and by +calculating the memory required by a program we can allocate that memory +once and for all at start up. + +We then provide a collection of sub-allocators which can dispense and +retrieve some of this memory, which have predictable performance +characteristics, and which can be chosen by the application developer +to meet their needs in various circumstances. + +Before we start I will provide a single motivating use case where +complex real time processing is required: real time audio signal processing. +These systems can be required to do various tasks from the fairly simple, +such as mixing, to very difficult, such as frequency shifting. +Irrespective of the task, blocks of data are delivered as inputs and +synthesised blocks of data must be delivered as outputs with a bounded +maximum lag; ideally less than 0.1 seconds. Otherwise a disc jockey, for +example, would lose control of the sound. + +In such a system a lag due to, for example, memory allocation delays, +would be catastrophic: first it would lead to a total loss of sound +for a brief period, and then it would require discarding some input +in order to catch up. Together this would cause a very signiciant +glitch in the sound continuity which could actually damage not only +equipment but human ears as well. + +There are of course numerous applications requiring real time signal +processing including target acquisition and tracking in aviation, +and safety monitoring in a nuclear power plant. + +@h2 What does an allocator look like? +An allocator object in a C++ environment requires four core operations. +
    +
  1. Construction
  2. +
  3. Allocation method
  4. +
  5. Deallocation method
  6. +
  7. Destructor
  8. +
+ +As is usual in such systems the constructor chosen will determine the +performance characteristics, whilst the other methods will be virtual +functions so that the programmer can try different allocators +in different circumstances. In particular during development a developer +might use a generic alloctor with no interest in performance, +since it seems important to get the semantics of their application right first. + +Later, a generic allocator with profiling might be used to help decide +which kinds of allocators to use where and with what bounds, +and then delegating allocators that monitor correctness at some +expense, until the final performance allocators are used. + +@h2 What does suballocation imply? +In our system, we have to get memory from somewhere and there are two case: +
    +
  1. From outside our system
  2. +
  3. From another allocator
  4. +
+ +An allocator of the first kind is known as a root allocator. +We will later provide a root allocator called @{malloc_free} which use +the @{malloc} and @{free} functions from the C standard library +to obtain and release memory. Obviously in different application domains +other roots may be provided. + +An allocator of the second kind is known as a @{suballocator}. + +There are two kinds of allocators. +
    +
  1. An static allocator is given a fixed amount of storage +on construction and never requests any more.
  2. +
  3. A delegating suballocator may request more memory +during its lifetime. +
+ +In order to request more memory, a delegating allocator requires a set least +of references to other allocators, its delegates. Usually there will only +be one delegate, however we will present a fairly generally allocator later +which uses an array of delegates. + +@h2 Who allocates the allocator? +Since an allocator is an object, its store was allocated by another allocator. +There are three special case we need to think about: +
    +
  1. it was allocated in static storage,
  2. +
  3. it was allocated on the machine stack, or,
  4. +
  5. it was allocated by a foreign allocator.
  6. +
+Other than that, another of our allocators, possibly a root, allocated it, +called the parent. + +Now whoever allocated it may chose to share it with other components, +and may indeed disappear, in which case there is a danger that the parent allocator +is forgotten, in which case the child cannot be deallocated. + +However if we're using a reference counting system to manage it, +the allocator requires two things: +
    +
  1. a reference count
  2. +
  3. a reference to the parent
  4. +
+ +In this case, the smart pointers managing it will do their usual thing, +but when the last reference to it is about to be destroyed, it can +delete the object using the parent allocator refered to in the object itself. +Of course we could put that informating into the smart pointers instead +but that seems wasteful. + +@h2 A smart pointer to allocators, part 1. +We're now going to provide a reference counting smart pointer +specialised to allocators. We have to split the code into two parts +because there is a circular relation between the allocator reference +type and the allocator it refers to. + + +@tangle allocator.hpp +#ifndef ALLOCATOR +#define ALLOCATOR +#include + +// forward decl +struct allocator_t; + +// smart pointer to allocators +struct alloc_ref_t { + // Uninitialised references refer to nothing + alloc_ref_t() : allocator(nullptr) {} + + // rvalue move, does not change reference counts + // ensures the source is nullified + alloc_ref_t(alloc_ref_t &&p) { + if(p.allocator) { + allocator = p.allocator; + p.allocator = nullptr; + } + else allocator = nullptr; + } + + // lvalue copy has to be defered + // because it adds 1 to the reference count + alloc_ref_t(alloc_ref_t &p); + + // rvalue assign + // PROOF of correctness: + // If the source allocator and target allocator are the same + // then the reference count must be at least 2 + // so destroying the target (reducing the ref count by 1) will not + // destroy the allocator. + void operator= (alloc_ref_t &&p) { + if (&p!=this) { // ignore self assign + this->~alloc_ref_t(); // destroy target + new(this) alloc_ref_t(::std::move(p)); // move source to target + } + } // rass + + // lvalue assign + void operator= (alloc_ref_t &p) { + if(&p!=this) { // ignore self assign + this->~alloc_ref_t(); // destroy target + new(this) alloc_ref_t(p); // copy source to target + } + } // lass + void *allocate(size_t); + void deallocate(void *, size_t); + + // destructor has to be defered + // uses parent allocator passed to allocator on construction to delete it + ~alloc_ref_t(); + +private: + allocator_t *allocator; + + // an reference is constructed from a pointer to an allocator + // the constructor of which is required to set the reference + // count to 1 + alloc_ref_t(allocator_t *p) : allocator(p) {} + + // only an allocator_t can construct a populated reference + friend class allocator_t; +}; +@ + +@h2 The allocator abstraction +Now we will present the allocator abstraction. +There are two concrete data types required for all allocator in our +design: a reference count, and a reference to the parent allocator. + +The parent may not exist, if the allocator had a foreign parent, +or it was not heap allocated, in which case the parent variable +will just be null. + +There is an onerous requirement here: when some code uses +a parent allocator to allocate the store for this allocator, +the parent must be passed to the allocator explicitly, so that the +same parent can be used to dispose of the store on deletion. +The pointer used for that has to be our reference counting +smart pointer, and that will increment the reference count +of the parent allocator to ensure it remains live whilst +its child does. + +In other words parents must always outlive their children. + +As a final note, there is an important method @{size()} +which reports the size in bytes of the object. In our +system this will be a universal requirement for all +polymorphic objects, and it's definition is boilerplate. +It's very unfortunate that C++ @{type_info} is deficient +and doesn't provide a way to obtain this information. + +Some allocator systems such as @{malloc_free} record +the size of each allocation internally when @{malloc()} is called, +so that the size does not have to be provided when @{free()} is +called. However most of our allocators do not store the size, +and it must be provided to the @{deallocate()} method. + +As a final note which will alarm the astute +reader: @{allocate()}, @{deallocate()} +and the destructor do not have any specified semantics! + +You may think @{allocate()} should be required to return +a pointer to some memory, but this is not so. The allocator +may have run out of resources. It could return garbage +or a @{nullptr.g}. + +Similarly @{deallocate} does not have to do anything +in general. For example a bump allocator just +bumps a pointer to perform an allocation and ignores +any @{deallocate()} requests. Instead it may return +a whole block of memory to its parent in the destructor. + +So, our objective is to provide a uniform interface, +but the actual semantics depend on the concrete allocator +actually constructed, and that includes data passed +to the allocator, such the size of a block that it +may be programmed to mahage. + + + +@tangle allocator.hpp +struct allocator_t { + alloc_ref_t parent; + + // these constructors can't be used directly by the public + // even though they're public, because this class is abstract + + // parentless construction + allocator_t() : refcnt(1) {} + + // construction by a parent + allocator_t(alloc_ref_t p) : parent(p), refcnt(1) {} + + // destructor does nothing special but the parent will be destroyed + virtual ~allocator_t(){} + + // must report the size in bytes of the object + virtual size_t size()const=0; + +protected: + // this is the only way to create a populated alloc_ref_t + // since it's protected, only classes derived from allocator_t can do it + static alloc_ref_t create(allocator_t *p) { return alloc_ref_t(p); } + +private: + friend class alloc_ref_t; + + // these can only be accessed via an alloc_ref_t + ::std::atomic refcnt; + + // self destruct + void suicide(); + + // memory management primitives + virtual void *allocate(size_t)=0; + virtual void deallocate(void *, size_t)=0; +}; +@ + +The parent of the allocator is publically available so that it can be +used to create sibling objects; but it is immutable to ensure the correct +allocator is used to recover the object's store. + +@h2 The smart pointer, part 2. +Now we have declared the reference count we can complete the +smart pointer definition. The tricky bit is the destructor because +it will decrement the reference count, and if the result is zero, +it also has to delete the refered to allocator. We do that by asking +the allocator to suicide. + +@tangle allocator.hpp +// lvalue copy +alloc_ref_t::alloc_ref_t(alloc_ref_t &p) { + if(p.allocator) ++p.allocator->refcnt; + allocator = p.allocator; +} + +template +void delete_csp_polymorphic_object (T *object, alloc_ref_t a) { + size_t n = object-> size(); + object->~T(); + a.deallocate(object, n); +} + +// NOTE: This method has the effect of saving the +// parent during the allocator destructor execution +void allocator_t::suicide() { + delete_csp_polymorphic_object(this, parent); +} + +// destructor +alloc_ref_t::~alloc_ref_t() { + if(allocator) { + if(allocator->refcnt.load() == 1) + allocator->suicide(); + else + --allocator->refcnt; + } +} + +void *alloc_ref_t::allocate(size_t n) { + return allocator? allocator->allocate(n) : nullptr; +} +void alloc_ref_t::deallocate(void *p, size_t n) { + if(allocator) allocator->deallocate(p,n); +} +@ + +@h2 Concrete objects +The @{delete_csp_polymorphic_object()} function above will work +for all our kind of polymorphic objects, but we need a method +for non-polymorphic objects too. + +@tangle allocator.hpp +template +void delete_concrete_object (T *object, alloc_ref_t a) { + object->~T(); + a.deallocate(object, sizeof(T)); +}; +@ + +@h2 Object Constructors +Now we need the usual wrapper method for an allocator. + +@tangle allocator.hpp +void *operator new(size_t amt, alloc_ref_t& a) { return a.allocate (amt); } +#endif +@ + +@h1 The malloc free allocators +So here is our basic root alloctor: +@tangle malloc_free.hpp +#ifndef MALLOC_FREE +#define MALLOC_FREE +#include "allocator.hpp" +struct malloc_free_allocator_t : public allocator_t { + size_t size()const override { return sizeof(*this); } + ~malloc_free_allocator_t() override { } + + // factory function + static alloc_ref_t create() { + return allocator_t::create( new malloc_free_allocator_t() ); + } +private: + malloc_free_allocator_t () : allocator_t() {} // no parent + void *allocate(size_t n) override { + auto p = malloc(n); + return p; + } + void deallocate(void *p, size_t n) override { free(p); } +}; +#endif +@ + +@h1 Bump allocators +A bump allocator just bumps a pointer to some store on an allocation +request. It ignores deallocations. + +There are several kinds of bump alloctor. + +@h2 Static bump allocator +The static bump allocator is just passed a pointer to bump. +There is no overrun check, deallocation requests are ignored, +there is no parent, and the store is not freed + +@tangle bump.hpp +#ifndef BUMP +#define BUMP +#include "allocator.hpp" +struct static_bump_allocator_t : public allocator_t { + char *p; + static_bump_allocator_t (void *q) : allocator_t(), p((char *)q) {} + void *allocate(size_t n) override { void *q = p; p += n; return q; } + void deallocate(void *, size_t) override {} + size_t size()const override { return sizeof(*this); } + + // factory function + static alloc_ref_t create(void *p) { + return allocator_t::create (new static_bump_allocator_t (p)); + } +}; +@ + +@h2 Dynamic bump allocator +The dynamic bump allocator has a parent from which it requests a single +store on construction, and never again. + +There is no overrun check, deallocation requests are ignored, +but the destructor returns the store to the parent. + +@tangle bump.hpp +struct dynamic_bump_allocator_t : public allocator_t { + void *base; + char *p; + size_t amt; + dynamic_bump_allocator_t (alloc_ref_t parent, size_t n) : + allocator_t(parent), amt(n) { + base=parent.allocate(amt); + p = (char*)base; + } + + void *allocate(size_t n) override { void *q = p; p += n; return q; } + void deallocate(void *, size_t) override {} + size_t size()const override { return sizeof(*this); } + ~dynamic_bump_allocator_t() { parent.deallocate(base, amt); } + + // factory function + static alloc_ref_t create(alloc_ref_t parent, size_t n) { + return allocator_t::create( new dynamic_bump_allocator_t(parent, n) ); + } +}; +#endif +@ + +@h1 Block allocators +A block allocator is one which can only supply a single block size, +and so ignores the size parameter of an allocation requrest. +However, deallocations put the argument block into a freelist +which can be used to service a subsequent allocation request. + +The simplest variant is a bump allocator with an added freelist. + +@tangle block.hpp +#ifndef BLOCK +#define BLOCK +#include "allocator.hpp" +struct static_block_allocator_t : public allocator_t { + char *p; + void *freelist; + size_t block_size; + + static_block_allocator_t (void *q, size_t n) : + allocator_t(), p((char *)q), freelist(nullptr), block_size(n) + {} + void *allocate(size_t n) override { + if(freelist) { + auto p = freelist; + freelist = *(void**)freelist; + return p; + } + else { + void *q = p; + p += block_size; + return q; + } + } + void deallocate(void *q, size_t) override { + *(void**)q = freelist; + freelist = q; + } + size_t size()const override { return sizeof(*this); } + + // factory function + static alloc_ref_t create(void *q, size_t n) { + return allocator_t::create( new static_block_allocator_t(q, n)); + } +}; +#endif +@ + +@h1 Thread Safe allocator +When we design allocators they're usually single thread only +for performance reasons. Even in many multi-threaded applications +only a single thread can access a given allocator. But if we want +to share an allocator, we can use a thread safety adaptor: + +@tangle ts_allocator.hpp +#ifndef TS_ALLOCATOR +#define TS_ALLOCATOR +#include "allocator.hpp" +struct ts_allocator_t : public allocator_t { + alloc_ref_t delegate; + ::std::atomic_flag lk; + + ts_allocator_t (alloc_ref_t parent, alloc_ref_t d) : + allocator_t(parent), delegate(d), lk(false) + {} + + void lock() { while(lk.test_and_set(::std::memory_order_acquire)); } + void unlock() { lk.clear(::std::memory_order_release); } + + void *allocate(size_t n) override { + lock(); + void *result = delegate.allocate(n); + unlock(); + return result; + } + void deallocate(void *p, size_t n) override { + lock(); + delegate.deallocate(p,n); + unlock(); + } + size_t size()const override { return sizeof(*this); } + + // factory function + static alloc_ref_t create(alloc_ref_t parent, alloc_ref_t delegate) { + return allocator_t::create( new ts_allocator_t(parent, delegate)); + } + +}; +#endif +@ +This allocator simply delegates to a specified allocator, but does so inside +the scope of a spinlock. Note that reference counts are already atomic. + +the destruction process is not protected because an allocator can only be +destroyed when there is only a single reference left. Similarly, +only a single thread can construct the allocator. + +@h1 Ring Buffers +A ring buffer is an array with sequential indexing modulo the array size. +An allocator can use an array of pointers to fixed sized blocks. +The principal advantage of a ring buffer is that the allocation by +popping the tail and deallocation by pushing at the head can both +be done without locks, by using the @{CAS} or @{compare_and_swap} +operation. + +We have an array @{a} of size @{n} and +two indices @{tail} and @{head}. The single threaded operation +to allocate a block is +@pre + void *pop() { + void *p = a[tail]; + tail = (tail + 1) % n; + return p; + } +@ + +and to deallocate + +@pre + void push(void *p) { + a[head] = p; + head = (head + 1) % n; + } +@ + +In general there is a danger the pop operation will advance past +the head and fetch garbage, or that the push operation will advance +past the tail, and overwrite a value leading to a leak. + +The first worry is dismissed by requiring the limit to the number +of blocks that can be allocated at once be the array size n: +the developer has to find a suitable bound for the code and ensure +that n is big enough. + +The second worry can be dismissed by requiring only blocks +allocated from this ring buffer can be deallocated. + +At worst, the head and tail can then be equal. If the buffer +is full, the next operation must be an allocation, whereas +if the buffer is empty, it must be a deallocation. Therefore +we do not need to know if the buffer is full or empty. + +Now the operations we exhibited are not thread safe. So here is how +we fix that. Consider @{pop} first. We first fetch the @{tail} variable +to a local variable. Next, we fetch the array value at this location. +Next, we create a new local variable, @{new_tail} which is the local +copy of tail plus 1 modulo n. + +Now we do a compare and swap operation on tail, with the local copy +of tail as the comparator, and the new tail as the value to be stored. +The operation will store the new tail value into tail if and only if +the current value is equal to the comparator and return true if it +succeeded or false otherwise. + +In the true case we're done and can return the fetched pointer, +otherwise another thread got in there ahead of us, and we loop around +and retry the whole operation. + +Although this is a spin loop, the critical difference is that the compare +and store are uninterruptable, whereas a thread holding a lock can be +pre-empted leaving the lock held, and preventing any progress until +the lock holding thread is resumed. + +The @{push} operation is similar. We can observe that these operations +are independent and could occur simultaneously. + +In general, ring buffer operations as described can fail, and obviously +cannot be lock free: extra code is needed to prevent over or underflow. +However by specification neither of these things can happen in our application. + +It should be noted that whilst our operations are lock free, this only means +at least one thread is always making progress. A particular thread my spin +forever if it loses the race every time it tries. Therefore, the operations +are not in fact real time, since all real time operations must be bounded. + +There is, however, a way to obtain real time behaviour, by introducing +a delay in a thread after every success of sufficient length. In this +case eventually all contenders but the last will be delayed and the last +contender will then succeed. The delay can be simple counted spinloop. + +However in most applications, for most operations, +there is no need for an artificial delay to be introduced, +because most threads will have sufficient work to do before the +next request anyhow. Care needs to be taken if threads are running +very tight loops requiring lots of allocations and deallocations; +in particular a loop which simply allocates a block and then +immediately deallocates it would cause a problem. + +Such cases probably won't occur in real programs, but almost +certainly will occur in test cases! + +@tangle ring_allocator.hpp +#ifndef RING_ALLOCATOR +#define RING_ALLOCATOR + +#include "allocator.hpp" +// client request entry: client needs n_blocks of size block_size +struct mem_req_t { + size_t block_size; + size_t n_blocks; + size_t array_bytes () const { return n_blocks * sizeof(void*); } + size_t memory_bytes () const { return n_blocks * block_size; } +}; + +struct ring_buffer_t : public allocator_t { + void **buffer; // the array of pointers + ::std::atomic head; + ::std::atomic tail; + mem_req_t mreq; + + // the constructor needs a parent to get memory arena from + // and a specification of the number and size of blocks + ring_buffer_t (alloc_ref_t parent, mem_req_t req) : mreq(req), head(0), tail(0) { + buffer = (void**) parent.allocate (req.array_bytes() + req.memory_bytes() ); + + // initialise buffer array with pointers to memory blocks + void *memory = (char*) buffer + req.array_bytes(); + for(size_t n = 0; n < req.n_blocks; ++n) { + buffer[n] = memory; + memory = (char*)memory + req.block_size; + } + } + + // the tail points at a populated spot + void *allocate(size_t) override { + size_t old_tail = tail.load(::std::memory_order_relaxed); + retry: + size_t new_tail = (old_tail + 1) % mreq.n_blocks; + if(!tail.compare_exchange_weak(old_tail, new_tail)) goto retry; + return buffer[old_tail]; + } + + // the head points at a free slot + void deallocate(void *p, size_t) override { + size_t old_head = head.load(::std::memory_order_relaxed); + retry: + size_t new_head= (old_head + 1) % mreq.n_blocks; + if(!head.compare_exchange_weak(old_head, new_head)) goto retry; + buffer[old_head] = p; + } + + // destructor returns store to parent + ~ring_buffer_t() { parent.deallocate(buffer, mreq.array_bytes() + mreq.memory_bytes() ); } + + // our size in bytes + size_t size() const override { return sizeof(*this); } + + // factory function + static alloc_ref_t create(alloc_ref_t parent, mem_req_t req) { + return allocator_t::create (new ring_buffer_t (parent, req)); + } + +}; +#endif +@ + +@h1 test +@tangle test01.cxx +#include +using namespace std; + +#include "allocator.hpp" +#include "malloc_free.hpp" +#include "bump.hpp" +#include "block.hpp" +#include "ts_allocator.hpp" +#include "ring_allocator.hpp" + +int main () { + cout << "Hello World" << endl; + auto a1 = malloc_free_allocator_t::create(); + auto a2 = dynamic_bump_allocator_t::create(a1, 1000); + unsigned char block_buffer[1000]; + auto a3 = static_block_allocator_t::create(block_buffer, 100); + auto a4 = ts_allocator_t::create(a3, a2); + auto a5 = ring_buffer_t::create( a1, mem_req_t { 100,100 }); +}