Skip to content

Commit

Permalink
Merge pull request #17471 from hrydgard/reduce-zero-initialization
Browse files Browse the repository at this point in the history
Reduce zero initialization and copying overhead of render commands
  • Loading branch information
hrydgard committed May 23, 2023
2 parents b9157fd + 234c1f0 commit 8d7c561
Show file tree
Hide file tree
Showing 14 changed files with 353 additions and 158 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ add_library(Common STATIC
Common/Data/Collections/FixedSizeQueue.h
Common/Data/Collections/Hashmaps.h
Common/Data/Collections/TinySet.h
Common/Data/Collections/FastVec.h
Common/Data/Collections/ThreadSafeList.h
Common/Data/Color/RGBAUtil.cpp
Common/Data/Color/RGBAUtil.h
Expand Down
1 change: 1 addition & 0 deletions Common/Common.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,7 @@
</ClCompile>
<ClCompile Include="ArmEmitter.cpp" />
<ClCompile Include="Buffer.cpp" />
<ClCompile Include="Data\Collections\FastVec.h" />
<ClCompile Include="Data\Color\RGBAUtil.cpp" />
<ClCompile Include="Data\Convert\SmallDataConvert.cpp" />
<ClCompile Include="Data\Encoding\Base64.cpp" />
Expand Down
5 changes: 4 additions & 1 deletion Common/Common.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,9 @@
<ClCompile Include="GPU\OpenGL\GLMemory.cpp">
<Filter>GPU\OpenGL</Filter>
</ClCompile>
<ClCompile Include="Data\Collections\FastVec.h">
<Filter>Data\Collections</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Crypto">
Expand Down Expand Up @@ -1086,4 +1089,4 @@
<Filter>ext\basis_universal</Filter>
</None>
</ItemGroup>
</Project>
</Project>
137 changes: 137 additions & 0 deletions Common/Data/Collections/FastVec.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#pragma once

// Yet another replacement for std::vector, this time for use in graphics queues.
// Its major difference is that you can append uninitialized structures and initialize them after.
// This is not allows by std::vector but is very useful for our sometimes oversized unions.
// Also, copies during resize are done by memcpy, not by any move constructor or similar.

#include <cstdlib>
#include <cstring>

template<class T>
class FastVec {
public:
FastVec() {}
FastVec(size_t initialCapacity) {
capacity_ = initialCapacity;
data_ = (T *)malloc(initialCapacity * sizeof(T));
}
~FastVec() { if (data_) free(data_); }

T &push_uninitialized() {
if (size_ < capacity_) {
size_++;
return data_[size_ - 1];
} else {
ExtendByOne();
return data_[size_ - 1];
}
}

void push_back(const T &t) {
T &dest = push_uninitialized();
dest = t;
}

// Move constructor
FastVec(FastVec &&other) {
data_ = other.data_;
size_ = other.size_;
capacity_ = other.capacity_;
other.data_ = nullptr;
other.size_ = 0;
other.capacity_ = 0;
}

FastVec &operator=(FastVec &&other) {
if (this != &other) {
delete[] data_;
data_ = other.data_;
size_ = other.size_;
capacity_ = other.capacity_;
other.data_ = nullptr;
other.size_ = 0;
other.capacity_ = 0;
}
return *this;
}

// No copy constructor.
FastVec(const FastVec &other) = delete;
FastVec &operator=(const FastVec &other) = delete;

size_t size() const { return size_; }
size_t capacity() const { return capacity_; }
void clear() { size_ = 0; }
bool empty() const { return size_ == 0; }

T *begin() { return data_; }
T *end() { return data_ + size_; }
const T *begin() const { return data_; }
const T *end() const { return data_ + size_; }

// Out of bounds (past size() - 1) is undefined behavior.
T &operator[] (const size_t index) { return data_[index]; }
const T &operator[] (const size_t index) const { return data_[index]; }
T &at(const size_t index) { return data_[index]; }
const T &at(const size_t index) const { return data_[index]; }

// These two are invalid if empty().
const T &back() const { return (*this)[size() - 1]; }
const T &front() const { return (*this)[0]; }

// Limited functionality for inserts and similar, add as needed.
T &insert(T *iter) {
int pos = iter - data_;
ExtendByOne();
if (pos + 1 < size_) {
memmove(data_ + pos + 1, data_ + pos, (size_ - pos) * sizeof(T));
}
return data_[pos];
}

void insert(T *destIter, const T *beginIter, const T *endIter) {
int pos = destIter - data_;
if (beginIter == endIter)
return;
size_t newItems = endIter - beginIter;
IncreaseCapacityTo(size_ + newItems);
memmove(data_ + pos + newItems, data_ + pos, (size_ - pos) * sizeof(T));
memcpy(data_ + pos, beginIter, newItems * sizeof(T));
size_ += newItems;
}

void resize(size_t size) {
if (size < size_) {
size_ = size;
} else {
// TODO
}
}

private:
void IncreaseCapacityTo(size_t newCapacity) {
if (newCapacity <= capacity_)
return;
T *oldData = data_;
data_ = (T *)malloc(sizeof(T) * newCapacity);
if (capacity_ != 0) {
memcpy(data_, oldData, sizeof(T) * size_);
free(oldData);
}
}

void ExtendByOne() {
size_t newCapacity = capacity_ * 2;
if (newCapacity < 16) {
newCapacity = 16;
}
IncreaseCapacityTo(newCapacity);
size_++;
capacity_ = newCapacity;
}

size_t size_ = 0;
size_t capacity_ = 0;
T *data_ = nullptr;
};
6 changes: 3 additions & 3 deletions Common/GPU/OpenGL/GLQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ static std::string GetStereoBufferLayout(const char *uniformName) {
else return "undefined";
}

void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls) {
void GLQueueRunner::RunInitSteps(const FastVec<GLRInitStep> &steps, bool skipGLCalls) {
if (skipGLCalls) {
// Some bookkeeping still needs to be done.
for (size_t i = 0; i < steps.size(); i++) {
Expand Down Expand Up @@ -700,7 +700,7 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCal
CHECK_GL_ERROR_IF_DEBUG();
size_t renderCount = 0;
for (size_t i = 0; i < steps.size(); i++) {
const GLRStep &step = *steps[i];
GLRStep &step = *steps[i];

#if !defined(USING_GLES2)
if (useDebugGroups_)
Expand All @@ -711,7 +711,7 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCal
case GLRStepType::RENDER:
renderCount++;
if (IsVREnabled()) {
GLRStep vrStep = step;
GLRStep &vrStep = step;
PreprocessStepVR(&vrStep);
PerformRenderPass(vrStep, renderCount == 1, renderCount == totalRenderCount);
} else {
Expand Down
9 changes: 4 additions & 5 deletions Common/GPU/OpenGL/GLQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "Common/GPU/Shader.h"
#include "Common/GPU/thin3d.h"
#include "Common/Data/Collections/TinySet.h"

#include "Common/Data/Collections/FastVec.h"

struct GLRViewport {
float x, y, w, h, minZ, maxZ;
Expand Down Expand Up @@ -70,6 +70,7 @@ enum class GLRRenderCommand : uint8_t {
// type field, smashed right after each other?)
// Also, all GLenums are really only 16 bits.
struct GLRRenderData {
GLRRenderData(GLRRenderCommand _cmd) : cmd(_cmd) {}
GLRRenderCommand cmd;
union {
struct {
Expand Down Expand Up @@ -301,7 +302,7 @@ enum {
struct GLRStep {
GLRStep(GLRStepType _type) : stepType(_type) {}
GLRStepType stepType;
std::vector<GLRRenderData> commands;
FastVec<GLRRenderData> commands;
TinySet<const GLRFramebuffer *, 8> dependencies;
const char *tag;
union {
Expand All @@ -310,8 +311,6 @@ struct GLRStep {
GLRRenderPassAction color;
GLRRenderPassAction depth;
GLRRenderPassAction stencil;
// Note: not accurate.
int numDraws;
} render;
struct {
GLRFramebuffer *src;
Expand Down Expand Up @@ -355,7 +354,7 @@ class GLQueueRunner {
caps_ = caps;
}

void RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls);
void RunInitSteps(const FastVec<GLRInitStep> &steps, bool skipGLCalls);

void RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCalls, bool keepSteps, bool useVR);

Expand Down
39 changes: 17 additions & 22 deletions Common/GPU/OpenGL/GLRenderManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,25 +129,24 @@ bool GLRenderManager::ThreadFrame() {
return false;
}

GLRRenderThreadTask task;
GLRRenderThreadTask *task = nullptr;

// In case of syncs or other partial completion, we keep going until we complete a frame.
while (true) {
// Pop a task of the queue and execute it.
// NOTE: We need to actually wait for a task, we can't just bail!

{
std::unique_lock<std::mutex> lock(pushMutex_);
while (renderThreadQueue_.empty()) {
pushCondVar_.wait(lock);
}
task = renderThreadQueue_.front();
task = std::move(renderThreadQueue_.front());
renderThreadQueue_.pop();
}

// We got a task! We can now have pushMutex_ unlocked, allowing the host to
// push more work when it feels like it, and just start working.
if (task.runType == GLRRunType::EXIT) {
if (task->runType == GLRRunType::EXIT) {
// Oh, host wanted out. Let's leave, and also let's notify the host.
// This is unlike Vulkan too which can just block on the thread existing.
std::unique_lock<std::mutex> lock(syncMutex_);
Expand All @@ -157,11 +156,13 @@ bool GLRenderManager::ThreadFrame() {
}

// Render the scene.
VLOG(" PULL: Frame %d RUN (%0.3f)", task.frame, time_now_d());
if (Run(task)) {
VLOG(" PULL: Frame %d RUN (%0.3f)", task->frame, time_now_d());
if (Run(*task)) {
// Swap requested, so we just bail the loop.
delete task;
break;
}
delete task;
};

return true;
Expand All @@ -174,9 +175,7 @@ void GLRenderManager::StopThread() {
run_ = false;

std::unique_lock<std::mutex> lock(pushMutex_);
GLRRenderThreadTask exitTask{};
exitTask.runType = GLRRunType::EXIT;
renderThreadQueue_.push(exitTask);
renderThreadQueue_.push(new GLRRenderThreadTask(GLRRunType::EXIT));
pushCondVar_.notify_one();
} else {
WARN_LOG(G3D, "GL submission thread was already paused.");
Expand Down Expand Up @@ -215,13 +214,11 @@ void GLRenderManager::BindFramebufferAsRenderTarget(GLRFramebuffer *fb, GLRRende
step->render.color = color;
step->render.depth = depth;
step->render.stencil = stencil;
step->render.numDraws = 0;
step->tag = tag;
steps_.push_back(step);

GLuint clearMask = 0;
GLRRenderData data;
data.cmd = GLRRenderCommand::CLEAR;
GLRRenderData data(GLRRenderCommand::CLEAR);
if (color == GLRRenderPassAction::CLEAR) {
clearMask |= GL_COLOR_BUFFER_BIT;
data.clear.clearColor = clearColor;
Expand Down Expand Up @@ -379,15 +376,14 @@ void GLRenderManager::Finish() {
frameData_[curFrame].deleter.Take(deleter_);

VLOG("PUSH: Finish, pushing task. curFrame = %d", curFrame);
GLRRenderThreadTask task;
task.frame = curFrame;
task.runType = GLRRunType::PRESENT;
GLRRenderThreadTask *task = new GLRRenderThreadTask(GLRRunType::PRESENT);
task->frame = curFrame;

{
std::unique_lock<std::mutex> lock(pushMutex_);
renderThreadQueue_.push(task);
renderThreadQueue_.back().initSteps = std::move(initSteps_);
renderThreadQueue_.back().steps = std::move(steps_);
renderThreadQueue_.back()->initSteps = std::move(initSteps_);
renderThreadQueue_.back()->steps = std::move(steps_);
initSteps_.clear();
steps_.clear();
pushCondVar_.notify_one();
Expand Down Expand Up @@ -509,14 +505,13 @@ void GLRenderManager::FlushSync() {
{
VLOG("PUSH: Frame[%d].readyForRun = true (sync)", curFrame_);

GLRRenderThreadTask task;
task.frame = curFrame_;
task.runType = GLRRunType::SYNC;
GLRRenderThreadTask *task = new GLRRenderThreadTask(GLRRunType::SYNC);
task->frame = curFrame_;

std::unique_lock<std::mutex> lock(pushMutex_);
renderThreadQueue_.push(task);
renderThreadQueue_.back().initSteps = std::move(initSteps_);
renderThreadQueue_.back().steps = std::move(steps_);
renderThreadQueue_.back()->initSteps = std::move(initSteps_);
renderThreadQueue_.back()->steps = std::move(steps_);
pushCondVar_.notify_one();
steps_.clear();
}
Expand Down
Loading

0 comments on commit 8d7c561

Please sign in to comment.