Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce zero initialization and copying overhead of render commands #17471

Merged
merged 10 commits into from
May 23, 2023
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ add_library(Common STATIC
Common/Data/Collections/FixedSizeQueue.h
Common/Data/Collections/Hashmaps.h
Common/Data/Collections/TinySet.h
Common/Data/Collections/FastVec.h
Common/Data/Collections/ThreadSafeList.h
Common/Data/Color/RGBAUtil.cpp
Common/Data/Color/RGBAUtil.h
Expand Down
1 change: 1 addition & 0 deletions Common/Common.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,7 @@
</ClCompile>
<ClCompile Include="ArmEmitter.cpp" />
<ClCompile Include="Buffer.cpp" />
<ClCompile Include="Data\Collections\FastVec.h" />
<ClCompile Include="Data\Color\RGBAUtil.cpp" />
<ClCompile Include="Data\Convert\SmallDataConvert.cpp" />
<ClCompile Include="Data\Encoding\Base64.cpp" />
Expand Down
5 changes: 4 additions & 1 deletion Common/Common.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,9 @@
<ClCompile Include="GPU\OpenGL\GLMemory.cpp">
<Filter>GPU\OpenGL</Filter>
</ClCompile>
<ClCompile Include="Data\Collections\FastVec.h">
<Filter>Data\Collections</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<Filter Include="Crypto">
Expand Down Expand Up @@ -1086,4 +1089,4 @@
<Filter>ext\basis_universal</Filter>
</None>
</ItemGroup>
</Project>
</Project>
137 changes: 137 additions & 0 deletions Common/Data/Collections/FastVec.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#pragma once

// Yet another replacement for std::vector, this time for use in graphics queues.
// Its major difference is that you can append uninitialized structures and initialize them after.
// This is not allows by std::vector but is very useful for our sometimes oversized unions.
// Also, copies during resize are done by memcpy, not by any move constructor or similar.

#include <cstdlib>
#include <cstring>

template<class T>
class FastVec {
public:
FastVec() {}
FastVec(size_t initialCapacity) {
capacity_ = initialCapacity;
data_ = (T *)malloc(initialCapacity * sizeof(T));
}
~FastVec() { if (data_) free(data_); }

T &push_uninitialized() {
if (size_ < capacity_) {
size_++;
return data_[size_ - 1];
} else {
ExtendByOne();
return data_[size_ - 1];
}
}

void push_back(const T &t) {
T &dest = push_uninitialized();
dest = t;
}

// Move constructor
FastVec(FastVec &&other) {
data_ = other.data_;
size_ = other.size_;
capacity_ = other.capacity_;
other.data_ = nullptr;
other.size_ = 0;
other.capacity_ = 0;
}

FastVec &operator=(FastVec &&other) {
if (this != &other) {
delete[] data_;
data_ = other.data_;
size_ = other.size_;
capacity_ = other.capacity_;
other.data_ = nullptr;
other.size_ = 0;
other.capacity_ = 0;
}
return *this;
}

// No copy constructor.
FastVec(const FastVec &other) = delete;
FastVec &operator=(const FastVec &other) = delete;

size_t size() const { return size_; }
size_t capacity() const { return capacity_; }
void clear() { size_ = 0; }
bool empty() const { return size_ == 0; }

T *begin() { return data_; }
T *end() { return data_ + size_; }
const T *begin() const { return data_; }
const T *end() const { return data_ + size_; }

// Out of bounds (past size() - 1) is undefined behavior.
T &operator[] (const size_t index) { return data_[index]; }
const T &operator[] (const size_t index) const { return data_[index]; }
T &at(const size_t index) { return data_[index]; }
const T &at(const size_t index) const { return data_[index]; }

// These two are invalid if empty().
const T &back() const { return (*this)[size() - 1]; }
const T &front() const { return (*this)[0]; }

// Limited functionality for inserts and similar, add as needed.
T &insert(T *iter) {
int pos = iter - data_;
ExtendByOne();
if (pos + 1 < size_) {
memmove(data_ + pos + 1, data_ + pos, (size_ - pos) * sizeof(T));
}
return data_[pos];
}

void insert(T *destIter, const T *beginIter, const T *endIter) {
int pos = destIter - data_;
if (beginIter == endIter)
return;
size_t newItems = endIter - beginIter;
IncreaseCapacityTo(size_ + newItems);
memmove(data_ + pos + newItems, data_ + pos, (size_ - pos) * sizeof(T));
memcpy(data_ + pos, beginIter, newItems * sizeof(T));
size_ += newItems;
}

void resize(size_t size) {
if (size < size_) {
size_ = size;
} else {
// TODO
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this should have an assert or something? I don't think anything's using this right now, though.

I do wonder if there might've been a way to coax the compiler to do less copying with enough move/emplace, but this is probably safer. I assume it's also faster in debug anyway...

-[Unknown]

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually tried a bunch of things, but when looking at the disassembly, still lots of zeroing and copying. So gave up and did it this way. And yes, it helps debug performance indeed.

}
}

private:
void IncreaseCapacityTo(size_t newCapacity) {
if (newCapacity <= capacity_)
return;
T *oldData = data_;
data_ = (T *)malloc(sizeof(T) * newCapacity);
if (capacity_ != 0) {
memcpy(data_, oldData, sizeof(T) * size_);
free(oldData);
}
}

void ExtendByOne() {
size_t newCapacity = capacity_ * 2;
if (newCapacity < 16) {
newCapacity = 16;
}
IncreaseCapacityTo(newCapacity);
size_++;
capacity_ = newCapacity;
}

size_t size_ = 0;
size_t capacity_ = 0;
T *data_ = nullptr;
};
6 changes: 3 additions & 3 deletions Common/GPU/OpenGL/GLQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ static std::string GetStereoBufferLayout(const char *uniformName) {
else return "undefined";
}

void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls) {
void GLQueueRunner::RunInitSteps(const FastVec<GLRInitStep> &steps, bool skipGLCalls) {
if (skipGLCalls) {
// Some bookkeeping still needs to be done.
for (size_t i = 0; i < steps.size(); i++) {
Expand Down Expand Up @@ -700,7 +700,7 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCal
CHECK_GL_ERROR_IF_DEBUG();
size_t renderCount = 0;
for (size_t i = 0; i < steps.size(); i++) {
const GLRStep &step = *steps[i];
GLRStep &step = *steps[i];

#if !defined(USING_GLES2)
if (useDebugGroups_)
Expand All @@ -711,7 +711,7 @@ void GLQueueRunner::RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCal
case GLRStepType::RENDER:
renderCount++;
if (IsVREnabled()) {
GLRStep vrStep = step;
GLRStep &vrStep = step;
hrydgard marked this conversation as resolved.
Show resolved Hide resolved
PreprocessStepVR(&vrStep);
PerformRenderPass(vrStep, renderCount == 1, renderCount == totalRenderCount);
} else {
Expand Down
9 changes: 4 additions & 5 deletions Common/GPU/OpenGL/GLQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include "Common/GPU/Shader.h"
#include "Common/GPU/thin3d.h"
#include "Common/Data/Collections/TinySet.h"

#include "Common/Data/Collections/FastVec.h"

struct GLRViewport {
float x, y, w, h, minZ, maxZ;
Expand Down Expand Up @@ -70,6 +70,7 @@ enum class GLRRenderCommand : uint8_t {
// type field, smashed right after each other?)
// Also, all GLenums are really only 16 bits.
struct GLRRenderData {
GLRRenderData(GLRRenderCommand _cmd) : cmd(_cmd) {}
GLRRenderCommand cmd;
union {
struct {
Expand Down Expand Up @@ -301,7 +302,7 @@ enum {
struct GLRStep {
GLRStep(GLRStepType _type) : stepType(_type) {}
GLRStepType stepType;
std::vector<GLRRenderData> commands;
FastVec<GLRRenderData> commands;
TinySet<const GLRFramebuffer *, 8> dependencies;
const char *tag;
union {
Expand All @@ -310,8 +311,6 @@ struct GLRStep {
GLRRenderPassAction color;
GLRRenderPassAction depth;
GLRRenderPassAction stencil;
// Note: not accurate.
int numDraws;
} render;
struct {
GLRFramebuffer *src;
Expand Down Expand Up @@ -355,7 +354,7 @@ class GLQueueRunner {
caps_ = caps;
}

void RunInitSteps(const std::vector<GLRInitStep> &steps, bool skipGLCalls);
void RunInitSteps(const FastVec<GLRInitStep> &steps, bool skipGLCalls);

void RunSteps(const std::vector<GLRStep *> &steps, bool skipGLCalls, bool keepSteps, bool useVR);

Expand Down
39 changes: 17 additions & 22 deletions Common/GPU/OpenGL/GLRenderManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,25 +129,24 @@ bool GLRenderManager::ThreadFrame() {
return false;
}

GLRRenderThreadTask task;
GLRRenderThreadTask *task = nullptr;

// In case of syncs or other partial completion, we keep going until we complete a frame.
while (true) {
// Pop a task of the queue and execute it.
// NOTE: We need to actually wait for a task, we can't just bail!

{
std::unique_lock<std::mutex> lock(pushMutex_);
while (renderThreadQueue_.empty()) {
pushCondVar_.wait(lock);
}
task = renderThreadQueue_.front();
task = std::move(renderThreadQueue_.front());
renderThreadQueue_.pop();
}

// We got a task! We can now have pushMutex_ unlocked, allowing the host to
// push more work when it feels like it, and just start working.
if (task.runType == GLRRunType::EXIT) {
if (task->runType == GLRRunType::EXIT) {
// Oh, host wanted out. Let's leave, and also let's notify the host.
// This is unlike Vulkan too which can just block on the thread existing.
std::unique_lock<std::mutex> lock(syncMutex_);
Expand All @@ -157,11 +156,13 @@ bool GLRenderManager::ThreadFrame() {
}

// Render the scene.
VLOG(" PULL: Frame %d RUN (%0.3f)", task.frame, time_now_d());
if (Run(task)) {
VLOG(" PULL: Frame %d RUN (%0.3f)", task->frame, time_now_d());
if (Run(*task)) {
// Swap requested, so we just bail the loop.
delete task;
break;
}
delete task;
};

return true;
Expand All @@ -174,9 +175,7 @@ void GLRenderManager::StopThread() {
run_ = false;

std::unique_lock<std::mutex> lock(pushMutex_);
GLRRenderThreadTask exitTask{};
exitTask.runType = GLRRunType::EXIT;
renderThreadQueue_.push(exitTask);
renderThreadQueue_.push(new GLRRenderThreadTask(GLRRunType::EXIT));
pushCondVar_.notify_one();
} else {
WARN_LOG(G3D, "GL submission thread was already paused.");
Expand Down Expand Up @@ -215,13 +214,11 @@ void GLRenderManager::BindFramebufferAsRenderTarget(GLRFramebuffer *fb, GLRRende
step->render.color = color;
step->render.depth = depth;
step->render.stencil = stencil;
step->render.numDraws = 0;
step->tag = tag;
steps_.push_back(step);

GLuint clearMask = 0;
GLRRenderData data;
data.cmd = GLRRenderCommand::CLEAR;
GLRRenderData data(GLRRenderCommand::CLEAR);
if (color == GLRRenderPassAction::CLEAR) {
clearMask |= GL_COLOR_BUFFER_BIT;
data.clear.clearColor = clearColor;
Expand Down Expand Up @@ -379,15 +376,14 @@ void GLRenderManager::Finish() {
frameData_[curFrame].deleter.Take(deleter_);

VLOG("PUSH: Finish, pushing task. curFrame = %d", curFrame);
GLRRenderThreadTask task;
task.frame = curFrame;
task.runType = GLRRunType::PRESENT;
GLRRenderThreadTask *task = new GLRRenderThreadTask(GLRRunType::PRESENT);
task->frame = curFrame;

{
std::unique_lock<std::mutex> lock(pushMutex_);
renderThreadQueue_.push(task);
renderThreadQueue_.back().initSteps = std::move(initSteps_);
renderThreadQueue_.back().steps = std::move(steps_);
renderThreadQueue_.back()->initSteps = std::move(initSteps_);
renderThreadQueue_.back()->steps = std::move(steps_);
initSteps_.clear();
steps_.clear();
pushCondVar_.notify_one();
Expand Down Expand Up @@ -509,14 +505,13 @@ void GLRenderManager::FlushSync() {
{
VLOG("PUSH: Frame[%d].readyForRun = true (sync)", curFrame_);

GLRRenderThreadTask task;
task.frame = curFrame_;
task.runType = GLRRunType::SYNC;
GLRRenderThreadTask *task = new GLRRenderThreadTask(GLRRunType::SYNC);
task->frame = curFrame_;

std::unique_lock<std::mutex> lock(pushMutex_);
renderThreadQueue_.push(task);
renderThreadQueue_.back().initSteps = std::move(initSteps_);
renderThreadQueue_.back().steps = std::move(steps_);
renderThreadQueue_.back()->initSteps = std::move(initSteps_);
renderThreadQueue_.back()->steps = std::move(steps_);
pushCondVar_.notify_one();
steps_.clear();
}
Expand Down
Loading