From 5964060b403842a34dee45c15f186e6c04227777 Mon Sep 17 00:00:00 2001 From: Leonard Hecker Date: Mon, 14 Mar 2022 14:38:59 +0100 Subject: [PATCH] AtlasEngine: Reduce shader power draw with explicit branching (#12552) Many articles I read while writing this engine claimed that GPUs can't do branches like CPUs can. One common approach to branching in GPUs is apparently to "mask" out results, a technique called branch predication. The GPU will simply execute all instructions in your shader linearly, but if a branch isn't taken, it'll ignore the computation results. This is unfortunate for our shader, since most branches we have are only very seldomly taken. The cursor for instance is only drawn on a single cell and underlines are seldomly used. But apparently modern GPUs (2010s and later?) are actually entirely capable of branching, _if_ all lanes ("pixels") processed by a wave (""GPU core"") take the same branch. On both my Nvidia GPU (RTX 3080) and Intel iGPU (Intel HD Graphics 530) this change has a positive impact on power draw. Most noticeably on the latter this reduces power draw from 900mW down to 600mW at 60 FPS. ## PR Checklist * [x] I work here * [x] Tests added/passed ## Validation Steps Performed It seems to work fine on Intel and Nvidia GPUs. Unfortunately I don't have a AMD GPU to test this on, but I suspect it can't be worse. --- src/renderer/atlas/AtlasEngine.cpp | 2 - src/renderer/atlas/AtlasEngine.h | 2 +- src/renderer/atlas/dwrite.hlsl | 4 +- src/renderer/atlas/shader_ps.hlsl | 73 +++++++++++++++++++----------- 4 files changed, 49 insertions(+), 32 deletions(-) diff --git a/src/renderer/atlas/AtlasEngine.cpp b/src/renderer/atlas/AtlasEngine.cpp index bd30b04faee..caac790ab78 100644 --- a/src/renderer/atlas/AtlasEngine.cpp +++ b/src/renderer/atlas/AtlasEngine.cpp @@ -791,8 +791,6 @@ void AtlasEngine::_createSwapChain() if (_api.hwnd) { - desc.AlphaMode = DXGI_ALPHA_MODE_IGNORE; - if (FAILED(dxgiFactory->CreateSwapChainForHwnd(_r.device.get(), _api.hwnd, &desc, nullptr, nullptr, _r.swapChain.put()))) { // Platform Update for Windows 7: diff --git a/src/renderer/atlas/AtlasEngine.h b/src/renderer/atlas/AtlasEngine.h index 74114ba54f7..11ad2026a43 100644 --- a/src/renderer/atlas/AtlasEngine.h +++ b/src/renderer/atlas/AtlasEngine.h @@ -565,7 +565,7 @@ namespace Microsoft::Console::Render alignas(sizeof(u32)) u32 backgroundColor = 0; alignas(sizeof(u32)) u32 cursorColor = 0; alignas(sizeof(u32)) u32 selectionColor = 0; - alignas(sizeof(u32)) u32 useClearType = 0; + alignas(sizeof(u32)) bool useClearType = 0; #pragma warning(suppress : 4324) // 'ConstBuffer': structure was padded due to alignment specifier }; diff --git a/src/renderer/atlas/dwrite.hlsl b/src/renderer/atlas/dwrite.hlsl index 561b395684b..0f5c683bc5b 100644 --- a/src/renderer/atlas/dwrite.hlsl +++ b/src/renderer/atlas/dwrite.hlsl @@ -79,7 +79,7 @@ float4 DWrite_GrayscaleBlend(float4 gammaRatios, float grayscaleEnhancedContrast float3 foregroundStraight = DWrite_UnpremultiplyColor(foregroundColor); float contrastBoost = isThinFont ? 0.5f : 0.0f; float blendEnhancedContrast = contrastBoost + DWrite_ApplyLightOnDarkContrastAdjustment(grayscaleEnhancedContrast, foregroundStraight); - float intensity = DWrite_CalcColorIntensity(foregroundColor.rgb); + float intensity = DWrite_CalcColorIntensity(foregroundStraight); float contrasted = DWrite_EnhanceContrast(glyphAlpha, blendEnhancedContrast); return foregroundColor * DWrite_ApplyAlphaCorrection(contrasted, intensity, gammaRatios); } @@ -120,7 +120,7 @@ float4 DWrite_GrayscaleBlend(float4 gammaRatios, float grayscaleEnhancedContrast // overscale (meaning: the glyph is rasterized with 6x the required resolution in the X axis) and thus // only 7 different RGB combinations can exist in this texture (black/white and 5 states in between). // If you wanted to you could just store these in a A8 texture and restore the RGB values in this shader. -float4 DWrite_CleartypeBlend(float4 gammaRatios, float enhancedContrast, bool isThinFont, float4 backgroundColor, float4 foregroundColor, float4 glyphColor) +float4 DWrite_ClearTypeBlend(float4 gammaRatios, float enhancedContrast, bool isThinFont, float4 backgroundColor, float4 foregroundColor, float4 glyphColor) { float3 foregroundStraight = DWrite_UnpremultiplyColor(foregroundColor); float contrastBoost = isThinFont ? 0.5f : 0.0f; diff --git a/src/renderer/atlas/shader_ps.hlsl b/src/renderer/atlas/shader_ps.hlsl index a6f74899a90..787959ad736 100644 --- a/src/renderer/atlas/shader_ps.hlsl +++ b/src/renderer/atlas/shader_ps.hlsl @@ -48,7 +48,7 @@ cbuffer ConstBuffer : register(b0) uint backgroundColor; uint cursorColor; uint selectionColor; - uint useClearType; + bool useClearType; }; StructuredBuffer cells : register(t0); Texture2D glyphs : register(t1); @@ -76,18 +76,13 @@ float4 alphaBlendPremultiplied(float4 bottom, float4 top) float4 main(float4 pos: SV_Position): SV_Target // clang-format on { - if (any(pos.xy < viewport.xy) || any(pos.xy >= viewport.zw)) + // We need to fill the entire render target with pixels, but only our "viewport" + // has cells we want to draw. The rest gets treated with the background color. + [branch] if (any(pos.xy < viewport.xy || pos.xy >= viewport.zw)) { return decodeRGBA(backgroundColor); } - // If you want to write test a before/after change simultaneously - // you can turn the image into a checkerboard by writing: - // if ((uint(pos.x) ^ uint(pos.y)) / 4 & 1) { return float4(1, 0, 0, 1); } - // This will generate a checkerboard of 4*4px red squares. - // Of course you wouldn't just return a red color there, but instead - // for instance run your new code and compare it with the old. - uint2 viewportPos = pos.xy - viewport.xy; uint2 cellIndex = viewportPos / cellSize; uint2 cellPos = viewportPos % cellSize; @@ -100,45 +95,69 @@ float4 main(float4 pos: SV_Position): SV_Target // Layer 1 (optional): // Colored cursors are drawn "in between" the background color and the text of a cell. - if ((cell.flags & CellFlags_Cursor) && cursorColor != INVALID_COLOR) + [branch] if (cell.flags & CellFlags_Cursor) { - // The cursor texture is stored at the top-left-most glyph cell. - // Cursor pixels are either entirely transparent or opaque. - // --> We can just use .a as a mask to flip cursor pixels on or off. - color = alphaBlendPremultiplied(color, decodeRGBA(cursorColor) * glyphs[cellPos].a); + [flatten] if (cursorColor != INVALID_COLOR) + { + // The cursor texture is stored at the top-left-most glyph cell. + // Cursor pixels are either entirely transparent or opaque. + // --> We can just use .a as a mask to flip cursor pixels on or off. + color = alphaBlendPremultiplied(color, decodeRGBA(cursorColor) * glyphs[cellPos].a); + } } // Layer 2: // Step 1: Underlines - if ((cell.flags & CellFlags_Underline) && cellPos.y >= underlinePos.x && cellPos.y < underlinePos.y) + [branch] if (cell.flags & CellFlags_Underline) { - color = alphaBlendPremultiplied(color, fg); + [flatten] if (cellPos.y >= underlinePos.x && cellPos.y < underlinePos.y) + { + color = alphaBlendPremultiplied(color, fg); + } } - if ((cell.flags & CellFlags_UnderlineDotted) && cellPos.y >= underlinePos.x && cellPos.y < underlinePos.y && (viewportPos.x / (underlinePos.y - underlinePos.x) & 3) == 0) + [branch] if (cell.flags & CellFlags_UnderlineDotted) { - color = alphaBlendPremultiplied(color, fg); + [flatten] if (cellPos.y >= underlinePos.x && cellPos.y < underlinePos.y && (viewportPos.x / (underlinePos.y - underlinePos.x) & 3) == 0) + { + color = alphaBlendPremultiplied(color, fg); + } } // Step 2: The cell's glyph, potentially drawn in the foreground color { float4 glyph = glyphs[decodeU16x2(cell.glyphPos) + cellPos]; - if (cell.flags & CellFlags_ColoredGlyph) + [branch] if (cell.flags & CellFlags_ColoredGlyph) { color = alphaBlendPremultiplied(color, glyph); } - else if (useClearType) - { - color = DWrite_CleartypeBlend(gammaRatios, enhancedContrast, false, color, fg, glyph); - } else { - color = alphaBlendPremultiplied(color, DWrite_GrayscaleBlend(gammaRatios, enhancedContrast, false, fg, glyph.a)); + float3 foregroundStraight = DWrite_UnpremultiplyColor(fg); + float blendEnhancedContrast = DWrite_ApplyLightOnDarkContrastAdjustment(enhancedContrast, foregroundStraight); + + [branch] if (useClearType) + { + // See DWrite_ClearTypeBlend + float3 contrasted = DWrite_EnhanceContrast3(glyph.rgb, blendEnhancedContrast); + float3 alphaCorrected = DWrite_ApplyAlphaCorrection3(contrasted, foregroundStraight, gammaRatios); + color = float4(lerp(color.rgb, foregroundStraight, alphaCorrected * fg.a), 1.0f); + } + else + { + // See DWrite_GrayscaleBlend + float intensity = DWrite_CalcColorIntensity(foregroundStraight); + float contrasted = DWrite_EnhanceContrast(glyph.a, blendEnhancedContrast); + color = fg * DWrite_ApplyAlphaCorrection(contrasted, intensity, gammaRatios); + } } } // Step 3: Lines, but not "under"lines - if ((cell.flags & CellFlags_Strikethrough) && cellPos.y >= strikethroughPos.x && cellPos.y < strikethroughPos.y) + [branch] if (cell.flags & CellFlags_Strikethrough) { - color = alphaBlendPremultiplied(color, fg); + [flatten] if (cellPos.y >= strikethroughPos.x && cellPos.y < strikethroughPos.y) + { + color = alphaBlendPremultiplied(color, fg); + } } // Layer 3 (optional): @@ -153,7 +172,7 @@ float4 main(float4 pos: SV_Position): SV_Target // Layer 4: // The current selection is drawn semi-transparent on top. - if (cell.flags & CellFlags_Selected) + [branch] if (cell.flags & CellFlags_Selected) { color = alphaBlendPremultiplied(color, decodeRGBA(selectionColor)); }