diff --git a/game/commandline.cpp b/game/commandline.cpp index 1c1cf0f76..3d3d25ea0 100644 --- a/game/commandline.cpp +++ b/game/commandline.cpp @@ -99,15 +99,15 @@ CommandLine::CommandLine(int argc, const char** argv) { if(i 0) && (settings.vidResIndex==0); + settings.aaEnabled = (Gothic::options().aaPreset>0) && (settings.vidResIndex==0); + if(prevVidResIndex!=settings.vidResIndex) { resetSwapchain(); } @@ -291,10 +304,31 @@ void Renderer::prepareUniforms() { tonemapping.uboTone.set(1, sceneLinear, smpB); } - if(settings.fxaaEnabled) { + if(settings.aaEnabled) { auto smpB = Sampler::bilinear(); smpB.setClamping(ClampMode::ClampToEdge); - fxaa.ubo.set(0, fxaa.sceneTonemapped, smpB); + + cmaa2.detectEdges2x2Ubo.set(0, sceneLinear, smpB); + cmaa2.detectEdges2x2Ubo.set(1, cmaa2.workingEdges); + cmaa2.detectEdges2x2Ubo.set(2, cmaa2.shapeCandidates); + cmaa2.detectEdges2x2Ubo.set(5, cmaa2.deferredBlendItemListHeads); + cmaa2.detectEdges2x2Ubo.set(6, cmaa2.controlBuffer); + cmaa2.detectEdges2x2Ubo.set(7, cmaa2.indirectBuffer); + + cmaa2.processCandidatesUbo.set(0, sceneLinear, smpB); + cmaa2.processCandidatesUbo.set(1, cmaa2.workingEdges); + cmaa2.processCandidatesUbo.set(2, cmaa2.shapeCandidates); + cmaa2.processCandidatesUbo.set(3, cmaa2.deferredBlendLocationList); + cmaa2.processCandidatesUbo.set(4, cmaa2.deferredBlendItemList); + cmaa2.processCandidatesUbo.set(5, cmaa2.deferredBlendItemListHeads); + cmaa2.processCandidatesUbo.set(6, cmaa2.controlBuffer); + cmaa2.processCandidatesUbo.set(7, cmaa2.indirectBuffer); + + cmaa2.defferedColorApplyUbo.set(0, sceneLinear); + cmaa2.defferedColorApplyUbo.set(3, cmaa2.deferredBlendLocationList); + cmaa2.defferedColorApplyUbo.set(4, cmaa2.deferredBlendItemList); + cmaa2.defferedColorApplyUbo.set(5, cmaa2.deferredBlendItemListHeads); + cmaa2.defferedColorApplyUbo.set(6, cmaa2.controlBuffer); } shadow.ubo.set(0, wview->sceneGlobals().uboGlobal[SceneGlobals::V_Main]); @@ -560,26 +594,18 @@ void Renderer::draw(Tempest::Attachment& result, Encoder& cmd, ui wview->drawFog(cmd,fId); } - auto* tonemappingRt = &result; - if(settings.fxaaEnabled) { - assert(!fxaa.sceneTonemapped.isEmpty()); - tonemappingRt = &fxaa.sceneTonemapped; - } - - cmd.setFramebuffer({{*tonemappingRt, Tempest::Discard, Tempest::Preserve}}); - cmd.setDebugMarker("Tonemapping"); - drawTonemapping(cmd); - - if(settings.fxaaEnabled) { - cmd.setFramebuffer({ {result, Tempest::Discard, Tempest::Preserve} }); - cmd.setDebugMarker("Fxaa"); - drawFxaa(cmd); + if(settings.aaEnabled) { + cmd.setDebugMarker("CMAA2 & Tonemapping"); + drawCMAA2(result, cmd); + } else { + cmd.setDebugMarker("Tonemapping"); + drawTonemapping(result, cmd); } wview->postFrameupdate(); } -void Renderer::drawTonemapping(Encoder& cmd) { +void Renderer::drawTonemapping(Attachment& result, Encoder& cmd) { struct Push { float brightness = 0; float contrast = 1; @@ -596,34 +622,50 @@ void Renderer::drawTonemapping(Encoder& cmd) { if(mul>0) p.mul = mul; + cmd.setFramebuffer({ {result, Tempest::Discard, Tempest::Preserve} }); cmd.setUniforms(*tonemapping.pso, tonemapping.uboTone, &p, sizeof(p)); cmd.draw(Resources::fsqVbo()); } -void Renderer::drawFxaa(Encoder& cmd) { +void Renderer::drawCMAA2(Tempest::Attachment& result, Tempest::Encoder& cmd) { + const IVec3 inputGroupSize = cmaa2.detectEdges2x2->workGroupSize(); + const IVec3 outputGroupSize = inputGroupSize - IVec3(2, 2, 0); + const uint32_t groupCountX = uint32_t((sceneLinear.w() + outputGroupSize.x * 2 - 1) / (outputGroupSize.x * 2)); + const uint32_t groupCountY = uint32_t((sceneLinear.h() + outputGroupSize.y * 2 - 1) / (outputGroupSize.y * 2)); + + cmd.setFramebuffer({}); - struct PushConstantsFxaa { - float fxaaInverseSharpnessCoeff; - float fxaaQualitySubpix; - float fxaaQualityEdgeThreshold; - float fxaaQualityEdgeThresholdMin; - float fxaaConsoleEdgeSharpness; - float fxaaConsoleEdgeThreshold; - float fxaaConsoleEdgeThresholdMin; - } pushConstantsFxaa; + // detect edges + cmd.setUniforms(*cmaa2.detectEdges2x2, cmaa2.detectEdges2x2Ubo); + cmd.dispatch(groupCountX, groupCountY, 1); + + // process candidates pass + cmd.setUniforms(*cmaa2.processCandidates, cmaa2.processCandidatesUbo); + cmd.dispatchIndirect(cmaa2.indirectBuffer, 0); + + // deferred color apply + struct Push { + float brightness = 0; + float contrast = 1; + float gamma = 1.f/2.2f; + float mul = 1; + }; + Push p; + p.brightness = (settings.zVidBrightness - 0.5f)*0.1f; + p.contrast = std::max(1.5f - settings.zVidContrast, 0.01f); + p.gamma = p.gamma/std::max(2.0f*settings.zVidGamma, 0.01f); - // for now filled with default values (see Fxaa3_11.h) - pushConstantsFxaa.fxaaInverseSharpnessCoeff = 0.5f; - pushConstantsFxaa.fxaaQualitySubpix = 0.75f; - pushConstantsFxaa.fxaaQualityEdgeThreshold = 0.166f; - pushConstantsFxaa.fxaaQualityEdgeThresholdMin = 0.0833f; - pushConstantsFxaa.fxaaConsoleEdgeSharpness = 8.f; - pushConstantsFxaa.fxaaConsoleEdgeThreshold = 0.125f; - pushConstantsFxaa.fxaaConsoleEdgeThresholdMin = 0.05f; + static float mul = 0.f; + if(mul>0) + p.mul = mul; - cmd.setUniforms(*fxaa.pso, fxaa.ubo, &pushConstantsFxaa, sizeof(pushConstantsFxaa)); + cmd.setFramebuffer({{result, Tempest::Discard, Tempest::Preserve}}); + cmd.setUniforms(*tonemapping.pso, tonemapping.uboTone, &p, sizeof(p)); cmd.draw(Resources::fsqVbo()); + + cmd.setUniforms(*cmaa2.defferedColorApply, cmaa2.defferedColorApplyUbo, &p, sizeof(p)); + cmd.drawIndirect(cmaa2.indirectBuffer, 3*sizeof(uint32_t)); } void Renderer::stashSceneAux(Encoder& cmd, uint8_t fId) { diff --git a/game/graphics/renderer.h b/game/graphics/renderer.h index 037201e5c..c84d7563c 100644 --- a/game/graphics/renderer.h +++ b/game/graphics/renderer.h @@ -55,8 +55,8 @@ class Renderer final { void drawSky (Tempest::Encoder& cmd, uint8_t fId, WorldView& view); void drawAmbient (Tempest::Encoder& cmd, const WorldView& view); void draw (Tempest::Attachment& result, Tempest::Encoder& cmd, uint8_t fId); - void drawTonemapping (Tempest::Encoder& cmd); - void drawFxaa (Tempest::Encoder& cmd); + void drawTonemapping (Tempest::Attachment& result, Tempest::Encoder& cmd); + void drawCMAA2 (Tempest::Attachment& result, Tempest::Encoder& cmd); void drawReflections (Tempest::Encoder& cmd, uint8_t fId); void drawUnderwater (Tempest::Encoder& cmd, uint8_t fId); @@ -73,7 +73,7 @@ class Renderer final { bool zEnvMappingEnabled = false; bool zCloudShadowScale = false; bool giEnabled = false; - bool fxaaEnabled = false; + bool aaEnabled = false; float zVidBrightness = 0.5; float zVidContrast = 0.5; @@ -124,15 +124,28 @@ class Renderer final { } ssao; struct Tonemapping { - Tempest::RenderPipeline* pso = nullptr; - Tempest::DescriptorSet uboTone; + Tempest::RenderPipeline* pso = nullptr; + Tempest::DescriptorSet uboTone; } tonemapping; - struct Fxaa { - Tempest::RenderPipeline* pso = nullptr; - Tempest::DescriptorSet ubo; - Tempest::Attachment sceneTonemapped; - } fxaa; + struct Cmaa2 { + Tempest::ComputePipeline* detectEdges2x2 = nullptr; + Tempest::DescriptorSet detectEdges2x2Ubo; + + Tempest::ComputePipeline* processCandidates = nullptr; + Tempest::DescriptorSet processCandidatesUbo; + + Tempest::RenderPipeline* defferedColorApply = nullptr; + Tempest::DescriptorSet defferedColorApplyUbo; + + Tempest::StorageImage workingEdges; + Tempest::StorageBuffer shapeCandidates; + Tempest::StorageBuffer deferredBlendLocationList; + Tempest::StorageBuffer deferredBlendItemList; + Tempest::StorageImage deferredBlendItemListHeads; + Tempest::StorageBuffer controlBuffer; + Tempest::StorageBuffer indirectBuffer; + } cmaa2; struct { Tempest::StorageImage hiZ; diff --git a/game/graphics/shaders.cpp b/game/graphics/shaders.cpp index f8aa0814a..130f6418a 100644 --- a/game/graphics/shaders.cpp +++ b/game/graphics/shaders.cpp @@ -128,13 +128,18 @@ Shaders::Shaders() { tonemapping = postEffect("tonemapping", "tonemapping", RenderState::ZTestMode::Always); tonemappingUpscale = postEffect("tonemapping", "tonemapping_up", RenderState::ZTestMode::Always); - const auto fxaaZTestMode = RenderState::ZTestMode::Always; - fxaaPresets[uint32_t(FxaaPreset::OFF)] = Tempest::RenderPipeline(); - fxaaPresets[uint32_t(FxaaPreset::CONSOLE)] = postEffect("fxaa", "fxaa_quality_0", fxaaZTestMode); - fxaaPresets[uint32_t(FxaaPreset::PC_LOW)] = postEffect("fxaa", "fxaa_quality_1", fxaaZTestMode); - fxaaPresets[uint32_t(FxaaPreset::PC_MEDIUM)] = postEffect("fxaa", "fxaa_quality_2", fxaaZTestMode); - fxaaPresets[uint32_t(FxaaPreset::PC_HIGH)] = postEffect("fxaa", "fxaa_quality_3", fxaaZTestMode); - fxaaPresets[uint32_t(FxaaPreset::PC_EXTREME)] = postEffect("fxaa", "fxaa_quality_4", fxaaZTestMode); + cmaa2EdgeColor2x2Presets[uint32_t(AaPreset::OFF)] = Tempest::ComputePipeline(); + cmaa2EdgeColor2x2Presets[uint32_t(AaPreset::MEDIUM)] = computeShader("cmaa2_edges_color2x2_quality_0.comp.sprv"); + cmaa2EdgeColor2x2Presets[uint32_t(AaPreset::ULTRA)] = computeShader("cmaa2_edges_color2x2_quality_1.comp.sprv"); + + cmaa2ProcessCandidates = computeShader("cmaa2_process_candidates.comp.sprv"); + { + auto sh = GothicShader::get("cmaa2_deferred_color_apply_2x2.vert.sprv"); + auto vs = device.shader(sh.data,sh.len); + sh = GothicShader::get("cmaa2_deferred_color_apply_2x2.frag.sprv"); + auto fs = device.shader(sh.data,sh.len); + cmaa2DeferredColorApply2x2 = device.pipeline(Tempest::Points,RenderState(),vs,fs); + } hiZPot = computeShader("hiz_pot.comp.sprv"); hiZMip = computeShader("hiz_mip.comp.sprv"); diff --git a/game/graphics/shaders.h b/game/graphics/shaders.h index 4a356d19d..f23f2f8aa 100644 --- a/game/graphics/shaders.h +++ b/game/graphics/shaders.h @@ -54,7 +54,9 @@ class Shaders { Tempest::RenderPipeline tonemapping, tonemappingUpscale; // AA - Tempest::RenderPipeline fxaaPresets[uint32_t(FxaaPreset::PRESETS_COUNT)]; + Tempest::ComputePipeline cmaa2EdgeColor2x2Presets[uint32_t(AaPreset::PRESETS_COUNT)]; + Tempest::ComputePipeline cmaa2ProcessCandidates; + Tempest::RenderPipeline cmaa2DeferredColorApply2x2; // HiZ Tempest::ComputePipeline hiZPot, hiZMip; diff --git a/lib/Tempest b/lib/Tempest index 681995274..71d8312c2 160000 --- a/lib/Tempest +++ b/lib/Tempest @@ -1 +1 @@ -Subproject commit 681995274c724c78f9f33bbbbead0b6dc257a0dd +Subproject commit 71d8312c2d07432849c2af5223389f4f0af3af94 diff --git a/shader/CMakeLists.txt b/shader/CMakeLists.txt index 8f78b3702..22e172397 100644 --- a/shader/CMakeLists.txt +++ b/shader/CMakeLists.txt @@ -270,12 +270,12 @@ add_shader(probe_lighting.comp lighting/rt/probe_lighting.comp) add_shader(probe_ambient.vert copy.vert) add_shader(probe_ambient.frag lighting/rt/probe_ambient.frag) -add_shader(fxaa.vert copy.vert) -add_shader(fxaa_quality_0.frag antialiasing/fxaa.frag -DFXAA_QUALITY_SETTING=0) -add_shader(fxaa_quality_1.frag antialiasing/fxaa.frag -DFXAA_QUALITY_SETTING=1) -add_shader(fxaa_quality_2.frag antialiasing/fxaa.frag -DFXAA_QUALITY_SETTING=2) -add_shader(fxaa_quality_3.frag antialiasing/fxaa.frag -DFXAA_QUALITY_SETTING=3) -add_shader(fxaa_quality_4.frag antialiasing/fxaa.frag -DFXAA_QUALITY_SETTING=4) +add_shader(cmaa2_edges_color2x2_quality_0.comp antialiasing/cmaa2/edge_color2x2.comp -DCMAA2_STATIC_QUALITY_PRESET=0) +add_shader(cmaa2_edges_color2x2_quality_1.comp antialiasing/cmaa2/edge_color2x2.comp -DCMAA2_STATIC_QUALITY_PRESET=1) + +add_shader(cmaa2_process_candidates.comp antialiasing/cmaa2/process_candidates.comp) +add_shader(cmaa2_deferred_color_apply_2x2.vert antialiasing/cmaa2/deferred_color_apply_2x2.vert) +add_shader(cmaa2_deferred_color_apply_2x2.frag antialiasing/cmaa2/deferred_color_apply_2x2.frag) add_custom_command( OUTPUT ${HEADER} ${CPP} diff --git a/shader/antialiasing/Fxaa3_11.h b/shader/antialiasing/Fxaa3_11.h deleted file mode 100644 index 99a7fbfaf..000000000 --- a/shader/antialiasing/Fxaa3_11.h +++ /dev/null @@ -1,2061 +0,0 @@ -/*============================================================================ - - - NVIDIA FXAA 3.11 by TIMOTHY LOTTES - - ------------------------------------------------------------------------------- -COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED. ------------------------------------------------------------------------------- -TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED -*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA -OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR -CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR -LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, -OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE -THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - ------------------------------------------------------------------------------- - INTEGRATION CHECKLIST ------------------------------------------------------------------------------- -(1.) -In the shader source, setup defines for the desired configuration. -When providing multiple shaders (for different presets), -simply setup the defines differently in multiple files. -Example, - - #define FXAA_PC 1 - #define FXAA_HLSL_5 1 - #define FXAA_QUALITY__PRESET 12 - -Or, - - #define FXAA_360 1 - -Or, - - #define FXAA_PS3 1 - -Etc. - -(2.) -Then include this file using the include directive, - - -"Fxaa3_11.h" - -(3.) -Then call the FXAA pixel shader from within your desired shader. -Look at the FXAA Quality FxaaPixelShader() for docs on inputs. -As for FXAA 3.11 all inputs for all shaders are the same -to enable easy porting between platforms. - - return FxaaPixelShader(...); - -(4.) -Insure pass prior to FXAA outputs RGBL (see next section). -Or use, - - #define FXAA_GREEN_AS_LUMA 1 - -(5.) -Setup engine to provide the following constants -which are used in the FxaaPixelShader() inputs, - - FxaaFloat2 fxaaQualityRcpFrame, - FxaaFloat4 fxaaConsoleRcpFrameOpt, - FxaaFloat4 fxaaConsoleRcpFrameOpt2, - FxaaFloat4 fxaaConsole360RcpFrameOpt2, - FxaaFloat fxaaQualitySubpix, - FxaaFloat fxaaQualityEdgeThreshold, - FxaaFloat fxaaQualityEdgeThresholdMin, - FxaaFloat fxaaConsoleEdgeSharpness, - FxaaFloat fxaaConsoleEdgeThreshold, - FxaaFloat fxaaConsoleEdgeThresholdMin, - FxaaFloat4 fxaaConsole360ConstDir - -Look at the FXAA Quality FxaaPixelShader() for docs on inputs. - -(6.) -Have FXAA vertex shader run as a full screen triangle, -and output "pos" and "fxaaConsolePosPos" -such that inputs in the pixel shader provide, - - // {xy} = center of pixel - FxaaFloat2 pos, - - // {xy__} = upper left of pixel - // {__zw} = lower right of pixel - FxaaFloat4 fxaaConsolePosPos, - -(7.) -Insure the texture sampler(s) used by FXAA are set to bilinear filtering. - - ------------------------------------------------------------------------------- - INTEGRATION - RGBL AND COLORSPACE ------------------------------------------------------------------------------- -FXAA3 requires RGBL as input unless the following is set, - - #define FXAA_GREEN_AS_LUMA 1 - -In which case the engine uses green in place of luma, -and requires RGB input is in a non-linear colorspace. - -RGB should be LDR (low dynamic range). -Specifically do FXAA after tonemapping. - -RGB data as returned by a texture fetch can be non-linear, -or linear when FXAA_GREEN_AS_LUMA is not set. -Note an "sRGB format" texture counts as linear, -because the result of a texture fetch is linear data. -Regular "RGBA8" textures in the sRGB colorspace are non-linear. - -If FXAA_GREEN_AS_LUMA is not set, -luma must be stored in the alpha channel prior to running FXAA. -This luma should be in a perceptual space (could be gamma 2.0). -Example pass before FXAA where output is gamma 2.0 encoded, - - color.rgb = ToneMap(color.rgb); // linear color output - color.rgb = sqrt(color.rgb); // gamma 2.0 color output - return color; - -To use FXAA, - - color.rgb = ToneMap(color.rgb); // linear color output - color.rgb = sqrt(color.rgb); // gamma 2.0 color output - color.a = dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114)); // compute luma - return color; - -Another example where output is linear encoded, -say for instance writing to an sRGB formated render target, -where the render target does the conversion back to sRGB after blending, - - color.rgb = ToneMap(color.rgb); // linear color output - return color; - -To use FXAA, - - color.rgb = ToneMap(color.rgb); // linear color output - color.a = sqrt(dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114))); // compute luma - return color; - -Getting luma correct is required for the algorithm to work correctly. - - ------------------------------------------------------------------------------- - BEING LINEARLY CORRECT? ------------------------------------------------------------------------------- -Applying FXAA to a framebuffer with linear RGB color will look worse. -This is very counter intuitive, but happends to be true in this case. -The reason is because dithering artifacts will be more visible -in a linear colorspace. - - ------------------------------------------------------------------------------- - COMPLEX INTEGRATION ------------------------------------------------------------------------------- -Q. What if the engine is blending into RGB before wanting to run FXAA? - -A. In the last opaque pass prior to FXAA, - have the pass write out luma into alpha. - Then blend into RGB only. - FXAA should be able to run ok - assuming the blending pass did not any add aliasing. - This should be the common case for particles and common blending passes. - -A. Or use FXAA_GREEN_AS_LUMA. - -============================================================================*/ - -/*============================================================================ - - INTEGRATION KNOBS - -============================================================================*/ -// -// FXAA_PS3 and FXAA_360 choose the console algorithm (FXAA3 CONSOLE). -// FXAA_360_OPT is a prototype for the new optimized 360 version. -// -// 1 = Use API. -// 0 = Don't use API. -// -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_PS3 - #define FXAA_PS3 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_360 - #define FXAA_360 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_360_OPT - #define FXAA_360_OPT 0 -#endif -/*==========================================================================*/ -#ifndef FXAA_PC - // - // FXAA Quality - // The high quality PC algorithm. - // - #define FXAA_PC 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_PC_CONSOLE - // - // The console algorithm for PC is included - // for developers targeting really low spec machines. - // Likely better to just run FXAA_PC, and use a really low preset. - // - #define FXAA_PC_CONSOLE 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_GLSL_120 - #define FXAA_GLSL_120 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_GLSL_130 - #define FXAA_GLSL_130 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_HLSL_3 - #define FXAA_HLSL_3 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_HLSL_4 - #define FXAA_HLSL_4 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_HLSL_5 - #define FXAA_HLSL_5 0 -#endif -/*==========================================================================*/ -#ifndef FXAA_GREEN_AS_LUMA - // - // For those using non-linear color, - // and either not able to get luma in alpha, or not wanting to, - // this enables FXAA to run using green as a proxy for luma. - // So with this enabled, no need to pack luma in alpha. - // - // This will turn off AA on anything which lacks some amount of green. - // Pure red and blue or combination of only R and B, will get no AA. - // - // Might want to lower the settings for both, - // fxaaConsoleEdgeThresholdMin - // fxaaQualityEdgeThresholdMin - // In order to insure AA does not get turned off on colors - // which contain a minor amount of green. - // - // 1 = On. - // 0 = Off. - // - #define FXAA_GREEN_AS_LUMA 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_EARLY_EXIT - // - // Controls algorithm's early exit path. - // On PS3 turning this ON adds 2 cycles to the shader. - // On 360 turning this OFF adds 10ths of a millisecond to the shader. - // Turning this off on console will result in a more blurry image. - // So this defaults to on. - // - // 1 = On. - // 0 = Off. - // - #define FXAA_EARLY_EXIT 1 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_DISCARD - // - // Only valid for PC OpenGL currently. - // Probably will not work when FXAA_GREEN_AS_LUMA = 1. - // - // 1 = Use discard on pixels which don't need AA. - // For APIs which enable concurrent TEX+ROP from same surface. - // 0 = Return unchanged color on pixels which don't need AA. - // - #define FXAA_DISCARD 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_FAST_PIXEL_OFFSET - // - // Used for GLSL 120 only. - // - // 1 = GL API supports fast pixel offsets - // 0 = do not use fast pixel offsets - // - #ifdef GL_EXT_gpu_shader4 - #define FXAA_FAST_PIXEL_OFFSET 1 - #endif - #ifdef GL_NV_gpu_shader5 - #define FXAA_FAST_PIXEL_OFFSET 1 - #endif - #ifdef GL_ARB_gpu_shader5 - #define FXAA_FAST_PIXEL_OFFSET 1 - #endif - #ifndef FXAA_FAST_PIXEL_OFFSET - #define FXAA_FAST_PIXEL_OFFSET 0 - #endif -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_GATHER4_ALPHA - // - // 1 = API supports gather4 on alpha channel. - // 0 = API does not support gather4 on alpha channel. - // - //#if (FXAA_HLSL_5 == 1) - // #define FXAA_GATHER4_ALPHA 1 - //#endif - //#ifdef GL_ARB_gpu_shader5 - // #define FXAA_GATHER4_ALPHA 1 - //#endif - //#ifdef GL_NV_gpu_shader5 - // #define FXAA_GATHER4_ALPHA 1 - //#endif - //#ifndef FXAA_GATHER4_ALPHA - // #define FXAA_GATHER4_ALPHA 0 - //#endif - #define FXAA_GATHER4_ALPHA 0 -#endif - -/*============================================================================ - FXAA CONSOLE PS3 - TUNING KNOBS -============================================================================*/ -#ifndef FXAA_CONSOLE__PS3_EDGE_SHARPNESS - // - // Consoles the sharpness of edges on PS3 only. - // Non-PS3 tuning is done with shader input. - // - // Due to the PS3 being ALU bound, - // there are only two safe values here: 4 and 8. - // These options use the shaders ability to a free *|/ by 2|4|8. - // - // 8.0 is sharper - // 4.0 is softer - // 2.0 is really soft (good for vector graphics inputs) - // - #if 1 - #define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 8.0 - #endif - #if 0 - #define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 4.0 - #endif - #if 0 - #define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 2.0 - #endif -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_CONSOLE__PS3_EDGE_THRESHOLD - // - // Only effects PS3. - // Non-PS3 tuning is done with shader input. - // - // The minimum amount of local contrast required to apply algorithm. - // The console setting has a different mapping than the quality setting. - // - // This only applies when FXAA_EARLY_EXIT is 1. - // - // Due to the PS3 being ALU bound, - // there are only two safe values here: 0.25 and 0.125. - // These options use the shaders ability to a free *|/ by 2|4|8. - // - // 0.125 leaves less aliasing, but is softer - // 0.25 leaves more aliasing, and is sharper - // - #if 1 - #define FXAA_CONSOLE__PS3_EDGE_THRESHOLD 0.125 - #else - #define FXAA_CONSOLE__PS3_EDGE_THRESHOLD 0.25 - #endif -#endif - -/*============================================================================ - FXAA QUALITY - TUNING KNOBS ------------------------------------------------------------------------------- -NOTE the other tuning knobs are now in the shader function inputs! -============================================================================*/ -#ifndef FXAA_QUALITY__PRESET - // - // Choose the quality preset. - // This needs to be compiled into the shader as it effects code. - // Best option to include multiple presets is to - // in each shader define the preset, then include this file. - // - // OPTIONS - // ----------------------------------------------------------------------- - // 10 to 15 - default medium dither (10=fastest, 15=highest quality) - // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality) - // 39 - no dither, very expensive - // - // NOTES - // ----------------------------------------------------------------------- - // 12 = slightly faster then FXAA 3.9 and higher edge quality (default) - // 13 = about same speed as FXAA 3.9 and better than 12 - // 23 = closest to FXAA 3.9 visually and performance wise - // _ = the lowest digit is directly related to performance - // _ = the highest digit is directly related to style - // - #define FXAA_QUALITY__PRESET 12 -#endif - - -/*============================================================================ - - FXAA QUALITY - PRESETS - -============================================================================*/ - -/*============================================================================ - FXAA QUALITY - MEDIUM DITHER PRESETS -============================================================================*/ -#if (FXAA_QUALITY__PRESET == 10) - #define FXAA_QUALITY__PS 3 - #define FXAA_QUALITY__P0 1.5 - #define FXAA_QUALITY__P1 3.0 - #define FXAA_QUALITY__P2 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 11) - #define FXAA_QUALITY__PS 4 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 3.0 - #define FXAA_QUALITY__P3 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 12) - #define FXAA_QUALITY__PS 5 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 4.0 - #define FXAA_QUALITY__P4 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 13) - #define FXAA_QUALITY__PS 6 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 4.0 - #define FXAA_QUALITY__P5 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 14) - #define FXAA_QUALITY__PS 7 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 4.0 - #define FXAA_QUALITY__P6 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 15) - #define FXAA_QUALITY__PS 8 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 4.0 - #define FXAA_QUALITY__P7 12.0 -#endif - -/*============================================================================ - FXAA QUALITY - LOW DITHER PRESETS -============================================================================*/ -#if (FXAA_QUALITY__PRESET == 20) - #define FXAA_QUALITY__PS 3 - #define FXAA_QUALITY__P0 1.5 - #define FXAA_QUALITY__P1 2.0 - #define FXAA_QUALITY__P2 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 21) - #define FXAA_QUALITY__PS 4 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 22) - #define FXAA_QUALITY__PS 5 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 23) - #define FXAA_QUALITY__PS 6 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 24) - #define FXAA_QUALITY__PS 7 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 3.0 - #define FXAA_QUALITY__P6 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 25) - #define FXAA_QUALITY__PS 8 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 4.0 - #define FXAA_QUALITY__P7 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 26) - #define FXAA_QUALITY__PS 9 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 4.0 - #define FXAA_QUALITY__P8 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 27) - #define FXAA_QUALITY__PS 10 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 4.0 - #define FXAA_QUALITY__P9 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 28) - #define FXAA_QUALITY__PS 11 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 2.0 - #define FXAA_QUALITY__P9 4.0 - #define FXAA_QUALITY__P10 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 29) - #define FXAA_QUALITY__PS 12 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 2.0 - #define FXAA_QUALITY__P9 2.0 - #define FXAA_QUALITY__P10 4.0 - #define FXAA_QUALITY__P11 8.0 -#endif - -/*============================================================================ - FXAA QUALITY - EXTREME QUALITY -============================================================================*/ -#if (FXAA_QUALITY__PRESET == 39) - #define FXAA_QUALITY__PS 12 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.0 - #define FXAA_QUALITY__P2 1.0 - #define FXAA_QUALITY__P3 1.0 - #define FXAA_QUALITY__P4 1.0 - #define FXAA_QUALITY__P5 1.5 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 2.0 - #define FXAA_QUALITY__P9 2.0 - #define FXAA_QUALITY__P10 4.0 - #define FXAA_QUALITY__P11 8.0 -#endif - - - -/*============================================================================ - - API PORTING - -============================================================================*/ -#if (FXAA_GLSL_120 == 1) || (FXAA_GLSL_130 == 1) - #define FxaaBool bool - #define FxaaDiscard discard - #define FxaaFloat float - #define FxaaFloat2 vec2 - #define FxaaFloat3 vec3 - #define FxaaFloat4 vec4 - #define FxaaHalf float - #define FxaaHalf2 vec2 - #define FxaaHalf3 vec3 - #define FxaaHalf4 vec4 - #define FxaaInt2 ivec2 - #define FxaaSat(x) clamp(x, 0.0, 1.0) - #define FxaaTex sampler2D -#else - #define FxaaBool bool - #define FxaaDiscard clip(-1) - #define FxaaFloat float - #define FxaaFloat2 float2 - #define FxaaFloat3 float3 - #define FxaaFloat4 float4 - #define FxaaHalf half - #define FxaaHalf2 half2 - #define FxaaHalf3 half3 - #define FxaaHalf4 half4 - #define FxaaSat(x) saturate(x) -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_GLSL_120 == 1) - // Requires, - // #version 120 - // And at least, - // #extension GL_EXT_gpu_shader4 : enable - // (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9) - #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0) - #if (FXAA_FAST_PIXEL_OFFSET == 1) - #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o) - #else - #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0) - #endif - #if (FXAA_GATHER4_ALPHA == 1) - // use #extension GL_ARB_gpu_shader5 : enable - #define FxaaTexAlpha4(t, p) textureGather(t, p, 3) - #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) - #define FxaaTexGreen4(t, p) textureGather(t, p, 1) - #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) - #endif -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_GLSL_130 == 1) - // Requires "#version 130" or better - #define FxaaTexTop(t, p) textureLod(t, p, 0.0) - #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o) - #if (FXAA_GATHER4_ALPHA == 1) - // use #extension GL_ARB_gpu_shader5 : enable - #define FxaaTexAlpha4(t, p) textureGather(t, p, 3) - #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) - #define FxaaTexGreen4(t, p) textureGather(t, p, 1) - #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) - #endif -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_HLSL_3 == 1) || (FXAA_360 == 1) || (FXAA_PS3 == 1) - #define FxaaInt2 float2 - #define FxaaTex sampler2D - #define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0)) - #define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0)) -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_HLSL_4 == 1) - #define FxaaInt2 int2 - struct FxaaTex { SamplerState smpl; Texture2D tex; float4 UVMinMax; }; - #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw), 0.0) - #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw), 0.0, o) -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_HLSL_5 == 1) - #define FxaaInt2 int2 - struct FxaaTex { SamplerState smpl; Texture2D tex; float4 UVMinMax; }; - #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw), 0.0) - #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw), 0.0, o) - #define FxaaTexAlpha4(t, p) t.tex.GatherAlpha(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw)) - #define FxaaTexOffAlpha4(t, p, o) t.tex.GatherAlpha(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw), o) - #define FxaaTexGreen4(t, p) t.tex.GatherGreen(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw)) - #define FxaaTexOffGreen4(t, p, o) t.tex.GatherGreen(t.smpl, clamp(p, t.UVMinMax.xy, t.UVMinMax.zw), o) -#endif - - -/*============================================================================ - GREEN AS LUMA OPTION SUPPORT FUNCTION -============================================================================*/ -#if (FXAA_GREEN_AS_LUMA == 0) - - #if 1 - // allows the tonemapper to not output Luma in alpha channel to use a R10G10BA2 - FxaaFloat FxaaLuma(FxaaFloat4 rgba) - { - return dot(rgba.rgb, FxaaFloat3(0.299f, 0.587f, 0.114f)); - } - - #else - FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.w; } - - #endif - -#else - FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.y; } -#endif - - - - -/*============================================================================ - - FXAA3 QUALITY - PC - -============================================================================*/ -#if (FXAA_PC == 1) -/*--------------------------------------------------------------------------*/ -FxaaFloat4 FxaaPixelShader( - // - // Use noperspective interpolation here (turn off perspective interpolation). - // {xy} = center of pixel - FxaaFloat2 pos, - // - // Used only for FXAA Console, and not used on the 360 version. - // Use noperspective interpolation here (turn off perspective interpolation). - // {xy__} = upper left of pixel - // {__zw} = lower right of pixel - FxaaFloat4 fxaaConsolePosPos, - // - // Input color texture. - // {rgb_} = color in linear or perceptual color space - // if (FXAA_GREEN_AS_LUMA == 0) - // {___a} = luma in perceptual color space (not linear) - FxaaTex tex, - // - // Only used on the optimized 360 version of FXAA Console. - // For everything but 360, just use the same input here as for "tex". - // For 360, same texture, just alias with a 2nd sampler. - // This sampler needs to have an exponent bias of -1. - FxaaTex fxaaConsole360TexExpBiasNegOne, - // - // Only used on the optimized 360 version of FXAA Console. - // For everything but 360, just use the same input here as for "tex". - // For 360, same texture, just alias with a 3nd sampler. - // This sampler needs to have an exponent bias of -2. - FxaaTex fxaaConsole360TexExpBiasNegTwo, - // - // Only used on FXAA Quality. - // This must be from a constant/uniform. - // {x_} = 1.0/screenWidthInPixels - // {_y} = 1.0/screenHeightInPixels - FxaaFloat2 fxaaQualityRcpFrame, - // - // Only used on FXAA Console. - // This must be from a constant/uniform. - // This effects sub-pixel AA quality and inversely sharpness. - // Where N ranges between, - // N = 0.50 (default) - // N = 0.33 (sharper) - // {x___} = -N/screenWidthInPixels - // {_y__} = -N/screenHeightInPixels - // {__z_} = N/screenWidthInPixels - // {___w} = N/screenHeightInPixels - FxaaFloat4 fxaaConsoleRcpFrameOpt, - // - // Only used on FXAA Console. - // Not used on 360, but used on PS3 and PC. - // This must be from a constant/uniform. - // {x___} = -2.0/screenWidthInPixels - // {_y__} = -2.0/screenHeightInPixels - // {__z_} = 2.0/screenWidthInPixels - // {___w} = 2.0/screenHeightInPixels - FxaaFloat4 fxaaConsoleRcpFrameOpt2, - // - // Only used on FXAA Console. - // Only used on 360 in place of fxaaConsoleRcpFrameOpt2. - // This must be from a constant/uniform. - // {x___} = 8.0/screenWidthInPixels - // {_y__} = 8.0/screenHeightInPixels - // {__z_} = -4.0/screenWidthInPixels - // {___w} = -4.0/screenHeightInPixels - FxaaFloat4 fxaaConsole360RcpFrameOpt2, - // - // Only used on FXAA Quality. - // This used to be the FXAA_QUALITY__SUBPIX define. - // It is here now to allow easier tuning. - // Choose the amount of sub-pixel aliasing removal. - // This can effect sharpness. - // 1.00 - upper limit (softer) - // 0.75 - default amount of filtering - // 0.50 - lower limit (sharper, less sub-pixel aliasing removal) - // 0.25 - almost off - // 0.00 - completely off - FxaaFloat fxaaQualitySubpix, - // - // Only used on FXAA Quality. - // This used to be the FXAA_QUALITY__EDGE_THRESHOLD define. - // It is here now to allow easier tuning. - // The minimum amount of local contrast required to apply algorithm. - // 0.333 - too little (faster) - // 0.250 - low quality - // 0.166 - default - // 0.125 - high quality - // 0.063 - overkill (slower) - FxaaFloat fxaaQualityEdgeThreshold, - // - // Only used on FXAA Quality. - // This used to be the FXAA_QUALITY__EDGE_THRESHOLD_MIN define. - // It is here now to allow easier tuning. - // Trims the algorithm from processing darks. - // 0.0833 - upper limit (default, the start of visible unfiltered edges) - // 0.0625 - high quality (faster) - // 0.0312 - visible limit (slower) - // Special notes when using FXAA_GREEN_AS_LUMA, - // Likely want to set this to zero. - // As colors that are mostly not-green - // will appear very dark in the green channel! - // Tune by looking at mostly non-green content, - // then start at zero and increase until aliasing is a problem. - FxaaFloat fxaaQualityEdgeThresholdMin, - // - // Only used on FXAA Console. - // This used to be the FXAA_CONSOLE__EDGE_SHARPNESS define. - // It is here now to allow easier tuning. - // This does not effect PS3, as this needs to be compiled in. - // Use FXAA_CONSOLE__PS3_EDGE_SHARPNESS for PS3. - // Due to the PS3 being ALU bound, - // there are only three safe values here: 2 and 4 and 8. - // These options use the shaders ability to a free *|/ by 2|4|8. - // For all other platforms can be a non-power of two. - // 8.0 is sharper (default!!!) - // 4.0 is softer - // 2.0 is really soft (good only for vector graphics inputs) - FxaaFloat fxaaConsoleEdgeSharpness, - // - // Only used on FXAA Console. - // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD define. - // It is here now to allow easier tuning. - // This does not effect PS3, as this needs to be compiled in. - // Use FXAA_CONSOLE__PS3_EDGE_THRESHOLD for PS3. - // Due to the PS3 being ALU bound, - // there are only two safe values here: 1/4 and 1/8. - // These options use the shaders ability to a free *|/ by 2|4|8. - // The console setting has a different mapping than the quality setting. - // Other platforms can use other values. - // 0.125 leaves less aliasing, but is softer (default!!!) - // 0.25 leaves more aliasing, and is sharper - FxaaFloat fxaaConsoleEdgeThreshold, - // - // Only used on FXAA Console. - // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD_MIN define. - // It is here now to allow easier tuning. - // Trims the algorithm from processing darks. - // The console setting has a different mapping than the quality setting. - // This only applies when FXAA_EARLY_EXIT is 1. - // This does not apply to PS3, - // PS3 was simplified to avoid more shader instructions. - // 0.06 - faster but more aliasing in darks - // 0.05 - default - // 0.04 - slower and less aliasing in darks - // Special notes when using FXAA_GREEN_AS_LUMA, - // Likely want to set this to zero. - // As colors that are mostly not-green - // will appear very dark in the green channel! - // Tune by looking at mostly non-green content, - // then start at zero and increase until aliasing is a problem. - FxaaFloat fxaaConsoleEdgeThresholdMin, - // - // Extra constants for 360 FXAA Console only. - // Use zeros or anything else for other platforms. - // These must be in physical constant registers and NOT immedates. - // Immedates will result in compiler un-optimizing. - // {xyzw} = float4(1.0, -1.0, 0.25, -0.25) - FxaaFloat4 fxaaConsole360ConstDir -) { -/*--------------------------------------------------------------------------*/ - FxaaFloat2 posM; - posM.x = pos.x; - posM.y = pos.y; - #if (FXAA_GATHER4_ALPHA == 1) - #if (FXAA_DISCARD == 0) - FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); - #if (FXAA_GREEN_AS_LUMA == 0) - #define lumaM FxaaLuma(rgbyM) - #else - #define lumaM rgbyM.y - #endif - #endif - #if (FXAA_GREEN_AS_LUMA == 0) - FxaaFloat4 luma4A = FxaaTexAlpha4(tex, posM); - FxaaFloat4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1)); - #else - FxaaFloat4 luma4A = FxaaTexGreen4(tex, posM); - FxaaFloat4 luma4B = FxaaTexOffGreen4(tex, posM, FxaaInt2(-1, -1)); - #endif - #if (FXAA_DISCARD == 1) - #define lumaM luma4A.w - #endif - #define lumaE luma4A.z - #define lumaS luma4A.x - #define lumaSE luma4A.y - #define lumaNW luma4B.w - #define lumaN luma4B.z - #define lumaW luma4B.x - #else - FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); - #if (FXAA_GREEN_AS_LUMA == 0) - #define lumaM FxaaLuma(rgbyM) - #else - #define lumaM rgbyM.y - #endif - FxaaFloat lumaS = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0, 1), fxaaQualityRcpFrame.xy)); - FxaaFloat lumaE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 0), fxaaQualityRcpFrame.xy)); - FxaaFloat lumaN = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0,-1), fxaaQualityRcpFrame.xy)); - FxaaFloat lumaW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 0), fxaaQualityRcpFrame.xy)); - #endif -/*--------------------------------------------------------------------------*/ - FxaaFloat maxSM = max(lumaS, lumaM); - FxaaFloat minSM = min(lumaS, lumaM); - FxaaFloat maxESM = max(lumaE, maxSM); - FxaaFloat minESM = min(lumaE, minSM); - FxaaFloat maxWN = max(lumaN, lumaW); - FxaaFloat minWN = min(lumaN, lumaW); - FxaaFloat rangeMax = max(maxWN, maxESM); - FxaaFloat rangeMin = min(minWN, minESM); - FxaaFloat rangeMaxScaled = rangeMax * fxaaQualityEdgeThreshold; - FxaaFloat range = rangeMax - rangeMin; - FxaaFloat rangeMaxClamped = max(fxaaQualityEdgeThresholdMin, rangeMaxScaled); - FxaaBool earlyExit = range < rangeMaxClamped; -/*--------------------------------------------------------------------------*/ - if(earlyExit) - #if (FXAA_DISCARD == 1) - FxaaDiscard; - #else - return rgbyM; - #endif -/*--------------------------------------------------------------------------*/ - #if (FXAA_GATHER4_ALPHA == 0) - FxaaFloat lumaNW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1,-1), fxaaQualityRcpFrame.xy)); - FxaaFloat lumaSE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 1), fxaaQualityRcpFrame.xy)); - FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1,-1), fxaaQualityRcpFrame.xy)); - FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); - #else - FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(1, -1), fxaaQualityRcpFrame.xy)); - FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); - #endif -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaNS = lumaN + lumaS; - FxaaFloat lumaWE = lumaW + lumaE; - FxaaFloat subpixRcpRange = 1.0/range; - FxaaFloat subpixNSWE = lumaNS + lumaWE; - FxaaFloat edgeHorz1 = (-2.0 * lumaM) + lumaNS; - FxaaFloat edgeVert1 = (-2.0 * lumaM) + lumaWE; -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaNESE = lumaNE + lumaSE; - FxaaFloat lumaNWNE = lumaNW + lumaNE; - FxaaFloat edgeHorz2 = (-2.0 * lumaE) + lumaNESE; - FxaaFloat edgeVert2 = (-2.0 * lumaN) + lumaNWNE; -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaNWSW = lumaNW + lumaSW; - FxaaFloat lumaSWSE = lumaSW + lumaSE; - FxaaFloat edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2); - FxaaFloat edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2); - FxaaFloat edgeHorz3 = (-2.0 * lumaW) + lumaNWSW; - FxaaFloat edgeVert3 = (-2.0 * lumaS) + lumaSWSE; - FxaaFloat edgeHorz = abs(edgeHorz3) + edgeHorz4; - FxaaFloat edgeVert = abs(edgeVert3) + edgeVert4; -/*--------------------------------------------------------------------------*/ - FxaaFloat subpixNWSWNESE = lumaNWSW + lumaNESE; - FxaaFloat lengthSign = fxaaQualityRcpFrame.x; - FxaaBool horzSpan = edgeHorz >= edgeVert; - FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE; -/*--------------------------------------------------------------------------*/ - if(!horzSpan) lumaN = lumaW; - if(!horzSpan) lumaS = lumaE; - if(horzSpan) lengthSign = fxaaQualityRcpFrame.y; - FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM; -/*--------------------------------------------------------------------------*/ - FxaaFloat gradientN = lumaN - lumaM; - FxaaFloat gradientS = lumaS - lumaM; - FxaaFloat lumaNN = lumaN + lumaM; - FxaaFloat lumaSS = lumaS + lumaM; - FxaaBool pairN = abs(gradientN) >= abs(gradientS); - FxaaFloat gradient = max(abs(gradientN), abs(gradientS)); - if(pairN) lengthSign = -lengthSign; - FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange); -/*--------------------------------------------------------------------------*/ - FxaaFloat2 posB; - posB.x = posM.x; - posB.y = posM.y; - FxaaFloat2 offNP; - offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x; - offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y; - if(!horzSpan) posB.x += lengthSign * 0.5; - if( horzSpan) posB.y += lengthSign * 0.5; -/*--------------------------------------------------------------------------*/ - FxaaFloat2 posN; - posN.x = posB.x - offNP.x * FXAA_QUALITY__P0; - posN.y = posB.y - offNP.y * FXAA_QUALITY__P0; - FxaaFloat2 posP; - posP.x = posB.x + offNP.x * FXAA_QUALITY__P0; - posP.y = posB.y + offNP.y * FXAA_QUALITY__P0; - FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0; - FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN)); - FxaaFloat subpixE = subpixC * subpixC; - FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP)); -/*--------------------------------------------------------------------------*/ - if(!pairN) lumaNN = lumaSS; - FxaaFloat gradientScaled = gradient * 1.0/4.0; - FxaaFloat lumaMM = lumaM - lumaNN * 0.5; - FxaaFloat subpixF = subpixD * subpixE; - FxaaBool lumaMLTZero = lumaMM < 0.0; -/*--------------------------------------------------------------------------*/ - lumaEndN -= lumaNN * 0.5; - lumaEndP -= lumaNN * 0.5; - FxaaBool doneN = abs(lumaEndN) >= gradientScaled; - FxaaBool doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1; - FxaaBool doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1; -/*--------------------------------------------------------------------------*/ - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 3) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 4) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 5) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 6) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 7) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 8) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 9) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 10) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 11) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 12) - if(doneNP) { - if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); - if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12; -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } -/*--------------------------------------------------------------------------*/ - FxaaFloat dstN = posM.x - posN.x; - FxaaFloat dstP = posP.x - posM.x; - if(!horzSpan) dstN = posM.y - posN.y; - if(!horzSpan) dstP = posP.y - posM.y; -/*--------------------------------------------------------------------------*/ - FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero; - FxaaFloat spanLength = (dstP + dstN); - FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero; - FxaaFloat spanLengthRcp = 1.0/spanLength; -/*--------------------------------------------------------------------------*/ - FxaaBool directionN = dstN < dstP; - FxaaFloat dstMin = min(dstN, dstP); - FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP; - FxaaFloat subpixG = subpixF * subpixF; - FxaaFloat pixelOffset = (dstMin * (-spanLengthRcp)) + 0.5; - FxaaFloat subpixH = subpixG * fxaaQualitySubpix; -/*--------------------------------------------------------------------------*/ - FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0; - FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH); - if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign; - if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign; - #if ((FXAA_DISCARD == 1) || 1) - return FxaaTexTop(tex, posM); - #else - return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM); - #endif -} -/*==========================================================================*/ -#endif - - - - -/*============================================================================ - - FXAA3 CONSOLE - PC VERSION - ------------------------------------------------------------------------------- -Instead of using this on PC, I'd suggest just using FXAA Quality with - #define FXAA_QUALITY__PRESET 10 -Or - #define FXAA_QUALITY__PRESET 20 -Either are higher qualilty and almost as fast as this on modern PC GPUs. -============================================================================*/ -#if (FXAA_PC_CONSOLE == 1) -/*--------------------------------------------------------------------------*/ -FxaaFloat4 FxaaPixelShader( - // See FXAA Quality FxaaPixelShader() source for docs on Inputs! - FxaaFloat2 pos, - FxaaFloat4 fxaaConsolePosPos, - FxaaTex tex, - FxaaTex fxaaConsole360TexExpBiasNegOne, - FxaaTex fxaaConsole360TexExpBiasNegTwo, - FxaaFloat2 fxaaQualityRcpFrame, - FxaaFloat4 fxaaConsoleRcpFrameOpt, - FxaaFloat4 fxaaConsoleRcpFrameOpt2, - FxaaFloat4 fxaaConsole360RcpFrameOpt2, - FxaaFloat fxaaQualitySubpix, - FxaaFloat fxaaQualityEdgeThreshold, - FxaaFloat fxaaQualityEdgeThresholdMin, - FxaaFloat fxaaConsoleEdgeSharpness, - FxaaFloat fxaaConsoleEdgeThreshold, - FxaaFloat fxaaConsoleEdgeThresholdMin, - FxaaFloat4 fxaaConsole360ConstDir -) { -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaNw = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.xy)); - FxaaFloat lumaSw = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.xw)); - FxaaFloat lumaNe = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.zy)); - FxaaFloat lumaSe = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.zw)); -/*--------------------------------------------------------------------------*/ - FxaaFloat4 rgbyM = FxaaTexTop(tex, pos.xy); - #if (FXAA_GREEN_AS_LUMA == 0) - FxaaFloat lumaM = FxaaLuma(rgbyM); - #else - FxaaFloat lumaM = rgbyM.y; - #endif -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaMaxNwSw = max(lumaNw, lumaSw); - lumaNe += 1.0/384.0; - FxaaFloat lumaMinNwSw = min(lumaNw, lumaSw); -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaMaxNeSe = max(lumaNe, lumaSe); - FxaaFloat lumaMinNeSe = min(lumaNe, lumaSe); -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaMax = max(lumaMaxNeSe, lumaMaxNwSw); - FxaaFloat lumaMin = min(lumaMinNeSe, lumaMinNwSw); -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaMaxScaled = lumaMax * fxaaConsoleEdgeThreshold; -/*--------------------------------------------------------------------------*/ - FxaaFloat lumaMinM = min(lumaMin, lumaM); - FxaaFloat lumaMaxScaledClamped = max(fxaaConsoleEdgeThresholdMin, lumaMaxScaled); - FxaaFloat lumaMaxM = max(lumaMax, lumaM); - FxaaFloat dirSwMinusNe = lumaSw - lumaNe; - FxaaFloat lumaMaxSubMinM = lumaMaxM - lumaMinM; - FxaaFloat dirSeMinusNw = lumaSe - lumaNw; - if(lumaMaxSubMinM < lumaMaxScaledClamped) return rgbyM; -/*--------------------------------------------------------------------------*/ - FxaaFloat2 dir; - dir.x = dirSwMinusNe + dirSeMinusNw; - dir.y = dirSwMinusNe - dirSeMinusNw; -/*--------------------------------------------------------------------------*/ - FxaaFloat2 dir1 = normalize(dir.xy); - FxaaFloat4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * fxaaConsoleRcpFrameOpt.zw); - FxaaFloat4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * fxaaConsoleRcpFrameOpt.zw); -/*--------------------------------------------------------------------------*/ - FxaaFloat dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * fxaaConsoleEdgeSharpness; - FxaaFloat2 dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0); -/*--------------------------------------------------------------------------*/ - FxaaFloat4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * fxaaConsoleRcpFrameOpt2.zw); - FxaaFloat4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * fxaaConsoleRcpFrameOpt2.zw); -/*--------------------------------------------------------------------------*/ - FxaaFloat4 rgbyA = rgbyN1 + rgbyP1; - FxaaFloat4 rgbyB = ((rgbyN2 + rgbyP2) * 0.25) + (rgbyA * 0.25); -/*--------------------------------------------------------------------------*/ - #if (FXAA_GREEN_AS_LUMA == 0) - FxaaBool twoTap = (FxaaLuma(rgbyB) < lumaMin) || (FxaaLuma(rgbyB) > lumaMax); - #else - FxaaBool twoTap = (rgbyB.y < lumaMin) || (rgbyB.y > lumaMax); - #endif - if(twoTap) rgbyB.xyz = rgbyA.xyz * 0.5; - return rgbyB; } -/*==========================================================================*/ -#endif - - - -/*============================================================================ - - FXAA3 CONSOLE - 360 PIXEL SHADER - ------------------------------------------------------------------------------- -This optimized version thanks to suggestions from Andy Luedke. -Should be fully tex bound in all cases. -As of the FXAA 3.11 release, I have still not tested this code, -however I fixed a bug which was in both FXAA 3.9 and FXAA 3.10. -And note this is replacing the old unoptimized version. -If it does not work, please let me know so I can fix it. -============================================================================*/ -#if (FXAA_360 == 1) -/*--------------------------------------------------------------------------*/ -[reduceTempRegUsage(4)] -float4 FxaaPixelShader( - // See FXAA Quality FxaaPixelShader() source for docs on Inputs! - FxaaFloat2 pos, - FxaaFloat4 fxaaConsolePosPos, - FxaaTex tex, - FxaaTex fxaaConsole360TexExpBiasNegOne, - FxaaTex fxaaConsole360TexExpBiasNegTwo, - FxaaFloat2 fxaaQualityRcpFrame, - FxaaFloat4 fxaaConsoleRcpFrameOpt, - FxaaFloat4 fxaaConsoleRcpFrameOpt2, - FxaaFloat4 fxaaConsole360RcpFrameOpt2, - FxaaFloat fxaaQualitySubpix, - FxaaFloat fxaaQualityEdgeThreshold, - FxaaFloat fxaaQualityEdgeThresholdMin, - FxaaFloat fxaaConsoleEdgeSharpness, - FxaaFloat fxaaConsoleEdgeThreshold, - FxaaFloat fxaaConsoleEdgeThresholdMin, - FxaaFloat4 fxaaConsole360ConstDir -) { -/*--------------------------------------------------------------------------*/ - float4 lumaNwNeSwSe; - #if (FXAA_GREEN_AS_LUMA == 0) - asm { - tfetch2D lumaNwNeSwSe.w___, tex, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe._w__, tex, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe.__w_, tex, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe.___w, tex, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false - }; - #else - asm { - tfetch2D lumaNwNeSwSe.y___, tex, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe._y__, tex, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe.__y_, tex, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe.___y, tex, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false - }; - #endif -/*--------------------------------------------------------------------------*/ - lumaNwNeSwSe.y += 1.0/384.0; - float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); - float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); - float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y); - float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y); -/*--------------------------------------------------------------------------*/ - float4 rgbyM = tex2Dlod(tex, float4(pos.xy, 0.0, 0.0)); - #if (FXAA_GREEN_AS_LUMA == 0) - float lumaMinM = min(lumaMin, rgbyM.w); - float lumaMaxM = max(lumaMax, rgbyM.w); - #else - float lumaMinM = min(lumaMin, rgbyM.y); - float lumaMaxM = max(lumaMax, rgbyM.y); - #endif - if((lumaMaxM - lumaMinM) < max(fxaaConsoleEdgeThresholdMin, lumaMax * fxaaConsoleEdgeThreshold)) return rgbyM; -/*--------------------------------------------------------------------------*/ - float2 dir; - dir.x = dot(lumaNwNeSwSe, fxaaConsole360ConstDir.yyxx); - dir.y = dot(lumaNwNeSwSe, fxaaConsole360ConstDir.xyxy); - dir = normalize(dir); -/*--------------------------------------------------------------------------*/ - float4 dir1 = dir.xyxy * fxaaConsoleRcpFrameOpt.xyzw; -/*--------------------------------------------------------------------------*/ - float4 dir2; - float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y)) * fxaaConsoleEdgeSharpness; - dir2 = saturate(fxaaConsole360ConstDir.zzww * dir.xyxy / dirAbsMinTimesC + 0.5); - dir2 = dir2 * fxaaConsole360RcpFrameOpt2.xyxy + fxaaConsole360RcpFrameOpt2.zwzw; -/*--------------------------------------------------------------------------*/ - float4 rgbyN1 = tex2Dlod(fxaaConsole360TexExpBiasNegOne, float4(pos.xy + dir1.xy, 0.0, 0.0)); - float4 rgbyP1 = tex2Dlod(fxaaConsole360TexExpBiasNegOne, float4(pos.xy + dir1.zw, 0.0, 0.0)); - float4 rgbyN2 = tex2Dlod(fxaaConsole360TexExpBiasNegTwo, float4(pos.xy + dir2.xy, 0.0, 0.0)); - float4 rgbyP2 = tex2Dlod(fxaaConsole360TexExpBiasNegTwo, float4(pos.xy + dir2.zw, 0.0, 0.0)); -/*--------------------------------------------------------------------------*/ - float4 rgbyA = rgbyN1 + rgbyP1; - float4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA; -/*--------------------------------------------------------------------------*/ - float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB; - rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA; - return rgbyR; } -/*==========================================================================*/ -#endif - - - -/*============================================================================ - - FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT) - -============================================================================== -The code below does not exactly match the assembly. -I have a feeling that 12 cycles is possible, but was not able to get there. -Might have to increase register count to get full performance. -Note this shader does not use perspective interpolation. - -Use the following cgc options, - - --fenable-bx2 --fastmath --fastprecision --nofloatbindings - ------------------------------------------------------------------------------- - NVSHADERPERF OUTPUT ------------------------------------------------------------------------------- -For reference and to aid in debug, output of NVShaderPerf should match this, - -Shader to schedule: - 0: texpkb h0.w(TRUE), v5.zyxx, #0 - 2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x - 4: texpkb h0.w(TRUE), v5.xwxx, #0 - 6: addh h0.z(TRUE), -h2, h0.w - 7: texpkb h1.w(TRUE), v5, #0 - 9: addh h0.x(TRUE), h0.z, -h1.w - 10: addh h3.w(TRUE), h0.z, h1 - 11: texpkb h2.w(TRUE), v5.zwzz, #0 - 13: addh h0.z(TRUE), h3.w, -h2.w - 14: addh h0.x(TRUE), h2.w, h0 - 15: nrmh h1.xz(TRUE), h0_n - 16: minh_m8 h0.x(TRUE), |h1|, |h1.z| - 17: maxh h4.w(TRUE), h0, h1 - 18: divx h2.xy(TRUE), h1_n.xzzw, h0_n - 19: movr r1.zw(TRUE), v4.xxxy - 20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww - 22: minh h5.w(TRUE), h0, h1 - 23: texpkb h0(TRUE), r2.xzxx, #0 - 25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1 - 27: maxh h4.x(TRUE), h2.z, h2.w - 28: texpkb h1(TRUE), r0.zwzz, #0 - 30: addh_d2 h1(TRUE), h0, h1 - 31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 33: texpkb h0(TRUE), r0, #0 - 35: minh h4.z(TRUE), h2, h2.w - 36: fenct TRUE - 37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 39: texpkb h2(TRUE), r1, #0 - 41: addh_d2 h0(TRUE), h0, h2 - 42: maxh h2.w(TRUE), h4, h4.x - 43: minh h2.x(TRUE), h5.w, h4.z - 44: addh_d2 h0(TRUE), h0, h1 - 45: slth h2.x(TRUE), h0.w, h2 - 46: sgth h2.w(TRUE), h0, h2 - 47: movh h0(TRUE), h0 - 48: addx.c0 rc(TRUE), h2, h2.w - 49: movh h0(c0.NE.x), h1 - -IPU0 ------ Simplified schedule: -------- -Pass | Unit | uOp | PC: Op ------+--------+------+------------------------- - 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-; - | | | - 2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; - | TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; - | SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-; - | | | - 3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; - | TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; - | SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---; - | SCB1 | add | 10: ADDh h3.w, h0.---z, h1; - | | | - 4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | SCB0 | add | 14: ADDh h0.x, h2.w---, h0; - | SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-; - | | | - 5 | SCT1 | mov | 15: NRMh h1.xz, h0; - | SRB | nrm | 15: NRMh h1.xz, h0; - | SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|; - | SCB1 | max | 17: MAXh h4.w, h0, h1; - | | | - 6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0; - | SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy; - | SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-; - | SCB1 | min | 22: MINh h5.w, h0, h1; - | | | - 7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; - | TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; - | SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---; - | SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1; - | | | - 8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; - | TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; - | SCB0/1 | add | 30: ADDh/2 h1, h0, h1; - | | | - 9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--; - | SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0; - | TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0; - | SCB1 | min | 35: MINh h4.z, h2, h2.--w-; - | | | - 10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--; - | SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0; - | TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0; - | SCB0/1 | add | 41: ADDh/2 h0, h0, h2; - | | | - 11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---; - | SCT1 | max | 42: MAXh h2.w, h4, h4.---x; - | SCB0/1 | add | 44: ADDh/2 h0, h0, h1; - | | | - 12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2; - | SCT1 | set | 46: SGTh h2.w, h0, h2; - | SCB0/1 | mul | 47: MOVh h0, h0; - | | | - 13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---; - | SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1; - -Pass SCT TEX SCB - 1: 0% 100% 25% - 2: 0% 100% 25% - 3: 0% 100% 50% - 4: 0% 100% 50% - 5: 0% 0% 50% - 6: 100% 0% 75% - 7: 0% 100% 75% - 8: 0% 100% 100% - 9: 0% 100% 25% - 10: 0% 100% 100% - 11: 50% 0% 100% - 12: 50% 0% 100% - 13: 25% 0% 100% - -MEAN: 17% 61% 67% - -Pass SCT0 SCT1 TEX SCB0 SCB1 - 1: 0% 0% 100% 0% 100% - 2: 0% 0% 100% 0% 100% - 3: 0% 0% 100% 100% 100% - 4: 0% 0% 100% 100% 100% - 5: 0% 0% 0% 100% 100% - 6: 100% 100% 0% 100% 100% - 7: 0% 0% 100% 100% 100% - 8: 0% 0% 100% 100% 100% - 9: 0% 0% 100% 0% 100% - 10: 0% 0% 100% 100% 100% - 11: 100% 100% 0% 100% 100% - 12: 100% 100% 0% 100% 100% - 13: 100% 0% 0% 100% 100% - -MEAN: 30% 23% 61% 76% 100% -Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 -Results 13 cycles, 3 r regs, 923,076,923 pixels/s -============================================================================*/ -#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0) -/*--------------------------------------------------------------------------*/ -#pragma regcount 7 -#pragma disablepc all -#pragma option O3 -#pragma option OutColorPrec=fp16 -#pragma texformat default RGBA8 -/*==========================================================================*/ -half4 FxaaPixelShader( - // See FXAA Quality FxaaPixelShader() source for docs on Inputs! - FxaaFloat2 pos, - FxaaFloat4 fxaaConsolePosPos, - FxaaTex tex, - FxaaTex fxaaConsole360TexExpBiasNegOne, - FxaaTex fxaaConsole360TexExpBiasNegTwo, - FxaaFloat2 fxaaQualityRcpFrame, - FxaaFloat4 fxaaConsoleRcpFrameOpt, - FxaaFloat4 fxaaConsoleRcpFrameOpt2, - FxaaFloat4 fxaaConsole360RcpFrameOpt2, - FxaaFloat fxaaQualitySubpix, - FxaaFloat fxaaQualityEdgeThreshold, - FxaaFloat fxaaQualityEdgeThresholdMin, - FxaaFloat fxaaConsoleEdgeSharpness, - FxaaFloat fxaaConsoleEdgeThreshold, - FxaaFloat fxaaConsoleEdgeThresholdMin, - FxaaFloat4 fxaaConsole360ConstDir -) { -/*--------------------------------------------------------------------------*/ -// (1) - half4 dir; - half4 lumaNe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zy, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - lumaNe.w += half(1.0/512.0); - dir.x = -lumaNe.w; - dir.z = -lumaNe.w; - #else - lumaNe.y += half(1.0/512.0); - dir.x = -lumaNe.y; - dir.z = -lumaNe.y; - #endif -/*--------------------------------------------------------------------------*/ -// (2) - half4 lumaSw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xw, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - dir.x += lumaSw.w; - dir.z += lumaSw.w; - #else - dir.x += lumaSw.y; - dir.z += lumaSw.y; - #endif -/*--------------------------------------------------------------------------*/ -// (3) - half4 lumaNw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xy, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - dir.x -= lumaNw.w; - dir.z += lumaNw.w; - #else - dir.x -= lumaNw.y; - dir.z += lumaNw.y; - #endif -/*--------------------------------------------------------------------------*/ -// (4) - half4 lumaSe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zw, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - dir.x += lumaSe.w; - dir.z -= lumaSe.w; - #else - dir.x += lumaSe.y; - dir.z -= lumaSe.y; - #endif -/*--------------------------------------------------------------------------*/ -// (5) - half4 dir1_pos; - dir1_pos.xy = normalize(dir.xyz).xz; - half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__PS3_EDGE_SHARPNESS); -/*--------------------------------------------------------------------------*/ -// (6) - half4 dir2_pos; - dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0)); - dir1_pos.zw = pos.xy; - dir2_pos.zw = pos.xy; - half4 temp1N; - temp1N.xy = dir1_pos.zw - dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; -/*--------------------------------------------------------------------------*/ -// (7) - temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); - half4 rgby1; - rgby1.xy = dir1_pos.zw + dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; -/*--------------------------------------------------------------------------*/ -// (8) - rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); - rgby1 = (temp1N + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (9) - half4 temp2N; - temp2N.xy = dir2_pos.zw - dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; - temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); -/*--------------------------------------------------------------------------*/ -// (10) - half4 rgby2; - rgby2.xy = dir2_pos.zw + dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; - rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); - rgby2 = (temp2N + rgby2) * 0.5; -/*--------------------------------------------------------------------------*/ -// (11) - // compilier moves these scalar ops up to other cycles - #if (FXAA_GREEN_AS_LUMA == 0) - half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); - half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); - #else - half lumaMin = min(min(lumaNw.y, lumaSw.y), min(lumaNe.y, lumaSe.y)); - half lumaMax = max(max(lumaNw.y, lumaSw.y), max(lumaNe.y, lumaSe.y)); - #endif - rgby2 = (rgby2 + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (12) - #if (FXAA_GREEN_AS_LUMA == 0) - bool twoTapLt = rgby2.w < lumaMin; - bool twoTapGt = rgby2.w > lumaMax; - #else - bool twoTapLt = rgby2.y < lumaMin; - bool twoTapGt = rgby2.y > lumaMax; - #endif -/*--------------------------------------------------------------------------*/ -// (13) - if(twoTapLt || twoTapGt) rgby2 = rgby1; -/*--------------------------------------------------------------------------*/ - return rgby2; } -/*==========================================================================*/ -#endif - - - -/*============================================================================ - - FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT) - -============================================================================== -The code mostly matches the assembly. -I have a feeling that 14 cycles is possible, but was not able to get there. -Might have to increase register count to get full performance. -Note this shader does not use perspective interpolation. - -Use the following cgc options, - - --fenable-bx2 --fastmath --fastprecision --nofloatbindings - -Use of FXAA_GREEN_AS_LUMA currently adds a cycle (16 clks). -Will look at fixing this for FXAA 3.12. ------------------------------------------------------------------------------- - NVSHADERPERF OUTPUT ------------------------------------------------------------------------------- -For reference and to aid in debug, output of NVShaderPerf should match this, - -Shader to schedule: - 0: texpkb h0.w(TRUE), v5.zyxx, #0 - 2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x - 4: texpkb h1.w(TRUE), v5.xwxx, #0 - 6: addh h0.x(TRUE), h1.w, -h2.y - 7: texpkb h2.w(TRUE), v5.zwzz, #0 - 9: minh h4.w(TRUE), h2.y, h2 - 10: maxh h5.x(TRUE), h2.y, h2.w - 11: texpkb h0.w(TRUE), v5, #0 - 13: addh h3.w(TRUE), -h0, h0.x - 14: addh h0.x(TRUE), h0.w, h0 - 15: addh h0.z(TRUE), -h2.w, h0.x - 16: addh h0.x(TRUE), h2.w, h3.w - 17: minh h5.y(TRUE), h0.w, h1.w - 18: nrmh h2.xz(TRUE), h0_n - 19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z| - 20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w - 21: movr r1.zw(TRUE), v4.xxxy - 22: maxh h2.w(TRUE), h0, h1 - 23: fenct TRUE - 24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz - 26: texpkb h0(TRUE), r0, #0 - 28: maxh h5.x(TRUE), h2.w, h5 - 29: minh h5.w(TRUE), h5.y, h4 - 30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz - 32: texpkb h2(TRUE), r1, #0 - 34: addh_d2 h2(TRUE), h0, h2 - 35: texpkb h1(TRUE), v4, #0 - 37: maxh h5.y(TRUE), h5.x, h1.w - 38: minh h4.w(TRUE), h1, h5 - 39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 41: texpkb h0(TRUE), r0, #0 - 43: addh_m8 h5.z(TRUE), h5.y, -h4.w - 44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 46: texpkb h3(TRUE), r2, #0 - 48: addh_d2 h0(TRUE), h0, h3 - 49: addh_d2 h3(TRUE), h0, h2 - 50: movh h0(TRUE), h3 - 51: slth h3.x(TRUE), h3.w, h5.w - 52: sgth h3.w(TRUE), h3, h5.x - 53: addx.c0 rc(TRUE), h3.x, h3 - 54: slth.c0 rc(TRUE), h5.z, h5 - 55: movh h0(c0.NE.w), h2 - 56: movh h0(c0.NE.x), h1 - -IPU0 ------ Simplified schedule: -------- -Pass | Unit | uOp | PC: Op ------+--------+------+------------------------- - 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--; - | | | - 2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; - | TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; - | SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---; - | | | - 3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---; - | SCB1 | min | 9: MINh h4.w, h2.---y, h2; - | | | - 4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; - | TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; - | SCB0 | add | 14: ADDh h0.x, h0.w---, h0; - | SCB1 | add | 13: ADDh h3.w,-h0, h0.---x; - | | | - 5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---; - | SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-; - | SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--; - | | | - 6 | SCT1 | mov | 18: NRMh h2.xz, h0; - | SRB | nrm | 18: NRMh h2.xz, h0; - | SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|; - | | | - 7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--; - | SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy; - | SCB1 | max | 22: MAXh h2.w, h0, h1; - | | | - 8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--; - | SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0; - | TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0; - | SCB0 | max | 28: MAXh h5.x, h2.w---, h5; - | SCB1 | min | 29: MINh h5.w, h5.---y, h4; - | | | - 9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--; - | SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0; - | TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0; - | SCB0/1 | add | 34: ADDh/2 h2, h0, h2; - | | | - 10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; - | TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; - | SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--; - | SCB1 | min | 38: MINh h4.w, h1, h5; - | | | - 11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--; - | SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0; - | TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0; - | SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--; - | SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-; - | | | - 12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0; - | TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0; - | SCB0/1 | add | 48: ADDh/2 h0, h0, h3; - | | | - 13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2; - | SCB0/1 | mul | 50: MOVh h0, h3; - | | | - 14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---; - | SCT1 | set | 52: SGTh h3.w, h3, h5.---x; - | SCB0 | set | 54: SLThc0 rc, h5.z---, h5; - | SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3; - | | | - 15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2; - | SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1; - -Pass SCT TEX SCB - 1: 0% 100% 25% - 2: 0% 100% 25% - 3: 0% 100% 50% - 4: 0% 100% 50% - 5: 50% 0% 25% - 6: 0% 0% 25% - 7: 100% 0% 25% - 8: 0% 100% 50% - 9: 0% 100% 100% - 10: 0% 100% 50% - 11: 0% 100% 75% - 12: 0% 100% 100% - 13: 100% 0% 100% - 14: 50% 0% 50% - 15: 100% 0% 100% - -MEAN: 26% 60% 56% - -Pass SCT0 SCT1 TEX SCB0 SCB1 - 1: 0% 0% 100% 100% 0% - 2: 0% 0% 100% 100% 0% - 3: 0% 0% 100% 100% 100% - 4: 0% 0% 100% 100% 100% - 5: 100% 100% 0% 100% 0% - 6: 0% 0% 0% 0% 100% - 7: 100% 100% 0% 0% 100% - 8: 0% 0% 100% 100% 100% - 9: 0% 0% 100% 100% 100% - 10: 0% 0% 100% 100% 100% - 11: 0% 0% 100% 100% 100% - 12: 0% 0% 100% 100% 100% - 13: 100% 100% 0% 100% 100% - 14: 100% 100% 0% 100% 100% - 15: 100% 100% 0% 100% 100% - -MEAN: 33% 33% 60% 86% 80% -Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 -Results 15 cycles, 3 r regs, 800,000,000 pixels/s -============================================================================*/ -#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1) -/*--------------------------------------------------------------------------*/ -#pragma regcount 7 -#pragma disablepc all -#pragma option O2 -#pragma option OutColorPrec=fp16 -#pragma texformat default RGBA8 -/*==========================================================================*/ -half4 FxaaPixelShader( - // See FXAA Quality FxaaPixelShader() source for docs on Inputs! - FxaaFloat2 pos, - FxaaFloat4 fxaaConsolePosPos, - FxaaTex tex, - FxaaTex fxaaConsole360TexExpBiasNegOne, - FxaaTex fxaaConsole360TexExpBiasNegTwo, - FxaaFloat2 fxaaQualityRcpFrame, - FxaaFloat4 fxaaConsoleRcpFrameOpt, - FxaaFloat4 fxaaConsoleRcpFrameOpt2, - FxaaFloat4 fxaaConsole360RcpFrameOpt2, - FxaaFloat fxaaQualitySubpix, - FxaaFloat fxaaQualityEdgeThreshold, - FxaaFloat fxaaQualityEdgeThresholdMin, - FxaaFloat fxaaConsoleEdgeSharpness, - FxaaFloat fxaaConsoleEdgeThreshold, - FxaaFloat fxaaConsoleEdgeThresholdMin, - FxaaFloat4 fxaaConsole360ConstDir -) { -/*--------------------------------------------------------------------------*/ -// (1) - half4 rgbyNe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zy, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - half lumaNe = rgbyNe.w + half(1.0/512.0); - #else - half lumaNe = rgbyNe.y + half(1.0/512.0); - #endif -/*--------------------------------------------------------------------------*/ -// (2) - half4 lumaSw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xw, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - half lumaSwNegNe = lumaSw.w - lumaNe; - #else - half lumaSwNegNe = lumaSw.y - lumaNe; - #endif -/*--------------------------------------------------------------------------*/ -// (3) - half4 lumaNw = h4tex2Dlod(tex, half4(fxaaConsolePosPos.xy, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - half lumaMaxNwSw = max(lumaNw.w, lumaSw.w); - half lumaMinNwSw = min(lumaNw.w, lumaSw.w); - #else - half lumaMaxNwSw = max(lumaNw.y, lumaSw.y); - half lumaMinNwSw = min(lumaNw.y, lumaSw.y); - #endif -/*--------------------------------------------------------------------------*/ -// (4) - half4 lumaSe = h4tex2Dlod(tex, half4(fxaaConsolePosPos.zw, 0, 0)); - #if (FXAA_GREEN_AS_LUMA == 0) - half dirZ = lumaNw.w + lumaSwNegNe; - half dirX = -lumaNw.w + lumaSwNegNe; - #else - half dirZ = lumaNw.y + lumaSwNegNe; - half dirX = -lumaNw.y + lumaSwNegNe; - #endif -/*--------------------------------------------------------------------------*/ -// (5) - half3 dir; - dir.y = 0.0; - #if (FXAA_GREEN_AS_LUMA == 0) - dir.x = lumaSe.w + dirX; - dir.z = -lumaSe.w + dirZ; - half lumaMinNeSe = min(lumaNe, lumaSe.w); - #else - dir.x = lumaSe.y + dirX; - dir.z = -lumaSe.y + dirZ; - half lumaMinNeSe = min(lumaNe, lumaSe.y); - #endif -/*--------------------------------------------------------------------------*/ -// (6) - half4 dir1_pos; - dir1_pos.xy = normalize(dir).xz; - half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__PS3_EDGE_SHARPNESS); -/*--------------------------------------------------------------------------*/ -// (7) - half4 dir2_pos; - dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0)); - dir1_pos.zw = pos.xy; - dir2_pos.zw = pos.xy; - #if (FXAA_GREEN_AS_LUMA == 0) - half lumaMaxNeSe = max(lumaNe, lumaSe.w); - #else - half lumaMaxNeSe = max(lumaNe, lumaSe.y); - #endif -/*--------------------------------------------------------------------------*/ -// (8) - half4 temp1N; - temp1N.xy = dir1_pos.zw - dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; - temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); - half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe); - half lumaMin = min(lumaMinNwSw, lumaMinNeSe); -/*--------------------------------------------------------------------------*/ -// (9) - half4 rgby1; - rgby1.xy = dir1_pos.zw + dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; - rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); - rgby1 = (temp1N + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (10) - half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0)); - #if (FXAA_GREEN_AS_LUMA == 0) - half lumaMaxM = max(lumaMax, rgbyM.w); - half lumaMinM = min(lumaMin, rgbyM.w); - #else - half lumaMaxM = max(lumaMax, rgbyM.y); - half lumaMinM = min(lumaMin, rgbyM.y); - #endif -/*--------------------------------------------------------------------------*/ -// (11) - half4 temp2N; - temp2N.xy = dir2_pos.zw - dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; - temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); - half4 rgby2; - rgby2.xy = dir2_pos.zw + dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; - half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE__PS3_EDGE_THRESHOLD; -/*--------------------------------------------------------------------------*/ -// (12) - rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); - rgby2 = (temp2N + rgby2) * 0.5; -/*--------------------------------------------------------------------------*/ -// (13) - rgby2 = (rgby2 + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (14) - #if (FXAA_GREEN_AS_LUMA == 0) - bool twoTapLt = rgby2.w < lumaMin; - bool twoTapGt = rgby2.w > lumaMax; - #else - bool twoTapLt = rgby2.y < lumaMin; - bool twoTapGt = rgby2.y > lumaMax; - #endif - bool earlyExit = lumaRangeM < lumaMax; - bool twoTap = twoTapLt || twoTapGt; -/*--------------------------------------------------------------------------*/ -// (15) - if(twoTap) rgby2 = rgby1; - if(earlyExit) rgby2 = rgbyM; -/*--------------------------------------------------------------------------*/ - return rgby2; } -/*==========================================================================*/ -#endif \ No newline at end of file diff --git a/shader/antialiasing/cmaa2/cmaa2_common.glsl b/shader/antialiasing/cmaa2/cmaa2_common.glsl new file mode 100644 index 000000000..2a72ef579 --- /dev/null +++ b/shader/antialiasing/cmaa2/cmaa2_common.glsl @@ -0,0 +1,116 @@ +#ifndef CMAA2_COMMON_GLSL +#define CMAA2_COMMON_GLSL + +#extension GL_EXT_samplerless_texture_functions : enable + +#include "common.glsl" + +#define CMAA2_PROCESS_CANDIDATES_NUM_THREADS 128 +#define CMAA2_SUPPORT_HDR_COLOR_RANGE 1 +#define CMAA2_EDGE_DETECTION_LUMA_PATH 0 // We should use HDR luma from a separate buffer in the future +#define CMAA2_EXTRA_SHARPNESS 0 + +struct DispatchIndirectCommand { + uint x; + uint y; + uint z; + }; + +struct DrawIndirectCommand { + uint vertexCount; + uint instanceCount; + uint firstVertex; + uint firstInstance; + }; + +const float symmetryCorrectionOffset = 0.22; + +#if CMAA2_EXTRA_SHARPNESS +const float dampeningEffect = 0.11; +const float cmaa2LocalContrastAdaptationAmount = 0.15f +const float cmaa2SimpleShapeBlurinessAmount = 0.07f +#else +const float dampeningEffect = 0.15; +const float cmaa2LocalContrastAdaptationAmount = 0.10f; +const float cmaa2SimpleShapeBlurinessAmount = 0.10f; +#endif + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// VARIOUS QUALITY SETTINGS +// +// Longest line search distance; must be even number; for high perf low quality start from ~32 - the bigger the number, +// the nicer the gradients but more costly. Max supported is 128! +const uint maxLineLength = 86; +// + +layout(binding = 0) uniform texture2D sceneTonemapped; +layout(binding = 1, r8) uniform image2D workingEdges; + +layout(binding = 2) buffer UboWorkingShapeCandidates { + uint shapeCandidates[]; + }; + +layout(binding = 3) buffer UboWorkingDeferredBlendLocationList { + uint deferredBlendLocationList[]; + }; + +layout(binding = 4) buffer UboWorkingDeferredBlendItemList { + uvec2 deferredBlendItemList[]; + }; + +layout(binding = 5, r32ui) uniform uimage2D deferredBlendItemListHeads; + +layout(binding = 6) buffer UboWorkingControlBuffer { + uint iterator; + uint shapeCandidateCount; + uint blendColorSamplesCount; + uint blendLocationCount; + uint subsequentPassWorkloadSize; + } controlBuffer; + + +vec4 unpackEdgesFlt(uint value) { + vec4 ret; + ret.x = float((value & 0x01) != 0); + ret.y = float((value & 0x02) != 0); + ret.z = float((value & 0x04) != 0); + ret.w = float((value & 0x08) != 0); + return ret; + } + +vec3 loadSourceColor(ivec2 pixelPos, ivec2 offset) { + vec3 color = texelFetch(sceneTonemapped, pixelPos + offset, 0).xyz; + return color; + } + +void storeEdge(uvec2 pos, uvec4 outEdges) { + imageStore(workingEdges, ivec2(pos.x / 2, pos.y + 0), vec4(((outEdges[1] << 4) | outEdges[0]) / 255.0)); + imageStore(workingEdges, ivec2(pos.x / 2, pos.y + 1), vec4(((outEdges[3] << 4) | outEdges[2]) / 255.0)); + } + +uint loadEdge(ivec2 pixelPos, ivec2 offset) { + uint a = uint((pixelPos.x + offset.x) % 2); + uint edge = uint(imageLoad(workingEdges, ivec2((pixelPos.x + offset.x) / 2, pixelPos.y + offset.y)).x * 255. + 0.5); + edge = (edge >> (a * 4)) & 0xF; + return edge; + } + +vec3 internalUnpackColor(uint packedColor) { +#if CMAA2_SUPPORT_HDR_COLOR_RANGE + return unpackR11G11B10F(packedColor); +#else + return unpackUnorm4x8(packedColor).rgb; + // return unpackR11G11B10E4F(packedColor); +#endif + } + +uint internalPackColor(vec3 color) { +#if CMAA2_SUPPORT_HDR_COLOR_RANGE + return packR11G11B10F(color); +#else + return packUnorm4x8(vec4(color,0)); + // return packR11G11B10E4F(color); +#endif + } + +#endif diff --git a/shader/antialiasing/cmaa2/deferred_color_apply_2x2.frag b/shader/antialiasing/cmaa2/deferred_color_apply_2x2.frag new file mode 100644 index 000000000..dce10d360 --- /dev/null +++ b/shader/antialiasing/cmaa2/deferred_color_apply_2x2.frag @@ -0,0 +1,68 @@ +#version 460 + +#extension GL_ARB_separate_shader_objects : enable +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_control_flow_attributes : enable + +#include "common.glsl" +#include "cmaa2_common.glsl" +#include "lighting/tonemapping.glsl" + +layout(push_constant, std140) uniform PushConstant { + float brightness; + float contrast; + float gamma; + float mulExposure; + } push; + +layout(location = 0) out vec4 result; + +layout(location = 0) in flat uint currentQuadOffsetXY; +layout(location = 1) in flat uint inCounterIndexWithHeader; + +vec3 tonemapping(vec3 color) { + // float exposure = scene.exposure; + float brightness = push.brightness; + float contrast = push.contrast; + float gamma = push.gamma; + + color *= push.mulExposure; + + // Brightness & Contrast + color = max(vec3(0), color + vec3(brightness)); + color = color * vec3(contrast); + + // Tonemapping + color = acesTonemap(color); + + // Gamma + //color = srgbEncode(color); + color = pow(color, vec3(gamma)); + return color; + } + +void main() { + uint counterIndexWithHeader = inCounterIndexWithHeader; + + vec4 outColors = vec4(0); + const uint maxLoops = 32u; + for(uint i = 0; (counterIndexWithHeader != 0xFFFFFFFF) && (i < maxLoops); i++) { + uint offsetXY = (counterIndexWithHeader >> 30) & 0x03; + bool isComplexShape = bool((counterIndexWithHeader >> 26) & 0x01); + uvec2 val = deferredBlendItemList[counterIndexWithHeader & ((1u << 26) - 1)]; + + counterIndexWithHeader = val.x; + + if(offsetXY == currentQuadOffsetXY) { + vec3 color = internalUnpackColor(val.y); + float weight = 0.8 + 1.0 * float(isComplexShape); + outColors += vec4(color * weight, weight); + } + } + + if(outColors.a == 0) + discard; + + // result = vec4(outColors.rgb/outColors.a, 1); + result = vec4(tonemapping(outColors.rgb/outColors.a), 1); + } diff --git a/shader/antialiasing/cmaa2/deferred_color_apply_2x2.vert b/shader/antialiasing/cmaa2/deferred_color_apply_2x2.vert new file mode 100644 index 000000000..566b7a80b --- /dev/null +++ b/shader/antialiasing/cmaa2/deferred_color_apply_2x2.vert @@ -0,0 +1,30 @@ +#version 460 + +#extension GL_ARB_separate_shader_objects : enable +#extension GL_GOOGLE_include_directive : enable + +#include "common.glsl" +#include "cmaa2_common.glsl" + +out gl_PerVertex { + vec4 gl_Position; + float gl_PointSize; + }; + +layout(location = 0) out flat uint currentQuadOffsetXY; +layout(location = 1) out flat uint counterIndexWithHeader; + +void main() { + const uint currentCandidate = gl_VertexIndex/4; + currentQuadOffsetXY = gl_VertexIndex%4; + + const ivec2 viewportSize = textureSize(sceneTonemapped, 0); + const uint pixelID = deferredBlendLocationList[currentCandidate]; + const ivec2 quadPos = ivec2((pixelID >> 16), pixelID & 0xFFFF); + const ivec2 qeOffsets[4] = ivec2[4](ivec2(0, 0), ivec2(1, 0), ivec2(0, 1), ivec2(1, 1)); + const uvec2 pixelPos = quadPos * 2 + qeOffsets[currentQuadOffsetXY]; + + counterIndexWithHeader = imageLoad(deferredBlendItemListHeads, quadPos).r; + gl_Position = vec4((vec2(pixelPos+0.5)/vec2(viewportSize))*2.0-1.0, 0, 1); + gl_PointSize = 1; + } diff --git a/shader/antialiasing/cmaa2/edge_color2x2.comp b/shader/antialiasing/cmaa2/edge_color2x2.comp new file mode 100644 index 000000000..a48c605d4 --- /dev/null +++ b/shader/antialiasing/cmaa2/edge_color2x2.comp @@ -0,0 +1,206 @@ +#version 460 + +#extension GL_ARB_separate_shader_objects : enable +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_control_flow_attributes : enable + +#include "cmaa2_common.glsl" + +layout(binding = 7, std430) buffer UboWorkingExecuteIndirectBuffer { + DispatchIndirectCommand candidateCmd; + DrawIndirectCommand applyCmd; + }; + +#if CMAA2_EDGE_DETECTION_LUMA_PATH == 2 +layout(binding = 8) uniform sampler2D inputLumaReadonly; +#endif + +layout(local_size_x = 16, local_size_y = 8) in; + +const uint CMAA2_CS_OUTPUT_KERNEL_SIZE_X = (gl_WorkGroupSize.x-2); +const uint CMAA2_CS_OUTPUT_KERNEL_SIZE_Y = (gl_WorkGroupSize.y-2); + +shared vec4 shared2x2FracEdgesH[gl_WorkGroupSize.x][gl_WorkGroupSize.y]; +shared vec4 shared2x2FracEdgesV[gl_WorkGroupSize.x][gl_WorkGroupSize.y]; + +// presets (for HDR color buffer maybe use higher values) +#if CMAA2_STATIC_QUALITY_PRESET == 0 +const float cmaa2EdgeThreshold = 0.10f; +#elif CMAA2_STATIC_QUALITY_PRESET == 1 +const float cmaa2EdgeThreshold = 0.05f; +#else +#error CMAA2_STATIC_QUALITY_PRESET not set? +#endif + +float edgeDetectColorCalcDiff(vec3 colorA, vec3 colorB) { + const vec3 LumWeights = vec3(0.299, 0.587, 0.114); + vec3 diff = abs((colorA.rgb - colorB.rgb)); + return dot(diff.rgb, LumWeights.rgb); + } + +float rgbToLumaForEdges(vec3 linearRGB) { + float luma = dot(sqrt(linearRGB.rgb), vec3(0.299, 0.587, 0.114)); + return luma; + } + +vec2 computeEdgeLuma(int x, int y, float pixelLumas[3 * 3 - 1]) { + vec2 temp; + temp.x = abs(pixelLumas[x + y * 3] - pixelLumas[x + 1 + y * 3]); + temp.y = abs(pixelLumas[x + y * 3] - pixelLumas[x + (y + 1) * 3]); + return temp; + } + +void sharedLoadQuadHV(ivec2 at, out vec2 e00, out vec2 e10, out vec2 e01, out vec2 e11) { + vec4 valH = shared2x2FracEdgesH[at.x][at.y]; + e00.y = valH.x; + e10.y = valH.y; + e01.y = valH.z; + e11.y = valH.w; + vec4 valV = shared2x2FracEdgesV[at.x][at.y]; + e00.x = valV.x; + e10.x = valV.y; + e01.x = valV.z; + e11.x = valV.w; + } + +float computeLocalContrastV(int x, int y, in vec2 neighbourhood[4][4]) { + return max(max(neighbourhood[x + 1][y + 0].y, neighbourhood[x + 1][y + 1].y), max(neighbourhood[x + 2][y + 0].y, neighbourhood[x + 2][y + 1].y)) * cmaa2LocalContrastAdaptationAmount; + } + +float computeLocalContrastH(int x, int y, in vec2 neighbourhood[4][4]) { + return max(max(neighbourhood[x + 0][y + 1].x, neighbourhood[x + 1][y + 1].x), max(neighbourhood[x + 0][y + 2].x, neighbourhood[x + 1][y + 2].x)) * cmaa2LocalContrastAdaptationAmount; + } + +uint packEdges(vec4 edges) { + return uint(dot(edges, vec4(1, 2, 4, 8))); + } + +void main() { + // screen position in the input (expanded) kernel (shifted one 2x2 block up/left) + uvec2 pixelPos = gl_WorkGroupID.xy * uvec2(CMAA2_CS_OUTPUT_KERNEL_SIZE_X, CMAA2_CS_OUTPUT_KERNEL_SIZE_Y) + gl_LocalInvocationID.xy - uvec2(1, 1); + pixelPos *= uvec2(2, 2); + + const uvec2 qeOffsets[4] = uvec2[4](uvec2(0, 0), uvec2(1, 0), uvec2(0, 1), uvec2(1, 1)); + const ivec2 center2x2 = ivec2(gl_LocalInvocationID.xy); + const bool inOutputKernel = !(any(equal(gl_LocalInvocationID.xy, uvec2(gl_WorkGroupSize.x - 1, 0))) || + any(equal(gl_LocalInvocationID.xy, uvec2(0, gl_WorkGroupSize.y - 1)))); + + float pixelLumas[3 * 3 - 1]; + for(int i = 0; i < 3 * 3 - 1; i++) { + vec3 color = loadSourceColor(ivec2(pixelPos), ivec2(i % 3, i / 3)); + pixelLumas[i] = rgbToLumaForEdges(color); + } + + vec2 qe0 = computeEdgeLuma(0, 0, pixelLumas); + vec2 qe1 = computeEdgeLuma(1, 0, pixelLumas); + vec2 qe2 = computeEdgeLuma(0, 1, pixelLumas); + vec2 qe3 = computeEdgeLuma(1, 1, pixelLumas); + + shared2x2FracEdgesV[center2x2.x][center2x2.y] = vec4(qe0.x, qe1.x, qe2.x, qe3.x); + shared2x2FracEdgesH[center2x2.x][center2x2.y] = vec4(qe0.y, qe1.y, qe2.y, qe3.y); + + barrier(); + + if(inOutputKernel) { + uvec4 outEdges = uvec4(0); + vec2 topRow = shared2x2FracEdgesH[center2x2.x][center2x2.y-1].zw; + vec2 leftColumn = shared2x2FracEdgesV[center2x2.x-1][center2x2.y].yw; + + bool someNonZeroEdges = any(notEqual(vec4(qe0, qe1) + vec4(qe2, qe3) + vec4(topRow[0], topRow[1], leftColumn[0], leftColumn[1]), vec4(0))); + if(someNonZeroEdges) { + // Clear deferred color list heads to empty (if potentially needed - even though + // some edges might get culled by local contrast adaptation step below, + // it's still cheaper to just clear it without additional logic) + imageStore(deferredBlendItemListHeads, ivec2(pixelPos) / 2, uvec4(0xFFFFFFFF, 0, 0, 0)); + // local contrast adaptation + + vec4 ce[4]; + vec2 dummyd0, dummyd1, dummyd2; + vec2 neighbourhood[4][4]; + + // load & unpack kernel data from SLM + sharedLoadQuadHV(center2x2 + ivec2(-1,-1), dummyd0, dummyd1, dummyd2, neighbourhood[0][0]); + sharedLoadQuadHV(center2x2 + ivec2( 0,-1), dummyd0, dummyd1, neighbourhood[1][0], neighbourhood[2][0]); + sharedLoadQuadHV(center2x2 + ivec2( 1,-1), dummyd0, dummyd1, neighbourhood[3][0], dummyd2); + sharedLoadQuadHV(center2x2 + ivec2(-1, 0), dummyd0, neighbourhood[0][1], dummyd1, neighbourhood[0][2]); + sharedLoadQuadHV(center2x2 + ivec2( 1, 0), neighbourhood[3][1], dummyd0, neighbourhood[3][2], dummyd1); + sharedLoadQuadHV(center2x2 + ivec2(-1, 1), dummyd0, neighbourhood[0][3], dummyd1, dummyd2); + sharedLoadQuadHV(center2x2 + ivec2( 0, 1), neighbourhood[1][3], neighbourhood[2][3], dummyd0, dummyd1); + // TODO: optimize + neighbourhood[1][0].y = topRow[0]; + neighbourhood[2][0].y = topRow[1]; + neighbourhood[0][1].x = leftColumn[0]; + neighbourhood[0][2].x = leftColumn[1]; + // + neighbourhood[1][1] = qe0; + neighbourhood[2][1] = qe1; + neighbourhood[1][2] = qe2; + neighbourhood[2][2] = qe3; + + topRow[0] = float((topRow[0] - computeLocalContrastH( 0, -1, neighbourhood)) > cmaa2EdgeThreshold); + topRow[1] = float((topRow[1] - computeLocalContrastH( 1, -1, neighbourhood)) > cmaa2EdgeThreshold); + leftColumn[0] = float((leftColumn[0] - computeLocalContrastV(-1, 0, neighbourhood)) > cmaa2EdgeThreshold); + leftColumn[1] = float((leftColumn[1] - computeLocalContrastV(-1, 1, neighbourhood)) > cmaa2EdgeThreshold); + + ce[0].x = float((qe0.x - computeLocalContrastV(0, 0, neighbourhood)) > cmaa2EdgeThreshold); + ce[0].y = float((qe0.y - computeLocalContrastH(0, 0, neighbourhood)) > cmaa2EdgeThreshold); + ce[1].x = float((qe1.x - computeLocalContrastV(1, 0, neighbourhood)) > cmaa2EdgeThreshold); + ce[1].y = float((qe1.y - computeLocalContrastH(1, 0, neighbourhood)) > cmaa2EdgeThreshold); + ce[2].x = float((qe2.x - computeLocalContrastV(0, 1, neighbourhood)) > cmaa2EdgeThreshold); + ce[2].y = float((qe2.y - computeLocalContrastH(0, 1, neighbourhood)) > cmaa2EdgeThreshold); + ce[3].x = float((qe3.x - computeLocalContrastV(1, 1, neighbourhood)) > cmaa2EdgeThreshold); + ce[3].y = float((qe3.y - computeLocalContrastH(1, 1, neighbourhood)) > cmaa2EdgeThreshold); + + ce[0].z = leftColumn[0]; + ce[1].z = ce[0].x; + ce[2].z = leftColumn[1]; + ce[3].z = ce[2].x; + + ce[0].w = topRow[0]; + ce[1].w = topRow[1]; + ce[2].w = ce[0].y; + ce[3].w = ce[1].y; + + for(int i = 0; i < 4; i++) { + uvec2 localPixelPos = pixelPos + qeOffsets[i]; + + vec4 edges = ce[i]; + + // if there's at least one two edge corner, this is a candidate for simple or complex shape processing... + bool isCandidate = (edges.x * edges.y + edges.y * edges.z + edges.z * edges.w + edges.w * edges.x) != 0; + if(isCandidate) { + uint counterIndex; + counterIndex = atomicAdd(controlBuffer.shapeCandidateCount, 1); + // 14-17 bits are free + shapeCandidates[counterIndex] = (localPixelPos.x << 18) | localPixelPos.y; + } + + // Write out edges - we write out all, including empty pixels, to make sure shape detection edge tracing + // doesn't continue on previous frame's edges that no longer exist. + outEdges[i] = packEdges(edges); + } + } + + storeEdge(pixelPos, outEdges); + } + + barrier(); + if(gl_LocalInvocationIndex==0) { + //NOTE: gl_NumWorkGroups is not implemented in DX12 + const ivec2 texSize = textureSize(sceneTonemapped, 0); + const ivec2 outputGroupSize = ivec2(CMAA2_CS_OUTPUT_KERNEL_SIZE_X, CMAA2_CS_OUTPUT_KERNEL_SIZE_Y); + const ivec2 groupCount = ivec2(texSize + outputGroupSize - 1) / (outputGroupSize * 2); + + if((atomicAdd(controlBuffer.iterator,1)+1)==(groupCount.x*groupCount.y)) { + memoryBarrierBuffer(); + controlBuffer.iterator = 0; + + const uint shapeCandidateCount = min(controlBuffer.shapeCandidateCount, shapeCandidates.length()); + controlBuffer.subsequentPassWorkloadSize = shapeCandidateCount; + + candidateCmd.x = (shapeCandidateCount + CMAA2_PROCESS_CANDIDATES_NUM_THREADS - 1) / CMAA2_PROCESS_CANDIDATES_NUM_THREADS; + candidateCmd.y = 1; + candidateCmd.z = 1; + } + } + } diff --git a/shader/antialiasing/cmaa2/process_candidates.comp b/shader/antialiasing/cmaa2/process_candidates.comp new file mode 100644 index 000000000..9467d30df --- /dev/null +++ b/shader/antialiasing/cmaa2/process_candidates.comp @@ -0,0 +1,376 @@ +#version 460 + +#extension GL_ARB_separate_shader_objects : enable +#extension GL_GOOGLE_include_directive : enable +#extension GL_EXT_control_flow_attributes : enable + +#include "cmaa2_common.glsl" + +layout(local_size_x = CMAA2_PROCESS_CANDIDATES_NUM_THREADS) in; + +layout(binding = 7, std430) buffer UboWorkingExecuteIndirectBuffer { + DispatchIndirectCommand candidateCmd; + DrawIndirectCommand applyCmd; + }; + +// this reschedules final part of work from few to all threads to increase hardware thread occupancy +#define CMAA2_COLLECT_EXPAND_BLEND_ITEMS 1 + +#if CMAA2_COLLECT_EXPAND_BLEND_ITEMS +const uint CMAA2_BLEND_ITEM_SLM_SIZE = 768; +shared uint g_groupSharedBlendItemCount; +shared uvec2 g_groupSharedBlendItems[CMAA2_BLEND_ITEM_SLM_SIZE]; +#endif + +void storeColorSample(uvec2 pixelPos, vec3 color, bool isComplexShape) { + uint counterIndex = atomicAdd(controlBuffer.blendColorSamplesCount, 1); + + uvec2 quadPos = pixelPos / uvec2(2, 2); + uint offsetXY = (pixelPos.y % 2) * 2 + (pixelPos.x % 2); + // 27-29 bits are free (we don't use msaaIndex) + uint header = (offsetXY << 30) | (uint(isComplexShape) << 26); + + uint counterIndexWithHeader = counterIndex | header; + + uint originalIndex = imageAtomicExchange(deferredBlendItemListHeads, ivec2(quadPos), counterIndexWithHeader); + deferredBlendItemList[counterIndex] = uvec2(originalIndex, internalPackColor(color)); + + if(originalIndex == 0xFFFFFFFF) { + const uint edgeListCounter = atomicAdd(controlBuffer.blendLocationCount, 1); + if(edgeListCounter < deferredBlendLocationList.length()) { + deferredBlendLocationList[edgeListCounter] = (quadPos.x << 16) | quadPos.y; + } + } + } + +void detectZsHorizontal(in vec4 edges, in vec4 edgesM1P0, in vec4 edgesP1P0, in vec4 edgesP2P0, out float invertedZScore, out float normalZScore) { + invertedZScore = edges.r * edges.g * edgesP1P0.a; + invertedZScore *= 2.0 + ((edgesM1P0.g + edgesP2P0.a)) - (edges.a + edgesP1P0.g) - 0.7 * (edgesP2P0.g + edgesM1P0.a + edges.b + edgesP1P0.r); + + normalZScore = edges.r * edges.a * edgesP1P0.g; + normalZScore *= 2.0 + ((edgesM1P0.a + edgesP2P0.g)) - (edges.g + edgesP1P0.a) - 0.7 * (edgesP2P0.a + edgesM1P0.g + edges.b + edgesP1P0.r); + } + +void findZLineLengths(out float lineLengthLeft, out float lineLengthRight, uvec2 screenPos, bool horizontal, bool invertedZShape, vec2 stepRight) { + uint maskLeft, bitsContinueLeft, maskRight, bitsContinueRight; + { + uint maskTraceLeft = 0x08; + uint maskTraceRight = 0x02; + + if(!horizontal) { + maskTraceLeft = 0x04; + maskTraceRight = 0x01; + } + if(invertedZShape) { + uint temp = maskTraceLeft; + maskTraceLeft = maskTraceRight; + maskTraceRight = temp; + } + maskLeft = maskTraceLeft; + bitsContinueLeft = maskTraceLeft; + maskRight = maskTraceRight; + bitsContinueRight = maskTraceRight; + } + + bool continueLeft = true; + bool continueRight = true; + lineLengthLeft = 1.0; + lineLengthRight = 1.0; + + for(;;) { + uint edgeLeft = loadEdge(ivec2(screenPos) - ivec2(stepRight * lineLengthLeft), ivec2(0, 0)); + uint edgeRight = loadEdge(ivec2(screenPos) + ivec2(stepRight * (lineLengthRight + 1.0)), ivec2(0, 0)); + + continueLeft = continueLeft && ((edgeLeft & maskLeft) == bitsContinueLeft); + continueRight = continueRight && ((edgeRight & maskRight) == bitsContinueRight); + + lineLengthLeft += continueLeft ? 1.0 : 0.0; + lineLengthRight += continueRight ? 1.0 : 0.0; + + float maxLR = max(lineLengthRight, lineLengthLeft); + + if(!continueLeft && !continueRight) + maxLR = float(maxLineLength); + +#if CMAA2_EXTRA_SHARPNESS + if(maxLR >= min(float(maxLineLength), (1.20 * min(lineLengthRight, lineLengthLeft) - 0.20))) +#else + if(maxLR >= min(float(maxLineLength), (1.25 * min(lineLengthRight, lineLengthLeft) - 0.25))) +#endif + break; + } + } + +#if CMAA2_COLLECT_EXPAND_BLEND_ITEMS +bool collectBlendZs(uvec2 screenPos, bool horizontal, bool invertedZShape, float shapeQualityScore, float lineLengthLeft, float lineLengthRight, vec2 stepRight) { + float leftOdd = symmetryCorrectionOffset * mod(lineLengthLeft, 2.0); + float rightOdd = symmetryCorrectionOffset * mod(lineLengthRight, 2.0); + + float dampenEffect = clamp(float(lineLengthLeft + lineLengthRight - shapeQualityScore) * dampeningEffect, 0.0, 1.0); + + float loopFrom = -floor((lineLengthLeft + 1.0) / 2.0) + 1.0; + float loopTo = floor((lineLengthRight + 1.0) / 2.0); + + uint itemIndex; + const uint blendItemCount = uint(loopTo - loopFrom + 1.0); + itemIndex = atomicAdd(g_groupSharedBlendItemCount, blendItemCount); + + if((itemIndex + blendItemCount) > CMAA2_BLEND_ITEM_SLM_SIZE) + return false; + + float totalLength = loopTo - loopFrom + 1.0 - leftOdd - rightOdd; + float lerpStep = 1.0 / totalLength; + + float lerpFromK = (0.5 - leftOdd - loopFrom) * lerpStep; + + // 14-17 bits are free + uint itemHeader = (screenPos.x << 18) | screenPos.y; + uint itemValStatic = (horizontal ? 1 : 0) << 31 | (invertedZShape ? 1 : 0) << 30; + + for(float i = loopFrom; i <= loopTo; i++) { + float lerpVal = lerpStep * i + lerpFromK; + + float secondPart = (i > 0.0) ? 1.0 : 0.0; + float srcOffset = 1.0 - secondPart * 2.0; + + float lerpK = (lerpStep * i + lerpFromK) * srcOffset + secondPart; + lerpK *= dampenEffect; + + ivec2 encodedItem; + encodedItem.x = int(itemHeader); + encodedItem.y = int(itemValStatic | ((uint(i + 256.0) /*& 0x3FF*/) << 20) | ((uint(srcOffset + 256.0) /*& 0x3FF*/) << 10) | uint(clamp(lerpK, 0.0, 1.0) * 1023.0 + 0.5)); + g_groupSharedBlendItems[itemIndex++] = uvec2(encodedItem); + } + return true; + } +#endif + +void blendZs(uvec2 screenPos, bool horizontal, bool invertedZShape, float shapeQualityScore, float lineLengthLeft, float lineLengthRight, vec2 stepRight) { + vec2 blendDir = horizontal ? vec2(0, -1) : vec2(-1, 0); + + if(invertedZShape) + blendDir = -blendDir; + + float leftOdd = symmetryCorrectionOffset * mod(lineLengthLeft, 2.0); + float rightOdd = symmetryCorrectionOffset * mod(lineLengthRight, 2.0); + + float dampenEffect = clamp(float(lineLengthLeft + lineLengthRight - shapeQualityScore) * dampeningEffect, 0.0, 1.0); + + float loopFrom = -floor((lineLengthLeft + 1.0) / 2.0) + 1.0; + float loopTo = floor((lineLengthRight + 1.0) / 2.0); + + float totalLength = loopTo - loopFrom + 1.0 - leftOdd - rightOdd; + float lerpStep = 1.0 / totalLength; + + float lerpFromK = (0.5 - leftOdd - loopFrom) * lerpStep; + + for(float i = loopFrom; i <= loopTo; i++) { + float lerpVal = lerpStep * i + lerpFromK; + + float secondPart = (i > 0.0) ? 1.0 : 0.0; + float srcOffset = 1.0 - secondPart * 2.0; + + float lerpK = (lerpStep * i + lerpFromK) * srcOffset + secondPart; + lerpK *= dampenEffect; + + vec2 pixelPos = vec2(screenPos) + stepRight * i; + + vec3 colorCenter = loadSourceColor(ivec2(pixelPos), ivec2(0, 0)); + vec3 colorFrom = loadSourceColor(ivec2(pixelPos + blendDir * srcOffset), ivec2(0, 0)); + + vec3 outputCol = mix(colorCenter, colorFrom, lerpK); + + storeColorSample(uvec2(pixelPos), outputCol, true); + } + } + +vec4 computeSimpleShapeBlendValues(vec4 edges, vec4 edgesLeft, vec4 edgesRight, vec4 edgesTop, vec4 edgesBottom, bool dontTestShapeValidity) { + float fromRight = edges.r; + float fromBelow = edges.g; + float fromLeft = edges.b; + float fromAbove = edges.a; + + float blurCoeff = cmaa2SimpleShapeBlurinessAmount; + + float numberOfEdges = dot(edges, vec4(1, 1, 1, 1)); + + float numberOfEdgesAllAround = dot(edgesLeft.bga + edgesRight.rga + edgesTop.rba + edgesBottom.rgb, vec3(1, 1, 1)); + + if(!dontTestShapeValidity) { + if(numberOfEdges == 1) + blurCoeff = 0; + + if(numberOfEdges == 2) + blurCoeff *= ((1.0 - fromBelow * fromAbove) * (1.0 - fromRight * fromLeft)); + } + + if(numberOfEdges == 2) { + blurCoeff *= 0.75; + + float k = 0.9; + fromRight += k * (edges.g * edgesTop.r * (1.0 - edgesLeft.g) + edges.a * edgesBottom.r * (1.0 - edgesLeft.a)); + fromBelow += k * (edges.b * edgesRight.g * (1.0 - edgesTop.b) + edges.r * edgesLeft.g * (1.0 - edgesTop.r)); + fromLeft += k * (edges.a * edgesBottom.b * (1.0 - edgesRight.a) + edges.g * edgesTop.b * (1.0 - edgesRight.g)); + fromAbove += k * (edges.r * edgesLeft.a * (1.0 - edgesBottom.r) + edges.b * edgesRight.a * (1.0 - edgesBottom.b)); + } + + blurCoeff *= clamp(1.30 - numberOfEdgesAllAround / 10.0, 0.f, 1.f); + + return vec4(fromLeft, fromAbove, fromRight, fromBelow) * blurCoeff; + } + +void main() { +#if CMAA2_COLLECT_EXPAND_BLEND_ITEMS + if(gl_LocalInvocationID.x == 0) + g_groupSharedBlendItemCount = 0; + barrier(); +#endif + + const uint numCandidates = controlBuffer.subsequentPassWorkloadSize; + if(gl_GlobalInvocationID.x < numCandidates) { + uint pixelID = shapeCandidates[gl_GlobalInvocationID.x]; + uvec2 pixelPos = uvec2((pixelID >> 18), pixelID & 0x3FFF); + + uint edgesCenterPacked = loadEdge(ivec2(pixelPos), ivec2(0, 0)); + vec4 edges = unpackEdgesFlt(edgesCenterPacked); + vec4 edgesLeft = unpackEdgesFlt(loadEdge(ivec2(pixelPos) + ivec2(-1, 0), ivec2(0, 0))); + vec4 edgesRight = unpackEdgesFlt(loadEdge(ivec2(pixelPos) + ivec2(1, 0), ivec2(0, 0))); + vec4 edgesBottom = unpackEdgesFlt(loadEdge(ivec2(pixelPos) + ivec2(0, 1), ivec2(0, 0))); + vec4 edgesTop = unpackEdgesFlt(loadEdge(ivec2(pixelPos) + ivec2(0, -1), ivec2(0, 0))); + + { + vec4 blendVal = computeSimpleShapeBlendValues(edges, edgesLeft, edgesRight, edgesTop, edgesBottom, true); + + float fourWeightSum = dot(blendVal, vec4(1.0)); + float centerWeight = 1.0 - fourWeightSum; + + vec3 outColor = loadSourceColor(ivec2(pixelPos), ivec2(0, 0)) * centerWeight; + if(blendVal.x > 0.0) + outColor += blendVal.x * loadSourceColor(ivec2(pixelPos), ivec2(-1, 0)); + if(blendVal.y > 0.0) + outColor += blendVal.y * loadSourceColor(ivec2(pixelPos), ivec2(0, -1)); + if(blendVal.z > 0.0) + outColor += blendVal.z * loadSourceColor(ivec2(pixelPos), ivec2(1, 0)); + if(blendVal.w > 0.0) + outColor += blendVal.w * loadSourceColor(ivec2(pixelPos), ivec2(0, 1)); + + storeColorSample(pixelPos, outColor, false); + } + + { + float invertedZScore; + float normalZScore; + float maxScore; + bool horizontal = true; + bool invertedZ = false; + + { + vec4 edgesM1P0 = edgesLeft; + vec4 edgesP1P0 = edgesRight; + vec4 edgesP2P0 = unpackEdgesFlt(loadEdge(ivec2(pixelPos) + ivec2(2, 0), ivec2(0, 0))); + + detectZsHorizontal(edges, edgesM1P0, edgesP1P0, edgesP2P0, invertedZScore, normalZScore); + maxScore = max(invertedZScore, normalZScore); + + if(maxScore > 0.0) + invertedZ = invertedZScore > normalZScore; + } + + { + vec4 edgesM1P0 = edgesBottom; + vec4 edgesP1P0 = edgesTop; + vec4 edgesP2P0 = unpackEdgesFlt(loadEdge(ivec2(pixelPos) + ivec2(0, -2), ivec2(0, 0))); + + detectZsHorizontal(edges.argb, edgesM1P0.argb, edgesP1P0.argb, edgesP2P0.argb, invertedZScore, normalZScore); + float vertScore = max(invertedZScore, normalZScore); + + if(vertScore > maxScore) { + maxScore = vertScore; + horizontal = false; + invertedZ = invertedZScore > normalZScore; + } + } + + if(maxScore > 0.0) { +#if CMAA2_EXTRA_SHARPNESS + float shapeQualityScore = round(clamp(4.0 - maxScore, 0.0, 3.0)); +#else + float shapeQualityScore = floor(clamp(4.0 - maxScore, 0.0, 3.0)); +#endif + + vec2 stepRight = horizontal ? vec2(1, 0) : vec2(0, -1); + float lineLengthLeft, lineLengthRight; + findZLineLengths(lineLengthLeft, lineLengthRight, pixelPos, horizontal, invertedZ, stepRight); + + lineLengthLeft -= shapeQualityScore; + lineLengthRight -= shapeQualityScore; + + if((lineLengthLeft + lineLengthRight) >= 5.0) { +#if CMAA2_COLLECT_EXPAND_BLEND_ITEMS + if(!collectBlendZs(pixelPos, horizontal, invertedZ, shapeQualityScore, lineLengthLeft, lineLengthRight, stepRight)) +#endif + blendZs(pixelPos, horizontal, invertedZ, shapeQualityScore, lineLengthLeft, lineLengthRight, stepRight); + } + } + } + } + +#if CMAA2_COLLECT_EXPAND_BLEND_ITEMS + barrier(); + + uint totalItemCount = min(CMAA2_BLEND_ITEM_SLM_SIZE, g_groupSharedBlendItemCount); + + uint loops = (totalItemCount + (CMAA2_PROCESS_CANDIDATES_NUM_THREADS - 1) - gl_LocalInvocationID.x) / CMAA2_PROCESS_CANDIDATES_NUM_THREADS; + + for(uint loop = 0; loop < loops; loop++) { + uint index = loop * CMAA2_PROCESS_CANDIDATES_NUM_THREADS + gl_LocalInvocationID.x; + + uvec2 itemVal = g_groupSharedBlendItems[index]; + + uvec2 startingPos = uvec2((itemVal.x >> 18), itemVal.x & 0x3FFF); + + bool itemHorizontal = bool((itemVal.y >> 31) & 1); + bool itemInvertedZ = bool((itemVal.y >> 30) & 1); + float itemStepIndex = float((itemVal.y >> 20) & 0x3FF) - 256.0; + float itemSrcOffset = float((itemVal.y >> 10) & 0x3FF) - 256.0; + float itemLerpK = float(itemVal.y & 0x3FF) / 1023.0; + + vec2 itemStepRight = itemHorizontal ? vec2(1, 0) : vec2(0, -1); + vec2 itemBlendDir = itemHorizontal ? vec2(0, -1) : vec2(-1, 0); + if(itemInvertedZ) + itemBlendDir = -itemBlendDir; + + uvec2 itemPixelPos = startingPos + uvec2(itemStepRight * itemStepIndex); + + vec3 colorCenter = loadSourceColor(ivec2(itemPixelPos), ivec2(0, 0)); + vec3 colorFrom = loadSourceColor(ivec2(itemPixelPos + itemBlendDir * itemSrcOffset), ivec2(0, 0)); + + vec3 outputColor = mix(colorCenter, colorFrom, itemLerpK); + + storeColorSample(ivec2(itemPixelPos), outputColor, true); + } +#endif + + barrier(); + if(gl_LocalInvocationIndex==0) { + //NOTE: gl_NumWorkGroups is not implemented in DX12 + const uint shapeCandidateCount = controlBuffer.subsequentPassWorkloadSize; + const uint numWorkGroups = (shapeCandidateCount + CMAA2_PROCESS_CANDIDATES_NUM_THREADS - 1) / CMAA2_PROCESS_CANDIDATES_NUM_THREADS; + + if((atomicAdd(controlBuffer.iterator,1)+1)==numWorkGroups) { + memoryBarrierBuffer(); + controlBuffer.iterator = 0; + + const uint blendLocationCount = min(controlBuffer.blendLocationCount, deferredBlendLocationList.length()); + applyCmd.vertexCount = blendLocationCount * 4; // 4 points per quad + applyCmd.instanceCount = 1; + applyCmd.firstVertex = 0; + applyCmd.firstInstance = 0; + + controlBuffer.subsequentPassWorkloadSize = blendLocationCount; + controlBuffer.blendLocationCount = 0; + controlBuffer.shapeCandidateCount = 0; + controlBuffer.blendColorSamplesCount = 0; + } + } + } diff --git a/shader/antialiasing/fxaa.frag b/shader/antialiasing/fxaa.frag deleted file mode 100644 index ba9d325a4..000000000 --- a/shader/antialiasing/fxaa.frag +++ /dev/null @@ -1,88 +0,0 @@ -#version 450 - -#extension GL_GOOGLE_include_directive : enable - -#define FXAA_GLSL_130 1 - -#if FXAA_QUALITY_SETTING == 0 - #define FXAA_QUALITY_PRESET 10 - #define FXAA_PC_CONSOLE 1 -#elif FXAA_QUALITY_SETTING == 1 - #define FXAA_QUALITY_PRESET 10 - #define FXAA_PC 1 -#elif FXAA_QUALITY_SETTING == 2 - #define FXAA_QUALITY_PRESET 15 - #define FXAA_PC 1 -#elif FXAA_QUALITY_SETTING == 3 - #define FXAA_QUALITY_PRESET 29 - #define FXAA_PC 1 -#elif FXAA_QUALITY_SETTING == 4 - #define FXAA_QUALITY_PRESET 39 - #define FXAA_PC 1 -#endif - -#include "Fxaa3_11.h" - -layout(push_constant, std140) uniform PushConstantsFxaa { - float fxaaInverseSharpnessCoeff; - float fxaaQualitySubpix; - float fxaaQualityEdgeThreshold; - float fxaaQualityEdgeThresholdMin; - float fxaaConsoleEdgeSharpness; - float fxaaConsoleEdgeThreshold; - float fxaaConsoleEdgeThresholdMin; - }; - -layout(binding = 0) uniform sampler2D aliasedInput; - -layout(location = 0) out vec4 outColor; - -void main() { - vec2 screenSize = textureSize(aliasedInput, 0); - vec2 screenSizeInv = 1.f / screenSize; - - vec2 uv = gl_FragCoord.xy * screenSizeInv; - - vec4 fxaaConsolePosPos = vec4(uv - screenSizeInv, uv + screenSizeInv); - - const vec4 fxaaConsoleRcpFrameOpt = vec4( - -fxaaInverseSharpnessCoeff * screenSizeInv.x, - -fxaaInverseSharpnessCoeff * screenSizeInv.y, - fxaaInverseSharpnessCoeff * screenSizeInv.x, - fxaaInverseSharpnessCoeff * screenSizeInv.y - ); - - const vec4 fxaaConsoleRcpFrameOpt2 = vec4( - -2.f * screenSizeInv.x, - -2.f * screenSizeInv.y, - 2.f * screenSizeInv.x, - 2.f * screenSizeInv.y - ); - - const vec4 fxaaConsole360RcpFrameOpt2 = vec4( - 8.f * screenSizeInv.x, - 8.f * screenSizeInv.y, - -4.f * screenSizeInv.x, - -4.f * screenSizeInv.y - ); - - const vec4 fxaaConsole360ConstDir = vec4(0.f, 0.f, 0.f, 0.f); - - outColor = FxaaPixelShader( - uv, - fxaaConsolePosPos, - aliasedInput, - aliasedInput, - aliasedInput, - screenSizeInv, - fxaaConsoleRcpFrameOpt, - fxaaConsoleRcpFrameOpt2, - fxaaConsole360RcpFrameOpt2, - fxaaQualitySubpix, - fxaaQualityEdgeThreshold, - fxaaQualityEdgeThresholdMin, - fxaaConsoleEdgeSharpness, - fxaaConsoleEdgeThreshold, - fxaaConsoleEdgeThresholdMin, - fxaaConsole360ConstDir); - } \ No newline at end of file diff --git a/shader/common.glsl b/shader/common.glsl index 4d88a8bea..b0fde0187 100644 --- a/shader/common.glsl +++ b/shader/common.glsl @@ -218,6 +218,39 @@ vec3 i_octahedral_32( uint data ) { return normalize(nor); } +// the next packing/unpacking methods are taken from +// https://github.com/Microsoft/DirectX-Graphics-Samples/blob/master/MiniEngine/Core/Shaders/PixelPacking_R11G11B10.hlsli +uint packR11G11B10F(vec3 rgb) { + rgb = min(rgb, uintBitsToFloat(0x477C0000)); + uint r = ((packHalf2x16(vec2(rgb.r, 0)) + 8) >> 4) & 0x000007FF; + uint g = ((packHalf2x16(vec2(rgb.g, 0)) + 8) << 7) & 0x003FF800; + uint b = ((packHalf2x16(vec2(rgb.b, 0)) + 16) << 17) & 0xFFC00000; + return r | g | b; + } + +vec3 unpackR11G11B10F(uint rgb) { + float r = unpackHalf2x16((rgb << 4) & 0x7FF0).r; + float g = unpackHalf2x16((rgb >> 7) & 0x7FF0).r; + float b = unpackHalf2x16((rgb >> 17) & 0x7FE0).r; + return vec3(r, g, b); + } + +// This is like R11G11B10F except that it moves one bit from each exponent to each mantissa. +uint packR11G11B10E4F(vec3 rgb) { + rgb = clamp(rgb, 0.0, uintBitsToFloat(0x3FFFFFFF)); + uint r = ((packHalf2x16(vec2(rgb.r, 0)) + 4) >> 3) & 0x000007FF; + uint g = ((packHalf2x16(vec2(rgb.g, 0)) + 4) << 8) & 0x003FF800; + uint b = ((packHalf2x16(vec2(rgb.b, 0)) + 8) << 18) & 0xFFC00000; + return r | g | b; + } + +vec3 unpackR11G11B10E4F(uint rgb) { + float r = unpackHalf2x16((rgb << 3) & 0x3FF8).r; + float g = unpackHalf2x16((rgb >> 8) & 0x3FF8).r; + float b = unpackHalf2x16((rgb >> 18) & 0x3FF0).r; + return vec3(r, g, b); + } + uint encodeNormal(vec3 n) { return octahedral_32(n); }