Skip to content

Commit

Permalink
Metal: simplify color_cell_compression_est by using vector types and …
Browse files Browse the repository at this point in the history
…folding two sides of weights branch into one

Quality 0: CPU ~268, GPU 150 -> 154
Quality 1: CPU ~48.3, GPU 58.5 -> 60.8
Quality 3: CPU ~33.3, GPU ~44.0
Shader load 13.2 -> 12.4 sec
  • Loading branch information
aras-p committed Jan 6, 2021
1 parent 4ec1065 commit dca2217
Showing 1 changed file with 42 additions and 168 deletions.
210 changes: 42 additions & 168 deletions src/shaders/metal/bc7e.metal
Original file line number Diff line number Diff line change
Expand Up @@ -1466,226 +1466,100 @@ static uint32_t color_cell_compression(uint32_t mode, const thread color_cell_co
return pResults->m_best_overall_err;
}

static uint32_t color_cell_compression_est(uint32_t mode, const thread color_cell_compressor_params* pParams, uint32_t best_err_so_far, const uint part_mask, const uint partition, const thread uchar4* pPixels)
static uint color_cell_compression_est(uint32_t mode, const thread color_cell_compressor_params* pParams, uint32_t best_err_so_far, const uint part_mask, const uint partition, const thread uchar4* pPixels)
{
assert((pParams->m_num_selector_weights == 4) || (pParams->m_num_selector_weights == 8));

float lr = 255, lg = 255, lb = 255;
float hr = 0, hg = 0, hb = 0;
float3 ll = 255;
float3 hh = 0;
for (uint i = 0, pm = part_mask; i < 16; ++i, pm >>= 2)
{
if ((pm & 3) != partition)
continue;
auto p = pPixels[i].rgb;

float r = p.r;
float g = p.g;
float b = p.b;

lr = min(lr, r);
lg = min(lg, g);
lb = min(lb, b);

hr = max(hr, r);
hg = max(hg, g);
hb = max(hb, b);
float3 p = float3(pPixels[i].rgb);
ll = min(ll, p);
hh = max(hh, p);
}

const uint32_t N = 1 << g_bc7_color_index_bitcount[mode];

uint32_t total_err = 0;

float sr = lr;
float sg = lg;
float sb = lb;
float3 ss = ll;
float3 di = hh - ll;
float3 fa = di;

float dir = hr - lr;
float dig = hg - lg;
float dib = hb - lb;

float far = dir;
float fag = dig;
float fab = dib;

float low = far * sr + fag * sg + fab * sb;
float high = far * hr + fag * hg + fab * hb;
float low = fa.r * ll.r + fa.g * ll.g + fa.b * ll.b;
float high = fa.r * hh.r + fa.g * hh.g + fa.b * hh.b;

float scale = ((float)N - 1) / (float)(high - low);
float inv_n = 1.0f / ((float)N - 1);

float total_errf = 0;

float err = 0;
// We don't handle perceptual very well here, but the difference is very slight (<.05 dB avg Luma PSNR across a large corpus) and the perf lost was high (2x slower).
if ((pParams->m_weights[0] != 1) || (pParams->m_weights[1] != 1) || (pParams->m_weights[2] != 1))
{
float wr = pParams->m_weights[0];
float wg = pParams->m_weights[1];
float wb = pParams->m_weights[2];

for (uint i = 0, pm = part_mask; i < 16; ++i, pm >>= 2)
{
if ((pm & 3) != partition)
continue;
auto c = pPixels[i];

float d = far * (float)c.r + fag * (float)c.g + fab * (float)c.b;

float s = clamp(floor((d - low) * scale + .5f) * inv_n, 0.0f, 1.0f);

float itr = sr + dir * s;
float itg = sg + dig * s;
float itb = sb + dib * s;

float dr = itr - (float)c.r;
float dg = itg - (float)c.g;
float db = itb - (float)c.b;

total_errf += wr * dr * dr + wg * dg * dg + wb * db * db;
}
}
else
{
float3 w = float3(pParams->m_weights.rgb);
for (uint i = 0, pm = part_mask; i < 16; ++i, pm >>= 2)
{
if ((pm & 3) != partition)
continue;
auto c = pPixels[i];

float d = far * (float)c.r + fag * (float)c.g + fab * (float)c.b;
float3 c = float3(pPixels[i].rgb);

float s = clamp(floor((d - low) * scale + .5f) * inv_n, 0.0f, 1.0f);

float itr = sr + dir * s;
float itg = sg + dig * s;
float itb = sb + dib * s;

float dr = itr - (float)c.r;
float dg = itg - (float)c.g;
float db = itb - (float)c.b;

total_errf += dr * dr + dg * dg + db * db;
float d = fa.r * c.r + fa.g * c.g + fa.b * c.b;
float s = saturate(floor((d - low) * scale + .5f) * inv_n);

float3 it = ss + di * s;
float3 dd = it - c;
err += w.r * dd.r * dd.r + w.g * dd.g * dd.g + w.b * dd.b * dd.b;
}
}

total_err = (int32_t)total_errf;

return total_err;
return (uint)err;
}

static uint32_t color_cell_compression_est_mode7(uint32_t mode, const thread color_cell_compressor_params* pParams, uint32_t best_err_so_far, const uint part_mask, const uint partition, const thread uchar4* pPixels)
static uint color_cell_compression_est_mode7(uint32_t mode, const thread color_cell_compressor_params* pParams, uint32_t best_err_so_far, const uint part_mask, const uint partition, const thread uchar4* pPixels)
{
assert((mode == 7) && (pParams->m_num_selector_weights == 4));

float lr = 255, lg = 255, lb = 255, la = 255;
float hr = 0, hg = 0, hb = 0, ha = 0;
float4 ll = 255;
float4 hh = 0;
for (uint i = 0, pm = part_mask; i < 16; ++i, pm >>= 2)
{
if ((pm & 3) != partition)
continue;
auto p = pPixels[i];
float r = p.r;
float g = p.g;
float b = p.b;
float a = p.a;

lr = min(lr, r);
lg = min(lg, g);
lb = min(lb, b);
la = min(la, a);

hr = max(hr, r);
hg = max(hg, g);
hb = max(hb, b);
ha = max(ha, a);
float4 p = float4(pPixels[i]);
ll = min(ll, p);
hh = max(hh, p);
}

const uint32_t N = 4;

uint32_t total_err = 0;

float sr = lr;
float sg = lg;
float sb = lb;
float sa = la;

float dir = hr - lr;
float dig = hg - lg;
float dib = hb - lb;
float dia = ha - la;
float4 ss = ll;
float4 di = hh - ll;
float4 fa = di;

float far = dir;
float fag = dig;
float fab = dib;
float faa = dia;

float low = far * sr + fag * sg + fab * sb + faa * sa;
float high = far * hr + fag * hg + fab * hb + faa * ha;
float low = fa.r * ll.r + fa.g * ll.g + fa.b * ll.b + fa.a * ll.a;
float high = fa.r * hh.r + fa.g * hh.g + fa.b * hh.b + fa.a * hh.a;

float scale = ((float)N - 1) / (float)(high - low);
float inv_n = 1.0f / ((float)N - 1);

float total_errf = 0;

float err = 0;
// We don't handle perceptual very well here, but the difference is very slight (<.05 dB avg Luma PSNR across a large corpus) and the perf lost was high (2x slower).
if ( (!pParams->m_perceptual) && ((pParams->m_weights[0] != 1) || (pParams->m_weights[1] != 1) || (pParams->m_weights[2] != 1) || (pParams->m_weights[3] != 1)) )
{
float wr = pParams->m_weights[0];
float wg = pParams->m_weights[1];
float wb = pParams->m_weights[2];
float wa = pParams->m_weights[3];

float4 w = pParams->m_perceptual ? float4(1,1,1,1) : float4(pParams->m_weights);
for (uint i = 0, pm = part_mask; i < 16; ++i, pm >>= 2)
{
if ((pm & 3) != partition)
continue;
const auto c = pPixels[i];
float4 c = float4(pPixels[i]);

float d = far * (float)c.r + fag * (float)c.g + fab * (float)c.b + faa * (float)c.a;

float s = clamp(floor((d - low) * scale + .5f) * inv_n, 0.0f, 1.0f);

float itr = sr + dir * s;
float itg = sg + dig * s;
float itb = sb + dib * s;
float ita = sa + dia * s;

float dr = itr - (float)c.r;
float dg = itg - (float)c.g;
float db = itb - (float)c.b;
float da = ita - (float)c.a;

total_errf += wr * dr * dr + wg * dg * dg + wb * db * db + wa * da * da;
}
}
else
{
for (uint i = 0, pm = part_mask; i < 16; ++i, pm >>= 2)
{
if ((pm & 3) != partition)
continue;
const auto c = pPixels[i];

float d = far * (float)c.r + fag * (float)c.g + fab * (float)c.b + faa * (float)c.a;

float s = clamp(floor((d - low) * scale + .5f) * inv_n, 0.0f, 1.0f);

float itr = sr + dir * s;
float itg = sg + dig * s;
float itb = sb + dib * s;
float ita = sa + dia * s;

float dr = itr - (float)c.r;
float dg = itg - (float)c.g;
float db = itb - (float)c.b;
float da = ita - (float)c.a;

total_errf += dr * dr + dg * dg + db * db + da * da;
float d = fa.r * c.r + fa.g * c.g + fa.b * c.b + fa.a * c.a;
float s = saturate(floor((d - low) * scale + .5f) * inv_n);

float4 it = ss + di * s;
float4 dd = it - c;
err += w.r * dd.r * dd.r + w.g * dd.g * dd.g + w.b * dd.b * dd.b + w.a * dd.a * dd.a;
}
}

total_err = (int32_t)total_errf;

return total_err;
return (uint)err;
}

static uint32_t estimate_partition(uint32_t mode, const uchar4 pixels[16], const constant bc7e_compress_block_params* pComp_params)
Expand Down

0 comments on commit dca2217

Please sign in to comment.