Skip to content

Commit

Permalink
Accelerated alphaMaskBlend
Browse files Browse the repository at this point in the history
  • Loading branch information
TellowKrinkle committed Jun 10, 2021
1 parent 708d812 commit 52fc831
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 21 deletions.
49 changes: 28 additions & 21 deletions src/PonscripterLabel_image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,29 +321,36 @@ void PonscripterLabel::alphaMaskBlend(SDL_Surface *mask_surface, int trans_mode,
mask_value >>= fmt->Bloss;

if (( trans_mode == ALPHA_BLEND_FADE_MASK || trans_mode == ALPHA_BLEND_CROSSFADE_MASK ) && mask_surface) {
int mask_off_base_y = rect.y % mask_surface->h;
int mask_off_base_x = rect.x % mask_surface->w;
for ( int i=0, my=mask_off_base_y ; i<rect.h ; i++, my++ ) {
if (my >= mask_surface->h) { my = 0; }
ONSBuf *mask_buffer = (ONSBuf *)mask_surface->pixels + mask_surface->w * my;
int offset=rect.x;
for ( int j=rect.w, mx=mask_off_base_x ; j ; j--, mx++ ) {
if (mx >= mask_surface->w) { mx = 0; }
Uint32 mask2 = 0;
Uint32 mask = *(mask_buffer + mx) & fmt->Bmask;
if ( mask_value > mask ){
mask2 = mask_value - mask;
if ( mask2 & overflow_mask ) mask2 = fmt->Bmask;
bool accelerated_ok = sizeof(ONSBuf) == 4 && fmt->Bmask == 0xff;
if (accelerated_ok) {
bool ok = AnimationInfo::gfx.alphaMaskBlend(dst, src1, src2, mask_surface, rect, mask_value);
accelerated_ok &= ok;
}
if (!accelerated_ok) {
int mask_off_base_y = rect.y % mask_surface->h;
int mask_off_base_x = rect.x % mask_surface->w;
for ( int i=0, my=mask_off_base_y ; i<rect.h ; i++, my++ ) {
if (my >= mask_surface->h) { my = 0; }
ONSBuf *mask_buffer = (ONSBuf *)mask_surface->pixels + mask_surface->w * my;
int offset=rect.x;
for ( int j=rect.w, mx=mask_off_base_x ; j ; j--, mx++ ) {
if (mx >= mask_surface->w) { mx = 0; }
Uint32 mask2 = 0;
Uint32 mask = *(mask_buffer + mx) & fmt->Bmask;
if ( mask_value > mask ){
mask2 = mask_value - mask;
if ( mask2 & overflow_mask ) mask2 = fmt->Bmask;
}
#ifndef BPP16
Uint32 mask1 = mask2 ^ fmt->Bmask;
#endif
BLEND_MASK_PIXEL();
++dst_buffer, ++src1_buffer, ++src2_buffer, ++offset;
}
#ifndef BPP16
Uint32 mask1 = mask2 ^ fmt->Bmask;
#endif
BLEND_MASK_PIXEL();
++dst_buffer, ++src1_buffer, ++src2_buffer, ++offset;
src1_buffer += rwidth;
src2_buffer += rwidth;
dst_buffer += rwidth;
}
src1_buffer += rwidth;
src2_buffer += rwidth;
dst_buffer += rwidth;
}
}
else{ // ALPHA_BLEND_CONST
Expand Down
6 changes: 6 additions & 0 deletions src/graphics_accelerated.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ void imageFilterBlend_Basic(Uint32 *dst_buffer, Uint32 *src_buffer,
BASIC_BLEND();
}

bool alphaMaskBlend_Basic(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value) {
return false;
}

AcceleratedGraphicsFunctions AcceleratedGraphicsFunctions::accelerated() {
AcceleratedGraphicsFunctions out;

Expand All @@ -108,10 +112,12 @@ AcceleratedGraphicsFunctions AcceleratedGraphicsFunctions::accelerated() {
out._imageFilterAddTo = imageFilterAddTo_SSE2;
out._imageFilterSubFrom = imageFilterSubFrom_SSE2;
out._imageFilterBlend = imageFilterBlend_SSE2;
out._alphaMaskBlend = alphaMaskBlend_SSE2;
}
if (_M_SSE >= 0x301 || ecx & bit_SSSE3) {
printf("SSSE3 ");
out._imageFilterBlend = imageFilterBlend_SSSE3;
out._alphaMaskBlend = alphaMaskBlend_SSSE3;
}
printf("\n");
}
Expand Down
7 changes: 7 additions & 0 deletions src/graphics_accelerated.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,22 @@ void imageFilterMean_Basic(unsigned char *src1, unsigned char *src2, unsigned ch
void imageFilterAddTo_Basic(unsigned char *dst, unsigned char *src, int length);
void imageFilterSubFrom_Basic(unsigned char *dst, unsigned char *src, int length);
void imageFilterBlend_Basic(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length);
bool alphaMaskBlend_Basic(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value);

class AcceleratedGraphicsFunctions {
void (*_imageFilterMean)(unsigned char *src1, unsigned char *src2, unsigned char *dst, int length);
void (*_imageFilterAddTo)(unsigned char *dst, unsigned char *src, int length);
void (*_imageFilterSubFrom)(unsigned char *dst, unsigned char *src, int length);
void (*_imageFilterBlend)(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length);
bool (*_alphaMaskBlend)(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value);

public:
AcceleratedGraphicsFunctions() {
_imageFilterMean = imageFilterMean_Basic;
_imageFilterAddTo = imageFilterAddTo_Basic;
_imageFilterSubFrom = imageFilterSubFrom_Basic;
_imageFilterBlend = imageFilterBlend_Basic;
_alphaMaskBlend = alphaMaskBlend_Basic;
}
static AcceleratedGraphicsFunctions basic() { return AcceleratedGraphicsFunctions(); }
static AcceleratedGraphicsFunctions accelerated();
Expand All @@ -62,4 +65,8 @@ class AcceleratedGraphicsFunctions {
void imageFilterBlend(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length) {
_imageFilterBlend(dst_buffer, src_buffer, alphap, alpha, length);
}

bool alphaMaskBlend(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value) {
return _alphaMaskBlend(dst, s1, s2, mask_surface, rect, mask_value);
}
};
5 changes: 5 additions & 0 deletions src/graphics_sse2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,9 @@ void imageFilterBlend_SSE2(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap
imageFilterBlend_SSE_Common(dst_buffer, src_buffer, alphap, alpha, length);
}

bool alphaMaskBlend_SSE2(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value)
{
return alphaMaskBlend_SSE_Common(dst, s1, s2, mask_surface, rect, mask_value);
}

#endif
1 change: 1 addition & 0 deletions src/graphics_sse2.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,6 @@ void imageFilterMean_SSE2(unsigned char *src1, unsigned char *src2, unsigned cha
void imageFilterAddTo_SSE2(unsigned char *dst, unsigned char *src, int length);
void imageFilterSubFrom_SSE2(unsigned char *dst, unsigned char *src, int length);
void imageFilterBlend_SSE2(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length);
bool alphaMaskBlend_SSE2(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value);

#endif
5 changes: 5 additions & 0 deletions src/graphics_ssse3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,9 @@ void imageFilterBlend_SSSE3(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alpha
imageFilterBlend_SSE_Common(dst_buffer, src_buffer, alphap, alpha, length);
}

bool alphaMaskBlend_SSSE3(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value)
{
return alphaMaskBlend_SSE_Common(dst, s1, s2, mask_surface, rect, mask_value);
}

#endif
1 change: 1 addition & 0 deletions src/graphics_ssse3.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <SDL.h>

void imageFilterBlend_SSSE3(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length);
bool alphaMaskBlend_SSSE3(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value);

#endif

108 changes: 108 additions & 0 deletions src/graphics_x86_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@

#include "graphics_common.h"

template <typename Px>
static HELPER_FN Px *getPointerToRow(SDL_Surface *surface, int y) {
char* buf = static_cast<char*>(surface->pixels) + surface->pitch * y;
return reinterpret_cast<Px*>(buf);
}

/// 0x0000gg?? -> 0x00gg00gg
static HELPER_FN __m128i extractFromGTo16L(__m128i v) {
#ifdef __SSSE3__
Expand All @@ -51,6 +57,16 @@ static HELPER_FN __m128i extractG(__m128i v) {
#endif
}

/// 0x000000bb -> 0x00bb00bb
static HELPER_FN __m128i extractBTo16L(__m128i v) {
#ifdef __SSSE3__
__m128i mask = _mm_setr_epi8(0, 0x80, 0, 0x80, 4, 0x80, 4, 0x80, 8, 0x80, 8, 0x80, 12, 0x80, 12, 0x80);
return _mm_shuffle_epi8(v, mask);
#else
return _mm_or_si128(_mm_slli_epi32(v, 16), v);
#endif
}

static HELPER_FN void imageFilterBlend_SSE_Common(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length) {
int n = length;

Expand Down Expand Up @@ -104,4 +120,96 @@ static HELPER_FN void imageFilterBlend_SSE_Common(Uint32 *dst_buffer, Uint32 *sr
BASIC_BLEND();
}

static Uint32 blendMaskOnePixel(Uint32 s1, Uint32 s2, Uint32 msk, Uint32 mask_value) {
Uint32 mask2 = 0;
msk &= 0xFF;
if (mask_value > msk) {
mask2 = mask_value - msk;
}
if (mask2 > 0xFF) {
mask2 = 0xFF;
}
Uint32 mask1 = mask2 ^ 0xFF;
Uint32 mask_rb = (((s1 & RBMASK) * mask1 + (s2 & RBMASK) * mask2) >> 8) & RBMASK;
Uint32 mask_g = (((s1 & GMASK) * mask1 + (s2 & GMASK) * mask2) >> 8) & GMASK;
return mask_rb | mask_g;
}

static HELPER_FN bool alphaMaskBlend_SSE_Common(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value)
{
if (mask_surface->w < 4) {
return false;
}

int end_x = rect.x + rect.w;
int end_y = rect.y + rect.h;
int mask_height = mask_surface->h;
int mask_width = mask_surface->w;

int mask_off_base_y = rect.y % mask_surface->h;
int mask_off_base_x = rect.x % mask_surface->w;
for (int y = rect.y, my = mask_off_base_y; y < end_y; y++, my++) {
if (my >= mask_height) { my = 0; }
Uint32* s1p = getPointerToRow<Uint32>(s1, y);
Uint32* s2p = getPointerToRow<Uint32>(s2, y);
Uint32* dstp = getPointerToRow<Uint32>(dst, y);
Uint32* mask_buf = getPointerToRow<Uint32>(mask_surface, my);

int x = rect.x, mx = mask_off_base_x;
while (!is_aligned(dstp + x, 16) && (x < rect.x + rect.w)) {
dstp[x] = blendMaskOnePixel(s1p[x], s2p[x], mask_buf[mx], mask_value);
x++, mx++;
if (mx >= mask_width) { mx = 0; }
}
__m128i mask_value_v = _mm_set1_epi32(mask_value);
__m128i mask_000000ff = _mm_set1_epi32(0x000000FF);
__m128i mask_00ff00ff = _mm_set1_epi32(0x00FF00FF);
while (x < (end_x - 3)) {
__m128i s1v = _mm_loadu_si128((__m128i*)(s1p + x));
__m128i s2v = _mm_loadu_si128((__m128i*)(s2p + x));
__m128i mskv;
if (__builtin_expect(mx + 3 < mask_width, true)) {
mskv = _mm_loadu_si128((__m128i*)(mask_buf + mx));
} else {
__attribute__((aligned(16))) Uint32 tmp[4];
for (int i = 0; i < 4; i++) {
if (mx + i < mask_width) {
tmp[i] = mask_buf[mx + i];
} else {
tmp[i] = mask_buf[mx + i - mask_width];
}
}
mskv = _mm_load_si128((__m128i*)tmp);
}
mskv = _mm_and_si128(mskv, mask_000000ff);
__m128i mask2 = _mm_subs_epu16(mask_value_v, mskv);
mask2 = _mm_min_epi16(mask2, mask_000000ff); // min(mask2, 0xFF)
#ifdef __clang__
asm("":"+x"(mask2)::); // clang optimization makes things worse, block it
#endif
mask2 = extractBTo16L(mask2); // Spread alpha for multiplying (0x00aa00aa)
__m128i mask1 = _mm_xor_si128(mask2, mask_00ff00ff);
// out_rb = ((s1v & rbmask) * mask1 + (s2v & rbmask) * mask2) >> 8
__m128i s1v_rb = _mm_mullo_epi16(mask1, _mm_and_si128(s1v, mask_00ff00ff));
__m128i s2v_rb = _mm_mullo_epi16(mask2, _mm_and_si128(s2v, mask_00ff00ff));
__m128i out_rb = _mm_srli_epi16(_mm_add_epi16(s1v_rb, s2v_rb), 8);
// out_g = (((s1v & gmask) >> 8) * mask1 + ((s2v & gmask) >> 8) * mask2) & gmask
__m128i s1v_g = _mm_mullo_epi16(mask1, extractG(s1v));
__m128i s2v_g = _mm_mullo_epi16(mask2, extractG(s2v));
__m128i out_g = _mm_andnot_si128(mask_00ff00ff, _mm_add_epi16(s1v_g, s2v_g));
_mm_store_si128((__m128i*)(dstp + x), _mm_or_si128(out_rb, out_g));

x += 4;
mx += 4;
if (mx >= mask_width) { mx -= mask_width; }
}
while (x < end_x) {
dstp[x] = blendMaskOnePixel(s1p[x], s2p[x], mask_buf[mx], mask_value);
x++, mx++;
if (mx >= mask_width) { mx = 0; }
}
}
return true;
}

#endif

0 comments on commit 52fc831

Please sign in to comment.