void imageFilterBlend_SSE2(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst_buffer while( (((long)dst_buffer & 0xF) > 0) && (n > 0) ) { BLEND_PIXEL(); --n; ++dst_buffer; ++src_buffer; } // Do bulk of processing using SSE2 (process 4 32bit (BGRA) pixels) // create basic bitmasks 0x00FF00FF, 0x000000FF __m128i bmask2 = _mm_set1_epi32(0x00FF00FF); __m128i bmask = _mm_srli_epi32(bmask2, 16); while(n >= 4) { // alpha1 = ((src_argb >> 24) * alpha) >> 8 __m128i a = _mm_set1_epi32(alpha); __m128i buf = _mm_loadu_si128((__m128i*)src_buffer); __m128i tmp = _mm_srli_epi32(buf, 24); a = _mm_mullo_epi16(a, tmp); a = _mm_srli_epi32(a, 8); // double-up alpha1 (0x000000vv -> 0x00vv00vv) tmp = _mm_slli_epi32(a, 16); a = _mm_or_si128(a, tmp); // rb = (src_argb & bmask2) * alpha1 tmp = _mm_and_si128(buf, bmask2); __m128i rb = _mm_mullo_epi16(a, tmp); // g = ((src_argb >> 8) & bmask) * alpha1 buf = _mm_srli_epi32(buf, 8); tmp = _mm_and_si128(buf, bmask); __m128i g = _mm_mullo_epi16(a, tmp); // alpha2 = alpha1 ^ bmask2 a = _mm_xor_si128(a, bmask2); buf = _mm_load_si128((__m128i*)dst_buffer); // rb += (dst_argb & bmask2) * alpha2 tmp = _mm_and_si128(buf, bmask2); tmp = _mm_mullo_epi16(a, tmp); rb = _mm_add_epi32(rb, tmp); // rb = (rb >> 8) & bmask2 tmp = _mm_srli_epi32(rb, 8); rb = _mm_and_si128(tmp, bmask2); // g += ((dst_argb >> 8) & bmask) * alpha2 buf = _mm_srli_epi32(buf, 8); tmp = _mm_and_si128(buf, bmask); tmp = _mm_mullo_epi16(a, tmp); g = _mm_add_epi32(g, tmp); // g = g & (bmask << 8) tmp =_mm_slli_epi32(bmask, 8); g = _mm_and_si128(g, tmp); // dst_argb = rb | g tmp = _mm_or_si128(rb, g); _mm_store_si128((__m128i*)dst_buffer, tmp); n -= 4; src_buffer += 4; dst_buffer += 4; alphap += 16; } // If any pixels are left over, deal with them individually ++n; BASIC_BLEND(); }
void imageFilterBlend(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length) { #if defined(USE_X86_GFX) #ifndef MACOSX if (cpufuncs & CPUF_X86_SSE2) { #endif // !MACOSX imageFilterBlend_SSE2(dst_buffer, src_buffer, alphap, alpha, length); #ifndef MACOSX } else { int n = length + 1; BASIC_BLEND(); } #endif // !MACOSX #else // no special gfx handling int n = length + 1; BASIC_BLEND(); #endif }