void imageFilterAddTo(unsigned char *dst, unsigned char *src, int length) { #if defined(USE_PPC_GFX) if(cpufuncs & CPUF_PPC_ALTIVEC) { imageFilterAddTo_Altivec(dst, src, length); } else { int n = length + 1; BASIC_ADDTO(); } #elif defined(USE_X86_GFX) #ifndef MACOSX if (cpufuncs & CPUF_X86_SSE2) { #endif // !MACOSX imageFilterAddTo_SSE2(dst, src, length); #ifndef MACOSX } else if (cpufuncs & CPUF_X86_MMX) { imageFilterAddTo_MMX(dst, src, length); } else { int n = length + 1; BASIC_ADDTO(); } #endif // !MACOSX #else // no special gfx handling int n = length + 1; BASIC_ADDTO(); #endif }
void imageFilterAddTo_SSE2(unsigned char *dst, unsigned char *src, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { ADDTO_PIXEL(); --n; ++dst; ++src; } // Do bulk of processing using SSE2 (add 16 8-bit unsigned integers, with saturation) while(n >= 16) { __m128i s = _mm_loadu_si128((__m128i*)src); __m128i d = _mm_load_si128((__m128i*)dst); __m128i r = _mm_adds_epu8(s, d); _mm_store_si128((__m128i*)dst, r); n -= 16; src += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_ADDTO(); }
void imageFilterAddTo_Altivec(unsigned char *dst, unsigned char *src, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { ADDTO_PIXEL(); --n; ++dst; ++src; } // Do bulk of processing using Altivec (add 16 8-bit unsigned integers, with saturation) while(n >= 16) { vector unsigned char s = vec_ld(0,src); vector unsigned char d = vec_ld(0,dst); vector unsigned char r = vec_adds(d, s); vec_st(r,0,dst); n -= 16; src += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_ADDTO(); }