void imageFilterAddTo_SSE2(unsigned char *dst, unsigned char *src, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { ADDTO_PIXEL(); --n; ++dst; ++src; } // Do bulk of processing using SSE2 (add 16 8-bit unsigned integers, with saturation) while(n >= 16) { __m128i s = _mm_loadu_si128((__m128i*)src); __m128i d = _mm_load_si128((__m128i*)dst); __m128i r = _mm_adds_epu8(s, d); _mm_store_si128((__m128i*)dst, r); n -= 16; src += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_ADDTO(); }
void imageFilterAddTo_Altivec(unsigned char *dst, unsigned char *src, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { ADDTO_PIXEL(); --n; ++dst; ++src; } // Do bulk of processing using Altivec (add 16 8-bit unsigned integers, with saturation) while(n >= 16) { vector unsigned char s = vec_ld(0,src); vector unsigned char d = vec_ld(0,dst); vector unsigned char r = vec_adds(d, s); vec_st(r,0,dst); n -= 16; src += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_ADDTO(); }