static inline __m128i v4_mul_color_sse2(__m128i x, __m128i y) { const __m128i zero = _mm_setzero_si128(); const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF); __m128i x_l = _mm_unpacklo_epi8(x, zero); __m128i x_h = _mm_unpackhi_epi8(x, zero); __m128i y_l = _mm_unpacklo_epi8(y, zero); __m128i y_h = _mm_unpackhi_epi8(y, zero); __m128i r_l = _mm_mullo_epi16(x_l, y_l); __m128i r_h = _mm_mullo_epi16(x_h, y_h); r_l = _mm_add_epi16(r_l, sym4_mask); r_h = _mm_add_epi16(r_h, sym4_mask); r_l = _mm_srli_epi16(r_l, 8); r_h = _mm_srli_epi16(r_h, 8); return _mm_packus_epi16(r_l, r_h); }
// Compute the sum of all pixel differences of this MB. static INLINE int sum_diff_16x1(__m128i acc_diff) { const __m128i k_1 = _mm_set1_epi16(1); const __m128i acc_diff_lo = _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_hi = _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); return _mm_cvtsi128_si32(hgfedcba); }
void SSE42_blend() { int n = width * height * 4; int dummy __attribute__((unused)); const uint8_t alpha2 = alpha/2; __m128i alpha_np = _mm_set1_epi16(alpha2 | ((uint16_t)(alpha2 ^ 0x7f) << 8)); for (size_t i=0; i < n; i += 32) { __m128i A0 = _mm_load_si128((__m128i*)(imgA + i)); __m128i B0 = _mm_load_si128((__m128i*)(imgB + i)); __m128i A1 = _mm_load_si128((__m128i*)(imgA + i + 16)); __m128i B1 = _mm_load_si128((__m128i*)(imgB + i + 16)); __m128i lo0 = _mm_unpacklo_epi8(A0, B0); __m128i hi0 = _mm_unpackhi_epi8(A0, B0); __m128i lo1 = _mm_unpacklo_epi8(A1, B1); __m128i hi1 = _mm_unpackhi_epi8(A1, B1); lo0 = _mm_maddubs_epi16(lo0, alpha_np); lo1 = _mm_maddubs_epi16(lo1, alpha_np); hi0 = _mm_maddubs_epi16(hi0, alpha_np); hi1 = _mm_maddubs_epi16(hi1, alpha_np); lo0 = _mm_srli_epi16(lo0, 7); lo1 = _mm_srli_epi16(lo1, 7); hi0 = _mm_srli_epi16(hi0, 7); hi1 = _mm_srli_epi16(hi1, 7); __m128i res0 = _mm_packus_epi16(lo0, hi0); __m128i res1 = _mm_packus_epi16(lo1, hi1); _mm_store_si128((__m128i*)(data + i + 0), res0); _mm_store_si128((__m128i*)(data + i + 16), res1); } }
// Transpose back and store static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1, __m128i* p0, __m128i* q0, __m128i* q1) { __m128i t1; // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 t1 = *p0; *p0 = _mm_unpacklo_epi8(*p1, t1); *p1 = _mm_unpackhi_epi8(*p1, t1); // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 t1 = *q0; *q0 = _mm_unpacklo_epi8(t1, *q1); *q1 = _mm_unpackhi_epi8(t1, *q1); // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 t1 = *p0; *p0 = _mm_unpacklo_epi16(t1, *q0); *q0 = _mm_unpackhi_epi16(t1, *q0); // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 t1 = *p1; *p1 = _mm_unpacklo_epi16(t1, *q1); *q1 = _mm_unpackhi_epi16(t1, *q1); Store4x4(p0, r0, stride); r0 += 4 * stride; Store4x4(q0, r0, stride); Store4x4(p1, r8, stride); r8 += 4 * stride; Store4x4(q1, r8, stride); }
static void GF_FUNC_ALIGN VS_CC proc_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff, float *dstp, int width, int height, int src_stride, int dst_stride) { int length = radius * 2 + 1; const uint8_t *p[17]; for (int i = -radius; i <= radius; i++) { p[i + radius] = srcp + abs(i) * src_stride; } __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128 sum[4]; sum[0] = _mm_setzero_ps(); sum[1] = _mm_setzero_ps(); sum[2] = _mm_setzero_ps(); sum[3] = _mm_setzero_ps(); for (int i = 0; i < length; i++) { __m128 f[4]; __m128i xmm0 = _mm_load_si128((__m128i *)(p[i] + x)); __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); f[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); f[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); f[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero)); f[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero)); __m128 k = _mm_set1_ps(kernel[i]); for (int j = 0; j < 4; j++) { sum[j] = _mm_add_ps(sum[j], _mm_mul_ps(k, f[j])); } } _mm_store_ps(buff + x, sum[0]); _mm_store_ps(buff + x + 4, sum[1]); _mm_store_ps(buff + x + 8, sum[2]); _mm_store_ps(buff + x + 12, sum[3]); } proc_horizontal(buff, radius, length, width, kernel, dstp); for (int i = 0; i < length - 1; i++) { p[i] = p[i + 1]; } p[length - 1] += (y < height - radius - 1 ? 1 : -1) * src_stride; dstp += dst_stride; } }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, int th) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *orig = p0, *end = p2; line_copy8(p0, srcp + stride, width, 1); line_copy8(p1, srcp, width, 1); uint8_t threshold = (uint8_t)th; __m128i zero = _mm_setzero_si128(); __m128i xth = _mm_set1_epi8((int8_t)threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy8(p2, srcp, width, 1); uint8_t *coordinates[] = COORDINATES; for (int x = 0; x < width; x += 16) { __m128i sumlo = zero; __m128i sumhi = zero; for (int i = 0; i < 8; i++) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); sumlo = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero)); sumhi = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero)); } sumlo = _mm_srai_epi16(sumlo, 3); sumhi = _mm_srai_epi16(sumhi, 3); sumlo = _mm_packus_epi16(sumlo, sumhi); __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i limit = _mm_adds_epu8(src, xth); sumlo = _mm_max_epu8(sumlo, src); sumlo = _mm_min_epu8(sumlo, limit); _mm_store_si128((__m128i *)(dstp + x), sumlo); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
void fb_sqrm_low(dig_t *c, const dig_t *a) { __m128i t0, t1, m0, m1, m2, m3, m4, m5, m6, m8, m9, mask; align dig_t x[2]; t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100); mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); m0 = _mm_load_si128((__m128i *)(a)); m1 = _mm_and_si128(m0, mask); m1 = _mm_shuffle_epi8(t0, m1); m2 = _mm_srli_epi64(m0, 4); m2 = _mm_and_si128(m2, mask); m2 = _mm_shuffle_epi8(t0, m2); m3 = _mm_unpacklo_epi8(m1, m2); m4 = _mm_unpackhi_epi8(m1, m2); m0 = _mm_load_si128((__m128i *)(a+2)); m1 = _mm_and_si128(m0, mask); m1 = _mm_shuffle_epi8(t0, m1); m2 = _mm_srli_epi64(m0, 4); m2 = _mm_and_si128(m2, mask); m2 = _mm_shuffle_epi8(t0, m2); m5 = _mm_unpacklo_epi8(m1, m2); m6 = _mm_unpackhi_epi8(m1, m2); m0 = m3; m1 = m4; m2 = m5; m3 = m6; REDUCE(); _mm_store_si128((__m128i *) c + 0, m0); _mm_store_si128((__m128i *) x, m1); c[2] = x[0]; c[3] = x[1] & 0x07FFFFFFFFFFFFFF; }
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1, __m128i* const p0, __m128i* const q0, __m128i* const q1, __m128i* const q2, const __m128i* const mask, int hev_thresh) { const __m128i zero = _mm_setzero_si128(); const __m128i sign_bit = _mm_set1_epi8(0x80); __m128i a, not_hev; // compute hev mask GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); GetBaseDelta(p1, p0, q0, q1, &a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); __m128i f = _mm_and_si128(a, m); // insieme: dropped const DoSimpleFilter(p0, q0, &f); } { // do strong filter on pixels with not hev const __m128i k9 = _mm_set1_epi16(0x0900); const __m128i k63 = _mm_set1_epi16(63); const __m128i m = _mm_and_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); const __m128i f_lo = _mm_unpacklo_epi8(zero, f); const __m128i f_hi = _mm_unpackhi_epi8(zero, f); const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9); // Filter (lo) * 9 const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9); // Filter (hi) * 9 __m128i a2_lo = _mm_add_epi16(f9_lo, k63); // Filter * 9 + 63 // insieme: dropped const __m128i a2_hi = _mm_add_epi16(f9_hi, k63); // Filter * 9 + 63 // insieme: dropped const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo); // Filter * 18 + 63 // insieme: dropped const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi); // Filter * 18 + 63 // insieme: dropped const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo); // Filter * 27 + 63 // insieme: dropped const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi); // Filter * 27 + 63 // insieme: dropped const Update2Pixels(p2, q2, &a2_lo, &a2_hi); Update2Pixels(p1, q1, &a1_lo, &a1_hi); Update2Pixels(p0, q0, &a0_lo, &a0_hi); } }
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ static void unshuffle8(uint8_t* dest, uint8_t* orig, size_t size) { size_t i, j, k; size_t neblock, numof16belem; __m128i xmm0[8], xmm1[8]; neblock = size / 8; numof16belem = neblock / 16; for (i = 0, k = 0; i < numof16belem; i++, k += 8) { /* Load the first 64 bytes in 8 XMM registrers */ for (j = 0; j < 8; j++) { xmm0[j] = ((__m128i *)orig)[j*numof16belem+i]; } /* Shuffle bytes */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]); /* Compute the hi 32 bytes */ xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]); /* Compute the hi 32 bytes */ xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]); /* Compute the hi 32 bytes */ xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]); } /* Store the result vectors in proper order */ ((__m128i *)dest)[k+0] = xmm1[0]; ((__m128i *)dest)[k+1] = xmm1[4]; ((__m128i *)dest)[k+2] = xmm1[2]; ((__m128i *)dest)[k+3] = xmm1[6]; ((__m128i *)dest)[k+4] = xmm1[1]; ((__m128i *)dest)[k+5] = xmm1[5]; ((__m128i *)dest)[k+6] = xmm1[3]; ((__m128i *)dest)[k+7] = xmm1[7]; } }
void SoundSSE::unpack_8bit_stereo(unsigned char *input, int size, float *output[2]) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/16)*16; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/128.0f); __m128i constant2 = _mm_set1_epi16(128); for (int i = 0; i < sse_size; i+=16) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2); __m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16)); __m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16)); __m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); samples2 = _mm_mul_ps(samples2, constant1); samples3 = _mm_mul_ps(samples3, constant1); __m128 tmp0, tmp1, tmp2, tmp3; tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0)); tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1)); tmp2 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(2,0,2,0)); tmp3 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(3,1,3,1)); _mm_storeu_ps(output[0]+i/2, tmp0); _mm_storeu_ps(output[1]+i/2, tmp1); _mm_storeu_ps(output[0]+i/2+4, tmp2); _mm_storeu_ps(output[1]+i/2+4, tmp3); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i+=2) { int value = input[i]; output[0][i/2] = ((float) (value - 128)) / 128.0f; value = input[i+1]; output[1][i/2] = ((float) (value - 128)) / 128.0f; } }
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */ static void unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 8; size_t i; int j; __m128i xmm0[8], xmm1[8]; for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { /* Load 16 elements (128 bytes) into 8 XMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 8; j++) { xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); /* Compute the hi 32 bytes */ xmm1[4 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); } /* Shuffle 2-byte words */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); /* Compute the hi 32 bytes */ xmm0[4 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 4; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); /* Compute the hi 32 bytes */ xmm1[4 + j] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]); } /* Store the result vectors in proper order */ _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]); } }
__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i8[ 8] = _MM1.m64_i8[4]; lhs.m128i_i8[ 9] = _MM1.m64_i8[5]; lhs.m128i_i8[10] = _MM1.m64_i8[6]; lhs.m128i_i8[11] = _MM1.m64_i8[7]; rhs.m128i_i8[ 8] = _MM2.m64_i8[4]; rhs.m128i_i8[ 9] = _MM2.m64_i8[5]; rhs.m128i_i8[10] = _MM2.m64_i8[6]; rhs.m128i_i8[11] = _MM2.m64_i8[7]; lhs = _mm_unpackhi_epi8(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
static INLINE unsigned int masked_sad8xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data_l = _mm_unpacklo_epi8(a0, b0); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpacklo_epi8(a1, b1); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); return (sad + 31) >> 6; }
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ static void shuffle16_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 16; size_t j; int k, l; uint8_t* dest_for_jth_element; __m128i xmm0[16], xmm1[16]; for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { /* Fetch 16 elements (256 bytes). */ for (k = 0; k < 16; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l += 2) { xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]); xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k % 2) == 0) l += 2; xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]); xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k % 4) == 0) l += 4; xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]); xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]); xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]); } /* Store the result vectors */ dest_for_jth_element = dest + j; for (k = 0; k < 16; k++) { _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]); } } }
static void accumulate_and_store_16(const __m128i sum_0_u16, const __m128i sum_1_u16, const uint8_t *pred, uint16_t *count, uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); _mm_storeu_si128((__m128i *)count, count_0_u16); count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); }
inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA) __m128i p1234 = _mm_unpacklo_epi8(p12, p34); __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128()); __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx); // extend to 16bit __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128()); __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128()); // convert weights to integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4] __m128i outRG = _mm_madd_epi16(pRG, weighti); //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4] __m128i outBA = _mm_madd_epi16(pBA, weighti); // horizontal add that will produce the output values (in 32bit) __m128i out = _mm_hadd_epi32(outRG, outBA); out = _mm_srli_epi32(out, 8); // divide by 256 // convert 32bit->8bit out = _mm_packus_epi32(out, _mm_setzero_si128()); out = _mm_packus_epi16(out, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(out); }
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */ static void shuffle16(uint8_t* dest, uint8_t* src, size_t size) { size_t i, j, k, l; size_t numof16belem; __m128i xmm0[16], xmm1[16]; numof16belem = size / (16*16); for (i = 0, j = 0; i < numof16belem; i++, j += 16*16) { /* Fetch elements in groups of 256 bytes */ for (k = 0; k < 16; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); } /* Transpose bytes */ for (k = 0, l = 0; k < 8; k++, l +=2) { xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]); xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]); } /* Transpose words */ for (k = 0, l = -2; k < 8; k++, l++) { if ((k%2) == 0) l += 2; xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]); xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]); } /* Transpose double words */ for (k = 0, l = -4; k < 8; k++, l++) { if ((k%4) == 0) l += 4; xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]); xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]); } /* Transpose quad words */ for (k = 0; k < 8; k++) { xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]); xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]); } /* Store the result vectors */ for (k = 0; k < 16; k++) { ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; } } }
/** * Optimized 8 bit zoomer for resizing by a factor of 2. Doesn't flip. * Used internally by _zoomSurfaceY() below. * This is an SSE2 version written with Intel intrinsics. * source and dest. widths must be multiples of 16 bytes for 128-bit access * and it would help if they were aligned properly... :( */ static int zoomSurface2X_SSE2(SDL_Surface *src, SDL_Surface *dst) { __m128i dataSrc; __m128i dataDst; Uint8 *pixelSrc = (Uint8*)src->pixels; Uint8 *pixelDstRow = (Uint8*)dst->pixels; int sx, sy; static bool proclaimed = false; if (!proclaimed) { proclaimed = true; Log(LOG_INFO) << "Using SSE2 2X zoom routine."; } for (sy = 0; sy < src->h; ++sy, pixelDstRow += dst->pitch*2) { __m128i *pixelDst = (__m128i*)pixelDstRow; __m128i *pixelDst2 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch); for (sx = 0; sx < src->w; sx += 16, pixelSrc += 16) { dataSrc = *((__m128i*) pixelSrc); dataDst = _mm_unpacklo_epi8(dataSrc, dataSrc); #undef WRITE_DST #define WRITE_DST *(pixelDst++) = dataDst; \ *(pixelDst2++) = dataDst; \ WRITE_DST; dataDst = _mm_unpackhi_epi8(dataSrc, dataSrc); WRITE_DST; } } return 0; }
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ static void unshuffle2(uint8_t* dest, uint8_t* orig, size_t size) { size_t i, k; size_t neblock, numof16belem; __m128i xmm1[2], xmm2[2]; neblock = size / 2; numof16belem = neblock / 16; for (i = 0, k = 0; i < numof16belem; i++, k += 2) { /* Load the first 32 bytes in 2 XMM registrers */ xmm1[0] = ((__m128i *)orig)[0*numof16belem+i]; xmm1[1] = ((__m128i *)orig)[1*numof16belem+i]; /* Shuffle bytes */ /* Compute the low 32 bytes */ xmm2[0] = _mm_unpacklo_epi8(xmm1[0], xmm1[1]); /* Compute the hi 32 bytes */ xmm2[1] = _mm_unpackhi_epi8(xmm1[0], xmm1[1]); /* Store the result vectors in proper order */ ((__m128i *)dest)[k+0] = xmm2[0]; ((__m128i *)dest)[k+1] = xmm2[1]; } }
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */ static void unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t i; int j; __m128i xmm0[2], xmm1[2]; for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { /* Load 16 elements (32 bytes) into 2 XMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 2; j++) { xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ /* Compute the low 32 bytes */ xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]); /* Compute the hi 32 bytes */ xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]); /* Store the result vectors in proper order */ _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]); } }
void SoundSSE::unpack_8bit_mono(unsigned char *input, int size, float *output) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/16)*16; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/128.0f); __m128i constant2 = _mm_set1_epi16(128); for (int i = 0; i < sse_size; i+=16) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2); __m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16)); __m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16)); __m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); samples2 = _mm_mul_ps(samples2, constant1); samples3 = _mm_mul_ps(samples3, constant1); _mm_storeu_ps(output+i+0, samples0); _mm_storeu_ps(output+i+4, samples1); _mm_storeu_ps(output+i+8, samples2); _mm_storeu_ps(output+i+12, samples3); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i++) { int value = input[i]; output[i] = ((float) (value - 128)) / 128.0f; } }
/* Transpose bytes within elements for 32 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_32(void* in, void* out, const size_t size) { char* in_b = (char*) in; char* out_b = (char*) out; __m128i a0, b0, c0, d0, a1, b1, c1, d1; size_t ii; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]); c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]); d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); c0 = _mm_unpacklo_epi8(c1, d1); d0 = _mm_unpackhi_epi8(c1, d1); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); a0 = _mm_unpacklo_epi64(a1, c1); b0 = _mm_unpackhi_epi64(a1, c1); c0 = _mm_unpacklo_epi64(b1, d1); d0 = _mm_unpackhi_epi64(b1, d1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); } return bshuf_trans_byte_elem_remainder(in, out, size, 4, size - size % 16); }
void ThumbnailProvider::shrink8x8SSE(const ImageBH& srcImage, ThumbnailBH::ThumbnailImage& destImage) { int scaleFactor = 8; int averagedPixels = scaleFactor * scaleFactor; ASSERT(srcImage.width % scaleFactor == 0); ASSERT(srcImage.height % scaleFactor == 0); destImage.setResolution(srcImage.width / scaleFactor, srcImage.height / scaleFactor); int height = srcImage.height; int width = srcImage.width; static const __m128i zero = _mm_setzero_si128(); __m128i* summs = reinterpret_cast<__m128i*>(SystemCall::alignedMalloc(16 * destImage.width, 16)); memset(summs, 0, destImage.width * 16); const ImageBH::Pixel* pSrc; ThumbnailBH::ThumbnailImage::PixelType* pDest; __m128i* pSumms; __m128i tmp; __m128i lower; __m128i upper; for(int y = 0; y < height; ++y) { if(y % scaleFactor == 0) { pDest = destImage[y / scaleFactor]; } pSrc = srcImage[y]; pSumms = summs; for(int x = 0; x < width; x += scaleFactor, pSrc += scaleFactor, ++pSumms) { tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc)); lower = _mm_unpacklo_epi8(tmp, zero); upper = _mm_unpackhi_epi8(tmp, zero); *pSumms = _mm_add_epi16(*pSumms, lower); *pSumms = _mm_add_epi16(*pSumms, upper); tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + scaleFactor / 2)); lower = _mm_unpacklo_epi8(tmp, zero); upper = _mm_unpackhi_epi8(tmp, zero); *pSumms = _mm_add_epi16(*pSumms, lower); *pSumms = _mm_add_epi16(*pSumms, upper); } if(y % scaleFactor == scaleFactor - 1) { pSumms = summs; for(int i = 0; i < destImage.width; ++i, ++pSumms, ++pDest) { short* ptr = reinterpret_cast<short*>(pSumms); short sumY = ptr[offsetof(ImageBH::Pixel, y)] + ptr[offsetof(ImageBH::Pixel, y) + sizeof(ImageBH::Pixel)]; short sumCb = ptr[offsetof(ImageBH::Pixel, cb)] + ptr[offsetof(ImageBH::Pixel, cb) + sizeof(ImageBH::Pixel)]; short sumCr = ptr[offsetof(ImageBH::Pixel, cr)] + ptr[offsetof(ImageBH::Pixel, cr) + sizeof(ImageBH::Pixel)]; pDest->y = static_cast<char>(sumY / averagedPixels); pDest->cb = static_cast<char>(sumCb / averagedPixels); pDest->cr = static_cast<char>(sumCr / averagedPixels); } memset(summs, 0, destImage.width * 16); } } SystemCall::alignedFree(summs); }
/* Transpose bytes within elements for 64 bit elements. */ int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) { char* in_b = (char*) in; char* out_b = (char*) out; __m128i a0, b0, c0, d0, e0, f0, g0, h0; __m128i a1, b1, c1, d1, e1, f1, g1, h1; size_t ii; for (ii=0; ii + 15 < size; ii += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]); b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]); c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]); d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]); e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]); f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]); g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]); h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpackhi_epi8(a0, b0); c1 = _mm_unpacklo_epi8(c0, d0); d1 = _mm_unpackhi_epi8(c0, d0); e1 = _mm_unpacklo_epi8(e0, f0); f1 = _mm_unpackhi_epi8(e0, f0); g1 = _mm_unpacklo_epi8(g0, h0); h1 = _mm_unpackhi_epi8(g0, h0); a0 = _mm_unpacklo_epi8(a1, b1); b0 = _mm_unpackhi_epi8(a1, b1); c0 = _mm_unpacklo_epi8(c1, d1); d0 = _mm_unpackhi_epi8(c1, d1); e0 = _mm_unpacklo_epi8(e1, f1); f0 = _mm_unpackhi_epi8(e1, f1); g0 = _mm_unpacklo_epi8(g1, h1); h0 = _mm_unpackhi_epi8(g1, h1); a1 = _mm_unpacklo_epi32(a0, c0); b1 = _mm_unpackhi_epi32(a0, c0); c1 = _mm_unpacklo_epi32(b0, d0); d1 = _mm_unpackhi_epi32(b0, d0); e1 = _mm_unpacklo_epi32(e0, g0); f1 = _mm_unpackhi_epi32(e0, g0); g1 = _mm_unpacklo_epi32(f0, h0); h1 = _mm_unpackhi_epi32(f0, h0); a0 = _mm_unpacklo_epi64(a1, e1); b0 = _mm_unpackhi_epi64(a1, e1); c0 = _mm_unpacklo_epi64(b1, f1); d0 = _mm_unpackhi_epi64(b1, f1); e0 = _mm_unpacklo_epi64(c1, g1); f0 = _mm_unpackhi_epi64(c1, g1); g0 = _mm_unpacklo_epi64(d1, h1); h0 = _mm_unpackhi_epi64(d1, h1); _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0); _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0); _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0); _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0); _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0); _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0); _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0); _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0); } return bshuf_trans_byte_elem_remainder(in, out, size, 8, size - size % 16); }
/* For data organized into a row for each bit (8 * elem_size rows), transpose * the bytes. */ int64_t bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size, const size_t elem_size) { char* in_b = (char*) in; char* out_b = (char*) out; size_t nrows = 8 * elem_size; size_t nbyte_row = size / 8; size_t ii, jj; __m128i a0, b0, c0, d0, e0, f0, g0, h0; __m128i a1, b1, c1, d1, e1, f1, g1, h1; __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs; CHECK_MULT_EIGHT(size); for (ii = 0; ii + 7 < nrows; ii += 8) { for (jj = 0; jj + 15 < nbyte_row; jj += 16) { a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]); b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]); c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]); d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]); e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]); f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]); g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]); h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]); a1 = _mm_unpacklo_epi8(a0, b0); b1 = _mm_unpacklo_epi8(c0, d0); c1 = _mm_unpacklo_epi8(e0, f0); d1 = _mm_unpacklo_epi8(g0, h0); e1 = _mm_unpackhi_epi8(a0, b0); f1 = _mm_unpackhi_epi8(c0, d0); g1 = _mm_unpackhi_epi8(e0, f0); h1 = _mm_unpackhi_epi8(g0, h0); a0 = _mm_unpacklo_epi16(a1, b1); b0 = _mm_unpacklo_epi16(c1, d1); c0 = _mm_unpackhi_epi16(a1, b1); d0 = _mm_unpackhi_epi16(c1, d1); e0 = _mm_unpacklo_epi16(e1, f1); f0 = _mm_unpacklo_epi16(g1, h1); g0 = _mm_unpackhi_epi16(e1, f1); h0 = _mm_unpackhi_epi16(g1, h1); a1 = _mm_unpacklo_epi32(a0, b0); b1 = _mm_unpackhi_epi32(a0, b0); c1 = _mm_unpacklo_epi32(c0, d0); d1 = _mm_unpackhi_epi32(c0, d0); e1 = _mm_unpacklo_epi32(e0, f0); f1 = _mm_unpackhi_epi32(e0, f0); g1 = _mm_unpacklo_epi32(g0, h0); h1 = _mm_unpackhi_epi32(g0, h0); /* We don't have a storeh instruction for integers, so interpret */ /* as a float. Have a storel (_mm_storel_epi64). */ as = (__m128 *) &a1; bs = (__m128 *) &b1; cs = (__m128 *) &c1; ds = (__m128 *) &d1; es = (__m128 *) &e1; fs = (__m128 *) &f1; gs = (__m128 *) &g1; hs = (__m128 *) &h1; _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as); _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs); _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs); _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds); _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es); _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs); _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs); _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs); _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as); _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs); _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs); _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds); _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es); _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs); _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs); _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs); } for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) { out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj]; out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj]; out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj]; out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj]; out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj]; out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj]; out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj]; out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj]; } } return size * elem_size; }
mlib_status mlib_VideoColorYUV2ARGB422_aligned( mlib_u8 *argb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 argb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* 1.1644 * 8192 */ const __m128i c0 = _mm_set1_epi16(0x2543); const mlib_s32 ic0 = 0x2543; /* 2.0184 * 8192 */ const __m128i c1 = _mm_set1_epi16(0x4097); const mlib_s32 ic1 = 0x4097; /* abs( -0.3920 * 8192 ) */ const __m128i c4 = _mm_set1_epi16(0xc8b); const mlib_s32 ic4 = 0xc8b; /* abs( -0.8132 * 8192 ) */ const __m128i c5 = _mm_set1_epi16(0x1a06); const mlib_s32 ic5 = 0x1a06; /* 1.5966 * 8192 */ const __m128i c8 = _mm_set1_epi16(0x3317); const mlib_s32 ic8 = 0x3317; /* -276.9856 * 32 */ const __m128i coff0 = _mm_set1_epi16(0xdd60); const mlib_s32 icoff0 = (mlib_s32)0xffffdd60; /* 135.6352 * 32 */ const __m128i coff1 = _mm_set1_epi16(0x10f4); const mlib_s32 icoff1 = 0x10f4; /* -222.9952 * 32 */ const __m128i coff2 = _mm_set1_epi16(0xe420); const mlib_s32 icoff2 = (mlib_s32)0xffffe420; /* loop variable */ mlib_s32 jH, iW; /* pointers */ mlib_u8 *pY, *pU, *pV, *pD, *pdd, *ptemp; __m128i *py, *pu, *pv; /* variables */ __m128i sy1, sy2, sy3, sy4, su1, su2, sv1, sv2; __m128i du0, du1, dv1, dv2; __m128i db1, db2, db3, db4, dr1, dr2, dr3, dr4, dg1, dg2, dg3, dg4; __m128i ddy1, ddy2, ddy3, ddy4, dzrl, dzrh, dgbl, dgbh, drgbh, drgbl; __m128i db_h, db_l, dg_h, dg_l, dr_h, dr_l, temp, bak; const __m128i x_zero = _mm_setzero_si128(); const __m128i x_mask = _mm_set1_epi32(0xff); /* for 4-pixel computing */ mlib_s32 iu, iv, ig, ir, ib, iTemp; mlib_s32 iu0, iu1, iv1, iv2; pY = (mlib_u8 *)y; pU = (mlib_u8 *)u; pV = (mlib_u8 *)v; pD = (mlib_u8 *)argb; for (jH = 0; jH < height; jH++) { py = (__m128i *)pY; pu = (__m128i *)pU; pv = (__m128i *)pV; pdd = pD; iW = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ /* 32 pixels */ for (; iW <= width - 32; iW += 32) { /* load y u v, and expand */ temp = _mm_load_si128(pu); su1 = _mm_unpacklo_epi8(x_zero, temp); su2 = _mm_unpackhi_epi8(x_zero, temp); pu++; temp = _mm_load_si128(pv); sv1 = _mm_unpacklo_epi8(x_zero, temp); sv2 = _mm_unpackhi_epi8(x_zero, temp); pv++; temp = _mm_load_si128(py); sy1 = _mm_unpacklo_epi8(x_zero, temp); sy2 = _mm_unpackhi_epi8(x_zero, temp); py++; temp = _mm_load_si128(py); sy3 = _mm_unpacklo_epi8(x_zero, temp); sy4 = _mm_unpackhi_epi8(x_zero, temp); py++; /* pre-calc d[r/g/b][1234] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du0 = _mm_mulhi_epu16(su2, c1); db_h = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); du1 = _mm_mulhi_epu16(su2, c4); dv1 = _mm_mulhi_epu16(sv2, c5); temp = _mm_add_epi16(du1, dv1); dg_h = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); dv2 = _mm_mulhi_epu16(sv2, c8); dr_h = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); ddy2 = _mm_mulhi_epu16(sy2, c0); ddy3 = _mm_mulhi_epu16(sy3, c0); ddy4 = _mm_mulhi_epu16(sy4, c0); /* db1/2/3/4 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(db_l, db_l); db2 = _mm_add_epi16(ddy2, bak); bak = _mm_unpacklo_epi16(db_h, db_h); db3 = _mm_add_epi16(ddy3, bak); bak = _mm_unpackhi_epi16(db_h, db_h); db4 = _mm_add_epi16(ddy4, bak); /* dg1/2/3/4 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dg_l, dg_l); dg2 = _mm_add_epi16(ddy2, bak); bak = _mm_unpacklo_epi16(dg_h, dg_h); dg3 = _mm_add_epi16(ddy3, bak); bak = _mm_unpackhi_epi16(dg_h, dg_h); dg4 = _mm_add_epi16(ddy4, bak); /* dr1/2/3/4 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dr_l, dr_l); dr2 = _mm_add_epi16(ddy2, bak); bak = _mm_unpacklo_epi16(dr_h, dr_h); dr3 = _mm_add_epi16(ddy3, bak); bak = _mm_unpackhi_epi16(dr_h, dr_h); dr4 = _mm_add_epi16(ddy4, bak); db1 = _mm_srai_epi16(db1, 5); db2 = _mm_srai_epi16(db2, 5); db3 = _mm_srai_epi16(db3, 5); db4 = _mm_srai_epi16(db4, 5); dg1 = _mm_srai_epi16(dg1, 5); dg2 = _mm_srai_epi16(dg2, 5); dg3 = _mm_srai_epi16(dg3, 5); dg4 = _mm_srai_epi16(dg4, 5); dr1 = _mm_srai_epi16(dr1, 5); dr2 = _mm_srai_epi16(dr2, 5); dr3 = _mm_srai_epi16(dr3, 5); dr4 = _mm_srai_epi16(dr4, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, db2); db2 = _mm_packus_epi16(db3, db4); dr1 = _mm_packus_epi16(dr1, dr2); dr2 = _mm_packus_epi16(dr3, dr4); dg1 = _mm_packus_epi16(dg1, dg2); dg2 = _mm_packus_epi16(dg3, dg4); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dzrh = _mm_unpackhi_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); dgbh = _mm_unpackhi_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); drgbl = _mm_unpacklo_epi16(dzrh, dgbh); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrh, dgbh); SAVE_ARGB1(drgbh); /* create rgb sequences : db/dr/dg[2] */ dzrl = _mm_unpacklo_epi8(x_zero, dr2); dzrh = _mm_unpackhi_epi8(x_zero, dr2); dgbl = _mm_unpacklo_epi8(dg2, db2); dgbh = _mm_unpackhi_epi8(dg2, db2); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); drgbl = _mm_unpacklo_epi16(dzrh, dgbh); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrh, dgbh); SAVE_ARGB1(drgbh); } /* 16 pixels */ if (iW <= width - 16) { /* load y u v, and expand */ temp = _mm_loadl_epi64(pu); su1 = _mm_unpacklo_epi8(x_zero, temp); pu = (__m128i *) (((__m64 *)pu) + 1); temp = _mm_loadl_epi64(pv); sv1 = _mm_unpacklo_epi8(x_zero, temp); pv = (__m128i *) (((__m64 *)pv) + 1); temp = _mm_load_si128(py); sy1 = _mm_unpacklo_epi8(x_zero, temp); sy2 = _mm_unpackhi_epi8(x_zero, temp); py++; /* pre-calc d[r/g/b][12] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); ddy2 = _mm_mulhi_epu16(sy2, c0); /* db1/2 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(db_l, db_l); db2 = _mm_add_epi16(ddy2, bak); /* dg1/2 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dg_l, dg_l); dg2 = _mm_add_epi16(ddy2, bak); /* dr1/2 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dr_l, dr_l); dr2 = _mm_add_epi16(ddy2, bak); db1 = _mm_srai_epi16(db1, 5); db2 = _mm_srai_epi16(db2, 5); dg1 = _mm_srai_epi16(dg1, 5); dg2 = _mm_srai_epi16(dg2, 5); dr1 = _mm_srai_epi16(dr1, 5); dr2 = _mm_srai_epi16(dr2, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, db2); dr1 = _mm_packus_epi16(dr1, dr2); dg1 = _mm_packus_epi16(dg1, dg2); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dzrh = _mm_unpackhi_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); dgbh = _mm_unpackhi_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); drgbl = _mm_unpacklo_epi16(dzrh, dgbh); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrh, dgbh); SAVE_ARGB1(drgbh); iW += 16; } /* 8 pixels */ if (iW <= width - 8) { /* load y u v, and expand */ iTemp = *((mlib_s32 *)pu); temp = _mm_cvtsi32_si128(iTemp); su1 = _mm_unpacklo_epi8(x_zero, temp); pu = (__m128i *) (((mlib_s32 *)pu) + 1); iTemp = *((mlib_s32 *)pv); temp = _mm_cvtsi32_si128(iTemp); sv1 = _mm_unpacklo_epi8(x_zero, temp); pv = (__m128i *) (((mlib_s32 *)pv) + 1); temp = _mm_loadl_epi64(py); sy1 = _mm_unpacklo_epi8(x_zero, temp); py = (__m128i *) (((__m64 *)py) + 1); /* pre-calc d[r/g/b][1] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); /* db1 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); /* dg1 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); /* dr1 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); db1 = _mm_srai_epi16(db1, 5); dg1 = _mm_srai_epi16(dg1, 5); dr1 = _mm_srai_epi16(dr1, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, x_zero); dr1 = _mm_packus_epi16(dr1, x_zero); dg1 = _mm_packus_epi16(dg1, x_zero); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); iW += 8; } /* 4 pixels */ if (iW <= width - 4) { /* load y u v, and expand */ iTemp = *((mlib_s16 *)pu); temp = _mm_cvtsi32_si128(iTemp); su1 = _mm_unpacklo_epi8(x_zero, temp); pu = (__m128i *) (((mlib_s16 *)pu) + 1); iTemp = *((mlib_s16 *)pv); temp = _mm_cvtsi32_si128(iTemp); sv1 = _mm_unpacklo_epi8(x_zero, temp); pv = (__m128i *) (((mlib_s16 *)pv) + 1); iTemp = *((mlib_s32 *)py); temp = _mm_cvtsi32_si128(iTemp); sy1 = _mm_unpacklo_epi8(x_zero, temp); py = (__m128i *) (((mlib_s32 *)py) + 1); /* pre-calc d[r/g/b][1] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); /* db1 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); /* dg1 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); /* dr1 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); db1 = _mm_srai_epi16(db1, 5); dg1 = _mm_srai_epi16(dg1, 5); dr1 = _mm_srai_epi16(dr1, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, x_zero); dr1 = _mm_packus_epi16(dr1, x_zero); dg1 = _mm_packus_epi16(dg1, x_zero); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); iW += 4; } /* 2 pixels */ if (iW <= width - 2) { /* load y u v, and expand */ iu = *((mlib_u8 *)pu); pu = (__m128i *) (((mlib_u8 *)pu) + 1); iv = *((mlib_u8 *)pv); pv = (__m128i *) (((mlib_u8 *)pv) + 1); iTemp = *((mlib_s16 *)py); temp = _mm_cvtsi32_si128(iTemp); sy1 = _mm_unpacklo_epi8(x_zero, temp); py = (__m128i *) (((mlib_s16 *)py) + 1); /* pre-calc d[r/g/b][1] */ iu0 = (iu * ic1) >> 8; ib = icoff0 + iu0; iu1 = (iu * ic4) >> 8; iv1 = (iv * ic5) >> 8; iTemp = iu1 + iv1; ig = icoff1 - iTemp; iv2 = (iv * ic8) >> 8; ir = iv2 + icoff2; ddy1 = _mm_mulhi_epu16(sy1, c0); /* db1 */ temp = _mm_set1_epi16(ib); db1 = _mm_add_epi16(ddy1, temp); /* dg1 */ temp = _mm_set1_epi16(ig); dg1 = _mm_add_epi16(ddy1, temp); /* dr1 */ temp = _mm_set1_epi16(ir); dr1 = _mm_add_epi16(ddy1, temp); db1 = _mm_srai_epi16(db1, 5); dg1 = _mm_srai_epi16(dg1, 5); dr1 = _mm_srai_epi16(dr1, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, x_zero); dr1 = _mm_packus_epi16(dr1, x_zero); dg1 = _mm_packus_epi16(dg1, x_zero); /* create rgb sequences : db/dr/dg */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); /* lower half of drgl & dbzl */ drgbl = _mm_unpacklo_epi16(dzrl, dgbl); ptemp = (mlib_u8*)(&drgbl); pdd += 1; ptemp += 1; *((mlib_s16*)pdd) = *((mlib_s16*)ptemp); pdd += 2; ptemp += 2; *((mlib_u8*)pdd) = *((mlib_u8*)ptemp); pdd += 2; ptemp += 2; *((mlib_s16*)pdd) = *((mlib_s16*)ptemp); pdd += 2; ptemp += 2; *((mlib_u8*)pdd) = *((mlib_u8*)ptemp); pdd += 1; iW += 2; } pY += y_stride; pU += uv_stride; pV += uv_stride; pD += argb_stride; }
void alphaBlendSSE_8u(Mat& src1, Mat& src2, Mat& alpha, Mat& dest) { if(dest.empty())dest.create(src1.size(),CV_8U); const int imsize = (src1.size().area()/16); uchar* s1 = src1.data; uchar* s2 = src2.data; uchar* a = alpha.data; uchar* d = dest.data; const __m128i zero = _mm_setzero_si128(); const __m128i amax = _mm_set1_epi8(char(255)); int i=0; if(s1==d) { for(;i<imsize;++i) { __m128i ms1h = _mm_load_si128((__m128i*)(s1)); __m128i ms2h = _mm_load_si128((__m128i*)(s2)); __m128i mah = _mm_load_si128((__m128i*)(a)); __m128i imah = _mm_sub_epi8(amax,mah); __m128i ms1l = _mm_unpacklo_epi8(ms1h, zero); ms1h = _mm_unpackhi_epi8(ms1h, zero); __m128i ms2l = _mm_unpacklo_epi8(ms2h, zero); ms2h = _mm_unpackhi_epi8(ms2h, zero); __m128i mal = _mm_unpacklo_epi8(mah, zero); mah = _mm_unpackhi_epi8(mah, zero); __m128i imal = _mm_unpacklo_epi8(imah, zero); imah = _mm_unpackhi_epi8(imah, zero); ms1l = _mm_mullo_epi16(ms1l,mal); ms2l = _mm_mullo_epi16(ms2l,imal); ms1l = _mm_add_epi16(ms1l,ms2l); //ms1l = _mm_srli_epi16(ms1l,8); ms1l = _mm_srai_epi16(ms1l,8); ms1h = _mm_mullo_epi16(ms1h,mah); ms2h = _mm_mullo_epi16(ms2h,imah); ms1h = _mm_add_epi16(ms1h,ms2h); //ms1h = _mm_srli_epi16(ms1h,8); ms1h = _mm_srai_epi16(ms1h,8); _mm_stream_si128((__m128i*)s1,_mm_packs_epi16(ms1l,ms1h)); s1+=16; s2+=16; a+=16; } } else { for(;i<imsize;++i) { __m128i ms1h = _mm_load_si128((__m128i*)(s1)); __m128i ms2h = _mm_load_si128((__m128i*)(s2)); __m128i mah = _mm_load_si128((__m128i*)(a)); __m128i imah = _mm_sub_epi8(amax,mah); __m128i ms1l = _mm_unpacklo_epi8(ms1h, zero); ms1h = _mm_unpackhi_epi8(ms1h, zero); __m128i ms2l = _mm_unpacklo_epi8(ms2h, zero); ms2h = _mm_unpackhi_epi8(ms2h, zero); __m128i mal = _mm_unpacklo_epi8(mah, zero); mah = _mm_unpackhi_epi8(mah, zero); __m128i imal = _mm_unpacklo_epi8(imah, zero); imah = _mm_unpackhi_epi8(imah, zero); ms1l = _mm_mullo_epi16(ms1l,mal); ms2l = _mm_mullo_epi16(ms2l,imal); ms1l = _mm_add_epi16(ms1l,ms2l); //ms1l = _mm_srli_epi16(ms1l,8); ms1l = _mm_srai_epi16(ms1l,8); ms1h = _mm_mullo_epi16(ms1h,mah); ms2h = _mm_mullo_epi16(ms2h,imah); ms1h = _mm_add_epi16(ms1h,ms2h); //ms1h = _mm_srli_epi16(ms1h,8); ms1h = _mm_srai_epi16(ms1h,8); _mm_store_si128((__m128i*)d,_mm_packs_epi16(ms1l,ms1h)); s1+=16; s2+=16; a+=16; d+=16; } } { uchar* s1 = src1.data; uchar* s2 = src2.data; uchar* a = alpha.data; uchar* d = dest.data; for(int n=i*16;n<src1.size().area();n++) { d[n] = (a[n]*s1[n] + (255-a[n])*s2[n])>>8; } } }
void pixops_crossfade_sse2(void* dst, intptr_t dstStride, const void* src, intptr_t srcStride, uint32_t w, uint32_t h, uint32_t alpha) { uint8_t* pDstRow = static_cast<uint8_t*>(dst); const uint8_t* pSrcRow = static_cast<const uint8_t*>(src); __m128i a = _mm_shuffle_epi32(_mm_cvtsi32_si128(expand16(alpha )), _MM_SHUFFLE(0, 0, 0, 0)); __m128i ia = _mm_shuffle_epi32(_mm_cvtsi32_si128(expand16(256 - alpha)), _MM_SHUFFLE(0, 0, 0, 0)); for (uint32_t y = h; y > 0; y--, pDstRow += dstStride, pSrcRow += srcStride) { uint32_t* pDst = reinterpret_cast<uint32_t*>(pDstRow); const uint32_t* pSrc = reinterpret_cast<const uint32_t*>(pSrcRow); uint32_t x = w; for (;;) { while (x < 4 || !SimdUtils::isAligned(dst, 16)) { __m128i d = _mm_cvtsi32_si128(*pDst); __m128i s = _mm_cvtsi32_si128(*pSrc); d = _mm_unpacklo_epi8(d, _mm_setzero_si128()); s = _mm_unpacklo_epi8(s, _mm_setzero_si128()); d = _mm_mullo_epi16(d, ia); s = _mm_mullo_epi16(s, a); d = _mm_add_epi16(d, s); d = _mm_srli_epi16(d, 8); d = _mm_packus_epi16(d, d); *pDst = _mm_cvtsi128_si32(d); pDst++; pSrc++; x--; } if (x == 0) break; while (x >= 8) { __m128i d0 = _mm_load_si128(reinterpret_cast<__m128i*>(pDst + 0)); __m128i d2 = _mm_load_si128(reinterpret_cast<__m128i*>(pDst + 4)); __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + 0)); __m128i s2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + 4)); __m128i d1 = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); __m128i d3 = _mm_unpackhi_epi8(d2, _mm_setzero_si128()); __m128i s1 = _mm_unpackhi_epi8(s0, _mm_setzero_si128()); __m128i s3 = _mm_unpackhi_epi8(s2, _mm_setzero_si128()); d0 = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); d2 = _mm_unpacklo_epi8(d2, _mm_setzero_si128()); s0 = _mm_unpacklo_epi8(s0, _mm_setzero_si128()); s2 = _mm_unpacklo_epi8(s2, _mm_setzero_si128()); d0 = _mm_mullo_epi16(d0, ia); d1 = _mm_mullo_epi16(d1, ia); d2 = _mm_mullo_epi16(d2, ia); d3 = _mm_mullo_epi16(d3, ia); s0 = _mm_mullo_epi16(s0, a); s1 = _mm_mullo_epi16(s1, a); s2 = _mm_mullo_epi16(s2, a); s3 = _mm_mullo_epi16(s3, a); d0 = _mm_add_epi16(d0, s0); d1 = _mm_add_epi16(d1, s1); d2 = _mm_add_epi16(d2, s2); d3 = _mm_add_epi16(d3, s3); d0 = _mm_srli_epi16(d0, 8); d1 = _mm_srli_epi16(d1, 8); d2 = _mm_srli_epi16(d2, 8); d3 = _mm_srli_epi16(d3, 8); d0 = _mm_packus_epi16(d0, d1); d2 = _mm_packus_epi16(d2, d3); _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 0), d0); _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 4), d2); pDst += 8; pSrc += 8; x -= 8; } while (x >= 4) { __m128i d0 = _mm_load_si128(reinterpret_cast<__m128i*>(pDst)); __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc)); __m128i d1 = _mm_unpackhi_epi8(d0, _mm_setzero_si128()); __m128i s1 = _mm_unpackhi_epi8(s0, _mm_setzero_si128()); d0 = _mm_unpacklo_epi8(d0, _mm_setzero_si128()); s0 = _mm_unpacklo_epi8(s0, _mm_setzero_si128()); d0 = _mm_mullo_epi16(d0, ia); d1 = _mm_mullo_epi16(d1, ia); s0 = _mm_mullo_epi16(s0, a); s1 = _mm_mullo_epi16(s1, a); d0 = _mm_add_epi16(d0, s0); d1 = _mm_add_epi16(d1, s1); d0 = _mm_srli_epi16(d0, 8); d1 = _mm_srli_epi16(d1, 8); d0 = _mm_packus_epi16(d0, d1); _mm_store_si128(reinterpret_cast<__m128i*>(pDst), d0); pDst += 4; pSrc += 4; x -= 4; } if (x == 0) break; } } }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i three = _mm_set1_epi16(3); // Load, combine and tranpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); // b0_extra = (a0 != 0); const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); const __m128i b0_base = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // b = abs(b) + 3 A_b0 = _mm_add_epi16(A_b0, three); A_b2 = _mm_add_epi16(A_b2, three); B_b0 = _mm_add_epi16(B_b0, three); B_b2 = _mm_add_epi16(B_b2, three); // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 // b = (abs(b) + 3) >> 3 A_b0 = _mm_srai_epi16(A_b0, 3); A_b2 = _mm_srai_epi16(A_b2, 3); B_b0 = _mm_srai_epi16(B_b0, 3); B_b2 = _mm_srai_epi16(B_b2, 3); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }
/** * Optimized 8 bit zoomer for resizing by a factor of 4. Doesn't flip. * 32-bit version. * Used internally by _zoomSurfaceY() below. * source and dest. widths must be multiples of 4 bytes for 32-bit access */ static int zoomSurface4X_32bit(SDL_Surface *src, SDL_Surface *dst) { Uint32 dataSrc; Uint32 dataDst; Uint8 *pixelSrc = (Uint8*)src->pixels; Uint8 *pixelDstRow = (Uint8*)dst->pixels; int sx, sy; static bool proclaimed = false; if (!proclaimed) { proclaimed = true; Log(LOG_INFO) << "Using 32-bit 4X zoom routine."; } for (sy = 0; sy < src->h; ++sy, pixelDstRow += dst->pitch*4) { Uint32 *pixelDst = (Uint32*)pixelDstRow; Uint32 *pixelDst2 = (Uint32*)(pixelDstRow + dst->pitch); Uint32 *pixelDst3 = (Uint32*)(pixelDstRow + 2*dst->pitch); Uint32 *pixelDst4 = (Uint32*)(pixelDstRow + 3*dst->pitch); for (sx = 0; sx < src->w; sx += 4, pixelSrc += 4) { dataSrc = *((Uint32*) pixelSrc); #if SDL_BYTEORDER == SDL_BIG_ENDIAN // boo dataSrc = SDL_Swap32(dataSrc); for (int i = 0; i < 4; ++i) { dataDst = SDL_Swap32( (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | ((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24) ); #else for (int i = 0; i < 4; ++i) { dataDst = (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | ((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24); #endif *pixelDst = dataDst; *pixelDst2 = dataDst; *pixelDst3 = dataDst; *pixelDst4 = dataDst; pixelDst++; // forward 4 bytes! pixelDst2++; pixelDst3++; pixelDst4++; dataSrc >>= 8; } } } return 0; } #endif /** * Optimized 8 bit zoomer for resizing by a factor of 4. Doesn't flip. * 32-bit version. * Used internally by _zoomSurfaceY() below. * source and dest. widths must be multiples of 4 bytes for 32-bit access */ static int zoomSurface2X_XAxis_32bit(SDL_Surface *src, SDL_Surface *dst) { Uint32 dataSrc; Uint32 dataDst; Uint8 *pixelSrc; Uint8 *pixelDstRow = (Uint8*)dst->pixels; Uint8 *pixelSrcRow = (Uint8*)src->pixels; int sx; int dsty; static bool proclaimed = false; static Uint32 *say = 0; Uint32 *csay; int csy; if (!proclaimed) { proclaimed = true; Log(LOG_INFO) << "Using mediocre scaling routine due to screen height."; } if ((say = (Uint32 *) realloc(say, (dst->h + 1) * sizeof(Uint32))) == NULL) { say = 0; return (-1); } csy = 0; csay = say; for (int y = 0; y < dst->h; y++) { csy += src->h; *csay = 0; while (csy >= dst->h) { csy -= dst->h; (*csay)++; } (*csay) *= src->pitch; csay++; } for (dsty = 0; dsty < dst->h; ++dsty, pixelDstRow += dst->pitch) { if (!say[dsty]) continue; Uint32 *pixelDst = (Uint32*)pixelDstRow; pixelSrc = pixelSrcRow; pixelSrcRow += say[dsty]; for (sx = 0; sx < src->w; sx += 4, pixelSrc += 4) { dataSrc = *((Uint32*) pixelSrc); #if SDL_BYTEORDER == SDL_BIG_ENDIAN // boo dataSrc = SDL_Swap32(dataSrc); for (int i = 0; i < 2; ++i) { dataDst = SDL_Swap32( (dataSrc & 0xFF) | ((dataSrc & 0xFFFF) << 8) | ((dataSrc & 0xFF00) << 16) ); #else for (int i = 0; i < 2; ++i) { dataDst = (dataSrc & 0xFF) | ((dataSrc & 0xFFFF) << 8) | ((dataSrc & 0xFF00) << 16); #endif int j = 0; do { if (dsty + j >= dst->h) break; *(pixelDst + (dst->pitch/sizeof(Uint32))*j) = dataDst; } while(say[dsty + ++j] == 0); // fill in all relevant rows dataSrc >>= 16; pixelDst++; // forward 4 bytes! } } } return 0; } /** * Optimized 8 bit zoomer for resizing by a factor of 2. Doesn't flip. * 32-bit version. * Used internally by _zoomSurfaceY() below. * source and dest. widths must be multiples of 4 bytes for 32-bit access */ static int zoomSurface4X_XAxis_32bit(SDL_Surface *src, SDL_Surface *dst) { Uint32 dataSrc; Uint32 dataDst; Uint8 *pixelSrc; Uint8 *pixelDstRow = (Uint8*)dst->pixels; Uint8 *pixelSrcRow = (Uint8*)src->pixels; int sx; int dsty; static bool proclaimed = false; static Uint32 *say = 0; Uint32 *csay; int csy; if (!proclaimed) { proclaimed = true; Log(LOG_INFO) << "Using mediocre scaling routine due to screen height."; } if ((say = (Uint32 *) realloc(say, (dst->h + 1) * sizeof(Uint32))) == NULL) { say = 0; return (-1); } csy = 0; csay = say; for (int y = 0; y < dst->h; y++) { csy += src->h; *csay = 0; while (csy >= dst->h) { csy -= dst->h; (*csay)++; } (*csay) *= src->pitch; csay++; } for (dsty = 0; dsty < dst->h; ++dsty, pixelDstRow += dst->pitch) { if (!say[dsty]) continue; Uint32 *pixelDst = (Uint32*)pixelDstRow; pixelSrc = pixelSrcRow; pixelSrcRow += say[dsty]; for (sx = 0; sx < src->w; sx += 4, pixelSrc += 4) { dataSrc = *((Uint32*) pixelSrc); #if SDL_BYTEORDER == SDL_BIG_ENDIAN // boo dataSrc = SDL_Swap32(dataSrc); for (int i = 0; i < 4; ++i) { dataDst = SDL_Swap32( (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | ((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24) ); #else for (int i = 0; i < 4; ++i) { dataDst = (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | ((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24); #endif int j = 0; do { if (dsty + j >= dst->h) break; *(pixelDst + (dst->pitch/sizeof(Uint32))*j) = dataDst; } while(say[dsty + ++j] == 0); // fill in all relevant rows dataSrc >>= 8; pixelDst++; // forward 4 bytes! } } } return 0; } #ifdef __SSE2__ /** * Optimized 8 bit zoomer for resizing by a factor of 4. Doesn't flip. * Used internally by _zoomSurfaceY() below. * This is an SSE2 version written with Intel intrinsics. * source and dest. widths must be multiples of 16 bytes for 128-bit access * and it would help if they were aligned properly... :( */ static int zoomSurface4X_SSE2(SDL_Surface *src, SDL_Surface *dst) { __m128i dataSrc; __m128i dataDst; Uint8 *pixelSrc = (Uint8*)src->pixels; Uint8 *pixelDstRow = (Uint8*)dst->pixels; int sx, sy; static bool proclaimed = false; if (!proclaimed) { proclaimed = true; Log(LOG_INFO) << "Using SSE2 4X zoom routine."; } for (sy = 0; sy < src->h; ++sy, pixelDstRow += dst->pitch*4) { __m128i *pixelDst = (__m128i*)pixelDstRow; __m128i *pixelDst2 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch); __m128i *pixelDst3 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch*2); __m128i *pixelDst4 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch*3); for (sx = 0; sx < src->w; sx += 16, pixelSrc += 16) { dataSrc = *((__m128i*) pixelSrc); __m128i halfDone = _mm_unpacklo_epi8(dataSrc, dataSrc); dataDst = _mm_unpacklo_epi8(halfDone, halfDone); /* #define WRITE_DST if ((char*)pixelDst4 + 128 > (char*)dst->pixels+(dst->w*dst->pitch)) { Log(LOG_ERROR) << "HELL"; exit(0); } \ */ #define WRITE_DST *(pixelDst++) = dataDst; \ *(pixelDst2++) = dataDst; \ *(pixelDst3++) = dataDst; \ *(pixelDst4++) = dataDst; \ WRITE_DST; dataDst = _mm_unpackhi_epi8(halfDone, halfDone); WRITE_DST; halfDone = _mm_unpackhi_epi8(dataSrc, dataSrc); dataDst = _mm_unpacklo_epi8(halfDone, halfDone); WRITE_DST; dataDst = _mm_unpackhi_epi8(halfDone, halfDone); WRITE_DST; } } return 0; }