void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) { __m128i T0 = _mm_unpacklo_epi64(B0, B1); __m128i T1 = _mm_unpacklo_epi64(B2, B3); __m128i T2 = _mm_unpackhi_epi64(B0, B1); __m128i T3 = _mm_unpackhi_epi64(B2, B3); T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); B0 = _mm_unpacklo_epi32(T0, T1); B1 = _mm_unpackhi_epi32(T0, T1); B2 = _mm_unpacklo_epi32(T2, T3); B3 = _mm_unpackhi_epi32(T2, T3); }
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i A1 = _mm_loadu_si128(in++); const __m128i A2 = _mm_loadu_si128(in++); const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0 const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0 const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i F1 = _mm_or_si128(E1, C1); const __m128i F2 = _mm_or_si128(E2, C2); _mm_storeu_si128(out++, F1); _mm_storeu_si128(out++, F2); num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
// // multiplies two complex vectors and returns the real and imaginary parts // as two 32 bit integers. // FORCE_INLINE int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, struct complex16* x, int len1, struct complex16* y, int len2 ) { const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Res = (__m128i*) re; __m128i* Ims = (__m128i*) im; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms2 = _mm_xor_si128(my, xmm5); ms2 = _mm_add_epi32(ms2, xmm4); ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); _mm_storeu_si128(&Res[i], _mm_madd_epi16(my, mx)); _mm_storeu_si128(&Ims[i], _mm_madd_epi16(ms2, mx)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ re[i] = x[i].re * y[i].re + x[i].im * y[i].im ; im[i] = x[i].im * y[i].re - x[i].re * y[i].im ; } return 0; }
void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_load_si128((const __m128i *)left); const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); const __m128i avg2_0 = _mm_avg_epu16(A0, B0); const __m128i avg2_1 = _mm_avg_epu16(A1, B1); const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); (void)above; (void)bd; d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); d207_store_4x16(&dst, stride, &out_d, &LR, &LR); }
void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_load_si128((const __m128i *)above); const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); (void)left; (void)bd; _mm_store_si128((__m128i *)dst, avg3_0); _mm_store_si128((__m128i *)(dst + 8), avg3_1); dst += stride; d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR); }
static void TransformColor(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i mults_rb = _mm_set_epi16( CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = _mm_set_epi16( CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0); const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 const __m128i H = _mm_add_epi8(G, D); // x dr x db const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db const __m128i out = _mm_sub_epi8(in, I); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LTransformColor_C(m, argb_data + i, num_pixels - i); }
static inline __m128i byteswap32( __m128i v ) { //rotate each 32 bit quantity by 16 bits // 0xB1 = 10110001 = 2,3,0,1 v = _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, 0xB1 ), 0xB1 ); return byteswap16( v ); }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t j; int k; uint8_t* dest_for_jth_element; __m128i xmm0[2], xmm1[2]; for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]); xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]); } /* Store the result vectors */ dest_for_jth_element = dest + j; for (k = 0; k < 2; k++) { _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]); } } }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g, const uint8_t* b, int len, uint32_t* out) { if (g == r + 1) { // RGBA input order. Need to swap R and B. int i = 0; const int len_max = len & ~3; // max length processed in main loop const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); assert(b == r + 2); assert(a == r + 3); for (; i < len_max; i += 4) { const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i)); const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0 const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i F = _mm_or_si128(E, C); _mm_storeu_si128((__m128i*)(out + i), F); } for (; i < len; ++i) { out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]); } } else { assert(g == b + 1); assert(r == b + 2); assert(a == b + 3); memcpy(out, b, len * 4); } }
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, const uint32_t* const src, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); #undef MK_CST_16 #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_add_epi8(in, D); // x r' x b' const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i out = _mm_or_si128(J, A); _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. if (i != num_pixels) { VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); } }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2(uint8_t* dest, uint8_t* src, size_t size) { size_t i, j, k; size_t numof16belem; __m128i xmm0[2], xmm1[2]; numof16belem = size / (16*2); for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) { /* Fetch and transpose bytes, words and double words in groups of 32 bytes */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]); xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]); } /* Store the result vectors */ for (k = 0; k < 2; k++) { ((__m128i *)dest)[k*numof16belem+i] = xmm1[k]; } } }
__m128i test_mm_shufflehi_epi16(__m128i A) { // DAG-LABEL: test_mm_shufflehi_epi16 // DAG: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> // // ASM-LABEL: test_mm_shufflehi_epi16 // ASM: pshufhw $0, return _mm_shufflehi_epi16(A, 0); }
void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_load_si128((const __m128i *)above); const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff); const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); const __m128i B3 = _mm_alignr_epi8(AR, A3, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); const __m128i C3 = _mm_alignr_epi8(AR, A3, 4); __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); __m128i avg2_0 = _mm_avg_epu16(A0, B0); __m128i avg2_1 = _mm_avg_epu16(A1, B1); __m128i avg2_2 = _mm_avg_epu16(A2, B2); __m128i avg2_3 = _mm_avg_epu16(A3, B3); int i; (void)left; (void)bd; for (i = 0; i < 30; i += 2) { _mm_store_si128((__m128i *)dst, avg2_0); _mm_store_si128((__m128i *)(dst + 8), avg2_1); _mm_store_si128((__m128i *)(dst + 16), avg2_2); _mm_store_si128((__m128i *)(dst + 24), avg2_3); dst += stride; _mm_store_si128((__m128i *)dst, avg3_0); _mm_store_si128((__m128i *)(dst + 8), avg3_1); _mm_store_si128((__m128i *)(dst + 16), avg3_2); _mm_store_si128((__m128i *)(dst + 24), avg3_3); dst += stride; avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2); avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2); avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2); avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2); avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2); avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2); } _mm_store_si128((__m128i *)dst, avg2_0); _mm_store_si128((__m128i *)(dst + 8), avg2_1); _mm_store_si128((__m128i *)(dst + 16), avg2_2); _mm_store_si128((__m128i *)(dst + 24), avg2_3); dst += stride; _mm_store_si128((__m128i *)dst, avg3_0); _mm_store_si128((__m128i *)(dst + 8), avg3_1); _mm_store_si128((__m128i *)(dst + 16), avg3_2); _mm_store_si128((__m128i *)(dst + 24), avg3_3); }
static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7) { xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 ); xmm1=_mm_shuffle_epi32( xmm0, 0 ); pmaddwd (xmm1, esi); xmm3=_mm_shuffle_epi32( xmm0, 0x55); xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 ); pmaddwd( xmm3, esi+32 ); xmm2=_mm_shuffle_epi32( xmm0, 0xAA ); xmm0=_mm_shuffle_epi32( xmm0, 0xFF ); pmaddwd( xmm2, esi+16 ); xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 ); paddd (xmm1, M128_round_inv_row); xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 ); pmaddwd (xmm0, esi+48 ); xmm5=_mm_shuffle_epi32( xmm4, 0 ); xmm6=_mm_shuffle_epi32( xmm4, 0xAA ); pmaddwd (xmm5, ecx ); paddd (xmm1, xmm2 ); movdqa (xmm2, xmm1 ); xmm7=_mm_shuffle_epi32( xmm4, 0x55 ); pmaddwd (xmm6, ecx+16 ); paddd (xmm0, xmm3 ); xmm4=_mm_shuffle_epi32( xmm4, 0xFF ); psubd (xmm2, xmm0 ); pmaddwd (xmm7, ecx+32 ); paddd (xmm0, xmm1 ); psrad (xmm2, 12 ); paddd (xmm5, M128_round_inv_row); pmaddwd (xmm4, ecx+48 ); paddd (xmm5, xmm6 ); movdqa (xmm6, xmm5 ); psrad (xmm0, 12 ); xmm2=_mm_shuffle_epi32( xmm2, 0x1B ); packssdw (xmm0, xmm2 ); paddd (xmm4, xmm7 ); psubd (xmm6, xmm4 ); paddd (xmm4, xmm5 ); psrad (xmm6, 12 ); psrad (xmm4, 12 ); xmm6=_mm_shuffle_epi32( xmm6, 0x1B ); packssdw (xmm4, xmm6 ); }
// // multiplies two complex vectors and returns the real and imaginary parts // as two 32 bit integers. // int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, struct complex16* x, int len1, struct complex16* y, int len2 ) { const int wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); //0x0000FFFF0000FFFF0000FFFF0000FFFF const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); for (int i = 0; i < len1 / wlen; i++){ /* vcs *vx = (vcs *)(x + wlen*i); vcs *vy = (vcs *)(y + wlen*i); vi *reout = (vi *)(re + wlen*i); vi *imout = (vi *)(im + wlen*i); vcs vs2 = conj0(*vy); vs2 = permutate_low<1, 0, 3, 2>(vs2); vs2 = permutate_high<1, 0, 3, 2>(vs2); *reout = (vcs)muladd(*vx, *vy); *imout = (vcs)muladd(*vx, vs2);*/ __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); __m128i my = _mm_loadu_si128((__m128i *)(y + wlen*i)); //__m128i ms1 = _mm_sign_epi16(my, conj); __m128i ms2 = _mm_xor_si128(my, xmm5); ms2 = _mm_add_epi32(ms2, xmm4); ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_madd_epi16(my, mx); __m128i mim = _mm_madd_epi16(ms2, mx); _mm_storeu_si128((__m128i *) (re + wlen*i), mre); _mm_storeu_si128((__m128i *) (im + wlen*i), mim); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ re[i] = x[i].re * y[i].re + x[i].im * y[i].im ; im[i] = x[i].im * y[i].re - x[i].re * y[i].im ; }; return 0; }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_sub_epi8(in, C); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); }
void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth, const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch, uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch) { #if defined(_M_ARM) // Ensure that this function is reported as not implemented for ARM builds because // the instructions below are not present for that architecture. UNIMPLEMENTED(); return; #else __m128i brMask = _mm_set1_epi32(0x00ff00ff); for (size_t z = 0; z < depth; z++) { for (size_t y = 0; y < height; y++) { const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch); uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch); size_t x = 0; // Make output writes aligned for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++) { uint32_t rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } for (; x + 3 < width; x += 4) { __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x])); // Mask out g and a, which don't change __m128i gaComponents = _mm_andnot_si128(brMask, sourceData); // Mask out b and r __m128i brComponents = _mm_and_si128(sourceData, brMask); // Swap b and r __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); __m128i result = _mm_or_si128(gaComponents, brSwapped); _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result); } // Perform leftover writes for (; x < width; x++) { uint32_t rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } } } #endif }
void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); (void)left; (void)bd; d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH); }
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels, uint32_t* dst) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_add_epi8(in, C); _mm_storeu_si128((__m128i*)&dst[i], out); } // fallthrough and finish off with plain-C if (i != num_pixels) { VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); } }
void unpack_rgba8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 16); i++) { t0 = _mm_load_si128((__m128i*)&source[i * 16]); t1 = _mm_and_si128(t0, _mm_set1_epi16(0x00FF)); t2 = _mm_and_si128(t0, _mm_set1_epi16(0xFF00)); t1 = _mm_shufflelo_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1)); t1 = _mm_shufflehi_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1)); t1 = _mm_or_si128(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_load_si128((const __m128i *)left); const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16)); const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24)); const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff); const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); const __m128i B3 = _mm_alignr_epi8(LR, A3, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); const __m128i C3 = _mm_alignr_epi8(LR, A3, 4); const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); const __m128i avg2_0 = _mm_avg_epu16(A0, B0); const __m128i avg2_1 = _mm_avg_epu16(A1, B1); const __m128i avg2_2 = _mm_avg_epu16(A2, B2); const __m128i avg2_3 = _mm_avg_epu16(A3, B3); const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2); const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2); const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3); const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3); (void)above; (void)bd; d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e); d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f); d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g); d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h); d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR); d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR); d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR); d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR); }
static INLINE void hor_transform_row_avx2(__m128i* row){ __m128i mask_pos = _mm_set1_epi16(1); __m128i mask_neg = _mm_set1_epi16(-1); __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg); __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg); temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg); temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2)); temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); }
// // This was v_mul_complex16_shift but I changed the name for consistency with v_conj_mul // and the fact that the old v_mul_complex16 was never called // FORCE_INLINE int __ext_v_mul_complex16(struct complex16* out, int lenout, struct complex16* x, int len1, struct complex16* y, int len2, int shift) { const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Outs = (__m128i*) out; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms1 = _mm_xor_si128(mx, xmm5); ms1 = _mm_add_epi32(ms1, xmm4); __m128i ms2 = _mm_shufflehi_epi16(mx, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_srai_epi32(_mm_madd_epi16(ms1, my), shift); __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, my), shift); mre = _mm_and_si128(mre,xmm6); mim = _mm_and_si128(mim,xmm6); mim = _mm_slli_epi32(mim,0x10); _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ out[i].re = (x[i].re * y[i].re - x[i].im * y[i].im) >> shift; out[i].im = (x[i].re * y[i].im + x[i].im * y[i].re) >> shift; } return 0; }
void Image::loadRGBAUByteDataSSE2(GLsizei width, GLsizei height, int inputPitch, const void *input, size_t outputPitch, void *output) const { const unsigned int *source = NULL; unsigned int *dest = NULL; __m128i brMask = _mm_set1_epi32(0x00ff00ff); for (int y = 0; y < height; y++) { source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch); dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch); int x = 0; // Make output writes aligned for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++) { unsigned int rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } for (; x + 3 < width; x += 4) { __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x])); // Mask out g and a, which don't change __m128i gaComponents = _mm_andnot_si128(brMask, sourceData); // Mask out b and r __m128i brComponents = _mm_and_si128(sourceData, brMask); // Swap b and r __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); __m128i result = _mm_or_si128(gaComponents, brSwapped); _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result); } // Perform leftover writes for (; x < width; x++) { unsigned int rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } } }
void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_load_si128((const __m128i *)above); const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff); const __m128i AR = _mm_unpackhi_epi64(AR0, AR0); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_alignr_epi8(AR, A1, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_alignr_epi8(AR, A1, 4); __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); __m128i avg2_0 = _mm_avg_epu16(A0, B0); __m128i avg2_1 = _mm_avg_epu16(A1, B1); int i; (void)left; (void)bd; for (i = 0; i < 14; i += 2) { _mm_store_si128((__m128i *)dst, avg2_0); _mm_store_si128((__m128i *)(dst + 8), avg2_1); dst += stride; _mm_store_si128((__m128i *)dst, avg3_0); _mm_store_si128((__m128i *)(dst + 8), avg3_1); dst += stride; avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2); avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2); avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2); avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2); } _mm_store_si128((__m128i *)dst, avg2_0); _mm_store_si128((__m128i *)(dst + 8), avg2_1); dst += stride; _mm_store_si128((__m128i *)dst, avg3_0); _mm_store_si128((__m128i *)(dst + 8), avg3_1); }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags) { if (!pcszText || !*pcszText) return; // Find out actual size of text int nChars = _tcslen(pcszText); uFlags |= DT_NOCLIP; int iX = rc.left; int iY = rc.top; int iXW = (rc.right - iX); int iYH = (rc.bottom - iY); RECT rcMin = rc; if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) { int iMinXW = rcMin.right - rcMin.left; int iMinYH = rcMin.bottom - rcMin.top; if (iMinXW < iXW) { if (uFlags & DT_CENTER) { iX += (iXW - iMinXW)/2; uFlags &= ~DT_CENTER; } else if (uFlags & DT_RIGHT) { iX += (iXW - iMinXW); uFlags &= ~DT_RIGHT; } iXW = iMinXW; } if (iMinYH < iYH) { if (uFlags & DT_SINGLELINE) { if (uFlags & DT_VCENTER) { iY += (iYH - iMinYH)/2; uFlags &= ~DT_VCENTER; } else if (uFlags & DT_BOTTOM) { iY += (iYH - iMinYH); uFlags &= ~DT_BOTTOM; } } iYH = iMinYH; } } iXW += 2; // NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette! iYH += 2; // Ensure we have a big enough DIB to draw the text to if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH); if (!hbmpTextDIB) return; // Select color ieBGRA clr; switch (eColor) { case eFileName: clr = clrFileName; break; case eComment: clr = clrComment; break; case eFileInfo: clr = clrFileInfo; break; default: clr = ieBGRA(0,0,0); break; } clr.A = 0xFF - clrBkg.A; // Draw the text to in-memory DIB RECT rcTextDIB = { 0, 0, iXW, iYH }; FillRect(hdcTextDIB, &rcTextDIB, hbrBkg); rcTextDIB.left++; rcTextDIB.top++; DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags); // Modify DIB: #ifndef __X64__ if (g_bSSE2) #endif { __m128i r0, r1, r2, r3, r4, r5, r6, r7; r7 = _mm_setzero_si128(); // 0 r6 = _mm_set1_epi32(clr.dw); // CA CR CG CB CA CR CG CB CA CR CG CB CA CR CG CB r6 = _mm_unpacklo_epi8(r7, r6); // CA<<8 CR<<8 CG<<8 CB<<8 CA<<8 CR<<8 CG<<8 CB<<8 r5 = _mm_set1_epi16(1); // 1 1 1 1 1 1 1 1 r4 = _mm_set1_epi32(0xFF); // FF FF FF FF r3 = _mm_set1_epi32(clrBkg.dw); // DA 0 0 0 DA 0 0 0 DA 0 0 0 DA 0 0 0 ieBGRA *py = pTextDIB; for (int y = iYH; y--; py += iTextDIBXW) { ieBGRA *px = py; for (int x_4 = (iXW+3)>>2; x_4--; px += 4) { r0 = _mm_load_si128((__m128i *)px); r1 = r0; r2 = r0; // X3 R3 G3 B3 X2 R2 G2 B2 X1 R1 G1 B1 X0 R0 G0 B0 r0 = _mm_srli_epi32(r0, 16); // 0 0 X3 R3 0 0 X2 R2 0 0 X1 R1 0 0 X0 R0 r1 = _mm_srli_epi32(r1, 8); // 0 X3 R3 G3 0 X2 R2 G2 0 X1 R1 G1 0 X0 R0 G0 r0 = _mm_max_epu8(r0, r2); r0 = _mm_max_epu8(r0, r1); // x x x A3 x x x A2 x x x A1 x x x A0 r0 = _mm_and_si128(r0, r4); // 0 A3 0 A2 0 A1 0 A0 r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0)); r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0)); // A3 A3 A2 A2 A1 A1 A0 A0 r1 = r0; r0 = _mm_unpacklo_epi32(r0, r0); // A1 A1 A1 A1 A0 A0 A0 A0 r1 = _mm_unpackhi_epi32(r1, r1); // A3 A3 A3 A3 A2 A2 A2 A2 r0 = _mm_add_epi16(r0, r5); // A1' A1' A1' A1' A0' A0' A0' A0' r1 = _mm_add_epi16(r1, r5); // A3' A3' A3' A3' A2' A2' A2' A2' r0 = _mm_mulhi_epu16(r0, r6); // xA1" xR1 xG1 xB1 xA0" xR0 xG0 xB0 r1 = _mm_mulhi_epu16(r1, r6); // xA3" xR3 xG3 xB3 xA2" xR2 xG2 xB2 r0 = _mm_packus_epi16(r0, r1); // xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0 r0 = _mm_adds_epu8(r0, r3); // xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0 _mm_store_si128((__m128i *)px, r0); } } } #ifndef __X64__ else {
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2217, 5352, 2217, 5352); const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, -5352, 2217, -5352, 2217); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. -> 00 01 02 03 00 00 00 00 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Unpack and shuffle // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); // 00 01 10 11 02 03 12 13 // 20 21 30 31 22 23 32 33 const __m128i shuf01_p = _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i shuf23_p = _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); // 00 01 10 11 20 21 30 31 // 03 02 13 12 23 22 33 32 const __m128i a01 = _mm_add_epi16(s01, s32); const __m128i a32 = _mm_sub_epi16(s01, s32); // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); v01 = _mm_unpacklo_epi32(s_lo, s_hi); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }
pstatus_t sse2_alphaComp_argb( const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height) { const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr2 = (const UINT32*) pSrc2; UINT32* dptr; int linebytes, src1Jump, src2Jump, dstJump; UINT32 y; __m128i xmm0, xmm1; if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; if (width < 4) /* pointless if too small */ { return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, height); } dptr = (UINT32*) pDst; linebytes = width * sizeof(UINT32); src1Jump = (src1Step - linebytes) / sizeof(UINT32); src2Jump = (src2Step - linebytes) / sizeof(UINT32); dstJump = (dstStep - linebytes) / sizeof(UINT32); xmm0 = _mm_set1_epi32(0); xmm1 = _mm_set1_epi16(1); for (y = 0; y < height; ++y) { int pixels = width; int count; /* Get to the 16-byte boundary now. */ int leadIn = 0; switch ((ULONG_PTR) dptr & 0x0f) { case 0: leadIn = 0; break; case 4: leadIn = 3; break; case 8: leadIn = 2; break; case 12: leadIn = 1; break; default: /* We'll never hit a 16-byte boundary, so do the whole * thing the slow way. */ leadIn = width; break; } if (leadIn) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, leadIn, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += leadIn; sptr2 += leadIn; dptr += leadIn; pixels -= leadIn; } /* Use SSE registers to do 4 pixels at a time. */ count = pixels >> 2; pixels -= count << 2; while (count--) { __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ xmm2 = LOAD_SI128(sptr1); sptr1 += 4; /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ xmm3 = LOAD_SI128(sptr2); sptr2 += 4; /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); /* subtract */ xmm6 = _mm_subs_epi16(xmm4, xmm5); /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); /* Add one to alphas */ xmm4 = _mm_adds_epi16(xmm4, xmm1); /* Multiply and take low word */ xmm4 = _mm_mullo_epi16(xmm4, xmm6); /* Shift 8 right */ xmm4 = _mm_srai_epi16(xmm4, 8); /* Add xmm5 */ xmm4 = _mm_adds_epi16(xmm4, xmm5); /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); /* subtract */ xmm7 = _mm_subs_epi16(xmm5, xmm6); /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); /* Add one to alphas */ xmm5 = _mm_adds_epi16(xmm5, xmm1); /* Multiply and take low word */ xmm5 = _mm_mullo_epi16(xmm5, xmm7); /* Shift 8 right */ xmm5 = _mm_srai_epi16(xmm5, 8); /* Add xmm6 */ xmm5 = _mm_adds_epi16(xmm5, xmm6); /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ /* Must mask off remainders or pack gets confused */ xmm3 = _mm_set1_epi16(0x00ffU); xmm4 = _mm_and_si128(xmm4, xmm3); xmm5 = _mm_and_si128(xmm5, xmm3); /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ xmm5 = _mm_packus_epi16(xmm5, xmm4); _mm_store_si128((__m128i*) dptr, xmm5); dptr += 4; } /* Finish off the remainder. */ if (pixels) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, pixels, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += pixels; sptr2 += pixels; dptr += pixels; } /* Jump to next row. */ sptr1 += src1Jump; sptr2 += src2Jump; dptr += dstJump; } return PRIMITIVES_SUCCESS; }