static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i A1 = _mm_loadu_si128(in++); const __m128i A2 = _mm_loadu_si128(in++); const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0 const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0 const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i F1 = _mm_or_si128(E1, C1); const __m128i F2 = _mm_or_si128(E2, C2); _mm_storeu_si128(out++, F1); _mm_storeu_si128(out++, F2); num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) { __m128i T0 = _mm_unpacklo_epi64(B0, B1); __m128i T1 = _mm_unpacklo_epi64(B2, B3); __m128i T2 = _mm_unpackhi_epi64(B0, B1); __m128i T3 = _mm_unpackhi_epi64(B2, B3); T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); B0 = _mm_unpacklo_epi32(T0, T1); B1 = _mm_unpackhi_epi32(T0, T1); B2 = _mm_unpacklo_epi32(T2, T3); B3 = _mm_unpackhi_epi32(T2, T3); }
void Coefs(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride, unsigned char *coef_buf, int n) { static const unsigned short c_32[8] = {32, 32, 32, 32, 32, 32, 32, 32}; int i; __m128i v_row0_0, v_row0_1; __m128i v_temp_0, v_temp_1; __m128i v_result; __m128i vZero; vZero = _mm_setzero_si128(); __m128i v_32 = _mm_loadu_si128((__m128i*)c_32); __m128i* coef_ptr = (__m128i*) coef_buf; v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; // row0: 0 1 2 3 4 5 6 7 // row1: 2 3 4 5 6 7 8 9 v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); for ( i = 0; i < n; i++ ) { v_row0_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[0]); v_row0_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[1]); v_result = v_32; v_result = _mm_add_epi16(v_result, v_row0_0); v_result = _mm_add_epi16(v_result, v_row0_1); v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); v_temp_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[2]); v_temp_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[3]); v_result = _mm_add_epi16(v_result, v_temp_0); v_result = _mm_add_epi16(v_result, v_temp_1); v_result = _mm_srli_epi16(v_result, 6); _mm_store_si128((__m128i*)(current_part_ptr), v_result); current_part_ptr += current_part_stride; } }
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, const uint32_t* const src, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); #undef MK_CST_16 #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_add_epi8(in, D); // x r' x b' const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i out = _mm_or_si128(J, A); _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. if (i != num_pixels) { VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); } }
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g, const uint8_t* b, int len, uint32_t* out) { if (g == r + 1) { // RGBA input order. Need to swap R and B. int i = 0; const int len_max = len & ~3; // max length processed in main loop const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); assert(b == r + 2); assert(a == r + 3); for (; i < len_max; i += 4) { const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i)); const __m128i B = _mm_and_si128(A, red_blue_mask); // R 0 B 0 const __m128i C = _mm_andnot_si128(red_blue_mask, A); // 0 G 0 A const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i F = _mm_or_si128(E, C); _mm_storeu_si128((__m128i*)(out + i), F); } for (; i < len; ++i) { out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]); } } else { assert(g == b + 1); assert(r == b + 2); assert(a == b + 3); memcpy(out, b, len * 4); } }
inline COLORREF MakeColor2(COLORREF a, COLORREF b, int alpha) { #ifdef USE_SSE2 // (a * alpha + b * (256 - alpha)) / 256 -> ((a - b) * alpha) / 256 + b __m128i xmm0, xmm1, xmm2, xmm3; COLORREF color; xmm0 = _mm_setzero_si128(); xmm1 = _mm_cvtsi32_si128( a ); xmm2 = _mm_cvtsi32_si128( b ); xmm3 = _mm_cvtsi32_si128( alpha ); xmm1 = _mm_unpacklo_epi8( xmm1, xmm0 ); // a:a:a:a xmm2 = _mm_unpacklo_epi8( xmm2, xmm0 ); // b:b:b:b xmm3 = _mm_shufflelo_epi16( xmm3, 0 ); // alpha:alpha:alpha:alpha xmm1 = _mm_sub_epi16( xmm1, xmm2 ); // (a - b) xmm1 = _mm_mullo_epi16( xmm1, xmm3 ); // (a - b) * alpha xmm1 = _mm_srli_epi16( xmm1, 8 ); // ((a - b) * alpha) / 256 xmm1 = _mm_add_epi8( xmm1, xmm2 ); // ((a - b) * alpha) / 256 + b xmm1 = _mm_packus_epi16( xmm1, xmm0 ); color = _mm_cvtsi128_si32( xmm1 ); return color; #else const int ap = alpha; const int bp = 256 - ap; BYTE valR = (BYTE)((GetRValue(a) * ap + GetRValue(b) * bp) / 256); BYTE valG = (BYTE)((GetGValue(a) * ap + GetGValue(b) * bp) / 256); BYTE valB = (BYTE)((GetBValue(a) * ap + GetBValue(b) * bp) / 256); return RGB(valR, valG, valB); #endif }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2(uint8_t* dest, uint8_t* src, size_t size) { size_t i, j, k; size_t numof16belem; __m128i xmm0[2], xmm1[2]; numof16belem = size / (16*2); for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) { /* Fetch and transpose bytes, words and double words in groups of 32 bytes */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]); xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]); } /* Store the result vectors */ for (k = 0; k < 2; k++) { ((__m128i *)dest)[k*numof16belem+i] = xmm1[k]; } } }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t j; int k; uint8_t* dest_for_jth_element; __m128i xmm0[2], xmm1[2]; for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]); xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]); } /* Store the result vectors */ dest_for_jth_element = dest + j; for (k = 0; k < 2; k++) { _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]); } } }
// // multiplies two complex vectors and returns the real and imaginary parts // as two 32 bit integers. // FORCE_INLINE int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, struct complex16* x, int len1, struct complex16* y, int len2 ) { const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Res = (__m128i*) re; __m128i* Ims = (__m128i*) im; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms2 = _mm_xor_si128(my, xmm5); ms2 = _mm_add_epi32(ms2, xmm4); ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); _mm_storeu_si128(&Res[i], _mm_madd_epi16(my, mx)); _mm_storeu_si128(&Ims[i], _mm_madd_epi16(ms2, mx)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ re[i] = x[i].re * y[i].re + x[i].im * y[i].im ; im[i] = x[i].im * y[i].re - x[i].re * y[i].im ; } return 0; }
static void TransformColor(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i mults_rb = _mm_set_epi16( CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = _mm_set_epi16( CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0); const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 const __m128i H = _mm_add_epi8(G, D); // x dr x db const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db const __m128i out = _mm_sub_epi8(in, I); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LTransformColor_C(m, argb_data + i, num_pixels - i); }
static inline __m128i byteswap32( __m128i v ) { //rotate each 32 bit quantity by 16 bits // 0xB1 = 10110001 = 2,3,0,1 v = _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, 0xB1 ), 0xB1 ); return byteswap16( v ); }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
__m128i test_mm_shufflelo_epi16(__m128i A) { // DAG-LABEL: test_mm_shufflelo_epi16 // DAG: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7> // // ASM-LABEL: test_mm_shufflelo_epi16 // ASM: pshuflw $0, return _mm_shufflelo_epi16(A, 0); }
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available) { LPBYTE lumPlane = output[0]; LPBYTE uPlane = output[1]; LPBYTE vPlane = output[2]; int chrPitch = width>>1; if(bSSE2Available) { __m128i lumMask = _mm_set1_epi32(0x0000FF00); __m128i uvMask = _mm_set1_epi16(0x00FF); for(int y=startY; y<endY; y+=2) { int yPos = y*pitch; int chrYPos = ((y>>1)*chrPitch); int lumYPos = y*width; for(int x=0; x<width; x+=4) { LPBYTE lpImagePos = input+yPos+(x*4); int chrPos = chrYPos + (x>>1); int lumPos0 = lumYPos + x; int lumPos1 = lumPos0+width; __m128i line1 = _mm_load_si128((__m128i*)lpImagePos); __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch)); //pack lum vals { __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1)); packVal = _mm_packus_epi16(packVal, packVal); *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0]; *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1]; } //do average, pack UV vals { __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask)); __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2); avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); avgVal = _mm_packus_epi16(avgVal, avgVal); DWORD packedVals = avgVal.m128i_u32[0]; *(LPWORD)(uPlane+chrPos) = WORD(packedVals); *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16); } } } } else { #ifdef _WIN64 for(int y=startY; y<endY; y+=2)
static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7) { xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 ); xmm1=_mm_shuffle_epi32( xmm0, 0 ); pmaddwd (xmm1, esi); xmm3=_mm_shuffle_epi32( xmm0, 0x55); xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 ); pmaddwd( xmm3, esi+32 ); xmm2=_mm_shuffle_epi32( xmm0, 0xAA ); xmm0=_mm_shuffle_epi32( xmm0, 0xFF ); pmaddwd( xmm2, esi+16 ); xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 ); paddd (xmm1, M128_round_inv_row); xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 ); pmaddwd (xmm0, esi+48 ); xmm5=_mm_shuffle_epi32( xmm4, 0 ); xmm6=_mm_shuffle_epi32( xmm4, 0xAA ); pmaddwd (xmm5, ecx ); paddd (xmm1, xmm2 ); movdqa (xmm2, xmm1 ); xmm7=_mm_shuffle_epi32( xmm4, 0x55 ); pmaddwd (xmm6, ecx+16 ); paddd (xmm0, xmm3 ); xmm4=_mm_shuffle_epi32( xmm4, 0xFF ); psubd (xmm2, xmm0 ); pmaddwd (xmm7, ecx+32 ); paddd (xmm0, xmm1 ); psrad (xmm2, 12 ); paddd (xmm5, M128_round_inv_row); pmaddwd (xmm4, ecx+48 ); paddd (xmm5, xmm6 ); movdqa (xmm6, xmm5 ); psrad (xmm0, 12 ); xmm2=_mm_shuffle_epi32( xmm2, 0x1B ); packssdw (xmm0, xmm2 ); paddd (xmm4, xmm7 ); psubd (xmm6, xmm4 ); paddd (xmm4, xmm5 ); psrad (xmm6, 12 ); psrad (xmm4, 12 ); xmm6=_mm_shuffle_epi32( xmm6, 0x1B ); packssdw (xmm4, xmm6 ); }
inline Pixel GetPixelSSE(const Image* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // extend to 16bit p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128()); p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128()); // convert floating point weights to 16bit integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit // prepare the weights __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0)); __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2)); w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3 // multiply each pixel with its weight (2 pixel per SSE mul) __m128i L12 = _mm_mullo_epi16(p12, w12); __m128i L34 = _mm_mullo_epi16(p34, w34); // sum the results __m128i L1234 = _mm_add_epi16(L12, L34); __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2)); __m128i L = _mm_add_epi16(L1234, Lhi); // convert back to 8bit __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256 L8 = _mm_packus_epi16(L8, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(L8); }
// // multiplies two complex vectors and returns the real and imaginary parts // as two 32 bit integers. // int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, struct complex16* x, int len1, struct complex16* y, int len2 ) { const int wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); //0x0000FFFF0000FFFF0000FFFF0000FFFF const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); for (int i = 0; i < len1 / wlen; i++){ /* vcs *vx = (vcs *)(x + wlen*i); vcs *vy = (vcs *)(y + wlen*i); vi *reout = (vi *)(re + wlen*i); vi *imout = (vi *)(im + wlen*i); vcs vs2 = conj0(*vy); vs2 = permutate_low<1, 0, 3, 2>(vs2); vs2 = permutate_high<1, 0, 3, 2>(vs2); *reout = (vcs)muladd(*vx, *vy); *imout = (vcs)muladd(*vx, vs2);*/ __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); __m128i my = _mm_loadu_si128((__m128i *)(y + wlen*i)); //__m128i ms1 = _mm_sign_epi16(my, conj); __m128i ms2 = _mm_xor_si128(my, xmm5); ms2 = _mm_add_epi32(ms2, xmm4); ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_madd_epi16(my, mx); __m128i mim = _mm_madd_epi16(ms2, mx); _mm_storeu_si128((__m128i *) (re + wlen*i), mre); _mm_storeu_si128((__m128i *) (im + wlen*i), mim); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ re[i] = x[i].re * y[i].re + x[i].im * y[i].im ; im[i] = x[i].im * y[i].re - x[i].re * y[i].im ; }; return 0; }
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, int N) { opus_int i, dataSize16; opus_int32 sum; __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; sum = 0; dataSize16 = N & ~15; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); for (i=0;i<dataSize16;i+=16) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); acc1 = _mm_add_epi32(acc1, inVec1_76543210); acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); } acc1 = _mm_add_epi32( acc1, acc2 ); if (N - i >= 8) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); acc1 = _mm_add_epi32(acc1, inVec1_76543210); i += 8; } acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); sum += _mm_cvtsi128_si32(acc1); for (;i<N;i++) { sum = silk_SMLABB(sum, x[i], y[i]); } return sum; }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_sub_epi8(in, C); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i); }
void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth, const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch, uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch) { #if defined(_M_ARM) // Ensure that this function is reported as not implemented for ARM builds because // the instructions below are not present for that architecture. UNIMPLEMENTED(); return; #else __m128i brMask = _mm_set1_epi32(0x00ff00ff); for (size_t z = 0; z < depth; z++) { for (size_t y = 0; y < height; y++) { const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch); uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch); size_t x = 0; // Make output writes aligned for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++) { uint32_t rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } for (; x + 3 < width; x += 4) { __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x])); // Mask out g and a, which don't change __m128i gaComponents = _mm_andnot_si128(brMask, sourceData); // Mask out b and r __m128i brComponents = _mm_and_si128(sourceData, brMask); // Swap b and r __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); __m128i result = _mm_or_si128(gaComponents, brSwapped); _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result); } // Perform leftover writes for (; x < width; x++) { uint32_t rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } } } #endif }
void store(uint16_t *p) const{ assert(((uintptr_t)p & 7) == 0);//assert aligned //_mm_packus_epi32 (pack with unsigned saturation) is not in SSE2 (2001) for some reason, requires SSE 4.1 (2007) //_mm_storel_epi64((__m128i*)p,_mm_packus_epi32 (a,a)); //a: AAAABBBBCCCCDDDD input vector //slli: AA__BB__CC__DD__ bitshift left by 16 //srli: __________AA__BB byteshift right by 10 //_or_: AA__BB__CCAADDBB OR together //shuf: AA__BB__AABBCCDD reshuffle low half: {[2], [0], [3], [1]} : 10 00 11 01 : 0x8D (I may have gotten this wrong) //storel: AABBCCDD store low half __m128i shifted = _mm_slli_epi32(vec,16); _mm_storel_epi64((__m128i*)p,_mm_shufflelo_epi16(_mm_or_si128(shifted,_mm_srli_si128(shifted,10)),0x8D)); }
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels, uint32_t* dst) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_add_epi8(in, C); _mm_storeu_si128((__m128i*)&dst[i], out); } // fallthrough and finish off with plain-C if (i != num_pixels) { VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); } }
void unpack_rgba8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 16); i++) { t0 = _mm_load_si128((__m128i*)&source[i * 16]); t1 = _mm_and_si128(t0, _mm_set1_epi16(0x00FF)); t2 = _mm_and_si128(t0, _mm_set1_epi16(0xFF00)); t1 = _mm_shufflelo_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1)); t1 = _mm_shufflehi_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1)); t1 = _mm_or_si128(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur) { __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur)); __m128i diff_lo = _mm_sub_epi16(current, original); original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8))); current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8))); __m128i diff_hi = _mm_sub_epi16(current, original); //Hor __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi); __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi); __m128i row2 = _mm_hadd_epi16(row0, row1); __m128i row3 = _mm_hsub_epi16(row0, row1); //Ver row0 = _mm_hadd_epi16(row2, row3); row1 = _mm_hsub_epi16(row2, row3); row2 = _mm_hadd_epi16(row0, row1); row3 = _mm_hsub_epi16(row0, row1); //Abs and sum row2 = _mm_abs_epi16(row2); row3 = _mm_abs_epi16(row3); row3 = _mm_add_epi16(row2, row3); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum = _mm_extract_epi16(row3, 0); unsigned satd = (sum + 1) >> 1; return satd; }
static INLINE void hor_transform_row_avx2(__m128i* row){ __m128i mask_pos = _mm_set1_epi16(1); __m128i mask_neg = _mm_set1_epi16(-1); __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg); __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg); temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg); temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2)); temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); }
// // This was v_mul_complex16_shift but I changed the name for consistency with v_conj_mul // and the fact that the old v_mul_complex16 was never called // FORCE_INLINE int __ext_v_mul_complex16(struct complex16* out, int lenout, struct complex16* x, int len1, struct complex16* y, int len2, int shift) { const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Outs = (__m128i*) out; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms1 = _mm_xor_si128(mx, xmm5); ms1 = _mm_add_epi32(ms1, xmm4); __m128i ms2 = _mm_shufflehi_epi16(mx, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_srai_epi32(_mm_madd_epi16(ms1, my), shift); __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, my), shift); mre = _mm_and_si128(mre,xmm6); mim = _mm_and_si128(mim,xmm6); mim = _mm_slli_epi32(mim,0x10); _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ out[i].re = (x[i].re * y[i].re - x[i].im * y[i].im) >> shift; out[i].im = (x[i].re * y[i].im + x[i].im * y[i].re) >> shift; } return 0; }
void Image::loadRGBAUByteDataSSE2(GLsizei width, GLsizei height, int inputPitch, const void *input, size_t outputPitch, void *output) const { const unsigned int *source = NULL; unsigned int *dest = NULL; __m128i brMask = _mm_set1_epi32(0x00ff00ff); for (int y = 0; y < height; y++) { source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch); dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch); int x = 0; // Make output writes aligned for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++) { unsigned int rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } for (; x + 3 < width; x += 4) { __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x])); // Mask out g and a, which don't change __m128i gaComponents = _mm_andnot_si128(brMask, sourceData); // Mask out b and r __m128i brComponents = _mm_and_si128(sourceData, brMask); // Swap b and r __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); __m128i result = _mm_or_si128(gaComponents, brSwapped); _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result); } // Perform leftover writes for (; x < width; x++) { unsigned int rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } } }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags) { if (!pcszText || !*pcszText) return; // Find out actual size of text int nChars = _tcslen(pcszText); uFlags |= DT_NOCLIP; int iX = rc.left; int iY = rc.top; int iXW = (rc.right - iX); int iYH = (rc.bottom - iY); RECT rcMin = rc; if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) { int iMinXW = rcMin.right - rcMin.left; int iMinYH = rcMin.bottom - rcMin.top; if (iMinXW < iXW) { if (uFlags & DT_CENTER) { iX += (iXW - iMinXW)/2; uFlags &= ~DT_CENTER; } else if (uFlags & DT_RIGHT) { iX += (iXW - iMinXW); uFlags &= ~DT_RIGHT; } iXW = iMinXW; } if (iMinYH < iYH) { if (uFlags & DT_SINGLELINE) { if (uFlags & DT_VCENTER) { iY += (iYH - iMinYH)/2; uFlags &= ~DT_VCENTER; } else if (uFlags & DT_BOTTOM) { iY += (iYH - iMinYH); uFlags &= ~DT_BOTTOM; } } iYH = iMinYH; } } iXW += 2; // NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette! iYH += 2; // Ensure we have a big enough DIB to draw the text to if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH); if (!hbmpTextDIB) return; // Select color ieBGRA clr; switch (eColor) { case eFileName: clr = clrFileName; break; case eComment: clr = clrComment; break; case eFileInfo: clr = clrFileInfo; break; default: clr = ieBGRA(0,0,0); break; } clr.A = 0xFF - clrBkg.A; // Draw the text to in-memory DIB RECT rcTextDIB = { 0, 0, iXW, iYH }; FillRect(hdcTextDIB, &rcTextDIB, hbrBkg); rcTextDIB.left++; rcTextDIB.top++; DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags); // Modify DIB: #ifndef __X64__ if (g_bSSE2) #endif { __m128i r0, r1, r2, r3, r4, r5, r6, r7; r7 = _mm_setzero_si128(); // 0 r6 = _mm_set1_epi32(clr.dw); // CA CR CG CB CA CR CG CB CA CR CG CB CA CR CG CB r6 = _mm_unpacklo_epi8(r7, r6); // CA<<8 CR<<8 CG<<8 CB<<8 CA<<8 CR<<8 CG<<8 CB<<8 r5 = _mm_set1_epi16(1); // 1 1 1 1 1 1 1 1 r4 = _mm_set1_epi32(0xFF); // FF FF FF FF r3 = _mm_set1_epi32(clrBkg.dw); // DA 0 0 0 DA 0 0 0 DA 0 0 0 DA 0 0 0 ieBGRA *py = pTextDIB; for (int y = iYH; y--; py += iTextDIBXW) { ieBGRA *px = py; for (int x_4 = (iXW+3)>>2; x_4--; px += 4) { r0 = _mm_load_si128((__m128i *)px); r1 = r0; r2 = r0; // X3 R3 G3 B3 X2 R2 G2 B2 X1 R1 G1 B1 X0 R0 G0 B0 r0 = _mm_srli_epi32(r0, 16); // 0 0 X3 R3 0 0 X2 R2 0 0 X1 R1 0 0 X0 R0 r1 = _mm_srli_epi32(r1, 8); // 0 X3 R3 G3 0 X2 R2 G2 0 X1 R1 G1 0 X0 R0 G0 r0 = _mm_max_epu8(r0, r2); r0 = _mm_max_epu8(r0, r1); // x x x A3 x x x A2 x x x A1 x x x A0 r0 = _mm_and_si128(r0, r4); // 0 A3 0 A2 0 A1 0 A0 r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0)); r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0)); // A3 A3 A2 A2 A1 A1 A0 A0 r1 = r0; r0 = _mm_unpacklo_epi32(r0, r0); // A1 A1 A1 A1 A0 A0 A0 A0 r1 = _mm_unpackhi_epi32(r1, r1); // A3 A3 A3 A3 A2 A2 A2 A2 r0 = _mm_add_epi16(r0, r5); // A1' A1' A1' A1' A0' A0' A0' A0' r1 = _mm_add_epi16(r1, r5); // A3' A3' A3' A3' A2' A2' A2' A2' r0 = _mm_mulhi_epu16(r0, r6); // xA1" xR1 xG1 xB1 xA0" xR0 xG0 xB0 r1 = _mm_mulhi_epu16(r1, r6); // xA3" xR3 xG3 xB3 xA2" xR2 xG2 xB2 r0 = _mm_packus_epi16(r0, r1); // xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0 r0 = _mm_adds_epu8(r0, r3); // xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0 _mm_store_si128((__m128i *)px, r0); } } } #ifndef __X64__ else {
pstatus_t sse2_alphaComp_argb( const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height) { const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr2 = (const UINT32*) pSrc2; UINT32* dptr; int linebytes, src1Jump, src2Jump, dstJump; UINT32 y; __m128i xmm0, xmm1; if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; if (width < 4) /* pointless if too small */ { return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, height); } dptr = (UINT32*) pDst; linebytes = width * sizeof(UINT32); src1Jump = (src1Step - linebytes) / sizeof(UINT32); src2Jump = (src2Step - linebytes) / sizeof(UINT32); dstJump = (dstStep - linebytes) / sizeof(UINT32); xmm0 = _mm_set1_epi32(0); xmm1 = _mm_set1_epi16(1); for (y = 0; y < height; ++y) { int pixels = width; int count; /* Get to the 16-byte boundary now. */ int leadIn = 0; switch ((ULONG_PTR) dptr & 0x0f) { case 0: leadIn = 0; break; case 4: leadIn = 3; break; case 8: leadIn = 2; break; case 12: leadIn = 1; break; default: /* We'll never hit a 16-byte boundary, so do the whole * thing the slow way. */ leadIn = width; break; } if (leadIn) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, leadIn, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += leadIn; sptr2 += leadIn; dptr += leadIn; pixels -= leadIn; } /* Use SSE registers to do 4 pixels at a time. */ count = pixels >> 2; pixels -= count << 2; while (count--) { __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ xmm2 = LOAD_SI128(sptr1); sptr1 += 4; /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ xmm3 = LOAD_SI128(sptr2); sptr2 += 4; /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); /* subtract */ xmm6 = _mm_subs_epi16(xmm4, xmm5); /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); /* Add one to alphas */ xmm4 = _mm_adds_epi16(xmm4, xmm1); /* Multiply and take low word */ xmm4 = _mm_mullo_epi16(xmm4, xmm6); /* Shift 8 right */ xmm4 = _mm_srai_epi16(xmm4, 8); /* Add xmm5 */ xmm4 = _mm_adds_epi16(xmm4, xmm5); /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); /* subtract */ xmm7 = _mm_subs_epi16(xmm5, xmm6); /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); /* Add one to alphas */ xmm5 = _mm_adds_epi16(xmm5, xmm1); /* Multiply and take low word */ xmm5 = _mm_mullo_epi16(xmm5, xmm7); /* Shift 8 right */ xmm5 = _mm_srai_epi16(xmm5, 8); /* Add xmm6 */ xmm5 = _mm_adds_epi16(xmm5, xmm6); /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ /* Must mask off remainders or pack gets confused */ xmm3 = _mm_set1_epi16(0x00ffU); xmm4 = _mm_and_si128(xmm4, xmm3); xmm5 = _mm_and_si128(xmm5, xmm3); /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ xmm5 = _mm_packus_epi16(xmm5, xmm4); _mm_store_si128((__m128i*) dptr, xmm5); dptr += 4; } /* Finish off the remainder. */ if (pixels) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, pixels, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += pixels; sptr2 += pixels; dptr += pixels; } /* Jump to next row. */ sptr1 += src1Jump; sptr2 += src2Jump; dptr += dstJump; } return PRIMITIVES_SUCCESS; }