__m64 test69(__m64 a) { // CHECK: psllq return _mm_slli_si64(a, 3); }
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output) { __m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0); __m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255); const uint8_t *orig_input = input; uint8_t *y_comp = output; uint8_t *u_comp = output + width * height; uint8_t *v_comp = u_comp + (int)((width * height)/2); // 4:2:2 int i, j; // When preparing video for PAL DV50 encoding, the video must be shifted // down by one line to change the field order to be bottom-field-first int start_line = 0; if (shift_picture_down) { memset(y_comp, 0x10, width); // write one line of black Y y_comp += width; memset(u_comp, 0x80, width/2); // write one line of black U,V u_comp += width/2; memset(v_comp, 0x80, width/2); // write one line of black U,V v_comp += width/2; start_line = 1; } /* Do the y component */ for (j = start_line; j < height; j++) { // Consume 16 bytes of UYVY data per iteration (8 pixels worth) for (i = 0; i < width*2; i += 16) { //__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask); //__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask); //__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0); //*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1); __m64 m0 = *(__m64 *)input; __m64 m2 = _mm_srli_si64(m0, 8); __m64 m3 = _mm_slli_si64(m0, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m0 = m2; __m64 m1 = *(__m64 *)(input+8); m2 = _mm_srli_si64(m1, 8); m3 = _mm_slli_si64(m1, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m1 = m2; *(__m64 *)y_comp = _mm_packs_pu16 (m0, m1); y_comp += 8; input += 16; } } /* Do the chroma components */ input = orig_input; for (j = start_line; j < height; j++) { /* Process every line for yuv 4:2:2 */ for (i = 0; i < width*2; i += 16) { __m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m3 = _mm_unpacklo_pi8 (m1, m2); __m64 m4 = _mm_unpackhi_pi8 (m1, m2); //*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2); //*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2); memcpy (u_comp, &m3, 4); memcpy (v_comp, &m4, 4); u_comp += 4; v_comp += 4; input += 16; } } _mm_empty(); // Clear aliased fp register state }
void sha384Process(register sha384Param* sp) { #ifdef OPTIMIZE_SSE2 # if defined(_MSC_VER) || defined (__INTEL_COMPILER) static const __m64 MASK = { 0x00FF00FF00FF00FF00 }; # elif defined(__GNUC__) static const __m64 MASK = { 0x00FF00FF, 0x00FF00FF }; # else # error # endif __m64 a, b, c, d, e, f, g, h, temp; register __m64 *w; register const __m64 *k; register byte t; w = (__m64*) sp->data; t = 16; while (t--) { temp = *w; *(w++) = _m_pxor( _mm_slli_si64(_m_pshufw(_m_pand(temp, MASK), 27), 8), _m_pshufw(_m_pand(_mm_srli_si64(temp, 8), MASK), 27) ); } t = 64; while (t--) { temp = _mm_add_si64(_mm_add_si64(sig1(w[-2]), w[-7]), _mm_add_si64(sig0(w[-15]), w[-16])); *(w++) = temp; } w = (__m64*) sp->h; a = w[0]; b = w[1]; c = w[2]; d = w[3]; e = w[4]; f = w[5]; g = w[6]; h = w[7]; w = (__m64*) sp->data; k = (__m64*) SHA2_64BIT_K; #else register uint64_t a, b, c, d, e, f, g, h, temp; register uint64_t *w; register const uint64_t *k; register byte t; # if WORDS_BIGENDIAN w = sp->data + 16; # else w = sp->data; t = 16; while (t--) { temp = swapu64(*w); *(w++) = temp; } # endif t = 64; while (t--) { temp = sig1(w[-2]) + w[-7] + sig0(w[-15]) + w[-16]; *(w++) = temp; } w = sp->data; a = sp->h[0]; b = sp->h[1]; c = sp->h[2]; d = sp->h[3]; e = sp->h[4]; f = sp->h[5]; g = sp->h[6]; h = sp->h[7]; k = SHA2_64BIT_K; #endif ROUND(a,b,c,d,e,f,g,h,w[ 0],k[ 0]); ROUND(h,a,b,c,d,e,f,g,w[ 1],k[ 1]); ROUND(g,h,a,b,c,d,e,f,w[ 2],k[ 2]); ROUND(f,g,h,a,b,c,d,e,w[ 3],k[ 3]); ROUND(e,f,g,h,a,b,c,d,w[ 4],k[ 4]); ROUND(d,e,f,g,h,a,b,c,w[ 5],k[ 5]); ROUND(c,d,e,f,g,h,a,b,w[ 6],k[ 6]); ROUND(b,c,d,e,f,g,h,a,w[ 7],k[ 7]); ROUND(a,b,c,d,e,f,g,h,w[ 8],k[ 8]); ROUND(h,a,b,c,d,e,f,g,w[ 9],k[ 9]); ROUND(g,h,a,b,c,d,e,f,w[10],k[10]); ROUND(f,g,h,a,b,c,d,e,w[11],k[11]); ROUND(e,f,g,h,a,b,c,d,w[12],k[12]); ROUND(d,e,f,g,h,a,b,c,w[13],k[13]); ROUND(c,d,e,f,g,h,a,b,w[14],k[14]); ROUND(b,c,d,e,f,g,h,a,w[15],k[15]); ROUND(a,b,c,d,e,f,g,h,w[16],k[16]); ROUND(h,a,b,c,d,e,f,g,w[17],k[17]); ROUND(g,h,a,b,c,d,e,f,w[18],k[18]); ROUND(f,g,h,a,b,c,d,e,w[19],k[19]); ROUND(e,f,g,h,a,b,c,d,w[20],k[20]); ROUND(d,e,f,g,h,a,b,c,w[21],k[21]); ROUND(c,d,e,f,g,h,a,b,w[22],k[22]); ROUND(b,c,d,e,f,g,h,a,w[23],k[23]); ROUND(a,b,c,d,e,f,g,h,w[24],k[24]); ROUND(h,a,b,c,d,e,f,g,w[25],k[25]); ROUND(g,h,a,b,c,d,e,f,w[26],k[26]); ROUND(f,g,h,a,b,c,d,e,w[27],k[27]); ROUND(e,f,g,h,a,b,c,d,w[28],k[28]); ROUND(d,e,f,g,h,a,b,c,w[29],k[29]); ROUND(c,d,e,f,g,h,a,b,w[30],k[30]); ROUND(b,c,d,e,f,g,h,a,w[31],k[31]); ROUND(a,b,c,d,e,f,g,h,w[32],k[32]); ROUND(h,a,b,c,d,e,f,g,w[33],k[33]); ROUND(g,h,a,b,c,d,e,f,w[34],k[34]); ROUND(f,g,h,a,b,c,d,e,w[35],k[35]); ROUND(e,f,g,h,a,b,c,d,w[36],k[36]); ROUND(d,e,f,g,h,a,b,c,w[37],k[37]); ROUND(c,d,e,f,g,h,a,b,w[38],k[38]); ROUND(b,c,d,e,f,g,h,a,w[39],k[39]); ROUND(a,b,c,d,e,f,g,h,w[40],k[40]); ROUND(h,a,b,c,d,e,f,g,w[41],k[41]); ROUND(g,h,a,b,c,d,e,f,w[42],k[42]); ROUND(f,g,h,a,b,c,d,e,w[43],k[43]); ROUND(e,f,g,h,a,b,c,d,w[44],k[44]); ROUND(d,e,f,g,h,a,b,c,w[45],k[45]); ROUND(c,d,e,f,g,h,a,b,w[46],k[46]); ROUND(b,c,d,e,f,g,h,a,w[47],k[47]); ROUND(a,b,c,d,e,f,g,h,w[48],k[48]); ROUND(h,a,b,c,d,e,f,g,w[49],k[49]); ROUND(g,h,a,b,c,d,e,f,w[50],k[50]); ROUND(f,g,h,a,b,c,d,e,w[51],k[51]); ROUND(e,f,g,h,a,b,c,d,w[52],k[52]); ROUND(d,e,f,g,h,a,b,c,w[53],k[53]); ROUND(c,d,e,f,g,h,a,b,w[54],k[54]); ROUND(b,c,d,e,f,g,h,a,w[55],k[55]); ROUND(a,b,c,d,e,f,g,h,w[56],k[56]); ROUND(h,a,b,c,d,e,f,g,w[57],k[57]); ROUND(g,h,a,b,c,d,e,f,w[58],k[58]); ROUND(f,g,h,a,b,c,d,e,w[59],k[59]); ROUND(e,f,g,h,a,b,c,d,w[60],k[60]); ROUND(d,e,f,g,h,a,b,c,w[61],k[61]); ROUND(c,d,e,f,g,h,a,b,w[62],k[62]); ROUND(b,c,d,e,f,g,h,a,w[63],k[63]); ROUND(a,b,c,d,e,f,g,h,w[64],k[64]); ROUND(h,a,b,c,d,e,f,g,w[65],k[65]); ROUND(g,h,a,b,c,d,e,f,w[66],k[66]); ROUND(f,g,h,a,b,c,d,e,w[67],k[67]); ROUND(e,f,g,h,a,b,c,d,w[68],k[68]); ROUND(d,e,f,g,h,a,b,c,w[69],k[69]); ROUND(c,d,e,f,g,h,a,b,w[70],k[70]); ROUND(b,c,d,e,f,g,h,a,w[71],k[71]); ROUND(a,b,c,d,e,f,g,h,w[72],k[72]); ROUND(h,a,b,c,d,e,f,g,w[73],k[73]); ROUND(g,h,a,b,c,d,e,f,w[74],k[74]); ROUND(f,g,h,a,b,c,d,e,w[75],k[75]); ROUND(e,f,g,h,a,b,c,d,w[76],k[76]); ROUND(d,e,f,g,h,a,b,c,w[77],k[77]); ROUND(c,d,e,f,g,h,a,b,w[78],k[78]); ROUND(b,c,d,e,f,g,h,a,w[79],k[79]); #ifdef OPTIMIZE_SSE2 w = (__m64*) sp->h; w[0] = _mm_add_si64(w[0], a); w[1] = _mm_add_si64(w[1], b); w[2] = _mm_add_si64(w[2], c); w[3] = _mm_add_si64(w[3], d); w[4] = _mm_add_si64(w[4], e); w[5] = _mm_add_si64(w[5], f); w[6] = _mm_add_si64(w[6], g); w[7] = _mm_add_si64(w[7], h); _mm_empty(); #else sp->h[0] += a; sp->h[1] += b; sp->h[2] += c; sp->h[3] += d; sp->h[4] += e; sp->h[5] += f; sp->h[6] += g; sp->h[7] += h; #endif }
void mlib_m_ImageMaximum_U8_3( mlib_s32 *res32, const mlib_image *img) { /* src address */ __m64 *sp, *sl; /* src data */ __m64 sd; /* max values */ __m64 max0, max1, max2, max3; /* edge mask */ mlib_s32 emask; /* loop variables */ mlib_s32 n1; /* height of image */ mlib_s32 height = mlib_ImageGetHeight(img); /* elements to next row */ mlib_s32 slb = mlib_ImageGetStride(img); mlib_s32 width = mlib_ImageGetWidth(img) * 3; mlib_u8 *dend; if (slb == width) { width *= height; height = 1; } sp = sl = (__m64 *) mlib_ImageGetData(img); max1 = _mm_set1_pi8(MLIB_U8_MIN); max2 = _mm_set1_pi8(MLIB_U8_MIN); max3 = _mm_set1_pi8(MLIB_U8_MIN); for (; height > 0; height--) { n1 = width; dend = (mlib_u8 *)sp + width; for (; n1 > 23; n1 -= 24) { sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max1, max1, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max2, max2, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max3, max3, sd); } if (n1 > 0) { emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8_M32(max1, max1, sd, emask); n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp); if (n1 > 0) { emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8_M32(max2, max2, sd, emask); n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp); if (n1 > 0) { emask = (0xFF << (8 - n1)); sd = *sp; MLIB_M_IMAGE_MAXIMUM_U8_M32(max3, max3, sd, emask); } } } sp = sl = (__m64 *) ((mlib_u8 *)sl + slb); } MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max1, _mm_srli_si64(max2, 8), mmx_write_64(0x00ffffffffffffffll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max2, 16), mmx_write_64(0x0000000000ff0000ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max3, 16), mmx_write_64(0x0000ffffffffffffll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max3, 8), mmx_write_64(0x0000000000ffff00ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24), mmx_write_64(0x000000ffff000000ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24), mmx_write_64(0x0000000000ffffffll)); res32[0] = _mm_cvtsi64_si32(_mm_and_si64(max0, mmx_write_64(0x00000000000000ffll))); res32[1] = _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 8), mmx_write_64(0x00000000000000ffll))); res32[2] = _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 16), mmx_write_64(0x00000000000000ffll))); _mm_empty(); }
__m64 test_mm_slli_si64(__m64 a) { // CHECK-LABEL: test_mm_slli_si64 // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q return _mm_slli_si64(a, 3); }
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix) { const unsigned char *u = src + ((y-1) & 7)*sc2lines_width, *m = src + ((y+0) & 7)*sc2lines_width, *l = src + ((y+1) & 7)*sc2lines_width; for (unsigned i = 0; i < nPix; i += 4) { if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) { __m64 mm = *(__m64*)(m+i-2); __m64 uu = *(__m64*)(u+i-2); __m64 ll = *(__m64*)(l+i-2); __m64 md = _mm_slli_si64(mm,8); __m64 mf = _mm_srli_si64(mm,8); __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll)); __m64 e0, e1, v1, v2; e0 = _mm_cmpeq_pi8(md,uu); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,uu); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(u+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); e0 = _mm_cmpeq_pi8(md,ll); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,ll); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(l+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); } else { __m64 v1 = _m_from_int(*(unsigned*)(m+i)); v1 = _mm_unpacklo_pi8(v1,v1); *(__m64*)(dst1 + 2*i) = v1; *(__m64*)(dst2 + 2*i) = v1; } } }