void pix_invert :: processGrayMMX(imageStruct &image) { int i = (image.xsize * image.ysize) / 8; // 8 pixels at a time vector64i offset; vector64i *input = (vector64i*)image.data; offset.c[0]=255; offset.c[1]=255; offset.c[2]=255; offset.c[3]=255; offset.c[4]=255; offset.c[5]=255; offset.c[6]=255; offset.c[7]=255; while (i--) { //*((unsigned long *)base) = ~*((unsigned long *)base); input[0].v= _mm_xor_si64(input[0].v, offset.v); input++; } _mm_empty(); }
void pix_invert :: processRGBAMMX(imageStruct &image) { int i = (image.xsize * image.ysize) / 2; // 2 pixels at a time vector64i offset; vector64i *input = (vector64i*)image.data; offset.c[0+chRed]=255; offset.c[0+chGreen]=255; offset.c[0+chBlue]=255; offset.c[0+chAlpha]=0; offset.c[4+chRed]=255; offset.c[4+chGreen]=255; offset.c[4+chBlue]=255; offset.c[4+chAlpha]=0; while (i--) { //*((unsigned long *)base) = ~*((unsigned long *)base); input[0].v= _mm_xor_si64(input[0].v, offset.v); input++; } _mm_empty(); }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, __m64 * result, unsigned int count) { __m64 _a, _b, one, sum, carry, onesCarry; unsigned int i; carry = _mm_setzero_si64 (); one = _mm_cmpeq_pi8 (carry, carry); one = _mm_sub_si64 (carry, one); for (i = 0; i < count; i++) { _a = a[i]; _b = b[i]; sum = _mm_add_si64 (_a, _b); sum = _mm_add_si64 (sum, carry); result[i] = sum; onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry); onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry); onesCarry = _mm_and_si64 (onesCarry, one); _a = _mm_srli_si64 (_a, 1); _b = _mm_srli_si64 (_b, 1); carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry); carry = _mm_srli_si64 (carry, 63); } return carry; }
mlib_status mlib_m_conv5x5_u16nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker[5][5]; __m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000; __m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h, tmph; __m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l, tmpl; __m64 *sp, *dp; mlib_s32 shift, ind, ker_sum = 0; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * 2 * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (__m64 *) sl; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < wid4; i++) { PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < width / 4; i++) { CONV_5x5(hi, i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(hi, i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
__m64 test56(__m64 a, __m64 b) { // CHECK: pxor return _mm_xor_si64(a, b); }
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd) { __m64 kw[12]; /* key schedule words : chaining vars + tweak */ __m64 X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ __m64 w[8]; /* local copy of input block */ __m64 z1; __m64 z3; __m64 z5; __m64 z7; ts[0] = ctx->T[0]; ts[1] = ctx->T[1]; do { ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd)); z1 = SKEIN_KS_PARITY; ks[0] = ctx->X[0]; z1 = _mm_xor_si64(z1,ks[0]); ks[1] = ctx->X[1]; z1 = _mm_xor_si64(z1,ks[1]); ks[2] = ctx->X[2]; z1 = _mm_xor_si64(z1,ks[2]); ks[3] = ctx->X[3]; z1 = _mm_xor_si64(z1,ks[3]); ks[4] = ctx->X[4]; z1 = _mm_xor_si64(z1,ks[4]); ks[5] = ctx->X[5]; z1 = _mm_xor_si64(z1,ks[5]); ks[6] = ctx->X[6]; z1 = _mm_xor_si64(z1,ks[6]); ks[7] = ctx->X[7]; z1 = _mm_xor_si64(z1,ks[7]); ks[8] = z1; ts[2] = _mm_xor_si64(ts[0],ts[1]); X0 = 0[(__m64 *) blkPtr]; X1 = 1[(__m64 *) blkPtr]; X2 = 2[(__m64 *) blkPtr]; X3 = 3[(__m64 *) blkPtr]; X4 = 4[(__m64 *) blkPtr]; X5 = 5[(__m64 *) blkPtr]; X6 = 6[(__m64 *) blkPtr]; X7 = 7[(__m64 *) blkPtr]; w[0] = X0; w[1] = X1; w[2] = X2; w[3] = X3; w[4] = X4; w[5] = X5; w[6] = X6; w[7] = X7; X0 = _mm_add_si64(X0,ks[0]); X1 = _mm_add_si64(X1,ks[1]); X2 = _mm_add_si64(X2,ks[2]); X3 = _mm_add_si64(X3,ks[3]); X4 = _mm_add_si64(X4,ks[4]); X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0])); X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1])); X7 = _mm_add_si64(X7,ks[7]); blkPtr += 64; #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ X##p0 = _mm_add_si64(X##p0,X##p1); \ X##p2 = _mm_add_si64(X##p2,X##p3); \ X##p4 = _mm_add_si64(X##p4,X##p5); \ X##p6 = _mm_add_si64(X##p6,X##p7); \ z1 = X##p1; \ X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \ z1 = _m_psllqi(z1,ROT##_0); \ X##p1 = _mm_or_si64(X##p1,z1); \ z3 = X##p3; \ X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \ z3 = _m_psllqi(z3,ROT##_1); \ X##p3 = _mm_or_si64(X##p3,z3); \ z5 = X##p5; \ z5 = _m_psllqi(z5,ROT##_2); \ X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \ X##p5 = _mm_or_si64(X##p5,z5); \ z7 = X##p7; \ X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \ z7 = _m_psllqi(z7,ROT##_3); \ X##p7 = _mm_or_si64(X##p7,z7); \ X##p1 = _mm_xor_si64(X##p1,X##p0); \ X##p3 = _mm_xor_si64(X##p3,X##p2); \ X##p5 = _mm_xor_si64(X##p5,X##p4); \ X##p7 = _mm_xor_si64(X##p7,X##p6); \ #define I512(R) \ X0 = _mm_add_si64(X0,ks[((R)+1) % 9]); /* inject the key schedule value */ \ X1 = _mm_add_si64(X1,ks[((R)+2) % 9]); \ X2 = _mm_add_si64(X2,ks[((R)+3) % 9]); \ X3 = _mm_add_si64(X3,ks[((R)+4) % 9]); \ X4 = _mm_add_si64(X4,ks[((R)+5) % 9]); \ X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3])); \ X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3])); \ X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1))); \ #define R512_8_rounds(R) \ R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ I512(2*(R)); \ R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ I512(2*(R)+1); R512_8_rounds( 0); R512_8_rounds( 1); R512_8_rounds( 2); R512_8_rounds( 3); R512_8_rounds( 4); R512_8_rounds( 5); R512_8_rounds( 6); R512_8_rounds( 7); R512_8_rounds( 8); ctx->X[0] = _mm_xor_si64(X0,w[0]); ctx->X[1] = _mm_xor_si64(X1,w[1]); ctx->X[2] = _mm_xor_si64(X2,w[2]); ctx->X[3] = _mm_xor_si64(X3,w[3]); ctx->X[4] = _mm_xor_si64(X4,w[4]); ctx->X[5] = _mm_xor_si64(X5,w[5]); ctx->X[6] = _mm_xor_si64(X6,w[6]); ctx->X[7] = _mm_xor_si64(X7,w[7]); ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32) 64 ) << 24),~0)); } while (--blkCnt); ctx->T[0] = ts[0]; ctx->T[1] = ts[1]; }
__m64 test_mm_xor_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_xor_si64 // CHECK: call x86_mmx @llvm.x86.mmx.pxor return _mm_xor_si64(a, b); }