Esempio n. 1
0
void pix_invert :: processGrayMMX(imageStruct &image)
{
  int i = (image.xsize * image.ysize) / 8; // 8 pixels at a time
  vector64i offset;
  vector64i *input = (vector64i*)image.data;

  offset.c[0]=255;
  offset.c[1]=255;
  offset.c[2]=255;
  offset.c[3]=255;
  offset.c[4]=255;
  offset.c[5]=255;
  offset.c[6]=255;
  offset.c[7]=255;

  while (i--) {
    //*((unsigned long *)base) = ~*((unsigned long *)base);
    input[0].v= _mm_xor_si64(input[0].v, offset.v);
    input++;
  }
  _mm_empty();
}
Esempio n. 2
0
void pix_invert :: processRGBAMMX(imageStruct &image)
{
  int i = (image.xsize * image.ysize) / 2; // 2 pixels at a time
  vector64i offset;
  vector64i *input = (vector64i*)image.data;

  offset.c[0+chRed]=255;
  offset.c[0+chGreen]=255;
  offset.c[0+chBlue]=255;
  offset.c[0+chAlpha]=0;

  offset.c[4+chRed]=255;
  offset.c[4+chGreen]=255;
  offset.c[4+chBlue]=255;
  offset.c[4+chAlpha]=0;

  while (i--) {
    //*((unsigned long *)base) = ~*((unsigned long *)base);
    input[0].v= _mm_xor_si64(input[0].v, offset.v);
    input++;
  }
  _mm_empty();
}
Esempio n. 3
0
__m64
unsigned_add3 (const __m64 * a, const __m64 * b,
	       __m64 * result, unsigned int count)
{
  __m64 _a, _b, one, sum, carry, onesCarry;

  unsigned int i;

  carry = _mm_setzero_si64 ();

  one = _mm_cmpeq_pi8 (carry, carry);
  one = _mm_sub_si64 (carry, one);

  for (i = 0; i < count; i++)
    {
      _a = a[i];
      _b = b[i];

      sum = _mm_add_si64 (_a, _b);
      sum = _mm_add_si64 (sum, carry);

      result[i] = sum;

      onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry);
      onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry);
      onesCarry = _mm_and_si64 (onesCarry, one);

      _a = _mm_srli_si64 (_a, 1);
      _b = _mm_srli_si64 (_b, 1);

      carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry);
      carry = _mm_srli_si64 (carry, 63);
    }

  return carry;
}
mlib_status
mlib_m_conv5x5_u16nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_s16);
	__m64 ker[5][5];
	__m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000;
	__m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h,
	    tmph;
	__m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l,
	    tmpl;
	__m64 *sp, *dp;
	mlib_s32 shift, ind, ker_sum = 0;
	mlib_s32 row, wid4, i, j;

	width -= 4;
	height -= 4;
	width *= NCHAN;
	dl += 2 * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * 2 * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (__m64 *) sl;
		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < wid4; i++) {
			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		sp = (__m64 *) sl;
		dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		d1 = (*sp++);
		d1 = _mm_xor_si64(d1, mask8000);
		d2 = (*sp++);
		d2 = _mm_xor_si64(d2, mask8000);

		for (i = 0; i < width / 4; i++) {
			CONV_5x5(hi, i);

			dp[i] = rr;
		}

		if (width & 3) {
			__m64 mask =
			    ((__m64 *) mlib_mask64_arr)[2 * (width & 3)];

			CONV_5x5(hi, i);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, rr),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Esempio n. 5
0
__m64 test56(__m64 a, __m64 b) {
  // CHECK: pxor
  return _mm_xor_si64(a, b);
}
Esempio n. 6
0
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd)
{
    __m64  kw[12];                         /* key schedule words : chaining vars + tweak */
    __m64  X0,X1,X2,X3,X4,X5,X6,X7;        /* local copy of vars, for speed */
    __m64  w[8];                          /* local copy of input block */
    __m64  z1;
    __m64  z3;
    __m64  z5;
    __m64  z7;

    ts[0] = ctx->T[0];
    ts[1] = ctx->T[1];
    do {
        ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd));

	z1 = SKEIN_KS_PARITY;
        ks[0] = ctx->X[0];
	z1 = _mm_xor_si64(z1,ks[0]);
        ks[1] = ctx->X[1];
	z1 = _mm_xor_si64(z1,ks[1]);
        ks[2] = ctx->X[2];
	z1 = _mm_xor_si64(z1,ks[2]);
        ks[3] = ctx->X[3];
	z1 = _mm_xor_si64(z1,ks[3]);
        ks[4] = ctx->X[4];
	z1 = _mm_xor_si64(z1,ks[4]);
        ks[5] = ctx->X[5];
	z1 = _mm_xor_si64(z1,ks[5]);
        ks[6] = ctx->X[6];
	z1 = _mm_xor_si64(z1,ks[6]);
        ks[7] = ctx->X[7];
	z1 = _mm_xor_si64(z1,ks[7]);
	ks[8] = z1;

        ts[2] = _mm_xor_si64(ts[0],ts[1]);

        X0 = 0[(__m64 *) blkPtr];
        X1 = 1[(__m64 *) blkPtr];
        X2 = 2[(__m64 *) blkPtr];
        X3 = 3[(__m64 *) blkPtr];
        X4 = 4[(__m64 *) blkPtr];
        X5 = 5[(__m64 *) blkPtr];
        X6 = 6[(__m64 *) blkPtr];
        X7 = 7[(__m64 *) blkPtr];

        w[0] = X0;
        w[1] = X1;
        w[2] = X2;
        w[3] = X3;
        w[4] = X4;
        w[5] = X5;
        w[6] = X6;
        w[7] = X7;

        X0 = _mm_add_si64(X0,ks[0]);
        X1 = _mm_add_si64(X1,ks[1]);
        X2 = _mm_add_si64(X2,ks[2]);
        X3 = _mm_add_si64(X3,ks[3]);
        X4 = _mm_add_si64(X4,ks[4]);
        X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0]));
        X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1]));
        X7 = _mm_add_si64(X7,ks[7]);

        blkPtr += 64;

#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    X##p0 = _mm_add_si64(X##p0,X##p1); \
      X##p2 = _mm_add_si64(X##p2,X##p3); \
        X##p4 = _mm_add_si64(X##p4,X##p5); \
          X##p6 = _mm_add_si64(X##p6,X##p7); \
    z1 = X##p1; \
    X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \
    z1 = _m_psllqi(z1,ROT##_0); \
    X##p1 = _mm_or_si64(X##p1,z1); \
      z3 = X##p3; \
      X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \
      z3 = _m_psllqi(z3,ROT##_1); \
      X##p3 = _mm_or_si64(X##p3,z3); \
        z5 = X##p5; \
        z5 = _m_psllqi(z5,ROT##_2); \
        X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \
        X##p5 = _mm_or_si64(X##p5,z5); \
          z7 = X##p7; \
          X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \
          z7 = _m_psllqi(z7,ROT##_3); \
          X##p7 = _mm_or_si64(X##p7,z7); \
    X##p1 = _mm_xor_si64(X##p1,X##p0); \
      X##p3 = _mm_xor_si64(X##p3,X##p2); \
        X##p5 = _mm_xor_si64(X##p5,X##p4); \
          X##p7 = _mm_xor_si64(X##p7,X##p6); \

#define I512(R)                                                     \
    X0 = _mm_add_si64(X0,ks[((R)+1) % 9]);   /* inject the key schedule value */  \
    X1 = _mm_add_si64(X1,ks[((R)+2) % 9]);                                        \
    X2 = _mm_add_si64(X2,ks[((R)+3) % 9]);                                        \
    X3 = _mm_add_si64(X3,ks[((R)+4) % 9]);                                        \
    X4 = _mm_add_si64(X4,ks[((R)+5) % 9]);                                        \
    X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3]));          \
    X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3]));          \
    X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1)));     \

#define R512_8_rounds(R) \
        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
        I512(2*(R));                              \
        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
        I512(2*(R)+1);

        R512_8_rounds( 0);
        R512_8_rounds( 1);
        R512_8_rounds( 2);
        R512_8_rounds( 3);
        R512_8_rounds( 4);
        R512_8_rounds( 5);
        R512_8_rounds( 6);
        R512_8_rounds( 7);
        R512_8_rounds( 8);

        ctx->X[0] = _mm_xor_si64(X0,w[0]);
        ctx->X[1] = _mm_xor_si64(X1,w[1]);
        ctx->X[2] = _mm_xor_si64(X2,w[2]);
        ctx->X[3] = _mm_xor_si64(X3,w[3]);
        ctx->X[4] = _mm_xor_si64(X4,w[4]);
        ctx->X[5] = _mm_xor_si64(X5,w[5]);
        ctx->X[6] = _mm_xor_si64(X6,w[6]);
        ctx->X[7] = _mm_xor_si64(X7,w[7]);

        ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32)  64 ) << 24),~0));
    } while (--blkCnt);
    ctx->T[0] = ts[0];
    ctx->T[1] = ts[1];
}
Esempio n. 7
0
__m64 test_mm_xor_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_xor_si64
  // CHECK: call x86_mmx @llvm.x86.mmx.pxor
  return _mm_xor_si64(a, b);
}