Esempio n. 1
0
void pix_compare :: processYUV_MMX(imageStruct &image, imageStruct &right)
{
  long datasize =   image.xsize * image.ysize * image.csize;
  datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);
  __m64*leftPix =  (__m64*)image.data;
  __m64*rightPix = (__m64*)right.data;

  __m64 l, r, b;
  __m64 mask = _mm_setr_pi8((unsigned char)0x00,
			    (unsigned char)0xFF,
			    (unsigned char)0x00,
			    (unsigned char)0xFF,
			    (unsigned char)0x00,
			    (unsigned char)0xFF,
			    (unsigned char)0x00,
			    (unsigned char)0xFF);
  __m64 zeros = _mm_set1_pi8((unsigned char)0x00);
  //format is U Y V Y
  if (m_direction) {
    while(datasize--){
      l=leftPix[datasize];
      r=rightPix[datasize];
      b=_mm_subs_pu8(l, r);
      b=_mm_and_si64(b, mask);
      b=_mm_cmpeq_pi32(b, zeros);
      r=_mm_and_si64(r, b);
      l=_mm_andnot_si64(b, l);

      leftPix[datasize]=_mm_or_si64(l, r);
    }
  } else {
    while(datasize--){
      l=leftPix[datasize];
      r=rightPix[datasize];
      b=_mm_subs_pu8(r, l);
      b=_mm_and_si64(b, mask);
      b=_mm_cmpeq_pi32(b, zeros);
      r=_mm_and_si64(r, b);
      l=_mm_andnot_si64(b, l);

      leftPix[datasize]=_mm_or_si64(l, r);
    }
  }
  _mm_empty();
}
Esempio n. 2
0
void pix_background :: processRGBAMMX(imageStruct &image)
{
  long i,pixsize;
  pixsize = image.xsize * image.ysize * image.csize;

  if(m_savedImage.xsize!=image.xsize ||
     m_savedImage.ysize!=image.ysize ||
     m_savedImage.format!=image.format)m_reset=1;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();

  if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
  }
  m_reset=0;

  i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*data =(__m64*)image.data;
  __m64*saved=(__m64*)m_savedImage.data;

  const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange,
				m_Yrange, m_Urange, m_Vrange, m_Arange);
  const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1);
  __m64 newpix, oldpix, m1;

  while(i--){
    /* 7ops, 3memops */
    /* i have the feeling that this is not faster at all! 
     * even if i have the 3memops + ONLY 1 _mm_subs_pu8() 
     * i am equally slow as the generic code; 
     * adding the other instruction does not change much
     */
    newpix=*data;
    oldpix=*saved++;
    m1    = newpix;
    m1    = _mm_subs_pu8     (m1, oldpix);
    oldpix= _mm_subs_pu8     (oldpix, newpix);
    m1    = _mm_or_si64      (m1, oldpix); // |oldpix-newpix|
    m1    = _mm_adds_pu8     (m1, offset);
    m1    = _mm_subs_pu8     (m1, thresh);
    m1    = _mm_cmpeq_pi32   (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh
    m1    = _mm_andnot_si64(m1, newpix);

    *data++ = m1; 
  }
  _mm_empty();
}
Esempio n. 3
0
void pix_background :: processYUVMMX(imageStruct &image)
{
  long pixsize;

  pixsize = image.xsize * image.ysize * image.csize;

  if(m_savedImage.xsize!=image.xsize ||
     m_savedImage.ysize!=image.ysize ||
     m_savedImage.format!=image.format)m_reset=1;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();
  
  if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
    // return;
  }
  m_reset=0;

  int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*data =(__m64*)image.data;
  __m64*saved=(__m64*)m_savedImage.data;

  const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange,
			  m_Urange, m_Yrange, m_Vrange, m_Yrange);
  const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1);
  const __m64 black =_mm_set_pi8((unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80);

  __m64 newpix, oldpix, m1;

  while(i--){
    newpix=*data;
    oldpix=*saved++;
    m1    = newpix;
    m1    = _mm_subs_pu8     (m1, oldpix);
    oldpix= _mm_subs_pu8     (oldpix, newpix);
    m1    = _mm_or_si64      (m1, oldpix); // |oldpix-newpix|
    m1    = _mm_adds_pu8     (m1, offset); // to make thresh=0 work correctly
    m1    = _mm_subs_pu8     (m1, thresh);  // m1>thresh -> saturation -> 0
    m1    = _mm_cmpeq_pi32   (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh

    oldpix= black;
    oldpix= _mm_and_si64     (oldpix, m1);

    m1    = _mm_andnot_si64  (m1, newpix);
    m1    = _mm_or_si64      (m1, oldpix);

    *data++ = m1; 
  }
  _mm_empty();
}
Esempio n. 4
0
__m64 test86(__m64 a, __m64 b) {
  // CHECK: pcmpeqd
  return _mm_cmpeq_pi32(a, b);
}
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) {
   __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
   __m128i emm0, emm2, emm4;
#else
   __m64 mm0, mm1, mm2, mm3, mm4, mm5;
#endif
   sign_bit_sin = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in emm2 */
   emm2 = _mm_cvttps_epi32(y);

   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm4 = emm2;

   /* get the swap sign flag for the sine */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);

   /* get the polynom selection mask for the sine*/
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm2:mm3 */
   xmm3 = _mm_movehl_ps(xmm3, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm3);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);

   mm4 = mm2;
   mm5 = mm3;

   /* get the swap sign flag for the sine */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   __m128 swap_sign_bit_sin;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);

   /* get the polynom selection mask for the sine */

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 poly_mask;
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif

   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

#ifdef USE_SSE2
   emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
   emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
   emm4 = _mm_slli_epi32(emm4, 29);
   __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
#else
   /* get the sign flag for the cosine */
   mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
   mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
   mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
   mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
   mm4 = _mm_slli_pi32(mm4, 29);
   mm5 = _mm_slli_pi32(mm5, 29);
   __m128 sign_bit_cos;
   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
   _mm_empty(); /* good-bye mmx */
#endif

   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   __m128 z = _mm_mul_ps(x,x);
   y = *(__m128*)_ps_coscof_p0;

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   __m128 ysin2 = _mm_and_ps(xmm3, y2);
   __m128 ysin1 = _mm_andnot_ps(xmm3, y);
   y2 = _mm_sub_ps(y2,ysin2);
   y = _mm_sub_ps(y, ysin1);

   xmm1 = _mm_add_ps(ysin1,ysin2);
   xmm2 = _mm_add_ps(y,y2);

   /* update the sign */
   *s = _mm_xor_ps(xmm1, sign_bit_sin);
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
/* almost the same as sin_ps */
__m128 cos_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
   __m128i emm0, emm2;
#else
   __m64 mm0, mm1, mm2, mm3;
#endif
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2);

   /* get the swap sign flag */
   emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask */
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

   __m128 sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);


   mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2);

   /* get the swap sign flag in mm0:mm1 and the 
   polynom selection mask in mm2:mm3 */

   mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);

   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());

   __m128 sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   _mm_empty(); /* good-bye mmx */
#endif
   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);

   return y;
}
Esempio n. 7
0
__m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_cmpeq_pi32
  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d
  return _mm_cmpeq_pi32(a, b);
}