unsigned int interpolhline_mmx_2(unsigned char* image){

	__m64 mm_A = _mm_set_pi16(image[1],image[0],image[-1],image[-2]); 
	__m64 mm_B = _mm_set_pi16(image[2],image[1],image[0],image[-1]); 
	__m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[4],image[3],image[2],image[1]);
	__m64 mm_E = _mm_set_pi16(image[5],image[4],image[3],image[2]);
	__m64 mm_F = _mm_set_pi16(image[6],image[5],image[4],image[3]);

	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	__m64 mm_zero = _mm_setzero_si64();
	__m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0
	__m64 mm_ret = _mm_srai_pi16(mm_clip,5);
	__m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255
	
	__m64 result =_mm_packs_pu16(mm_clip1,mm_zero);
	unsigned int ret = _mm_cvtsi64_si32(result);

	 empty(); 
	return ret;

}
 __m64 interpolhline64_mmx(unsigned char* image){

 
	__m64 mm_A = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_B = _mm_set_pi16(image[4],image[3],image[2],image[1]);
	__m64 mm_C = _mm_set_pi16(image[5],image[4],image[3],image[2]);
	__m64 mm_D = _mm_set_pi16(image[6],image[5],image[4],image[3]);
	__m64 mm_E = _mm_set_pi16(image[7],image[6],image[5],image[4]);
	__m64 mm_F = _mm_set_pi16(image[8],image[7],image[6],image[5]);


	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	 

	return(mm_result);


}
unsigned int interpolvline_mmx_3(unsigned char* image,	int PicWidthInPix){

	__m64 mm_A = _mm_set_pi16(image[-2 * PicWidthInPix + 3],image[-2 * PicWidthInPix + 2],image[-2 * PicWidthInPix + 1],image[-2 * PicWidthInPix]); 
	__m64 mm_B = _mm_set_pi16(image[-1 * PicWidthInPix + 3],image[-1 * PicWidthInPix + 2],image[-1 * PicWidthInPix + 1],image[-1 * PicWidthInPix]); 
	__m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[1 * PicWidthInPix + 3],image[1 * PicWidthInPix + 2],image[1 * PicWidthInPix + 1],image[1 * PicWidthInPix]);
	__m64 mm_E = _mm_set_pi16(image[2 * PicWidthInPix + 3],image[2 * PicWidthInPix + 2],image[2 * PicWidthInPix + 1],image[2 * PicWidthInPix]);
	__m64 mm_F = _mm_set_pi16(image[3 * PicWidthInPix + 3],image[3 * PicWidthInPix + 2],image[3 * PicWidthInPix + 1],image[3 * PicWidthInPix]);

	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	__m64 mm_zero = _mm_setzero_si64();
	__m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0
	__m64 mm_ret = _mm_srai_pi16(mm_clip,5);
	__m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255
	
	
	__m64 test =  _mm_avg_pu8(mm_clip1,mm_D);//(ptr_img[0] + ptr_rf[0] + 1) >> 1
	__m64 test1 =_mm_packs_pu16(test,mm_zero);
	unsigned int ret = _mm_cvtsi64_si32(test1);

	empty(); 
	return ret;

}
示例#4
0
void weak_horizontal_chroma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha	, const unsigned char beta, const unsigned char tc0)
{
	
	__m64 mp1 = _mm_set_pi16( 0,0,pix[-2*xstride + 1], pix[-2*xstride]); 
	__m64 mp0 = _mm_set_pi16( 0,0,pix[-1*xstride + 1], pix[-1*xstride]); 
	__m64 mq0 = _mm_set_pi16( 0,0,pix[1], pix[0]);
	__m64 mq1 = _mm_set_pi16( 0,0,pix[xstride + 1], pix[xstride]);
	

	__m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0)
	__m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0)
	__m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0)

	//To calculate the mask
 	__m64 malpha = _mm_set_pi16(0,0,alpha,alpha);
	__m64 malphab = _mm_set_pi16(0,0,-alpha,-alpha);
	__m64 mbeta = _mm_set_pi16(0,0,beta,beta);
	__m64 mbetab = _mm_set_pi16(0,0,-beta,-beta);

	__m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab));
	__m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab));
	__m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab));
	__m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2);


	__m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) 
	__m64 mlshift = _mm_set_pi16(0,0,0,2);
	__m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2;
	__m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1)
	__m64 madd4 = _mm_set_pi16(4,4,4,4);
	__m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4;
	__m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 +  inter_2)
	__m64 mrshift3 = _mm_set_pi16(0,0,0,3);
	__m64 minter5 = _mm_sra_pi16(minter_4, mrshift3);



	//Clip3
	__m64 m_tc0 = _mm_set_pi16(0,0,tc0,tc0);
	__m64 m_tcb0 = _mm_set_pi16(0,0,-tc0,-tc0);
	__m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 merror2 = _mm_and_si64 (mres_c3,first_mask);

	__m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B);
	__m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B);
	__m64 mrshift = _mm_set_pi16(0,0,0,1);

	*((unsigned short* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift));
	*((unsigned short* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift));


	empty();
}
示例#5
0
int32_t od_mc_compute_satd8_4x4_sse2(const unsigned char *src, int systride,
 const unsigned char *ref, int rystride) {
  int32_t satd;
  __m64 sums;
  __m64 a;
  __m64 b;
  __m64 c;
  __m64 d;
  a = od_load_convert_subtract_x4(src + 0*systride, ref + 0*rystride);
  b = od_load_convert_subtract_x4(src + 1*systride, ref + 1*rystride);
  c = od_load_convert_subtract_x4(src + 2*systride, ref + 2*rystride);
  d = od_load_convert_subtract_x4(src + 3*systride, ref + 3*rystride);
  /*Vertical 1D transform.*/
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  od_transpose16x4(&a, &b, &c, &d);
  /*Horizontal 1D transform.*/
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the
     final butterfly stage with the calculating the absolute values and the
     first stage of accumulation.
    Calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
    An offset must be added to the final sum before rounding to account for
     subtracting 0x7FFF.*/
  a = _mm_sub_pi16(_mm_max_pi16(a, b), _mm_adds_pi16(_mm_add_pi16(a, b),
   _mm_set1_pi16(0x7FFF)));
  c = _mm_sub_pi16(_mm_max_pi16(c, d), _mm_adds_pi16(_mm_add_pi16(c, d),
   _mm_set1_pi16(0x7FFF)));
  /*Take the sum of all the absolute values.*/
  sums = _mm_add_pi16(a, c);
  /*Sum the elements of the vector.*/
  sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(0, 1, 2, 3)));
  sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(2, 3, 0, 1)));
  sums = _mm_unpacklo_pi16(sums, _mm_setzero_si64());
  satd = _mm_cvtsi64_si32(sums);
  /*Subtract the offset (8) and round.*/
  satd = (satd + 1 - 8) >> 1;
#if defined(OD_CHECKASM)
  {
    int32_t c_satd;
    c_satd = od_mc_compute_satd8_4x4_c(src, systride, ref, rystride);
    if (satd != c_satd) {
      fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n",
       4, 4, satd, c_satd);
    }
  }
#endif
  return satd;
}
示例#6
0
unsigned int mmx_hash_bucket_data(unsigned char *key, int size, int NoOfItems)
{
		char *p, *end;
    __m64 v1, v2, s; 
		int val;

		if (size < 8) return(fnv_data2bucket(key, size, NoOfItems));

		p=key;
		end=key+size;
    _mm_empty();                            // emms
		v1=_mm_set1_pi32(FNV_INIT_VAL);

		while ((end-p) > 7)
		{
		v2=_mm_setr_pi32(*p,*(p+4));
		v1=_mm_add_pi16(v1, v2);
		v1=_mm_slli_pi32(v1, 3);
		p+=8;
		}

		val=_mm_cvtsi64_si32(v1);
    _mm_empty();                            // emms

		if (val < 0) val=1-val;
 		val =val % NoOfItems;
		return(val);
}
示例#7
0
long dotp(short a[], short b[])
{
  int i;
  __m64 mm0, mm1, mm2, mm3, mm4;
  short suml[4];   // don't init sum from C - this confuses the GCC!
  short sumh[4];
  
  /* mmx - Intel Pentium-MMX and above */

  mm2 = _m_psubw(mm2, mm2);   // set mm2 to 0
  mm4 = _m_psubw(mm4, mm4);
  for (i = 0; i < NLMS_LEN; i += 4, a += 4, b += 4) {
    mm0 = _m_from_WORDs(a);
    mm3 = mm0;
    mm1 = _m_from_WORDs(b);
    
    /* Intel notation: first operand is destination */
    /* GNU as notation: first operand is source */
    // mm0 = _mm_mullo_pi16 (mm0, mm1);
    mm3 = _mm_mulhi_pi16 (mm3, mm1);
    // mm2 = _mm_add_pi16(mm2, mm0);
    mm4 = _mm_add_pi16(mm4, mm3);
  }
  _m_from_WORDs(suml) = mm2;
  _m_from_WORDs(sumh) = mm4;
  _mm_empty();
  return suml[0] + suml[1] + suml[2] + suml[3] 
   + 65536 * (sumh[0] + sumh[1] + sumh[2] + sumh[3]);
}
__m64 interpolhline64_3_mmx(__m64* temp){


	__m64 res,res1;
	__m64 ptr = _mm_setzero_si64();
	__m64 mm_16 = _mm_set_pi16(16,16,16,16);
	
	short A = _mm_extract_pi16(temp[0],0);
	short B = _mm_extract_pi16(temp[1],0);
	short C = _mm_extract_pi16(temp[2],0);
	short D = _mm_extract_pi16(temp[3],0);
	short E = _mm_extract_pi16(temp[4],0);
	short F = _mm_extract_pi16(temp[5],0);

	unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0);

	 A = _mm_extract_pi16(temp[0],1);
	 B = _mm_extract_pi16(temp[1],1);
	 C = _mm_extract_pi16(temp[2],1);
	 D = _mm_extract_pi16(temp[3],1);
	 E = _mm_extract_pi16(temp[4],1);
	 F = _mm_extract_pi16(temp[5],1);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1);

	 A = _mm_extract_pi16(temp[0],2);
	 B = _mm_extract_pi16(temp[1],2);
	 C = _mm_extract_pi16(temp[2],2);
	 D = _mm_extract_pi16(temp[3],2);
	 E = _mm_extract_pi16(temp[4],2);
	 F = _mm_extract_pi16(temp[5],2);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2);

	 A = _mm_extract_pi16(temp[0],3);
	 B = _mm_extract_pi16(temp[1],3);
	 C = _mm_extract_pi16(temp[2],3);
	 D = _mm_extract_pi16(temp[3],3);
	 E = _mm_extract_pi16(temp[4],3);
	 F = _mm_extract_pi16(temp[5],3);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3);
	

	res = _mm_add_pi16(temp[3],mm_16);
	res1 = _mm_srai_pi16(res,5);
	res1 = _mm_max_pi16(res1,_mm_set_pi16(0,0,0,0));
	res1 = _mm_min_pi16(res1,_mm_set_pi16(255,255,255,255)); //Clip

	res =  _mm_avg_pu16(ptr,res1);//(ptr_img[0] + ptr_rf[0] + 1) >> 1*/
	
	return res;

}
示例#9
0
OD_SIMD_INLINE void od_mc_butterfly_2x2_16x4(__m64 *t0, __m64 *t1,
 __m64 *t2, __m64 *t3) {
  __m64 a;
  __m64 b;
  __m64 c;
  __m64 d;
  /*a = t0 + t1, c = (t0 + t1) - (t1 + t1) = t0 - t1
    b = t2 + t3, d = (t2 + t3) - (t3 + t3) = t2 - t3*/
  a = _mm_add_pi16(*t0, *t1);
  c = _mm_add_pi16(*t1, *t1);
  c = _mm_sub_pi16(a, c);
  b = _mm_add_pi16(*t2, *t3);
  d = _mm_add_pi16(*t3, *t3);
  d = _mm_sub_pi16(b, d);
  *t0 = a;
  *t1 = b;
  *t2 = c;
  *t3 = d;
}
示例#10
0
//n_2
 __m64 interpolvline64_mmx(unsigned char* image, const unsigned short PicWidthInPix){

 
	__m64 mm_A = _mm_set_pi16(image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix],image[-2 * PicWidthInPix]); 
	__m64 mm_B = _mm_set_pi16(image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix]); 
	__m64 mm_C = _mm_set_pi16(image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix]);
	__m64 mm_E = _mm_set_pi16(image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix]);
	__m64 mm_F = _mm_set_pi16(image[6 * PicWidthInPix],image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix]);


	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5

	 empty();  
	return(mm_result);
}
示例#11
0
__m64 test34(__m64 a, __m64 b) {
  // CHECK: paddw
  return _mm_add_pi16(a, b);
}
示例#12
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YUY2 to YV12.
VOID Yuy2ToYv12_mmx(PBYTE pbDstY, PBYTE pbDstU, PBYTE pbDstV, INT iDstYStride, INT iDstUvStride,
					PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iSrcXDif;
	INT iDstYDif;
	INT iDstUvDif;
	M64 m0, m1, m2, m3, m4, m5, m6, m7;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbSrcX += (iHeight - 1) * iSrcXStride;
		iSrcXStride = -iSrcXStride;
	}

	iSrcXDif = iSrcXStride - (uWidth * 2);
	iDstYDif = iDstYStride - uWidth;
	iDstUvDif = iDstUvStride - (uWidth / 2);

	m7 = g_mWord00FF;
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			m0 = ((PM64) pbSrcX)[0];
			m1 = ((PM64) pbSrcX)[1];
			m2 = ((PM64) (pbSrcX + iSrcXStride))[0];
			m3 = ((PM64) (pbSrcX + iSrcXStride))[1];

			m4 = m0;
			m5 = m2;

			m4 = _mm_srli_pi16(m4, 8);
			m5 = _mm_srli_pi16(m5, 8);

			m4 = _mm_and_si64(m4, m7);
			m5 = _mm_and_si64(m5, m7);

			m4 = _mm_add_pi16(m4, m5);

			m5 = m1;
			m6 = m3;

			m5 = _mm_srli_pi16(m5, 8);
			m6 = _mm_srli_pi16(m6, 8);
			m5 = _mm_and_si64(m5, m7);
			m6 = _mm_and_si64(m6, m7);

			m5 = _mm_add_pi16(m5, m6);

			m4 = _mm_add_pi16(m4, g_mWord0001);
			m5 = _mm_add_pi16(m5, g_mWord0001);

			m4 = _mm_srli_pi16(m4, 1);
			m5 = _mm_srli_pi16(m5, 1);

			m0 = _mm_and_si64(m0, m7);
			m1 = _mm_and_si64(m1, m7);
			m2 = _mm_and_si64(m2, m7);
			m3 = _mm_and_si64(m3, m7);

			m0 = _mm_packs_pu16(m0, m1);
			m2 = _mm_packs_pu16(m2, m3);

			((PM64) pbDstY)[0] = m0;
			((PM64) (pbDstY + iDstYStride))[0] = m2;

			m4 = _mm_packs_pu16(m4, m5);
			m5 = m4;

			m4 = _mm_srli_si64(m4, 8);

			m5 = _mm_and_si64(m5, m7);
			m4 = _mm_and_si64(m4, m7);

			m5 = _mm_packs_pu16(m5, m5);
			m4 = _mm_packs_pu16(m4, m4);

			((PDWORD) pbDstU)[0] = _mm_cvtsi64_si32(m5);
			((PDWORD) pbDstV)[0] = _mm_cvtsi64_si32(m4);

			pbSrcX += 16;
			pbDstY += 8;
			pbDstU += 4;
			pbDstV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			pbDstY[0] = pbSrcX[0];
			pbDstU[0] = (pbSrcX[1] + pbSrcX[iSrcXStride + 1] + 1) / 2;
			pbDstY[1] = pbSrcX[2];
			pbDstV[0] = (pbSrcX[3] + pbSrcX[iSrcXStride + 3] + 1) / 2;

			pbDstY[iDstYStride + 0] = pbSrcX[iSrcXStride + 0];
			pbDstY[iDstYStride + 1] = pbSrcX[iSrcXStride + 2];

			pbSrcX += 4;
			pbDstY += 2;
			pbDstU++;
			pbDstV++;
		}

		pbSrcX += iSrcXDif + iSrcXStride;
		pbDstY += iDstYDif + iDstYStride;
		pbDstU += iDstUvDif;
		pbDstV += iDstUvDif;
	}

	_mm_empty();
}
示例#13
0
/*  Fill a surface with a gradient which is generated by bilinearly 
    interpolating between four corner color values.  Can take
    a source surface and multiply it into the gradient, but if 
    'src' is NULL, it will generate the gradient without multiplying  */
static void fillBlend(
    SDL_Surface *dst, SDL_Surface *src, BROGUE_DRAW_COLOR *color)
{
    int x, y;
    int lr, lg, lb, rr, rg, rb;
    int ldr, ldg, ldb, rdr, rdg, rdb;
    int w, h;
    BROGUE_DRAW_COLOR ul = color[0];
    BROGUE_DRAW_COLOR ur = color[1];
    BROGUE_DRAW_COLOR bl = color[2];
    BROGUE_DRAW_COLOR br = color[3];
#if defined(__MMX__)
    int mmx = SDL_HasMMX();
#endif
    
    w = dst->w;
    h = dst->h;

    if (src != NULL)
    {
	assert(dst->w == src->w);
	assert(dst->h == src->h);
    }

    lr = clamp(ul.red * 0xFFFF, 0, 0xFFFF);
    lg = clamp(ul.green * 0xFFFF, 0, 0xFFFF);
    lb = clamp(ul.blue * 0xFFFF, 0, 0xFFFF);

    rr = clamp(ur.red * 0xFFFF, 0, 0xFFFF);
    rg = clamp(ur.green * 0xFFFF, 0, 0xFFFF);
    rb = clamp(ur.blue * 0xFFFF, 0, 0xFFFF);

    ldr = (clamp(bl.red * 0xFFFF, 0, 0xFFFF) - lr) / h;
    ldg = (clamp(bl.green * 0xFFFF, 0, 0xFFFF) - lg) / h;
    ldb = (clamp(bl.blue * 0xFFFF, 0, 0xFFFF) - lb) / h;

    rdr = (clamp(br.red * 0xFFFF, 0, 0xFFFF) - rr) / h;
    rdg = (clamp(br.green * 0xFFFF, 0, 0xFFFF) - rg) / h;
    rdb = (clamp(br.blue * 0xFFFF, 0, 0xFFFF) - rb) / h;

    for (y = 0; y < h; y++)
    {
	unsigned char *pix;
	int dr, dg, db;
	int rpp, gpp, bpp, raccum, gaccum, baccum;

	pix = (unsigned char *)dst->pixels + dst->pitch * y;

	dr = rr - lr;
	dg = rg - lg;
	db = rb - lb;

	rpp = dr / w;
	gpp = dg / w;
	bpp = db / w;

	raccum = lr;
	gaccum = lg;
	baccum = lb;

	lr += ldr;
	lg += ldg;
	lb += ldb;

	rr += rdr;
	rg += rdg;
	rb += rdb;

	if (src != NULL)
	{
	    unsigned char *src_pix = (unsigned char *)src->pixels 
		+ src->pitch * y;
	    x = w;

#if defined(__MMX__)
	    /*  MMX is significantly faster.  Use it if the CPU supports it  */
	    if (mmx)
	    {
		__m64 mmx_zero = _m_from_int(0);

		long long ll_color = 
		    ((long long)0xFFFF << 48)
		    | ((long long)raccum << 32)
		    | ((long long)gaccum << 16)
		    | ((long long)baccum);
		__m64 mmx_color = *(__m64 *)&ll_color;

		long long ll_pp = 
		    ((long long)(rpp & 0xFFFF) << 32)
		    | ((long long)(gpp & 0xFFFF) << 16)
		    | ((long long)(bpp & 0xFFFF));
		__m64 mmx_pp = *(__m64 *)&ll_pp;

		while (x >= 2)
		{
		    __m64 src_pair = *(__m64 *)src_pix;
		
		    /*  Separate the left pixel and right pixel  */
		    __m64 left_pix = _mm_unpacklo_pi8(src_pair, mmx_zero);
		    __m64 right_pix = _mm_unpackhi_pi8(src_pair, mmx_zero);
		
		    /*  Multiply the left source by the gradient color  */
		    left_pix = _mm_mullo_pi16(left_pix, 
					      _mm_srli_pi16(mmx_color, 8));
		    /*  Advance the gradient color for the next pixel */
		    mmx_color = _mm_add_pi16(mmx_color, mmx_pp);

		    /*  Multiply the right source by the gradient color  */
		    right_pix = _mm_mullo_pi16(right_pix,
					       _mm_srli_pi16(mmx_color, 8));
		    /*  Advance the gradient  */
		    mmx_color = _mm_add_pi16(mmx_color, mmx_pp); 

		    /*  Recombine the pixels  */
		    __m64 result_pix = _mm_packs_pu16(
			_mm_srli_pi16(left_pix, 8),
			_mm_srli_pi16(right_pix, 8));
		    
		    *(__m64 *)pix = result_pix;

		    src_pix += 8;
		    pix += 8;
		    x -= 2;
		}

		/*  Extract the accumulated gradient value for the potential
		    odd remaining pixel  */
		short *s_color = (short *)&mmx_color;
		raccum = s_color[2];
		gaccum = s_color[1];
		baccum = s_color[0];
	    }
#endif

	    /*  The equivalent slow loop for odd pixels or CPUs without MMX  */
	    while (x > 0)
	    {
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
		pix[3] = src_pix[3];
		pix[2] = (src_pix[2] * raccum) >> 16;
		pix[1] = (src_pix[1] * gaccum) >> 16;
		pix[0] = (src_pix[0] * baccum) >> 16;
#else
		pix[0] = src_pix[0];
		pix[1] = (src_pix[1] * raccum) >> 16;
		pix[2] = (src_pix[2] * gaccum) >> 16;
		pix[3] = (src_pix[3] * baccum) >> 16;
#endif

		raccum += rpp;
		gaccum += gpp;
		baccum += bpp;

		src_pix += 4;
		pix += 4;
		x--;
	    }
	}
	else
	{
示例#14
0
//mbl test with Pocket_PC		
void weak_horizontal_luma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha, const unsigned char beta, const unsigned char tc0){

	__m64 mp2 = _mm_set_pi16(pix[-3*xstride + 3], pix[-3*xstride + 2], pix[-3*xstride + 1], pix[-3*xstride]);
	__m64 mp1 = _mm_set_pi16(pix[-2*xstride + 3], pix[-2*xstride + 2], pix[-2*xstride + 1], pix[-2*xstride]); 
	__m64 mp0 = _mm_set_pi16(pix[-1*xstride + 3], pix[-1*xstride + 2], pix[-1*xstride + 1], pix[-1*xstride]); 
	__m64 mq0 = _mm_set_pi16(pix[3], pix[2], pix[1], pix[0]);
	__m64 mq1 = _mm_set_pi16(pix[xstride + 3], pix[xstride + 2], pix[xstride + 1], pix[xstride]);
	__m64 mq2 = _mm_set_pi16(pix[2*xstride + 3], pix[2*xstride + 2], pix[2*xstride + 1], pix[2*xstride]);
	
	__m64 mrshift = _mm_set_pi16(0,0,0,1);
	__m64 maddp0_q0 =  _mm_avg_pu8(mp0,mq0); //addp0_q0 = (p0 + q0 + 1) >> 1;
	__m64 maddp2 = _mm_add_pi16(maddp0_q0,mp2); //addp2 = (p2 + addp0_q0);
	__m64 maddq2 = _mm_add_pi16(maddp0_q0,mq2); //addp2 = (p2 + addp0_q0);
	__m64 maddp2_s = _mm_srl_pi16(maddp2,mrshift); //addp2 = (p2 + addp0_q0) >> 1;
	__m64 maddq2_s = _mm_srl_pi16(maddq2,mrshift); //addp2 = (p2 + addp0_q0) >> 1;
	__m64 mp1_c = _mm_sub_pi16(maddp2_s, mp1); //addp2 - p1
	__m64 mq1_c = _mm_sub_pi16(maddq2_s, mq1); // addq2 - q1



	//To calculate the mask
 	__m64 malpha = _mm_set_pi16(alpha,alpha,alpha,alpha);
	__m64 malphab = _mm_set_pi16(-alpha,-alpha,-alpha,-alpha);
	__m64 mbeta = _mm_set_pi16(beta,beta,beta,beta);
	__m64 mbetab = _mm_set_pi16(-beta,-beta,-beta,-beta);
	

	__m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0)
	__m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0)
	__m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0)
	__m64 mdiff_p2_p0 = _mm_sub_pi16(mp2,mp0); //abs(p2 - p0 ))
	__m64 mdiff_q2_q0 = _mm_sub_pi16(mq2,mq0); //abs(q2 - q0)

 	__m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab));
	__m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab));
	__m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab));
	__m64 mask3 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p2_p0), _mm_cmpgt_pi16(mdiff_p2_p0,mbetab));
	__m64 mask4 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q2_q0), _mm_cmpgt_pi16(mdiff_q2_q0,mbetab));
	
	__m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); //(abs(q0 - p0) < alpha) && (abs(p1 - p0) < beta) && (abs(q1 - q0) < beta)
	__m64 second_mask = _mm_and_si64 (first_mask,mask3);
	__m64 third_mask = _mm_and_si64 (first_mask,mask4);


	__m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) 
	__m64 mlshift = _mm_set_pi16(0,0,0,2);
	__m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2;
	__m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1)
	__m64 madd4 = _mm_set_pi16(4,4,4,4);
	__m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4;
	__m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 +  inter_2)
	__m64 mrshift3 = _mm_set_pi16(0,0,0,3);
	__m64 minter5 = _mm_sra_pi16(minter_4, mrshift3);



	//Clip3
	__m64 m_tc0 = _mm_set_pi16(tc0,tc0,tc0,tc0);
	__m64 m_tcb0 = _mm_set_pi16(-tc0,-tc0,-tc0,-tc0);
	__m64 mres_c1 = _mm_min_pi16(_mm_max_pi16(mp1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 mres_c2 = _mm_min_pi16(_mm_max_pi16(mq1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addq2 - q1 );
	__m64 merror0 = _mm_and_si64 (mres_c1,second_mask);
	__m64 merror1 = _mm_and_si64 (mres_c2,third_mask);


	__m64 m_1 = _mm_set_pi16(1,1,1,1);
	__m64 m_and1 = _mm_and_si64 (mask3, m_1); //tc++; if abs( p2 - p0 ) < beta 
	__m64 m_and2 = _mm_and_si64 (mask4, m_1); //tc++; if abs( q2 - q0  ) < beta 
	__m64 m_tc = _mm_add_pi16(m_and2,_mm_add_pi16(m_tc0,m_and1));
	__m64 m_tcn =_mm_sub_pi16(_mm_sub_pi16(m_tcb0,m_and1),m_and2);
	__m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcn),m_tc); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 merror2 = _mm_and_si64 (mres_c3,first_mask);


	__m64 result_p1 = _mm_add_pi16(merror0,mp1); //_mm_shuffle_pi16(_mm_add_pi16(merror0,mp1), 0x1B);
	__m64 result_q1 = _mm_add_pi16(merror1,mq1); //_mm_shuffle_pi16(_mm_add_pi16(merror1,mq1), 0x1B);
	__m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B);
	__m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B);

	*((unsigned int* )(&pix[-2*xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p1,mrshift));
	*((unsigned int* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift));
	*((unsigned int* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift));
	*((unsigned int* )(&pix[xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q1,mrshift));

	empty();  
}
示例#15
0
__m64 test_mm_add_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_add_pi16
  // CHECK: call x86_mmx @llvm.x86.mmx.padd.w
  return _mm_add_pi16(a, b);
}