Пример #1
0
///////////////////////////////////////////////transforms/////////////////////////////////////////////////////////////////////
void Haar::transrows(char** dest, char** sour, unsigned int w, unsigned int h) const
{
        unsigned int w2 = w / 2;

        __m64 m00FF;
        m00FF.m64_u64 = 0x00FF00FF00FF00FF;

        for (unsigned int y = 0; y < h; y++) {

                __m64 *mlo = (__m64 *) & dest[y][0];
                __m64 *mhi = (__m64 *) & dest[y][w2];
                __m64 *msour = (__m64 *) & sour[y][0];

                for (unsigned int k = 0; k < w2 / 8; k++) {   //k<w2/8   k=8*k

                        __m64 even = _mm_packs_pu16(_mm_and_si64(*msour, m00FF), _mm_and_si64(*(msour + 1), m00FF));       //even coeffs
                        __m64 odd = _mm_packs_pu16(_mm_srli_pi16(*msour, 8), _mm_srli_pi16(*(msour + 1), 8));              //odd coeffs

                        addsub(even, odd, mlo++, mhi++);
                        msour += 2;
                }

                if (w2 % 8) {
                        for (unsigned int k = w2 - (w2 % 8); k < w2; k++) {
                                dest[y][k] = char(((int)sour[y][2*k] + (int)sour[y][2*k+1]) / 2);
                                dest[y][k+w2] = char(((int)sour[y][2*k] - (int)sour[y][2*k+1]) / 2);
                        }
                }
        }
        _mm_empty();
}
Пример #2
0
void weak_horizontal_chroma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha	, const unsigned char beta, const unsigned char tc0)
{
	
	__m64 mp1 = _mm_set_pi16( 0,0,pix[-2*xstride + 1], pix[-2*xstride]); 
	__m64 mp0 = _mm_set_pi16( 0,0,pix[-1*xstride + 1], pix[-1*xstride]); 
	__m64 mq0 = _mm_set_pi16( 0,0,pix[1], pix[0]);
	__m64 mq1 = _mm_set_pi16( 0,0,pix[xstride + 1], pix[xstride]);
	

	__m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0)
	__m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0)
	__m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0)

	//To calculate the mask
 	__m64 malpha = _mm_set_pi16(0,0,alpha,alpha);
	__m64 malphab = _mm_set_pi16(0,0,-alpha,-alpha);
	__m64 mbeta = _mm_set_pi16(0,0,beta,beta);
	__m64 mbetab = _mm_set_pi16(0,0,-beta,-beta);

	__m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab));
	__m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab));
	__m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab));
	__m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2);


	__m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) 
	__m64 mlshift = _mm_set_pi16(0,0,0,2);
	__m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2;
	__m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1)
	__m64 madd4 = _mm_set_pi16(4,4,4,4);
	__m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4;
	__m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 +  inter_2)
	__m64 mrshift3 = _mm_set_pi16(0,0,0,3);
	__m64 minter5 = _mm_sra_pi16(minter_4, mrshift3);



	//Clip3
	__m64 m_tc0 = _mm_set_pi16(0,0,tc0,tc0);
	__m64 m_tcb0 = _mm_set_pi16(0,0,-tc0,-tc0);
	__m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 merror2 = _mm_and_si64 (mres_c3,first_mask);

	__m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B);
	__m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B);
	__m64 mrshift = _mm_set_pi16(0,0,0,1);

	*((unsigned short* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift));
	*((unsigned short* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift));


	empty();
}
Пример #3
0
void pix_multiply :: processRGBA_MMX(imageStruct &image, imageStruct &right)
{
    int datasize =   image.xsize * image.ysize * image.csize;
    __m64*leftPix =  (__m64*)image.data;
    __m64*rightPix = (__m64*)right.data;

    datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);

    __m64 l0, r0, l1, r1;
    __m64 null64 = _mm_setzero_si64();
    while(datasize--)    {
        l1=leftPix [datasize];
        r1=rightPix[datasize];

        l0=_mm_unpacklo_pi8(l1, null64);
        r0=_mm_unpacklo_pi8(r1, null64);
        l1=_mm_unpackhi_pi8(l1, null64);
        r1=_mm_unpackhi_pi8(r1, null64);

        l0=_mm_mullo_pi16  (l0, r0);
        l1=_mm_mullo_pi16  (l1, r1);

        l0=_mm_srli_pi16(l0, 8);
        l1=_mm_srli_pi16(l1, 8);

        leftPix[datasize]=_mm_packs_pu16(l0, l1);
    }
    _mm_empty();
}
Пример #4
0
unsigned int interpolhline_mmx_2(unsigned char* image){

	__m64 mm_A = _mm_set_pi16(image[1],image[0],image[-1],image[-2]); 
	__m64 mm_B = _mm_set_pi16(image[2],image[1],image[0],image[-1]); 
	__m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[4],image[3],image[2],image[1]);
	__m64 mm_E = _mm_set_pi16(image[5],image[4],image[3],image[2]);
	__m64 mm_F = _mm_set_pi16(image[6],image[5],image[4],image[3]);

	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	__m64 mm_zero = _mm_setzero_si64();
	__m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0
	__m64 mm_ret = _mm_srai_pi16(mm_clip,5);
	__m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255
	
	__m64 result =_mm_packs_pu16(mm_clip1,mm_zero);
	unsigned int ret = _mm_cvtsi64_si32(result);

	 empty(); 
	return ret;

}
Пример #5
0
unsigned int interpolvline_mmx_3(unsigned char* image,	int PicWidthInPix){

	__m64 mm_A = _mm_set_pi16(image[-2 * PicWidthInPix + 3],image[-2 * PicWidthInPix + 2],image[-2 * PicWidthInPix + 1],image[-2 * PicWidthInPix]); 
	__m64 mm_B = _mm_set_pi16(image[-1 * PicWidthInPix + 3],image[-1 * PicWidthInPix + 2],image[-1 * PicWidthInPix + 1],image[-1 * PicWidthInPix]); 
	__m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[1 * PicWidthInPix + 3],image[1 * PicWidthInPix + 2],image[1 * PicWidthInPix + 1],image[1 * PicWidthInPix]);
	__m64 mm_E = _mm_set_pi16(image[2 * PicWidthInPix + 3],image[2 * PicWidthInPix + 2],image[2 * PicWidthInPix + 1],image[2 * PicWidthInPix]);
	__m64 mm_F = _mm_set_pi16(image[3 * PicWidthInPix + 3],image[3 * PicWidthInPix + 2],image[3 * PicWidthInPix + 1],image[3 * PicWidthInPix]);

	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	__m64 mm_zero = _mm_setzero_si64();
	__m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0
	__m64 mm_ret = _mm_srai_pi16(mm_clip,5);
	__m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255
	
	
	__m64 test =  _mm_avg_pu8(mm_clip1,mm_D);//(ptr_img[0] + ptr_rf[0] + 1) >> 1
	__m64 test1 =_mm_packs_pu16(test,mm_zero);
	unsigned int ret = _mm_cvtsi64_si32(test1);

	empty(); 
	return ret;

}
Пример #6
0
void pix_subtract :: processYUV_MMX (imageStruct &image, imageStruct &right){
  int datasize =   image.xsize * image.ysize * image.csize;
  __m64*leftPix =  (__m64*)image.data;
  __m64*rightPix = (__m64*)right.data;

  datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);
  __m64 null64 = _mm_setzero_si64();
  __m64 offset = _mm_setr_pi16(0x80, 0x00, 0x80, 0x00);
  __m64 l0, l1, r0, r1;
  while (datasize--) {
    l1=leftPix[datasize];
    r1=rightPix[datasize];

    l0=_mm_unpacklo_pi8 (l1, null64);
    r0=_mm_unpacklo_pi8 (r1, null64);
    l1=_mm_unpackhi_pi8 (l1, null64);
    r1=_mm_unpackhi_pi8 (r1, null64);

    l0=_mm_adds_pu16(l0, offset);
    l1=_mm_adds_pu16(l1, offset);

    l0=_mm_subs_pu16(l0, r0);
    l1=_mm_subs_pu16(l1, r1);

    leftPix[datasize]=_mm_packs_pu16(l0, l1);
  }
  _mm_empty();
}
Пример #7
0
void weighted_merge_planar_mmx(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) {
  __m64 round_mask = _mm_set1_pi32(0x4000);
  __m64 zero = _mm_setzero_si64();
  __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight);

  int wMod8 = (width/8) * 8;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod8; x += 8) {
      __m64 px1 = *(reinterpret_cast<const __m64*>(p1+x)); //y7y6 y5y4 y3y2 y1y0
      __m64 px2 = *(reinterpret_cast<const __m64*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0

      __m64 p0123 = _mm_unpacklo_pi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0
      __m64 p4567 = _mm_unpackhi_pi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4

      __m64 p01 = _mm_unpacklo_pi8(p0123, zero); //00Y1 00y1 00Y0 00y0
      __m64 p23 = _mm_unpackhi_pi8(p0123, zero); //00Y3 00y3 00Y2 00y2
      __m64 p45 = _mm_unpacklo_pi8(p4567, zero); //00Y5 00y5 00Y4 00y4
      __m64 p67 = _mm_unpackhi_pi8(p4567, zero); //00Y7 00y7 00Y6 00y6

      p01 = _mm_madd_pi16(p01, mask);
      p23 = _mm_madd_pi16(p23, mask);
      p45 = _mm_madd_pi16(p45, mask);
      p67 = _mm_madd_pi16(p67, mask);

      p01 = _mm_add_pi32(p01, round_mask);
      p23 = _mm_add_pi32(p23, round_mask);
      p45 = _mm_add_pi32(p45, round_mask);
      p67 = _mm_add_pi32(p67, round_mask);

      p01 = _mm_srli_pi32(p01, 15);
      p23 = _mm_srli_pi32(p23, 15);
      p45 = _mm_srli_pi32(p45, 15);
      p67 = _mm_srli_pi32(p67, 15);

      p0123 = _mm_packs_pi32(p01, p23);
      p4567 = _mm_packs_pi32(p45, p67);

      __m64 result = _mm_packs_pu16(p0123, p4567);

      *reinterpret_cast<__m64*>(p1+x) = result;
    }

    for (int x = wMod8; x < width; x++) {
      p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15;
    }

    p1 += p1_pitch;
    p2 += p2_pitch;
  }
  _mm_empty();
}
Пример #8
0
/* do the processing for all colourspaces */
void pix_motionblur :: processMMX(imageStruct &image)
{
  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();

  int pixsize=image.ysize*image.xsize*image.csize;
  pixsize=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*pixels=(__m64*)image.data;
  __m64*old=(__m64*)m_savedImage.data;

  __m64 newGain = _mm_set1_pi16(static_cast<short>(m_blur0));
  __m64 oldGain = _mm_set1_pi16(static_cast<short>(m_blur1));
  __m64 null64 = _mm_setzero_si64();

  __m64 newpix1, newpix2, oldpix1, oldpix2;

  while(pixsize--) {
    newpix1=pixels[pixsize];
    oldpix1=old[pixsize];

    newpix2 = _mm_unpackhi_pi8(newpix1, null64);
    newpix1 = _mm_unpacklo_pi8(newpix1, null64);
    oldpix2 = _mm_unpackhi_pi8(oldpix1, null64);
    oldpix1 = _mm_unpacklo_pi8(oldpix1, null64);

    newpix1 = _mm_mullo_pi16(newpix1, newGain);
    newpix2 = _mm_mullo_pi16(newpix2, newGain);
    oldpix1 = _mm_mullo_pi16(oldpix1, oldGain);
    oldpix2 = _mm_mullo_pi16(oldpix2, oldGain);

    newpix1 = _mm_adds_pu16 (newpix1, oldpix1);
    newpix2 = _mm_adds_pu16 (newpix2, oldpix2);

    newpix1 = _mm_srli_pi16(newpix1, 8);
    newpix2 = _mm_srli_pi16(newpix2, 8);
    newpix1 = _mm_packs_pu16(newpix1, newpix2);
    pixels[pixsize]=newpix1;
    old   [pixsize]=newpix1;
  }
  _mm_empty();
}
Пример #9
0
/* *********************************************************** */
mlib_status
mlib_m_sconv3x3_8nw_1(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
	__m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc;
	__m64 *buff0, *buff1, *buffT;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 hker0, hker1, hker2, vker0, vker1, vker2;
	__m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= 2;
	height -= 2;
	dl += dll + 1;

	wid4 = (width + 7) / 4;

	if (wid4 > BUFF_LINE) {
		pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4);
	}

	GET_KERN();

	buff0 = pbuff;
	buff1 = buff0 + wid4;

	for (j = 0; j < 2; j++) {
		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_3x3_1ch(lo, i);
		}

		sl += sll;

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);

		for (i = 0; i < width / 8; i++) {
			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_hi, res_lo);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			CONV_3x3_1ch(hi, 2 * i);
			s0 = sp[i];
			CONV_3x3_1ch(lo, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_hi, res_lo);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		buffT = buff1;
		buff1 = buff0;
		buff0 = buffT;

		sl += sll;
		dl += dll;
	}

	_mm_empty();

	if (pbuff != buff_loc)
		mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Пример #10
0
void pix_multiply :: processYUV_MMX(imageStruct &image, imageStruct &right)
{
    int datasize =   image.xsize * image.ysize * image.csize;
    __m64*leftPix =  (__m64*)image.data;
    __m64*rightPix = (__m64*)right.data;

    datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);

    __m64 l0, r0, l1, r1;
    __m64 mask= _mm_setr_pi8((unsigned char)0xFF,
                             (unsigned char)0x00,
                             (unsigned char)0xFF,
                             (unsigned char)0x00,
                             (unsigned char)0xFF,
                             (unsigned char)0x00,
                             (unsigned char)0xFF,
                             (unsigned char)0x00);
    __m64 yuvclamp0 = _mm_setr_pi8((unsigned char)0x00,
                                   (unsigned char)0x10,
                                   (unsigned char)0x00,
                                   (unsigned char)0x10,
                                   (unsigned char)0x00,
                                   (unsigned char)0x10,
                                   (unsigned char)0x00,
                                   (unsigned char)0x10);
    __m64 yuvclamp1 = _mm_setr_pi8((unsigned char)0x00,
                                   (unsigned char)0x24,
                                   (unsigned char)0x00,
                                   (unsigned char)0x24,
                                   (unsigned char)0x00,
                                   (unsigned char)0x24,
                                   (unsigned char)0x00,
                                   (unsigned char)0x24);
    __m64 yuvclamp2 = _mm_setr_pi8((unsigned char)0x00,
                                   (unsigned char)0x14,
                                   (unsigned char)0x00,
                                   (unsigned char)0x14,
                                   (unsigned char)0x00,
                                   (unsigned char)0x14,
                                   (unsigned char)0x00,
                                   (unsigned char)0x14);

    __m64 null64 = _mm_setzero_si64();
    while(datasize--)    {
        r1=rightPix[datasize];
        l1=leftPix [datasize];

        r1=_mm_or_si64(r1, mask);

        l0=_mm_unpacklo_pi8(l1, null64);
        r0=_mm_unpacklo_pi8(r1, null64);
        l1=_mm_unpackhi_pi8(l1, null64);
        r1=_mm_unpackhi_pi8(r1, null64);

        l0=_mm_mullo_pi16  (l0, r0);
        l1=_mm_mullo_pi16  (l1, r1);

        l0=_mm_srli_pi16(l0, 8);
        l1=_mm_srli_pi16(l1, 8);

        l0=_mm_packs_pu16(l0, l1);

        l0=_mm_subs_pu8(l0, yuvclamp0);
        l0=_mm_adds_pu8(l0, yuvclamp1);
        l0=_mm_subs_pu8(l0, yuvclamp2);


        leftPix[datasize]=l0;
    }
    _mm_empty();
}
Пример #11
0
//mbl test with Pocket_PC		
void weak_horizontal_luma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha, const unsigned char beta, const unsigned char tc0){

	__m64 mp2 = _mm_set_pi16(pix[-3*xstride + 3], pix[-3*xstride + 2], pix[-3*xstride + 1], pix[-3*xstride]);
	__m64 mp1 = _mm_set_pi16(pix[-2*xstride + 3], pix[-2*xstride + 2], pix[-2*xstride + 1], pix[-2*xstride]); 
	__m64 mp0 = _mm_set_pi16(pix[-1*xstride + 3], pix[-1*xstride + 2], pix[-1*xstride + 1], pix[-1*xstride]); 
	__m64 mq0 = _mm_set_pi16(pix[3], pix[2], pix[1], pix[0]);
	__m64 mq1 = _mm_set_pi16(pix[xstride + 3], pix[xstride + 2], pix[xstride + 1], pix[xstride]);
	__m64 mq2 = _mm_set_pi16(pix[2*xstride + 3], pix[2*xstride + 2], pix[2*xstride + 1], pix[2*xstride]);
	
	__m64 mrshift = _mm_set_pi16(0,0,0,1);
	__m64 maddp0_q0 =  _mm_avg_pu8(mp0,mq0); //addp0_q0 = (p0 + q0 + 1) >> 1;
	__m64 maddp2 = _mm_add_pi16(maddp0_q0,mp2); //addp2 = (p2 + addp0_q0);
	__m64 maddq2 = _mm_add_pi16(maddp0_q0,mq2); //addp2 = (p2 + addp0_q0);
	__m64 maddp2_s = _mm_srl_pi16(maddp2,mrshift); //addp2 = (p2 + addp0_q0) >> 1;
	__m64 maddq2_s = _mm_srl_pi16(maddq2,mrshift); //addp2 = (p2 + addp0_q0) >> 1;
	__m64 mp1_c = _mm_sub_pi16(maddp2_s, mp1); //addp2 - p1
	__m64 mq1_c = _mm_sub_pi16(maddq2_s, mq1); // addq2 - q1



	//To calculate the mask
 	__m64 malpha = _mm_set_pi16(alpha,alpha,alpha,alpha);
	__m64 malphab = _mm_set_pi16(-alpha,-alpha,-alpha,-alpha);
	__m64 mbeta = _mm_set_pi16(beta,beta,beta,beta);
	__m64 mbetab = _mm_set_pi16(-beta,-beta,-beta,-beta);
	

	__m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0)
	__m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0)
	__m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0)
	__m64 mdiff_p2_p0 = _mm_sub_pi16(mp2,mp0); //abs(p2 - p0 ))
	__m64 mdiff_q2_q0 = _mm_sub_pi16(mq2,mq0); //abs(q2 - q0)

 	__m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab));
	__m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab));
	__m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab));
	__m64 mask3 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p2_p0), _mm_cmpgt_pi16(mdiff_p2_p0,mbetab));
	__m64 mask4 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q2_q0), _mm_cmpgt_pi16(mdiff_q2_q0,mbetab));
	
	__m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); //(abs(q0 - p0) < alpha) && (abs(p1 - p0) < beta) && (abs(q1 - q0) < beta)
	__m64 second_mask = _mm_and_si64 (first_mask,mask3);
	__m64 third_mask = _mm_and_si64 (first_mask,mask4);


	__m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) 
	__m64 mlshift = _mm_set_pi16(0,0,0,2);
	__m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2;
	__m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1)
	__m64 madd4 = _mm_set_pi16(4,4,4,4);
	__m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4;
	__m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 +  inter_2)
	__m64 mrshift3 = _mm_set_pi16(0,0,0,3);
	__m64 minter5 = _mm_sra_pi16(minter_4, mrshift3);



	//Clip3
	__m64 m_tc0 = _mm_set_pi16(tc0,tc0,tc0,tc0);
	__m64 m_tcb0 = _mm_set_pi16(-tc0,-tc0,-tc0,-tc0);
	__m64 mres_c1 = _mm_min_pi16(_mm_max_pi16(mp1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 mres_c2 = _mm_min_pi16(_mm_max_pi16(mq1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addq2 - q1 );
	__m64 merror0 = _mm_and_si64 (mres_c1,second_mask);
	__m64 merror1 = _mm_and_si64 (mres_c2,third_mask);


	__m64 m_1 = _mm_set_pi16(1,1,1,1);
	__m64 m_and1 = _mm_and_si64 (mask3, m_1); //tc++; if abs( p2 - p0 ) < beta 
	__m64 m_and2 = _mm_and_si64 (mask4, m_1); //tc++; if abs( q2 - q0  ) < beta 
	__m64 m_tc = _mm_add_pi16(m_and2,_mm_add_pi16(m_tc0,m_and1));
	__m64 m_tcn =_mm_sub_pi16(_mm_sub_pi16(m_tcb0,m_and1),m_and2);
	__m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcn),m_tc); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 merror2 = _mm_and_si64 (mres_c3,first_mask);


	__m64 result_p1 = _mm_add_pi16(merror0,mp1); //_mm_shuffle_pi16(_mm_add_pi16(merror0,mp1), 0x1B);
	__m64 result_q1 = _mm_add_pi16(merror1,mq1); //_mm_shuffle_pi16(_mm_add_pi16(merror1,mq1), 0x1B);
	__m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B);
	__m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B);

	*((unsigned int* )(&pix[-2*xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p1,mrshift));
	*((unsigned int* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift));
	*((unsigned int* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift));
	*((unsigned int* )(&pix[xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q1,mrshift));

	empty();  
}
Пример #12
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YV12 to RGB16.
VOID Yv12ToRgb16_mmx(PBYTE pbDstX, INT iDstXStride,
					 PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride,
					 UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcYDif;
	INT iSrcUvDif;
	INT yy, bu, guv, rv, r, g, b;

	M64 y0, y1, u0, v0, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbDstX += (iHeight - 1) * iDstXStride;
		iDstXStride = -iDstXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 2);
	iSrcYDif = iSrcYStride - uWidth;
	iSrcUvDif = iSrcUvStride - (uWidth / 2);

	mz = _mm_setzero_si64();
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			// Calculate coefficient.
			u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU));	// [	| u3 u2 u1 u0]
			v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV));	// [	| v3 v2 v1 v0]

			u0 = _mm_unpacklo_pi8(u0, mz);				// u3 u2 u1 u0
			v0 = _mm_unpacklo_pi8(v0, mz);				// v3 v2 v1 v0
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y0 = *((PM64) pbSrcY);						// [YYYY | YYYY]; row 0

			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);			// 8 bits (0 - 7)
			b0 = _mm_srai_pi16(b0, SCALEBITS);

			b1 = _mm_packs_pu16(b1, mz);				// 0 0 0 0 b7 b6 b5 b4
			b0 = _mm_packs_pu16(b0, mz);				// 0 0 0 0 b3 b2 b1 b0
			b1 = _mm_unpacklo_pi8(b1, mz);				// 0 b7 0b 6 0 b5 0 b4
			b0 = _mm_unpacklo_pi8(b0, mz);

			b1 = _mm_srli_pi16(b1, 3);
			b0 = _mm_srli_pi16(b0, 3);					// 5 bits (0 - 4)

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);			// 8 bits (0 - 7)
			g0 = _mm_srai_pi16(g0, SCALEBITS);

			g1 = _mm_packs_pu16(g1, mz);				// 0 0 0 0 g7 g6 g5 g4
			g0 = _mm_packs_pu16(g0, mz);				// 0 0 0 0 g3 g2 g1 g0
			g1 = _mm_unpacklo_pi8(g1, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);

			g1 = _mm_srli_pi16(g1, 2);					// 6 bits (0 - 5)
			g0 = _mm_srli_pi16(g0, 2);
			g1 = _mm_slli_pi16(g1, 5);					// 6 bits (5 - 10)
			g0 = _mm_slli_pi16(g0, 5);					// 6 bits (5 - 10)

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);

			r1 = _mm_packs_pu16(r1, mz);				// 0 0 0 0 r7 r6 r5 r4
			r0 = _mm_packs_pu16(r0, mz);				// 0 0 0 0 r3 r2 r1 r0
			r1 = _mm_unpacklo_pi8(r1, mz);				// 0 r7 0 r6 0 r5 0 r4
			r0 = _mm_unpacklo_pi8(r0, mz);

			r1 = _mm_srli_pi16(r1, 3);					// 5 bits (0 - 4)
			r0 = _mm_srli_pi16(r0, 3);
			r1 = _mm_slli_pi16(r1, 11);					// 5 bits (11 - 15)
			r0 = _mm_slli_pi16(r0, 11);					// 5 bits (11 - 15)

			// Combine RGB.
			b0 = _mm_or_si64(g0, b0);
			b0 = _mm_or_si64(r0, b0);					// 16 bits rgb
			b1 = _mm_or_si64(g1, b1);
			b1 = _mm_or_si64(r1, b1);					// 16 bits rgb

			// Write out row 0.
			((PM64) pbDstX)[0] = b0;
			((PM64) pbDstX)[1] = b1;

			// Process for row 1.
			y0 = *((PM64) (pbSrcY + iSrcYStride));		// [YYYY | YYYY]; row 0
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);			// 8 bits (0 - 7)
			b0 = _mm_srai_pi16(b0, SCALEBITS);

			b1 = _mm_packs_pu16(b1, mz);				// 0 0 0 0 b7 b6 b5 b4
			b0 = _mm_packs_pu16(b0, mz);				// 0 0 0 0 b3 b2 b1 b0
			b1 = _mm_unpacklo_pi8(b1, mz);				// 0 b7 0b 6 0 b5 0 b4
			b0 = _mm_unpacklo_pi8(b0, mz);

			b1 = _mm_srli_pi16(b1, 3);
			b0 = _mm_srli_pi16(b0, 3);					// 5 bits (0 - 4)

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);			// 8 bits (0 - 7)
			g0 = _mm_srai_pi16(g0, SCALEBITS);

			g1 = _mm_packs_pu16(g1, mz);				// 0 0 0 0 g7 g6 g5 g4
			g0 = _mm_packs_pu16(g0, mz);				// 0 0 0 0 g3 g2 g1 g0
			g1 = _mm_unpacklo_pi8(g1, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);

			g1 = _mm_srli_pi16(g1, 2);					// 6 bits (0 - 5)
			g0 = _mm_srli_pi16(g0, 2);
			g1 = _mm_slli_pi16(g1, 5);					// 6 bits (5 - 10)
			g0 = _mm_slli_pi16(g0, 5);					// 6 bits (5 - 10)

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);

			r1 = _mm_packs_pu16(r1, mz);				// 0 0 0 0 r7 r6 r5 r4
			r0 = _mm_packs_pu16(r0, mz);				// 0 0 0 0 r3 r2 r1 r0
			r1 = _mm_unpacklo_pi8(r1, mz);				// 0 r7 0 r6 0 r5 0 r4
			r0 = _mm_unpacklo_pi8(r0, mz);

			r1 = _mm_srli_pi16(r1, 3);					// 5 bits (0 - 4)
			r0 = _mm_srli_pi16(r0, 3);
			r1 = _mm_slli_pi16(r1, 11);					// 5 bits (11 - 15)
			r0 = _mm_slli_pi16(r0, 11);					// 5 bits (11 - 15)

			// Combine RGB.
			b0 = _mm_or_si64(g0, b0);
			b0 = _mm_or_si64(r0, b0);					// 16 bits rgb
			b1 = _mm_or_si64(g1, b1);
			b1 = _mm_or_si64(r1, b1);					// 16 bits rgb

			// Write out row 1.
			((PM64) (pbDstX + iDstXStride))[0] = b0;
			((PM64) (pbDstX + iDstXStride))[1] = b1;

			pbDstX += 16;
			pbSrcY += 8;
			pbSrcU += 4;
			pbSrcV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcU[0]];
			guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]];
			rv = g_iRVTab[pbSrcV[0]];

			yy = g_iYYTab[pbSrcY[0]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) pbDstX)[0] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[1]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) pbDstX)[1] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[iSrcYStride]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) (pbDstX + iDstXStride))[0] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[iSrcYStride + 1]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) (pbDstX + iDstXStride))[1] = _MakeRgb16(r, g, b);

			pbDstX += 4;
			pbSrcY += 2;
			pbSrcU++;
			pbSrcV++;
		}

		pbDstX += iDstXDif + iDstXStride;
		pbSrcY += iSrcYDif + iSrcYStride;
		pbSrcU += iSrcUvDif;
		pbSrcV += iSrcUvDif;
	}

	_mm_empty();
}
Пример #13
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YV12 to RGB24.
VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride,
					 PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride,
					 UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcYDif;
	INT iSrcUvDif;
	INT yy, bu, guv, rv;

	M64 y0, y1, u0, v0, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 rgb0, rgb1, rgb2, rgb3;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbDstX += (iHeight - 1) * iDstXStride;
		iDstXStride = -iDstXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 3);
	iSrcYDif = iSrcYStride - uWidth;
	iSrcUvDif = iSrcUvStride - (uWidth / 2);

	mz = _mm_setzero_si64();
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			// Calculate coefficient.
			u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU));	// [	| u3 u2 u1 u0]
			v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV));	// [	| v3 v2 v1 v0]

			u0 = _mm_unpacklo_pi8(u0, mz);				// u3 u2 u1 u0
			v0 = _mm_unpacklo_pi8(v0, mz);				// v3 v2 v1 v0
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y0 = *((PM64) pbSrcY);						// [YYYY | YYYY]; row 0
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 0.
			*((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3);

			// Process for row 1.
			y0 = *((PM64) (pbSrcY + iSrcYStride));		// [YYYY | YYYY]; row 1
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 1.
			*((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3);

			pbDstX += 24;
			pbSrcY += 8;
			pbSrcU += 4;
			pbSrcV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcU[0]];
			guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]];
			rv = g_iRVTab[pbSrcV[0]];

			yy = g_iYYTab[pbSrcY[0]];
			pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[1]];
			pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride]];
			pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride + 1]];
			pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			pbDstX += 6;
			pbSrcY += 2;
			pbSrcU++;
			pbSrcV++;
		}

		pbDstX += iDstXDif + iDstXStride;
		pbSrcY += iSrcYDif + iSrcYStride;
		pbSrcU += iSrcUvDif;
		pbSrcV += iSrcUvDif;
	}

	_mm_empty();
}
Пример #14
0
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga)
{
	// SSE2 temporaries:
	const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff);
	const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga);
	const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga);

#if 1
	const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]);
#else
	const __m64 bgColors01 = *((__m64 *)&dest[0]);
#endif
	const __m64 fgColors01 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[0]),
		rt_mapcolor<argb_t>(dcol.colormap, source[1])
	);

	const __m64 finalColors01 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		)
	);

#if 1
	const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]);
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	const __m64 bgColors23 = *((__m64 *)&dest[2]);
#endif
	const __m64 fgColors23 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[2]),
		rt_mapcolor<argb_t>(dcol.colormap, source[3])
	);

	const __m64 finalColors23 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		)
	);
	
#if 1
	dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0));
	dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1));
	dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0));
	dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1));
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	*((__m64 *)&dest[0]) = finalColors01;
	*((__m64 *)&dest[2]) = finalColors23;
#endif

	// Required to reset FP:
	_mm_empty();
}
Пример #15
0
mlib_status
mlib_m_sconv5x5_8nw_2(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *hkernel,
    mlib_s32 *vkernel,
    mlib_s32 scalef_expon)
{
    __m64 *pbuff, *buff_arr[5];
    __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT;
    GET_SRC_DST_PARAMETERS(mlib_u8);
    __m64 hker0, hker1, hker2, hker3, hker4;
    __m64 vker0, vker1, vker2, vker3, vker4;
    __m64 s0, d0, d1, d2, prev0;
    __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo;
    __m64 zero = _m_zero;
    mlib_s32 shift, ind;
    mlib_s32 *sp;
    mlib_s32 row, wid4, i, j;

    width -= 4;
    height -= 4;
    width *= NCHAN;
    dl += 2 * (dll + NCHAN);

    wid4 = 2 * ((width + 7) / 8);
    pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4);

    GET_KERN();

    for (i = 0; i < 5; i++) {
        buff_arr[i] = pbuff + i * wid4;
    }

    for (j = 0; j < 4; j++) {
        buff4 = buff_arr[j];

        sp = (mlib_s32 *)sl;

        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        *(mlib_s32 *)&s0 = (*sp++);
        UNPACK_SRC(d2, lo);

        for (i = 0; i < wid4; i++) {
            *(mlib_s32 *)&s0 = sp[i];

            PREP_5x5(lo, i);
        }

        sl += sll;
        ind++;
    }

    buff0 = buff_arr[0];
    buff1 = buff_arr[1];
    buff2 = buff_arr[2];
    buff3 = buff_arr[3];
    buff4 = buff_arr[4];

    for (row = 0; row < height; row++) {
        __m64 *sp = (__m64 *) sl;
        __m64 *dp = (__m64 *) dl;

        s0 = (*sp++);
        UNPACK_SRC(d1, lo);
        UNPACK_SRC(d2, hi);

        for (i = 0; i < width / 8; i++) {
            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);

            dp[i] = _mm_packs_pu16(res_lo, res_hi);
        }

        if (width & 7) {
            __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7];

            s0 = sp[i];
            CONV_5x5(lo, 2 * i);
            CONV_5x5(hi, 2 * i + 1);
            res_hi = _mm_packs_pu16(res_lo, res_hi);

            dp[i] =
                _mm_or_si64(_mm_and_si64(mask, res_hi),
                            _mm_andnot_si64(mask, dp[i]));
        }

        buffT = buff0;
        buff0 = buff1;
        buff1 = buff2;
        buff2 = buff3;
        buff3 = buff4;
        buff4 = buffT;

        sl += sll;
        dl += dll;
    }

    _mm_empty();
    mlib_free(pbuff);

    return (MLIB_SUCCESS);
}
Пример #16
0
__m64 test77(__m64 a, __m64 b) {
  // CHECK: packuswb
  return _mm_packs_pu16(a, b);
}
Пример #17
0
mlib_status
mlib_m_conv5x5_8nw_4(
    mlib_image *dst,
    mlib_image *src,
    mlib_s32 *kern,
    mlib_s32 scalef_expon)
{
	__m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr;
	__m64 *buff0, *buff1, *buff2, *buff3;
	GET_SRC_DST_PARAMETERS(mlib_u8);
	__m64 ker[5][5];
	__m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc;
	__m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo;
	__m64 zero = _m_zero;
	mlib_s32 shift, ind;
	mlib_s32 *sp;
	mlib_s32 row, wid4, i, j;

	width -= (KSIZE - 1);
	height -= (KSIZE - 1);
	width *= NCHAN;
	dl += ((KSIZE - 1) / 2) * (dll + NCHAN);

	wid4 = (width + 7) / 4;
	pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4);

	GET_KERN();

	for (i = 0; i < 10; i++) {
		buff_arr[i] = pbuff + i * wid4;
	}

	ind = 0;
	for (j = 1; j <= 4; j++) {
		buff0 = buff_arr[ind];
		buff1 = buff_arr[ind + 1];
		buff2 = buff_arr[ind + 2];
		buff3 = buff_arr[ind + 3];

		sp = (mlib_s32 *)sl;

		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d2, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		*(mlib_s32 *)&s0 = (*sp++);
		UNPACK_SRC(d4, lo);

		for (i = 0; i < wid4; i++) {
			*(mlib_s32 *)&s0 = sp[i];

			PREP_5x5();
		}

		sl += sll;
		ind += j;
	}

	for (row = 0; row < height; row++) {
		__m64 *sp = (__m64 *) sl;
		__m64 *dp = (__m64 *) dl;

		buff0 = pbuff_arr[0];
		buff1 = pbuff_arr[2];
		buff2 = pbuff_arr[5];
		buff3 = pbuff_arr[9];

		s0 = (*sp++);
		UNPACK_SRC(d1, lo);
		UNPACK_SRC(d2, hi);
		s0 = (*sp++);
		UNPACK_SRC(d3, lo);
		UNPACK_SRC(d4, hi);

		for (i = 0; i < width / 8; i++) {
			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);

			dp[i] = _mm_packs_pu16(res_lo, res_hi);
		}

		if (width & 7) {
			__m64 mask;

			mask = ((__m64 *) mlib_mask64_arr)[width & 7];

			s0 = sp[i];
			CONV_5x5(lo, 2 * i);
			CONV_5x5(hi, 2 * i + 1);
			res_hi = _mm_packs_pu16(res_lo, res_hi);

			dp[i] =
			    _mm_or_si64(_mm_and_si64(mask, res_hi),
			    _mm_andnot_si64(mask, dp[i]));
		}

		ind = (pbuff_arr == buff_arr) ? 10 : -10;
		pbuff_arr[ind + 0] = pbuff_arr[1];
		pbuff_arr[ind + 1] = pbuff_arr[3];
		pbuff_arr[ind + 2] = pbuff_arr[4];
		pbuff_arr[ind + 3] = pbuff_arr[6];
		pbuff_arr[ind + 4] = pbuff_arr[7];
		pbuff_arr[ind + 5] = pbuff_arr[8];
		pbuff_arr[ind + 6] = pbuff_arr[0];
		pbuff_arr[ind + 7] = pbuff_arr[2];
		pbuff_arr[ind + 8] = pbuff_arr[5];
		pbuff_arr[ind + 9] = pbuff_arr[9];
		pbuff_arr += ind;

		sl += sll;
		dl += dll;
	}

	_mm_empty();
	mlib_free(pbuff);

	return (MLIB_SUCCESS);
}
Пример #18
0
__m64 test_mm_packs_pu16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_packs_pu16
  // CHECK: call x86_mmx @llvm.x86.mmx.packuswb
  return _mm_packs_pu16(a, b);
}
Пример #19
0
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output)
{
	__m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0);
	__m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255);
	const uint8_t *orig_input = input;
	uint8_t *y_comp = output;
	uint8_t *u_comp = output + width * height;
	uint8_t *v_comp = u_comp + (int)((width * height)/2);	// 4:2:2
	int i, j;

	// When preparing video for PAL DV50 encoding, the video must be shifted
	// down by one line to change the field order to be bottom-field-first
	int start_line = 0;
	if (shift_picture_down) {
		memset(y_comp, 0x10, width);		// write one line of black Y
		y_comp += width;
		memset(u_comp, 0x80, width/2);		// write one line of black U,V
		u_comp += width/2;
		memset(v_comp, 0x80, width/2);		// write one line of black U,V
		v_comp += width/2;
		start_line = 1;
	}

	/* Do the y component */
	for (j = start_line; j < height; j++)
	{
		// Consume 16 bytes of UYVY data per iteration (8 pixels worth)
		for (i = 0; i < width*2; i += 16)
		{
			//__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask);
			//__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask);
			//__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0);
			//*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1);
			__m64 m0 = *(__m64 *)input;
			__m64 m2 = _mm_srli_si64(m0, 8);
			__m64 m3 = _mm_slli_si64(m0, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m0 = m2;
			__m64 m1 = *(__m64 *)(input+8);
			m2 = _mm_srli_si64(m1, 8);
			m3 = _mm_slli_si64(m1, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m1 = m2;
			*(__m64 *)y_comp = _mm_packs_pu16 (m0, m1);

			y_comp += 8;
			input += 16;
		}
	}
	/* Do the chroma components */
	input = orig_input;
	for (j = start_line; j < height; j++)
	{
		/* Process every line for yuv 4:2:2 */
		for (i = 0; i < width*2; i += 16)
		{
			__m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8));
			__m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8));

			__m64 m3 = _mm_unpacklo_pi8 (m1, m2);
			__m64 m4 = _mm_unpackhi_pi8 (m1, m2);
			//*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2);
			//*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2);
			memcpy (u_comp, &m3, 4);
			memcpy (v_comp, &m4, 4);
			u_comp += 4;
			v_comp += 4;
			input += 16;
		}
	}
	_mm_empty();        // Clear aliased fp register state
}
Пример #20
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YUY2 to RGB24.
VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcXDif;
	INT yy, bu, guv, rv;

	M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 rgb0, rgb1, rgb2, rgb3;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbSrcX += (iHeight - 1) * iSrcXStride;
		iSrcXStride = -iSrcXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 3);
	iSrcXDif = iSrcXStride - (uWidth * 2);

	mz = _mm_setzero_si64();
	for (y = iHeight; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			y0 = ((PM64) pbSrcX)[0];
			y1 = ((PM64) pbSrcX)[1];

			u0 = y0;
			v0 = y1;

			y0 = _mm_and_si64(y0, g_mWord00FF);
			y1 = _mm_and_si64(y1, g_mWord00FF);

			u0 = _mm_srli_pi16(u0, 8);
			v0 = _mm_srli_pi16(v0, 8);

			uv_temp1 = _mm_srli_pi32(u0, 16);
			u0 = _mm_slli_pi32(u0, 16);
			u0 = _mm_srli_pi32(u0, 16);

			uv_temp2 = _mm_srli_pi32(v0, 16);
			v0 = _mm_slli_pi32(v0, 16);
			v0 = _mm_srli_pi32(v0, 16);

			u0 = _mm_packs_pi32(u0, v0);
			v0 = _mm_packs_pi32(uv_temp1, uv_temp2);
			// Calculate coefficient.
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 0.
			*((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3);

			pbDstX += 24;
			pbSrcX += 16;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcX[1]];
			guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]];
			rv = g_iRVTab[pbSrcX[3]];

			yy = g_iYYTab[pbSrcX[0]];
			pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcX[2]];
			pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			pbDstX += 6;
			pbSrcX += 4;
		}

		pbDstX += iDstXDif;
		pbSrcX += iSrcXDif;
	}

	_mm_empty();
}
Пример #21
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YUY2 to YV12.
VOID Yuy2ToYv12_mmx(PBYTE pbDstY, PBYTE pbDstU, PBYTE pbDstV, INT iDstYStride, INT iDstUvStride,
					PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iSrcXDif;
	INT iDstYDif;
	INT iDstUvDif;
	M64 m0, m1, m2, m3, m4, m5, m6, m7;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbSrcX += (iHeight - 1) * iSrcXStride;
		iSrcXStride = -iSrcXStride;
	}

	iSrcXDif = iSrcXStride - (uWidth * 2);
	iDstYDif = iDstYStride - uWidth;
	iDstUvDif = iDstUvStride - (uWidth / 2);

	m7 = g_mWord00FF;
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			m0 = ((PM64) pbSrcX)[0];
			m1 = ((PM64) pbSrcX)[1];
			m2 = ((PM64) (pbSrcX + iSrcXStride))[0];
			m3 = ((PM64) (pbSrcX + iSrcXStride))[1];

			m4 = m0;
			m5 = m2;

			m4 = _mm_srli_pi16(m4, 8);
			m5 = _mm_srli_pi16(m5, 8);

			m4 = _mm_and_si64(m4, m7);
			m5 = _mm_and_si64(m5, m7);

			m4 = _mm_add_pi16(m4, m5);

			m5 = m1;
			m6 = m3;

			m5 = _mm_srli_pi16(m5, 8);
			m6 = _mm_srli_pi16(m6, 8);
			m5 = _mm_and_si64(m5, m7);
			m6 = _mm_and_si64(m6, m7);

			m5 = _mm_add_pi16(m5, m6);

			m4 = _mm_add_pi16(m4, g_mWord0001);
			m5 = _mm_add_pi16(m5, g_mWord0001);

			m4 = _mm_srli_pi16(m4, 1);
			m5 = _mm_srli_pi16(m5, 1);

			m0 = _mm_and_si64(m0, m7);
			m1 = _mm_and_si64(m1, m7);
			m2 = _mm_and_si64(m2, m7);
			m3 = _mm_and_si64(m3, m7);

			m0 = _mm_packs_pu16(m0, m1);
			m2 = _mm_packs_pu16(m2, m3);

			((PM64) pbDstY)[0] = m0;
			((PM64) (pbDstY + iDstYStride))[0] = m2;

			m4 = _mm_packs_pu16(m4, m5);
			m5 = m4;

			m4 = _mm_srli_si64(m4, 8);

			m5 = _mm_and_si64(m5, m7);
			m4 = _mm_and_si64(m4, m7);

			m5 = _mm_packs_pu16(m5, m5);
			m4 = _mm_packs_pu16(m4, m4);

			((PDWORD) pbDstU)[0] = _mm_cvtsi64_si32(m5);
			((PDWORD) pbDstV)[0] = _mm_cvtsi64_si32(m4);

			pbSrcX += 16;
			pbDstY += 8;
			pbDstU += 4;
			pbDstV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			pbDstY[0] = pbSrcX[0];
			pbDstU[0] = (pbSrcX[1] + pbSrcX[iSrcXStride + 1] + 1) / 2;
			pbDstY[1] = pbSrcX[2];
			pbDstV[0] = (pbSrcX[3] + pbSrcX[iSrcXStride + 3] + 1) / 2;

			pbDstY[iDstYStride + 0] = pbSrcX[iSrcXStride + 0];
			pbDstY[iDstYStride + 1] = pbSrcX[iSrcXStride + 2];

			pbSrcX += 4;
			pbDstY += 2;
			pbDstU++;
			pbDstV++;
		}

		pbSrcX += iSrcXDif + iSrcXStride;
		pbDstY += iDstYDif + iDstYStride;
		pbDstU += iDstUvDif;
		pbDstV += iDstUvDif;
	}

	_mm_empty();
}
Пример #22
0
/*  Fill a surface with a gradient which is generated by bilinearly 
    interpolating between four corner color values.  Can take
    a source surface and multiply it into the gradient, but if 
    'src' is NULL, it will generate the gradient without multiplying  */
static void fillBlend(
    SDL_Surface *dst, SDL_Surface *src, BROGUE_DRAW_COLOR *color)
{
    int x, y;
    int lr, lg, lb, rr, rg, rb;
    int ldr, ldg, ldb, rdr, rdg, rdb;
    int w, h;
    BROGUE_DRAW_COLOR ul = color[0];
    BROGUE_DRAW_COLOR ur = color[1];
    BROGUE_DRAW_COLOR bl = color[2];
    BROGUE_DRAW_COLOR br = color[3];
#if defined(__MMX__)
    int mmx = SDL_HasMMX();
#endif
    
    w = dst->w;
    h = dst->h;

    if (src != NULL)
    {
	assert(dst->w == src->w);
	assert(dst->h == src->h);
    }

    lr = clamp(ul.red * 0xFFFF, 0, 0xFFFF);
    lg = clamp(ul.green * 0xFFFF, 0, 0xFFFF);
    lb = clamp(ul.blue * 0xFFFF, 0, 0xFFFF);

    rr = clamp(ur.red * 0xFFFF, 0, 0xFFFF);
    rg = clamp(ur.green * 0xFFFF, 0, 0xFFFF);
    rb = clamp(ur.blue * 0xFFFF, 0, 0xFFFF);

    ldr = (clamp(bl.red * 0xFFFF, 0, 0xFFFF) - lr) / h;
    ldg = (clamp(bl.green * 0xFFFF, 0, 0xFFFF) - lg) / h;
    ldb = (clamp(bl.blue * 0xFFFF, 0, 0xFFFF) - lb) / h;

    rdr = (clamp(br.red * 0xFFFF, 0, 0xFFFF) - rr) / h;
    rdg = (clamp(br.green * 0xFFFF, 0, 0xFFFF) - rg) / h;
    rdb = (clamp(br.blue * 0xFFFF, 0, 0xFFFF) - rb) / h;

    for (y = 0; y < h; y++)
    {
	unsigned char *pix;
	int dr, dg, db;
	int rpp, gpp, bpp, raccum, gaccum, baccum;

	pix = (unsigned char *)dst->pixels + dst->pitch * y;

	dr = rr - lr;
	dg = rg - lg;
	db = rb - lb;

	rpp = dr / w;
	gpp = dg / w;
	bpp = db / w;

	raccum = lr;
	gaccum = lg;
	baccum = lb;

	lr += ldr;
	lg += ldg;
	lb += ldb;

	rr += rdr;
	rg += rdg;
	rb += rdb;

	if (src != NULL)
	{
	    unsigned char *src_pix = (unsigned char *)src->pixels 
		+ src->pitch * y;
	    x = w;

#if defined(__MMX__)
	    /*  MMX is significantly faster.  Use it if the CPU supports it  */
	    if (mmx)
	    {
		__m64 mmx_zero = _m_from_int(0);

		long long ll_color = 
		    ((long long)0xFFFF << 48)
		    | ((long long)raccum << 32)
		    | ((long long)gaccum << 16)
		    | ((long long)baccum);
		__m64 mmx_color = *(__m64 *)&ll_color;

		long long ll_pp = 
		    ((long long)(rpp & 0xFFFF) << 32)
		    | ((long long)(gpp & 0xFFFF) << 16)
		    | ((long long)(bpp & 0xFFFF));
		__m64 mmx_pp = *(__m64 *)&ll_pp;

		while (x >= 2)
		{
		    __m64 src_pair = *(__m64 *)src_pix;
		
		    /*  Separate the left pixel and right pixel  */
		    __m64 left_pix = _mm_unpacklo_pi8(src_pair, mmx_zero);
		    __m64 right_pix = _mm_unpackhi_pi8(src_pair, mmx_zero);
		
		    /*  Multiply the left source by the gradient color  */
		    left_pix = _mm_mullo_pi16(left_pix, 
					      _mm_srli_pi16(mmx_color, 8));
		    /*  Advance the gradient color for the next pixel */
		    mmx_color = _mm_add_pi16(mmx_color, mmx_pp);

		    /*  Multiply the right source by the gradient color  */
		    right_pix = _mm_mullo_pi16(right_pix,
					       _mm_srli_pi16(mmx_color, 8));
		    /*  Advance the gradient  */
		    mmx_color = _mm_add_pi16(mmx_color, mmx_pp); 

		    /*  Recombine the pixels  */
		    __m64 result_pix = _mm_packs_pu16(
			_mm_srli_pi16(left_pix, 8),
			_mm_srli_pi16(right_pix, 8));
		    
		    *(__m64 *)pix = result_pix;

		    src_pix += 8;
		    pix += 8;
		    x -= 2;
		}

		/*  Extract the accumulated gradient value for the potential
		    odd remaining pixel  */
		short *s_color = (short *)&mmx_color;
		raccum = s_color[2];
		gaccum = s_color[1];
		baccum = s_color[0];
	    }
#endif

	    /*  The equivalent slow loop for odd pixels or CPUs without MMX  */
	    while (x > 0)
	    {
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
		pix[3] = src_pix[3];
		pix[2] = (src_pix[2] * raccum) >> 16;
		pix[1] = (src_pix[1] * gaccum) >> 16;
		pix[0] = (src_pix[0] * baccum) >> 16;
#else
		pix[0] = src_pix[0];
		pix[1] = (src_pix[1] * raccum) >> 16;
		pix[2] = (src_pix[2] * gaccum) >> 16;
		pix[3] = (src_pix[3] * baccum) >> 16;
#endif

		raccum += rpp;
		gaccum += gpp;
		baccum += bpp;

		src_pix += 4;
		pix += 4;
		x--;
	    }
	}
	else
	{