static void replace_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height) { int mod8_width = width / 8 * 8; __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) for(int y = 0; y < height; y++) { for(int x = 0; x < mod8_width; x+=8) { __m64 s = *reinterpret_cast<const __m64*>(src+x); __m64 l = *reinterpret_cast<const __m64*>(luma+x); __m64 s_chroma = _mm_and_si64(s, chroma_mask); __m64 l_luma = _mm_and_si64(l, luma_mask); __m64 result = _mm_or_si64(s_chroma, l_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = mod8_width; x < width; x+=2) { src[x] = luma[x]; } src += pitch; luma += luma_pitch; } _mm_empty(); }
///////////////////////////////////////////////transforms///////////////////////////////////////////////////////////////////// void Haar::transrows(char** dest, char** sour, unsigned int w, unsigned int h) const { unsigned int w2 = w / 2; __m64 m00FF; m00FF.m64_u64 = 0x00FF00FF00FF00FF; for (unsigned int y = 0; y < h; y++) { __m64 *mlo = (__m64 *) & dest[y][0]; __m64 *mhi = (__m64 *) & dest[y][w2]; __m64 *msour = (__m64 *) & sour[y][0]; for (unsigned int k = 0; k < w2 / 8; k++) { //k<w2/8 k=8*k __m64 even = _mm_packs_pu16(_mm_and_si64(*msour, m00FF), _mm_and_si64(*(msour + 1), m00FF)); //even coeffs __m64 odd = _mm_packs_pu16(_mm_srli_pi16(*msour, 8), _mm_srli_pi16(*(msour + 1), 8)); //odd coeffs addsub(even, odd, mlo++, mhi++); msour += 2; } if (w2 % 8) { for (unsigned int k = w2 - (w2 % 8); k < w2; k++) { dest[y][k] = char(((int)sour[y][2*k] + (int)sour[y][2*k+1]) / 2); dest[y][k+w2] = char(((int)sour[y][2*k] - (int)sour[y][2*k+1]) / 2); } } } _mm_empty(); }
static void weighted_merge_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *reinterpret_cast<const __m64*>(src+x); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m64 px2 = *reinterpret_cast<const __m64*>(luma+x); //v1 y3 u1 y2 v0 y1 u0 y0 __m64 src_lo = _mm_unpacklo_pi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m64 src_hi = _mm_unpackhi_pi16(px1, px2); src_lo = _mm_and_si64(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si64(src_hi, luma_mask); src_lo = _mm_madd_pi16(src_lo, mask); src_hi = _mm_madd_pi16(src_hi, mask); src_lo = _mm_add_pi32(src_lo, round_mask); src_hi = _mm_add_pi32(src_hi, round_mask); src_lo = _mm_srli_pi32(src_lo, 15); src_hi = _mm_srli_pi32(src_hi, 15); __m64 result_luma = _mm_packs_pi32(src_lo, src_hi); __m64 result_chroma = _mm_and_si64(px1, chroma_mask); __m64 result = _mm_or_si64(result_chroma, result_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = wMod8; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } _mm_empty(); }
mlib_status mlib_m_sconv3x3_16nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, s1, s2, v0, v1, aa, bb, rr, rh, rl; __m64 *sp0, *sp1, *sp2, *dp; __m64 zero, _rnd; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_3x3(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
void pix_compare :: processYUV_MMX(imageStruct &image, imageStruct &right) { long datasize = image.xsize * image.ysize * image.csize; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; __m64 l, r, b; __m64 mask = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF); __m64 zeros = _mm_set1_pi8((unsigned char)0x00); //format is U Y V Y if (m_direction) { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(l, r); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } else { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(r, l); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } _mm_empty(); }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, __m64 * result, unsigned int count) { __m64 _a, _b, one, sum, carry, onesCarry; unsigned int i; carry = _mm_setzero_si64 (); one = _mm_cmpeq_pi8 (carry, carry); one = _mm_sub_si64 (carry, one); for (i = 0; i < count; i++) { _a = a[i]; _b = b[i]; sum = _mm_add_si64 (_a, _b); sum = _mm_add_si64 (sum, carry); result[i] = sum; onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry); onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry); onesCarry = _mm_and_si64 (onesCarry, one); _a = _mm_srli_si64 (_a, 1); _b = _mm_srli_si64 (_b, 1); carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry); carry = _mm_srli_si64 (carry, 63); } return carry; }
void weak_horizontal_chroma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha , const unsigned char beta, const unsigned char tc0) { __m64 mp1 = _mm_set_pi16( 0,0,pix[-2*xstride + 1], pix[-2*xstride]); __m64 mp0 = _mm_set_pi16( 0,0,pix[-1*xstride + 1], pix[-1*xstride]); __m64 mq0 = _mm_set_pi16( 0,0,pix[1], pix[0]); __m64 mq1 = _mm_set_pi16( 0,0,pix[xstride + 1], pix[xstride]); __m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0) __m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0) __m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0) //To calculate the mask __m64 malpha = _mm_set_pi16(0,0,alpha,alpha); __m64 malphab = _mm_set_pi16(0,0,-alpha,-alpha); __m64 mbeta = _mm_set_pi16(0,0,beta,beta); __m64 mbetab = _mm_set_pi16(0,0,-beta,-beta); __m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab)); __m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab)); __m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab)); __m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); __m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) __m64 mlshift = _mm_set_pi16(0,0,0,2); __m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2; __m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1) __m64 madd4 = _mm_set_pi16(4,4,4,4); __m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4; __m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 + inter_2) __m64 mrshift3 = _mm_set_pi16(0,0,0,3); __m64 minter5 = _mm_sra_pi16(minter_4, mrshift3); //Clip3 __m64 m_tc0 = _mm_set_pi16(0,0,tc0,tc0); __m64 m_tcb0 = _mm_set_pi16(0,0,-tc0,-tc0); __m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 merror2 = _mm_and_si64 (mres_c3,first_mask); __m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B); __m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B); __m64 mrshift = _mm_set_pi16(0,0,0,1); *((unsigned short* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift)); *((unsigned short* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift)); empty(); }
void pix_background :: processGrayMMX(imageStruct &image){ int i; long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; if(m_Yrange==0)return; __m64*npixes=(__m64*)image.data; __m64*opixes=(__m64*)m_savedImage.data; __m64 newpix, oldpix, m1; unsigned char thresh=m_Yrange-1; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); while(i--){ newpix=npixes[i]; oldpix=opixes[i]; m1 = _mm_subs_pu8 (newpix, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_subs_pu8 (m1, thresh8); m1 = _mm_cmpgt_pi8(m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh8 npixes[i] = _mm_and_si64(m1, newpix); } _mm_empty(); }
__m64 test53(__m64 a, __m64 b) { // CHECK: pand return _mm_and_si64(a, b); }
mlib_status mlib_m_conv5x5_8nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 ker[5][5]; __m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc; __m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= (KSIZE - 1); height -= (KSIZE - 1); width *= NCHAN; dl += ((KSIZE - 1) / 2) * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d3, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d4, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); s0 = (*sp++); UNPACK_SRC(d3, lo); UNPACK_SRC(d4, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
//mbl test with Pocket_PC void weak_horizontal_luma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha, const unsigned char beta, const unsigned char tc0){ __m64 mp2 = _mm_set_pi16(pix[-3*xstride + 3], pix[-3*xstride + 2], pix[-3*xstride + 1], pix[-3*xstride]); __m64 mp1 = _mm_set_pi16(pix[-2*xstride + 3], pix[-2*xstride + 2], pix[-2*xstride + 1], pix[-2*xstride]); __m64 mp0 = _mm_set_pi16(pix[-1*xstride + 3], pix[-1*xstride + 2], pix[-1*xstride + 1], pix[-1*xstride]); __m64 mq0 = _mm_set_pi16(pix[3], pix[2], pix[1], pix[0]); __m64 mq1 = _mm_set_pi16(pix[xstride + 3], pix[xstride + 2], pix[xstride + 1], pix[xstride]); __m64 mq2 = _mm_set_pi16(pix[2*xstride + 3], pix[2*xstride + 2], pix[2*xstride + 1], pix[2*xstride]); __m64 mrshift = _mm_set_pi16(0,0,0,1); __m64 maddp0_q0 = _mm_avg_pu8(mp0,mq0); //addp0_q0 = (p0 + q0 + 1) >> 1; __m64 maddp2 = _mm_add_pi16(maddp0_q0,mp2); //addp2 = (p2 + addp0_q0); __m64 maddq2 = _mm_add_pi16(maddp0_q0,mq2); //addp2 = (p2 + addp0_q0); __m64 maddp2_s = _mm_srl_pi16(maddp2,mrshift); //addp2 = (p2 + addp0_q0) >> 1; __m64 maddq2_s = _mm_srl_pi16(maddq2,mrshift); //addp2 = (p2 + addp0_q0) >> 1; __m64 mp1_c = _mm_sub_pi16(maddp2_s, mp1); //addp2 - p1 __m64 mq1_c = _mm_sub_pi16(maddq2_s, mq1); // addq2 - q1 //To calculate the mask __m64 malpha = _mm_set_pi16(alpha,alpha,alpha,alpha); __m64 malphab = _mm_set_pi16(-alpha,-alpha,-alpha,-alpha); __m64 mbeta = _mm_set_pi16(beta,beta,beta,beta); __m64 mbetab = _mm_set_pi16(-beta,-beta,-beta,-beta); __m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0) __m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0) __m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0) __m64 mdiff_p2_p0 = _mm_sub_pi16(mp2,mp0); //abs(p2 - p0 )) __m64 mdiff_q2_q0 = _mm_sub_pi16(mq2,mq0); //abs(q2 - q0) __m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab)); __m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab)); __m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab)); __m64 mask3 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p2_p0), _mm_cmpgt_pi16(mdiff_p2_p0,mbetab)); __m64 mask4 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q2_q0), _mm_cmpgt_pi16(mdiff_q2_q0,mbetab)); __m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); //(abs(q0 - p0) < alpha) && (abs(p1 - p0) < beta) && (abs(q1 - q0) < beta) __m64 second_mask = _mm_and_si64 (first_mask,mask3); __m64 third_mask = _mm_and_si64 (first_mask,mask4); __m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) __m64 mlshift = _mm_set_pi16(0,0,0,2); __m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2; __m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1) __m64 madd4 = _mm_set_pi16(4,4,4,4); __m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4; __m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 + inter_2) __m64 mrshift3 = _mm_set_pi16(0,0,0,3); __m64 minter5 = _mm_sra_pi16(minter_4, mrshift3); //Clip3 __m64 m_tc0 = _mm_set_pi16(tc0,tc0,tc0,tc0); __m64 m_tcb0 = _mm_set_pi16(-tc0,-tc0,-tc0,-tc0); __m64 mres_c1 = _mm_min_pi16(_mm_max_pi16(mp1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 mres_c2 = _mm_min_pi16(_mm_max_pi16(mq1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addq2 - q1 ); __m64 merror0 = _mm_and_si64 (mres_c1,second_mask); __m64 merror1 = _mm_and_si64 (mres_c2,third_mask); __m64 m_1 = _mm_set_pi16(1,1,1,1); __m64 m_and1 = _mm_and_si64 (mask3, m_1); //tc++; if abs( p2 - p0 ) < beta __m64 m_and2 = _mm_and_si64 (mask4, m_1); //tc++; if abs( q2 - q0 ) < beta __m64 m_tc = _mm_add_pi16(m_and2,_mm_add_pi16(m_tc0,m_and1)); __m64 m_tcn =_mm_sub_pi16(_mm_sub_pi16(m_tcb0,m_and1),m_and2); __m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcn),m_tc); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 merror2 = _mm_and_si64 (mres_c3,first_mask); __m64 result_p1 = _mm_add_pi16(merror0,mp1); //_mm_shuffle_pi16(_mm_add_pi16(merror0,mp1), 0x1B); __m64 result_q1 = _mm_add_pi16(merror1,mq1); //_mm_shuffle_pi16(_mm_add_pi16(merror1,mq1), 0x1B); __m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B); __m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B); *((unsigned int* )(&pix[-2*xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p1,mrshift)); *((unsigned int* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift)); *((unsigned int* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift)); *((unsigned int* )(&pix[xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q1,mrshift)); empty(); }
/* *********************************************************** */ mlib_status mlib_m_sconv3x3_8nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; dl += dll + 1; wid4 = (width + 7) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + wid4; for (j = 0; j < 2; j++) { sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_3x3_1ch(lo, i); } sl += sll; buffT = buff1; buff1 = buff0; buff0 = buffT; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < width / 8; i++) { CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); dp[i] = _mm_packs_pu16(res_hi, res_lo); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); res_hi = _mm_packs_pu16(res_hi, res_lo); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_ImageMinFilter7x7_S16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, ee, ff, r0, r1; __m64 g0, g1, g2, g3, g4, g5, g6, gg; __m64 h0, h1, h2, h3, h4, h5, h6, hh; __m64 e_mask; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(KSIZE1 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; buff4 = buff3 + wid8; buff5 = buff4 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; for (j = 0; j < 3; j++) { sp0 = buff4; sp1 = buff5; sp6 = sl; sp7 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; sp0 += 8; sp1 += 8; sp6 += 8; sp7 += 8; } if (j < 2) { buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; } } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; sp7 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: ee * line5: ff * line4: g0 g1 g2 g3 g4 g5 g6 * line5: h0 h1 h2 h3 h4 h5 h6 */ for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; sp7 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to YV12. VOID Yuy2ToYv12_mmx(PBYTE pbDstY, PBYTE pbDstU, PBYTE pbDstV, INT iDstYStride, INT iDstUvStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iSrcXDif; INT iDstYDif; INT iDstUvDif; M64 m0, m1, m2, m3, m4, m5, m6, m7; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iSrcXDif = iSrcXStride - (uWidth * 2); iDstYDif = iDstYStride - uWidth; iDstUvDif = iDstUvStride - (uWidth / 2); m7 = g_mWord00FF; for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { m0 = ((PM64) pbSrcX)[0]; m1 = ((PM64) pbSrcX)[1]; m2 = ((PM64) (pbSrcX + iSrcXStride))[0]; m3 = ((PM64) (pbSrcX + iSrcXStride))[1]; m4 = m0; m5 = m2; m4 = _mm_srli_pi16(m4, 8); m5 = _mm_srli_pi16(m5, 8); m4 = _mm_and_si64(m4, m7); m5 = _mm_and_si64(m5, m7); m4 = _mm_add_pi16(m4, m5); m5 = m1; m6 = m3; m5 = _mm_srli_pi16(m5, 8); m6 = _mm_srli_pi16(m6, 8); m5 = _mm_and_si64(m5, m7); m6 = _mm_and_si64(m6, m7); m5 = _mm_add_pi16(m5, m6); m4 = _mm_add_pi16(m4, g_mWord0001); m5 = _mm_add_pi16(m5, g_mWord0001); m4 = _mm_srli_pi16(m4, 1); m5 = _mm_srli_pi16(m5, 1); m0 = _mm_and_si64(m0, m7); m1 = _mm_and_si64(m1, m7); m2 = _mm_and_si64(m2, m7); m3 = _mm_and_si64(m3, m7); m0 = _mm_packs_pu16(m0, m1); m2 = _mm_packs_pu16(m2, m3); ((PM64) pbDstY)[0] = m0; ((PM64) (pbDstY + iDstYStride))[0] = m2; m4 = _mm_packs_pu16(m4, m5); m5 = m4; m4 = _mm_srli_si64(m4, 8); m5 = _mm_and_si64(m5, m7); m4 = _mm_and_si64(m4, m7); m5 = _mm_packs_pu16(m5, m5); m4 = _mm_packs_pu16(m4, m4); ((PDWORD) pbDstU)[0] = _mm_cvtsi64_si32(m5); ((PDWORD) pbDstV)[0] = _mm_cvtsi64_si32(m4); pbSrcX += 16; pbDstY += 8; pbDstU += 4; pbDstV += 4; } for (x = (uWidth & 7) / 2; x; x--) { pbDstY[0] = pbSrcX[0]; pbDstU[0] = (pbSrcX[1] + pbSrcX[iSrcXStride + 1] + 1) / 2; pbDstY[1] = pbSrcX[2]; pbDstV[0] = (pbSrcX[3] + pbSrcX[iSrcXStride + 3] + 1) / 2; pbDstY[iDstYStride + 0] = pbSrcX[iSrcXStride + 0]; pbDstY[iDstYStride + 1] = pbSrcX[iSrcXStride + 2]; pbSrcX += 4; pbDstY += 2; pbDstU++; pbDstV++; } pbSrcX += iSrcXDif + iSrcXStride; pbDstY += iDstYDif + iDstYStride; pbDstU += iDstUvDif; pbDstV += iDstUvDif; } _mm_empty(); }
mlib_status mlib_m_conv3x3_16nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buff2, *buffT; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9; __m64 d0, d1, d2, rr, tmpa, tmpb; __m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph; __m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl; __m64 *sp, *dp; mlib_s32 shift; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; wid4 = (width + 3) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + 2 * wid4; buff2 = buff1 + 2 * wid4; for (j = 0; j < 2; j++) { sp = (__m64 *) sl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < wid4; i++) { PREP_3x3(i); } sl += sll; if (j == 0) { buffT = buff1; buff1 = buff0; buff0 = buffT; } } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < width / 4; i++) { CONV_3x3(i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_ImageMinFilter5x5_U8( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff, r0, r1; __m64 e_mask, mask8080; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(4 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; mask8080 = mmx_from_int_dup(0x80808080); for (j = 0; j < 2; j++) { sp0 = buff0; sp1 = buff1; sp4 = sl; sp5 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; sp0 += 8; sp1 += 8; sp4 += 8; sp5 += 8; } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; sp5 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: e0 e1 e2 e3 e4 * line5: f0 f1 f2 f3 f4 */ for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd) { __m64 kw[12]; /* key schedule words : chaining vars + tweak */ __m64 X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ __m64 w[8]; /* local copy of input block */ __m64 z1; __m64 z3; __m64 z5; __m64 z7; ts[0] = ctx->T[0]; ts[1] = ctx->T[1]; do { ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd)); z1 = SKEIN_KS_PARITY; ks[0] = ctx->X[0]; z1 = _mm_xor_si64(z1,ks[0]); ks[1] = ctx->X[1]; z1 = _mm_xor_si64(z1,ks[1]); ks[2] = ctx->X[2]; z1 = _mm_xor_si64(z1,ks[2]); ks[3] = ctx->X[3]; z1 = _mm_xor_si64(z1,ks[3]); ks[4] = ctx->X[4]; z1 = _mm_xor_si64(z1,ks[4]); ks[5] = ctx->X[5]; z1 = _mm_xor_si64(z1,ks[5]); ks[6] = ctx->X[6]; z1 = _mm_xor_si64(z1,ks[6]); ks[7] = ctx->X[7]; z1 = _mm_xor_si64(z1,ks[7]); ks[8] = z1; ts[2] = _mm_xor_si64(ts[0],ts[1]); X0 = 0[(__m64 *) blkPtr]; X1 = 1[(__m64 *) blkPtr]; X2 = 2[(__m64 *) blkPtr]; X3 = 3[(__m64 *) blkPtr]; X4 = 4[(__m64 *) blkPtr]; X5 = 5[(__m64 *) blkPtr]; X6 = 6[(__m64 *) blkPtr]; X7 = 7[(__m64 *) blkPtr]; w[0] = X0; w[1] = X1; w[2] = X2; w[3] = X3; w[4] = X4; w[5] = X5; w[6] = X6; w[7] = X7; X0 = _mm_add_si64(X0,ks[0]); X1 = _mm_add_si64(X1,ks[1]); X2 = _mm_add_si64(X2,ks[2]); X3 = _mm_add_si64(X3,ks[3]); X4 = _mm_add_si64(X4,ks[4]); X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0])); X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1])); X7 = _mm_add_si64(X7,ks[7]); blkPtr += 64; #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ X##p0 = _mm_add_si64(X##p0,X##p1); \ X##p2 = _mm_add_si64(X##p2,X##p3); \ X##p4 = _mm_add_si64(X##p4,X##p5); \ X##p6 = _mm_add_si64(X##p6,X##p7); \ z1 = X##p1; \ X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \ z1 = _m_psllqi(z1,ROT##_0); \ X##p1 = _mm_or_si64(X##p1,z1); \ z3 = X##p3; \ X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \ z3 = _m_psllqi(z3,ROT##_1); \ X##p3 = _mm_or_si64(X##p3,z3); \ z5 = X##p5; \ z5 = _m_psllqi(z5,ROT##_2); \ X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \ X##p5 = _mm_or_si64(X##p5,z5); \ z7 = X##p7; \ X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \ z7 = _m_psllqi(z7,ROT##_3); \ X##p7 = _mm_or_si64(X##p7,z7); \ X##p1 = _mm_xor_si64(X##p1,X##p0); \ X##p3 = _mm_xor_si64(X##p3,X##p2); \ X##p5 = _mm_xor_si64(X##p5,X##p4); \ X##p7 = _mm_xor_si64(X##p7,X##p6); \ #define I512(R) \ X0 = _mm_add_si64(X0,ks[((R)+1) % 9]); /* inject the key schedule value */ \ X1 = _mm_add_si64(X1,ks[((R)+2) % 9]); \ X2 = _mm_add_si64(X2,ks[((R)+3) % 9]); \ X3 = _mm_add_si64(X3,ks[((R)+4) % 9]); \ X4 = _mm_add_si64(X4,ks[((R)+5) % 9]); \ X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3])); \ X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3])); \ X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1))); \ #define R512_8_rounds(R) \ R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ I512(2*(R)); \ R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ I512(2*(R)+1); R512_8_rounds( 0); R512_8_rounds( 1); R512_8_rounds( 2); R512_8_rounds( 3); R512_8_rounds( 4); R512_8_rounds( 5); R512_8_rounds( 6); R512_8_rounds( 7); R512_8_rounds( 8); ctx->X[0] = _mm_xor_si64(X0,w[0]); ctx->X[1] = _mm_xor_si64(X1,w[1]); ctx->X[2] = _mm_xor_si64(X2,w[2]); ctx->X[3] = _mm_xor_si64(X3,w[3]); ctx->X[4] = _mm_xor_si64(X4,w[4]); ctx->X[5] = _mm_xor_si64(X5,w[5]); ctx->X[6] = _mm_xor_si64(X6,w[6]); ctx->X[7] = _mm_xor_si64(X7,w[7]); ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32) 64 ) << 24),~0)); } while (--blkCnt); ctx->T[0] = ts[0]; ctx->T[1] = ts[1]; }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) { __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; #ifdef USE_SSE2 __m128i emm0, emm2, emm4; #else __m64 mm0, mm1, mm2, mm3, mm4, mm5; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm2:mm3 */ xmm3 = _mm_movehl_ps(xmm3, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm3); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm4 = mm2; mm5 = mm3; /* get the swap sign flag for the sine */ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); __m128 swap_sign_bit_sin; COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); /* get the polynom selection mask for the sine */ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 poly_mask; COPY_MM_TO_XMM(mm2, mm3, poly_mask); #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); #ifdef USE_SSE2 emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); #else /* get the sign flag for the cosine */ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2); mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2); mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4); mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4); mm4 = _mm_slli_pi32(mm4, 29); mm5 = _mm_slli_pi32(mm5, 29); __m128 sign_bit_cos; COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); _mm_empty(); /* good-bye mmx */ #endif sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m128 z = _mm_mul_ps(x,x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
/* almost the same as sin_ps */ __m128 cos_ps(v4sfu *xPtr) { // any x __m128 x=*((__m128 *)xPtr); __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; #ifdef USE_SSE2 __m128i emm0, emm2; #else __m64 mm0, mm1, mm2, mm3; #endif /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2); /* get the swap sign flag */ emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 sign_bit = _mm_castsi128_ps(emm0); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm0:mm1 */ xmm2 = _mm_movehl_ps(xmm2, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm2); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2); mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2); /* get the swap sign flag in mm0:mm1 and the polynom selection mask in mm2:mm3 */ mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 sign_bit, poly_mask; COPY_MM_TO_XMM(mm0, mm1, sign_bit); COPY_MM_TO_XMM(mm2, mm3, poly_mask); _mm_empty(); /* good-bye mmx */ #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m128*)_ps_coscof_p0; __m128 z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
mlib_status mlib_m_sconv5x5_8nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[5]; __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, d0, d1, d2, prev0; __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = 2 * ((width + 7) / 8); pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4); GET_KERN(); for (i = 0; i < 5; i++) { buff_arr[i] = pbuff + i * wid4; } for (j = 0; j < 4; j++) { buff4 = buff_arr[j]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(lo, i); } sl += sll; ind++; } buff0 = buff_arr[0]; buff1 = buff_arr[1]; buff2 = buff_arr[2]; buff3 = buff_arr[3]; buff4 = buff_arr[4]; for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff0; buff0 = buff1; buff1 = buff2; buff2 = buff3; buff3 = buff4; buff4 = buffT; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_conv5x5_u16nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker[5][5]; __m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000; __m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h, tmph; __m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l, tmpl; __m64 *sp, *dp; mlib_s32 shift, ind, ker_sum = 0; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * 2 * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (__m64 *) sl; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < wid4; i++) { PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < width / 4; i++) { CONV_5x5(hi, i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(hi, i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to RGB24. VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcXDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcXDif = iSrcXStride - (uWidth * 2); mz = _mm_setzero_si64(); for (y = iHeight; y; y--) { for (x = uWidth / 8; x; x--) { y0 = ((PM64) pbSrcX)[0]; y1 = ((PM64) pbSrcX)[1]; u0 = y0; v0 = y1; y0 = _mm_and_si64(y0, g_mWord00FF); y1 = _mm_and_si64(y1, g_mWord00FF); u0 = _mm_srli_pi16(u0, 8); v0 = _mm_srli_pi16(v0, 8); uv_temp1 = _mm_srli_pi32(u0, 16); u0 = _mm_slli_pi32(u0, 16); u0 = _mm_srli_pi32(u0, 16); uv_temp2 = _mm_srli_pi32(v0, 16); v0 = _mm_slli_pi32(v0, 16); v0 = _mm_srli_pi32(v0, 16); u0 = _mm_packs_pi32(u0, v0); v0 = _mm_packs_pi32(uv_temp1, uv_temp2); // Calculate coefficient. u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcX += 16; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcX[1]]; guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]]; rv = g_iRVTab[pbSrcX[3]]; yy = g_iYYTab[pbSrcX[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcX[2]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcX += 4; } pbDstX += iDstXDif; pbSrcX += iSrcXDif; } _mm_empty(); }
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga) { // SSE2 temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga); const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga); #if 1 const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]); #else const __m64 bgColors01 = *((__m64 *)&dest[0]); #endif const __m64 fgColors01 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[0]), rt_mapcolor<argb_t>(dcol.colormap, source[1]) ); const __m64 finalColors01 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ) ); #if 1 const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 bgColors23 = *((__m64 *)&dest[2]); #endif const __m64 fgColors23 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[2]), rt_mapcolor<argb_t>(dcol.colormap, source[3]) ); const __m64 finalColors23 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ) ); #if 1 dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0)); dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1)); dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0)); dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)&dest[0]) = finalColors01; *((__m64 *)&dest[2]) = finalColors23; #endif // Required to reset FP: _mm_empty(); }
mlib_status mlib_ImageErode4_U16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* DILATE_FILTER */ { mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m64 *dp0, *dp1; __m64 a1, b0, b1, b2, c0, c1, c2, d1, vv, h0, h1, r0, r1; __m64 mask, mask80; mlib_s32 i, j, tail; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; wid = (wid - 2) * SSIZE; tail = wid & 7; mask = ((__m64 *) mlib_mask64_arr)[tail]; mask80 = mmx_from_int_dup(0x80008000); for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = sl; sp1 = sp0 + slb; sp2 = sp1 + slb; sp3 = sp2 + slb; /* * line0: a1 * line1: b0 b1 b2 * line2: c0 c1 c2 * line3: d1 */ for (i = 0; i <= wid - 8; i += 8) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c0 = *(__m64 *) (sp2); c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d1 = *(__m64 *) (sp3 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); h1 = C_COMP(c0, c2); r0 = C_COMP(vv, a1); r1 = C_COMP(vv, d1); r0 = C_COMP(r0, h0); r1 = C_COMP(r1, h1); (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } if (tail) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c0 = *(__m64 *) (sp2); c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d1 = *(__m64 *) (sp3 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); h1 = C_COMP(c0, c2); r0 = C_COMP(vv, a1); r1 = C_COMP(vv, d1); r0 = C_COMP(r0, h0); r1 = C_COMP(r1, h1); *dp0 = _mm_or_si64(_mm_and_si64(mask, r0), _mm_andnot_si64(mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(mask, r1), _mm_andnot_si64(mask, *dp1)); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; sp0 = sl; sp1 = sp0 + slb; sp2 = sp1 + slb; for (i = 0; i <= wid - 8; i += 8) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c1 = *(__m64 *) (sp2 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); r0 = C_COMP(vv, a1); r0 = C_COMP(r0, h0); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; } if (tail) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c1 = *(__m64 *) (sp2 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); r0 = C_COMP(vv, a1); r0 = C_COMP(r0, h0); *dp0 = _mm_or_si64(_mm_and_si64(mask, r0), _mm_andnot_si64(mask, *dp0)); } } _mm_empty(); return (MLIB_SUCCESS); }
void pix_background :: processYUVMMX(imageStruct &image) { long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); // return; } m_reset=0; int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange, m_Urange, m_Yrange, m_Vrange, m_Yrange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); const __m64 black =_mm_set_pi8((unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80); __m64 newpix, oldpix, m1; while(i--){ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); // to make thresh=0 work correctly m1 = _mm_subs_pu8 (m1, thresh); // m1>thresh -> saturation -> 0 m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh oldpix= black; oldpix= _mm_and_si64 (oldpix, m1); m1 = _mm_andnot_si64 (m1, newpix); m1 = _mm_or_si64 (m1, oldpix); *data++ = m1; } _mm_empty(); }
void mlib_m_ImageMaximum_U8_3( mlib_s32 *res32, const mlib_image *img) { /* src address */ __m64 *sp, *sl; /* src data */ __m64 sd; /* max values */ __m64 max0, max1, max2, max3; /* edge mask */ mlib_s32 emask; /* loop variables */ mlib_s32 n1; /* height of image */ mlib_s32 height = mlib_ImageGetHeight(img); /* elements to next row */ mlib_s32 slb = mlib_ImageGetStride(img); mlib_s32 width = mlib_ImageGetWidth(img) * 3; mlib_u8 *dend; if (slb == width) { width *= height; height = 1; } sp = sl = (__m64 *) mlib_ImageGetData(img); max1 = _mm_set1_pi8(MLIB_U8_MIN); max2 = _mm_set1_pi8(MLIB_U8_MIN); max3 = _mm_set1_pi8(MLIB_U8_MIN); for (; height > 0; height--) { n1 = width; dend = (mlib_u8 *)sp + width; for (; n1 > 23; n1 -= 24) { sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max1, max1, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max2, max2, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max3, max3, sd); } if (n1 > 0) { emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8_M32(max1, max1, sd, emask); n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp); if (n1 > 0) { emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8_M32(max2, max2, sd, emask); n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp); if (n1 > 0) { emask = (0xFF << (8 - n1)); sd = *sp; MLIB_M_IMAGE_MAXIMUM_U8_M32(max3, max3, sd, emask); } } } sp = sl = (__m64 *) ((mlib_u8 *)sl + slb); } MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max1, _mm_srli_si64(max2, 8), mmx_write_64(0x00ffffffffffffffll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max2, 16), mmx_write_64(0x0000000000ff0000ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max3, 16), mmx_write_64(0x0000ffffffffffffll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max3, 8), mmx_write_64(0x0000000000ffff00ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24), mmx_write_64(0x000000ffff000000ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24), mmx_write_64(0x0000000000ffffffll)); res32[0] = _mm_cvtsi64_si32(_mm_and_si64(max0, mmx_write_64(0x00000000000000ffll))); res32[1] = _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 8), mmx_write_64(0x00000000000000ffll))); res32[2] = _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 16), mmx_write_64(0x00000000000000ffll))); _mm_empty(); }
mlib_status mlib_m_sconv5x5_u16nw_3( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, s1, s2, s3, s4, v0, v1, v2, v3, rr, rh, rl; __m64 aa, bb, cc, zero, ker_off, mask8000; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *dp; mlib_s32 shift, ker_sum, kerh_sum = 0, kerv_sum = 0; mlib_s32 i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_5x5(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output) { __m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0); __m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255); const uint8_t *orig_input = input; uint8_t *y_comp = output; uint8_t *u_comp = output + width * height; uint8_t *v_comp = u_comp + (int)((width * height)/2); // 4:2:2 int i, j; // When preparing video for PAL DV50 encoding, the video must be shifted // down by one line to change the field order to be bottom-field-first int start_line = 0; if (shift_picture_down) { memset(y_comp, 0x10, width); // write one line of black Y y_comp += width; memset(u_comp, 0x80, width/2); // write one line of black U,V u_comp += width/2; memset(v_comp, 0x80, width/2); // write one line of black U,V v_comp += width/2; start_line = 1; } /* Do the y component */ for (j = start_line; j < height; j++) { // Consume 16 bytes of UYVY data per iteration (8 pixels worth) for (i = 0; i < width*2; i += 16) { //__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask); //__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask); //__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0); //*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1); __m64 m0 = *(__m64 *)input; __m64 m2 = _mm_srli_si64(m0, 8); __m64 m3 = _mm_slli_si64(m0, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m0 = m2; __m64 m1 = *(__m64 *)(input+8); m2 = _mm_srli_si64(m1, 8); m3 = _mm_slli_si64(m1, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m1 = m2; *(__m64 *)y_comp = _mm_packs_pu16 (m0, m1); y_comp += 8; input += 16; } } /* Do the chroma components */ input = orig_input; for (j = start_line; j < height; j++) { /* Process every line for yuv 4:2:2 */ for (i = 0; i < width*2; i += 16) { __m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m3 = _mm_unpacklo_pi8 (m1, m2); __m64 m4 = _mm_unpackhi_pi8 (m1, m2); //*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2); //*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2); memcpy (u_comp, &m3, 4); memcpy (v_comp, &m4, 4); u_comp += 4; v_comp += 4; input += 16; } } _mm_empty(); // Clear aliased fp register state }
mlib_status mlib_m_sconv7x7_16nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4, hker5, hker6; __m64 vker0, vker1, vker2, vker3, vker4, vker5, vker6; __m64 s0, s1, s2, s3, s4, s5, s6, v0, v1, v2, v3, v4, v5, v6, rr, rh, rl; __m64 zero, _rnd; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *dp; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= KSIZE1; height -= KSIZE1; width *= NCHAN; dl += (KSIZE / 2) * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); sp5 = (__m64 *) (sl + 5 * sll); sp6 = (__m64 *) (sl + 6 * sll); dp = (__m64 *) dl; PREP_V(v1); PREP_V(v2); PREP_V(v3); PREP_V(v4); PREP_V(v5); PREP_V(v6); for (i = 0; i < width / 4; i++) { CONV_7x7(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_7x7(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
__m64 test_mm_and_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_and_si64 // CHECK: call x86_mmx @llvm.x86.mmx.pand return _mm_and_si64(a, b); }