uint32_t probe(uint32_t key) { /* create a vector with all values initialized to key */ __m128i keyVector = _mm_set1_epi32(key); /* find the appropriate buckets using multiplicative hashing */ __m128i bucketIds = _mm_mullo_epi32(keyVector, hashes.vec128); bucketIds = _mm_srli_epi32(bucketIds, hashShift); size_t b0 = _mm_extract_epi32(bucketIds, 0); size_t b1 = _mm_extract_epi32(bucketIds, 1); __m128i keys; __m128i values0, values1; /* load keys, compare with lookup key (to produce a bitmask). * AND the result with the corresponding values. */ keys = _mm_load_si128((const __m128i *) buckets[b0].keys); keys = _mm_cmpeq_epi32(keys, keyVector); values0 = _mm_load_si128((const __m128i *) buckets[b0].values); values0 = _mm_and_si128(values0, keys); keys = _mm_load_si128((const __m128i *) buckets[b1].keys); keys = _mm_cmpeq_epi32(keys, keyVector); values1 = _mm_load_si128((const __m128i *) buckets[b1].values); values1 = _mm_and_si128(values1, keys); /* OR all of the (key AND value) pairs to get result */ union QuadInt qi; qi.vec128 = _mm_or_si128(values0, values1); qi.vec64[0] = _mm_or_si64(qi.vec64[0], qi.vec64[1]); return qi.arr[0] | qi.arr[1]; }
static void replace_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height) { int mod8_width = width / 8 * 8; __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) for(int y = 0; y < height; y++) { for(int x = 0; x < mod8_width; x+=8) { __m64 s = *reinterpret_cast<const __m64*>(src+x); __m64 l = *reinterpret_cast<const __m64*>(luma+x); __m64 s_chroma = _mm_and_si64(s, chroma_mask); __m64 l_luma = _mm_and_si64(l, luma_mask); __m64 result = _mm_or_si64(s_chroma, l_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = mod8_width; x < width; x+=2) { src[x] = luma[x]; } src += pitch; luma += luma_pitch; } _mm_empty(); }
void pix_diff :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 mask = _mm_setr_pi8(0x40, 0x00, 0x40, 0x00, 0x40, 0x00, 0x40, 0x00); __m64 l, r, b; while (datasize--) { l=leftPix[datasize]; r=rightPix[datasize]; l=_mm_adds_pu8(l, mask); r=_mm_subs_pu8(r, mask); b = l; b = _mm_subs_pu8 (b, r); r = _mm_subs_pu8 (r, l); b = _mm_or_si64 (b, r); leftPix[datasize]=b; } _mm_empty(); }
mlib_status mlib_m_sconv3x3_16nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, s1, s2, v0, v1, aa, bb, rr, rh, rl; __m64 *sp0, *sp1, *sp2, *dp; __m64 zero, _rnd; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_3x3(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
void pix_compare :: processYUV_MMX(imageStruct &image, imageStruct &right) { long datasize = image.xsize * image.ysize * image.csize; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; __m64 l, r, b; __m64 mask = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF); __m64 zeros = _mm_set1_pi8((unsigned char)0x00); //format is U Y V Y if (m_direction) { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(l, r); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } else { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(r, l); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } _mm_empty(); }
void pix_movement :: processGrayMMX(imageStruct &image) { // assume that the pix_size does not change ! bool doclear=(image.xsize*image.ysize != buffer.xsize*buffer.ysize); buffer.xsize = image.xsize; buffer.ysize = image.ysize; buffer.reallocate(); if(doclear) { buffer.setWhite(); } buffer2.xsize = image.xsize; buffer2.ysize = image.ysize; buffer2.reallocate(); int pixsize = image.ysize * image.xsize / sizeof(__m64); unsigned char thresh=threshold; __m64*rp = (__m64*)image.data; // read pointer __m64*wp = (__m64*)buffer.data; // write pointer to the copy __m64*wp2= (__m64*)buffer2.data; // write pointer to the diff-image __m64 m1, m2, grey; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); // there is still one problem with the threshold: is the cmpgt only for signed ? while(pixsize--) { grey = rp[pixsize]; // image.data m2 = wp[pixsize]; // buffer.data //m0 =_mm_cmpgt_pi8(grey, m2); // (grey>m2) //m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) //m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) //m1 =_mm_and_si64 (m1, m0); // (m2-grey)&(grey>m2) ((??)) //m0 =_mm_andnot_si64(m0, m2); // !(grey>m2)&(grey-m2) ((??)) //m2 =_mm_or_si64 (m2, m0); // [(a-b)&(a>b)]|[(b-a)&!(a>b)]=abs(a-b) // this is better: use saturated arithmetic! m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) wp[pixsize]=grey; // buffer.data m2 = _mm_or_si64 (m2, m1); // |grey-m2| m2 =_mm_subs_pu8 (m2, thresh8); m2 =_mm_cmpgt_pi8(m2, _mm_setzero_si64()); wp2[pixsize]=m2; // output.data } _mm_empty(); image.data = buffer2.data; }
void pix_background :: processRGBAMMX(imageStruct &image) { long i,pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange, m_Yrange, m_Urange, m_Vrange, m_Arange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); __m64 newpix, oldpix, m1; while(i--){ /* 7ops, 3memops */ /* i have the feeling that this is not faster at all! * even if i have the 3memops + ONLY 1 _mm_subs_pu8() * i am equally slow as the generic code; * adding the other instruction does not change much */ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); m1 = _mm_subs_pu8 (m1, thresh); m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh m1 = _mm_andnot_si64(m1, newpix); *data++ = m1; } _mm_empty(); }
static void weighted_merge_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *reinterpret_cast<const __m64*>(src+x); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m64 px2 = *reinterpret_cast<const __m64*>(luma+x); //v1 y3 u1 y2 v0 y1 u0 y0 __m64 src_lo = _mm_unpacklo_pi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m64 src_hi = _mm_unpackhi_pi16(px1, px2); src_lo = _mm_and_si64(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si64(src_hi, luma_mask); src_lo = _mm_madd_pi16(src_lo, mask); src_hi = _mm_madd_pi16(src_hi, mask); src_lo = _mm_add_pi32(src_lo, round_mask); src_hi = _mm_add_pi32(src_hi, round_mask); src_lo = _mm_srli_pi32(src_lo, 15); src_hi = _mm_srli_pi32(src_hi, 15); __m64 result_luma = _mm_packs_pi32(src_lo, src_hi); __m64 result_chroma = _mm_and_si64(px1, chroma_mask); __m64 result = _mm_or_si64(result_chroma, result_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = wMod8; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } _mm_empty(); }
void pix_background :: processGrayMMX(imageStruct &image){ int i; long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; if(m_Yrange==0)return; __m64*npixes=(__m64*)image.data; __m64*opixes=(__m64*)m_savedImage.data; __m64 newpix, oldpix, m1; unsigned char thresh=m_Yrange-1; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); while(i--){ newpix=npixes[i]; oldpix=opixes[i]; m1 = _mm_subs_pu8 (newpix, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_subs_pu8 (m1, thresh8); m1 = _mm_cmpgt_pi8(m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh8 npixes[i] = _mm_and_si64(m1, newpix); } _mm_empty(); }
int crypto_hash ( unsigned char *out, const unsigned char *in, unsigned long long inlen ) { Skein_512_Ctxt_t ctx; memcpy(ctx.X,IV,sizeof(ctx.X)); ctx.T[0] = _mm_set_pi32(0,0); ctx.T[1] = _mm_set_pi32(((uint32) 112) << 24,0); if (inlen > 64) { size_t n = (inlen-1) / 64; Skein_512_Process_Block(&ctx,in,n,64); inlen -= n * 64; in += n * 64; } memset(ctx.b,0,sizeof(ctx.b)); if (inlen) memcpy(ctx.b,in,inlen); ctx.T[1] = _mm_or_si64(ctx.T[1],_mm_set_pi32(((uint32) 128) << 24,0)); Skein_512_Process_Block(&ctx,ctx.b,1,inlen); memset(ctx.b,0,sizeof(ctx.b)); ctx.T[0] = _mm_set_pi32(0,0); ctx.T[1] = _mm_set_pi32(((uint32) 255) << 24,0); Skein_512_Process_Block(&ctx,ctx.b,1,sizeof(uint64)); 0[(__m64 *) out] = ctx.X[0]; 1[(__m64 *) out] = ctx.X[1]; 2[(__m64 *) out] = ctx.X[2]; 3[(__m64 *) out] = ctx.X[3]; return 0; }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, __m64 * result, unsigned int count) { __m64 _a, _b, one, sum, carry, onesCarry; unsigned int i; carry = _mm_setzero_si64 (); one = _mm_cmpeq_pi8 (carry, carry); one = _mm_sub_si64 (carry, one); for (i = 0; i < count; i++) { _a = a[i]; _b = b[i]; sum = _mm_add_si64 (_a, _b); sum = _mm_add_si64 (sum, carry); result[i] = sum; onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry); onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry); onesCarry = _mm_and_si64 (onesCarry, one); _a = _mm_srli_si64 (_a, 1); _b = _mm_srli_si64 (_b, 1); carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry); carry = _mm_srli_si64 (carry, 63); } return carry; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB16. VOID Yv12ToRgb16_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv, r, g, b; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 2); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 0. ((PM64) pbDstX)[0] = b0; ((PM64) pbDstX)[1] = b1; // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 1. ((PM64) (pbDstX + iDstXStride))[0] = b0; ((PM64) (pbDstX + iDstXStride))[1] = b1; pbDstX += 16; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[1] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[1] = _MakeRgb16(r, g, b); pbDstX += 4; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
mlib_status mlib_m_conv5x5_u16nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker[5][5]; __m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000; __m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h, tmph; __m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l, tmpl; __m64 *sp, *dp; mlib_s32 shift, ind, ker_sum = 0; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * 2 * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (__m64 *) sl; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < wid4; i++) { PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < width / 4; i++) { CONV_5x5(hi, i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(hi, i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv5x5_8nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[5]; __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, d0, d1, d2, prev0; __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = 2 * ((width + 7) / 8); pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4); GET_KERN(); for (i = 0; i < 5; i++) { buff_arr[i] = pbuff + i * wid4; } for (j = 0; j < 4; j++) { buff4 = buff_arr[j]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(lo, i); } sl += sll; ind++; } buff0 = buff_arr[0]; buff1 = buff_arr[1]; buff2 = buff_arr[2]; buff3 = buff_arr[3]; buff4 = buff_arr[4]; for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff0; buff0 = buff1; buff1 = buff2; buff2 = buff3; buff3 = buff4; buff4 = buffT; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
__m64 test55(__m64 a, __m64 b) { // CHECK: por return _mm_or_si64(a, b); }
mlib_status mlib_m_conv5x5_8nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 ker[5][5]; __m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc; __m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= (KSIZE - 1); height -= (KSIZE - 1); width *= NCHAN; dl += ((KSIZE - 1) / 2) * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d3, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d4, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); s0 = (*sp++); UNPACK_SRC(d3, lo); UNPACK_SRC(d4, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
__m64 test_mm_or_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_or_si64 // CHECK: call x86_mmx @llvm.x86.mmx.por return _mm_or_si64(a, b); }
mlib_status mlib_ImageMinFilter7x7_S16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, ee, ff, r0, r1; __m64 g0, g1, g2, g3, g4, g5, g6, gg; __m64 h0, h1, h2, h3, h4, h5, h6, hh; __m64 e_mask; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(KSIZE1 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; buff4 = buff3 + wid8; buff5 = buff4 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; for (j = 0; j < 3; j++) { sp0 = buff4; sp1 = buff5; sp6 = sl; sp7 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; sp0 += 8; sp1 += 8; sp6 += 8; sp7 += 8; } if (j < 2) { buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; } } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; sp7 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: ee * line5: ff * line4: g0 g1 g2 g3 g4 g5 g6 * line5: h0 h1 h2 h3 h4 h5 h6 */ for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; sp7 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_ImageErode4_U16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* DILATE_FILTER */ { mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m64 *dp0, *dp1; __m64 a1, b0, b1, b2, c0, c1, c2, d1, vv, h0, h1, r0, r1; __m64 mask, mask80; mlib_s32 i, j, tail; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; wid = (wid - 2) * SSIZE; tail = wid & 7; mask = ((__m64 *) mlib_mask64_arr)[tail]; mask80 = mmx_from_int_dup(0x80008000); for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = sl; sp1 = sp0 + slb; sp2 = sp1 + slb; sp3 = sp2 + slb; /* * line0: a1 * line1: b0 b1 b2 * line2: c0 c1 c2 * line3: d1 */ for (i = 0; i <= wid - 8; i += 8) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c0 = *(__m64 *) (sp2); c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d1 = *(__m64 *) (sp3 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); h1 = C_COMP(c0, c2); r0 = C_COMP(vv, a1); r1 = C_COMP(vv, d1); r0 = C_COMP(r0, h0); r1 = C_COMP(r1, h1); (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } if (tail) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c0 = *(__m64 *) (sp2); c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d1 = *(__m64 *) (sp3 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); h1 = C_COMP(c0, c2); r0 = C_COMP(vv, a1); r1 = C_COMP(vv, d1); r0 = C_COMP(r0, h0); r1 = C_COMP(r1, h1); *dp0 = _mm_or_si64(_mm_and_si64(mask, r0), _mm_andnot_si64(mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(mask, r1), _mm_andnot_si64(mask, *dp1)); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; sp0 = sl; sp1 = sp0 + slb; sp2 = sp1 + slb; for (i = 0; i <= wid - 8; i += 8) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c1 = *(__m64 *) (sp2 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); r0 = C_COMP(vv, a1); r0 = C_COMP(r0, h0); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; } if (tail) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c1 = *(__m64 *) (sp2 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); r0 = C_COMP(vv, a1); r0 = C_COMP(r0, h0); *dp0 = _mm_or_si64(_mm_and_si64(mask, r0), _mm_andnot_si64(mask, *dp0)); } } _mm_empty(); return (MLIB_SUCCESS); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to UYVY. VOID Yuy2ToUyvy_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; PM64 p, q; M64 m0, m1, m2, m3, m4, m5, m6, m7; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } uWidth *= 2; for (y = iHeight; y; y--) { p = (PM64) pbSrcX; q = (PM64) pbDstX; for (x = uWidth / 32; x; x--) { m0 = p[0]; m1 = p[1]; m2 = p[2]; m3 = p[3]; m4 = m0; m5 = m1; m6 = m2; m7 = m3; m0 = _mm_slli_pi16(m0, 8); m1 = _mm_slli_pi16(m1, 8); m2 = _mm_slli_pi16(m2, 8); m3 = _mm_slli_pi16(m3, 8); m4 = _mm_srli_pi16(m4, 8); m5 = _mm_srli_pi16(m5, 8); m6 = _mm_srli_pi16(m6, 8); m7 = _mm_srli_pi16(m7, 8); m0 = _mm_or_si64(m0, m4); m1 = _mm_or_si64(m1, m5); m2 = _mm_or_si64(m2, m6); m3 = _mm_or_si64(m3, m7); q[0] = m0; q[1] = m1; q[2] = m2; q[3] = m3; p += 4; q += 4; } for (x = uWidth & 31; x; x--) { ((PBYTE) q)[0] = ((PBYTE) p)[1]; ((PBYTE) q)[1] = ((PBYTE) p)[0]; ((PBYTE) q)[2] = ((PBYTE) p)[3]; ((PBYTE) q)[3] = ((PBYTE) p)[2]; ((PBYTE) p) += 4; ((PBYTE) q) += 4; } pbSrcX += iSrcXStride; pbDstX += iDstXStride; } _mm_empty(); }
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix) { const unsigned char *u = src + ((y-1) & 7)*sc2lines_width, *m = src + ((y+0) & 7)*sc2lines_width, *l = src + ((y+1) & 7)*sc2lines_width; for (unsigned i = 0; i < nPix; i += 4) { if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) { __m64 mm = *(__m64*)(m+i-2); __m64 uu = *(__m64*)(u+i-2); __m64 ll = *(__m64*)(l+i-2); __m64 md = _mm_slli_si64(mm,8); __m64 mf = _mm_srli_si64(mm,8); __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll)); __m64 e0, e1, v1, v2; e0 = _mm_cmpeq_pi8(md,uu); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,uu); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(u+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); e0 = _mm_cmpeq_pi8(md,ll); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,ll); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(l+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); } else { __m64 v1 = _m_from_int(*(unsigned*)(m+i)); v1 = _mm_unpacklo_pi8(v1,v1); *(__m64*)(dst1 + 2*i) = v1; *(__m64*)(dst2 + 2*i) = v1; } } }
mlib_status mlib_ImageMinFilter5x5_U8( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff, r0, r1; __m64 e_mask, mask8080; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(4 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; mask8080 = mmx_from_int_dup(0x80808080); for (j = 0; j < 2; j++) { sp0 = buff0; sp1 = buff1; sp4 = sl; sp5 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; sp0 += 8; sp1 += 8; sp4 += 8; sp5 += 8; } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; sp5 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: e0 e1 e2 e3 e4 * line5: f0 f1 f2 f3 f4 */ for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
void pix_multiply :: processYUV_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 mask= _mm_setr_pi8((unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00); __m64 yuvclamp0 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10); __m64 yuvclamp1 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24); __m64 yuvclamp2 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14); __m64 null64 = _mm_setzero_si64(); while(datasize--) { r1=rightPix[datasize]; l1=leftPix [datasize]; r1=_mm_or_si64(r1, mask); l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); l0=_mm_packs_pu16(l0, l1); l0=_mm_subs_pu8(l0, yuvclamp0); l0=_mm_adds_pu8(l0, yuvclamp1); l0=_mm_subs_pu8(l0, yuvclamp2); leftPix[datasize]=l0; } _mm_empty(); }
/* *********************************************************** */ mlib_status mlib_m_sconv3x3_8nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; dl += dll + 1; wid4 = (width + 7) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + wid4; for (j = 0; j < 2; j++) { sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_3x3_1ch(lo, i); } sl += sll; buffT = buff1; buff1 = buff0; buff0 = buffT; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < width / 8; i++) { CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); dp[i] = _mm_packs_pu16(res_hi, res_lo); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); res_hi = _mm_packs_pu16(res_hi, res_lo); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
void pix_background :: processYUVMMX(imageStruct &image) { long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); // return; } m_reset=0; int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange, m_Urange, m_Yrange, m_Vrange, m_Yrange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); const __m64 black =_mm_set_pi8((unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80); __m64 newpix, oldpix, m1; while(i--){ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); // to make thresh=0 work correctly m1 = _mm_subs_pu8 (m1, thresh); // m1>thresh -> saturation -> 0 m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh oldpix= black; oldpix= _mm_and_si64 (oldpix, m1); m1 = _mm_andnot_si64 (m1, newpix); m1 = _mm_or_si64 (m1, oldpix); *data++ = m1; } _mm_empty(); }
mlib_status mlib_m_conv3x3_16nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buff2, *buffT; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9; __m64 d0, d1, d2, rr, tmpa, tmpb; __m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph; __m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl; __m64 *sp, *dp; mlib_s32 shift; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; wid4 = (width + 3) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + 2 * wid4; buff2 = buff1 + 2 * wid4; for (j = 0; j < 2; j++) { sp = (__m64 *) sl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < wid4; i++) { PREP_3x3(i); } sl += sll; if (j == 0) { buffT = buff1; buff1 = buff0; buff0 = buffT; } } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < width / 4; i++) { CONV_3x3(i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv5x5_u16nw_3( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, s1, s2, s3, s4, v0, v1, v2, v3, rr, rh, rl; __m64 aa, bb, cc, zero, ker_off, mask8000; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *dp; mlib_s32 shift, ker_sum, kerh_sum = 0, kerv_sum = 0; mlib_s32 i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_5x5(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output) { __m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0); __m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255); const uint8_t *orig_input = input; uint8_t *y_comp = output; uint8_t *u_comp = output + width * height; uint8_t *v_comp = u_comp + (int)((width * height)/2); // 4:2:2 int i, j; // When preparing video for PAL DV50 encoding, the video must be shifted // down by one line to change the field order to be bottom-field-first int start_line = 0; if (shift_picture_down) { memset(y_comp, 0x10, width); // write one line of black Y y_comp += width; memset(u_comp, 0x80, width/2); // write one line of black U,V u_comp += width/2; memset(v_comp, 0x80, width/2); // write one line of black U,V v_comp += width/2; start_line = 1; } /* Do the y component */ for (j = start_line; j < height; j++) { // Consume 16 bytes of UYVY data per iteration (8 pixels worth) for (i = 0; i < width*2; i += 16) { //__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask); //__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask); //__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0); //*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1); __m64 m0 = *(__m64 *)input; __m64 m2 = _mm_srli_si64(m0, 8); __m64 m3 = _mm_slli_si64(m0, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m0 = m2; __m64 m1 = *(__m64 *)(input+8); m2 = _mm_srli_si64(m1, 8); m3 = _mm_slli_si64(m1, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m1 = m2; *(__m64 *)y_comp = _mm_packs_pu16 (m0, m1); y_comp += 8; input += 16; } } /* Do the chroma components */ input = orig_input; for (j = start_line; j < height; j++) { /* Process every line for yuv 4:2:2 */ for (i = 0; i < width*2; i += 16) { __m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m3 = _mm_unpacklo_pi8 (m1, m2); __m64 m4 = _mm_unpackhi_pi8 (m1, m2); //*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2); //*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2); memcpy (u_comp, &m3, 4); memcpy (v_comp, &m4, 4); u_comp += 4; v_comp += 4; input += 16; } } _mm_empty(); // Clear aliased fp register state }
mlib_status mlib_m_sconv7x7_16nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4, hker5, hker6; __m64 vker0, vker1, vker2, vker3, vker4, vker5, vker6; __m64 s0, s1, s2, s3, s4, s5, s6, v0, v1, v2, v3, v4, v5, v6, rr, rh, rl; __m64 zero, _rnd; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *dp; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= KSIZE1; height -= KSIZE1; width *= NCHAN; dl += (KSIZE / 2) * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); sp5 = (__m64 *) (sl + 5 * sll); sp6 = (__m64 *) (sl + 6 * sll); dp = (__m64 *) dl; PREP_V(v1); PREP_V(v2); PREP_V(v3); PREP_V(v4); PREP_V(v5); PREP_V(v6); for (i = 0; i < width / 4; i++) { CONV_7x7(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_7x7(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
mlib_status mlib_ImageMinFilter3x3_S16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *buff, *buff1; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m64 *dp0, *dp1; __m64 aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1; __m64 e_mask; mlib_s32 i, j, wid8, tail; wid = (wid - 2) * SSIZE; wid8 = (wid + 7) & ~7; buff = mlib_malloc(2 * wid8); buff1 = buff + wid8; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; /* * line0: aa * line1: bb * line2: c0 c1 c2 * line3: d0 d1 d2 */ for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); cc = C_COMP(c0, c1); cc = C_COMP(cc, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); c1 = C_COMP(c0, c1); cc = C_COMP(c1, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(buff); return (MLIB_SUCCESS); }