static void replace_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height) { int mod8_width = width / 8 * 8; __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) for(int y = 0; y < height; y++) { for(int x = 0; x < mod8_width; x+=8) { __m64 s = *reinterpret_cast<const __m64*>(src+x); __m64 l = *reinterpret_cast<const __m64*>(luma+x); __m64 s_chroma = _mm_and_si64(s, chroma_mask); __m64 l_luma = _mm_and_si64(l, luma_mask); __m64 result = _mm_or_si64(s_chroma, l_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = mod8_width; x < width; x+=2) { src[x] = luma[x]; } src += pitch; luma += luma_pitch; } _mm_empty(); }
static void weighted_merge_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *reinterpret_cast<const __m64*>(src+x); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m64 px2 = *reinterpret_cast<const __m64*>(luma+x); //v1 y3 u1 y2 v0 y1 u0 y0 __m64 src_lo = _mm_unpacklo_pi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m64 src_hi = _mm_unpackhi_pi16(px1, px2); src_lo = _mm_and_si64(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si64(src_hi, luma_mask); src_lo = _mm_madd_pi16(src_lo, mask); src_hi = _mm_madd_pi16(src_hi, mask); src_lo = _mm_add_pi32(src_lo, round_mask); src_hi = _mm_add_pi32(src_hi, round_mask); src_lo = _mm_srli_pi32(src_lo, 15); src_hi = _mm_srli_pi32(src_hi, 15); __m64 result_luma = _mm_packs_pi32(src_lo, src_hi); __m64 result_chroma = _mm_and_si64(px1, chroma_mask); __m64 result = _mm_or_si64(result_chroma, result_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = wMod8; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } _mm_empty(); }
int32_t od_mc_compute_satd8_4x4_sse2(const unsigned char *src, int systride, const unsigned char *ref, int rystride) { int32_t satd; __m64 sums; __m64 a; __m64 b; __m64 c; __m64 d; a = od_load_convert_subtract_x4(src + 0*systride, ref + 0*rystride); b = od_load_convert_subtract_x4(src + 1*systride, ref + 1*rystride); c = od_load_convert_subtract_x4(src + 2*systride, ref + 2*rystride); d = od_load_convert_subtract_x4(src + 3*systride, ref + 3*rystride); /*Vertical 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_transpose16x4(&a, &b, &c, &d); /*Horizontal 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the final butterfly stage with the calculating the absolute values and the first stage of accumulation. Calculates (abs(a+b)+abs(a-b))/2-0x7FFF. An offset must be added to the final sum before rounding to account for subtracting 0x7FFF.*/ a = _mm_sub_pi16(_mm_max_pi16(a, b), _mm_adds_pi16(_mm_add_pi16(a, b), _mm_set1_pi16(0x7FFF))); c = _mm_sub_pi16(_mm_max_pi16(c, d), _mm_adds_pi16(_mm_add_pi16(c, d), _mm_set1_pi16(0x7FFF))); /*Take the sum of all the absolute values.*/ sums = _mm_add_pi16(a, c); /*Sum the elements of the vector.*/ sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(0, 1, 2, 3))); sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(2, 3, 0, 1))); sums = _mm_unpacklo_pi16(sums, _mm_setzero_si64()); satd = _mm_cvtsi64_si32(sums); /*Subtract the offset (8) and round.*/ satd = (satd + 1 - 8) >> 1; #if defined(OD_CHECKASM) { int32_t c_satd; c_satd = od_mc_compute_satd8_4x4_c(src, systride, ref, rystride); if (satd != c_satd) { fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n", 4, 4, satd, c_satd); } } #endif return satd; }
__m64 test_mm_set1_pi16(short a) { // CHECK-LABEL: test_mm_set1_pi16 // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> return _mm_set1_pi16(a); }
/* do the processing for all colourspaces */ void pix_motionblur :: processMMX(imageStruct &image) { m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); int pixsize=image.ysize*image.xsize*image.csize; pixsize=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*pixels=(__m64*)image.data; __m64*old=(__m64*)m_savedImage.data; __m64 newGain = _mm_set1_pi16(static_cast<short>(m_blur0)); __m64 oldGain = _mm_set1_pi16(static_cast<short>(m_blur1)); __m64 null64 = _mm_setzero_si64(); __m64 newpix1, newpix2, oldpix1, oldpix2; while(pixsize--) { newpix1=pixels[pixsize]; oldpix1=old[pixsize]; newpix2 = _mm_unpackhi_pi8(newpix1, null64); newpix1 = _mm_unpacklo_pi8(newpix1, null64); oldpix2 = _mm_unpackhi_pi8(oldpix1, null64); oldpix1 = _mm_unpacklo_pi8(oldpix1, null64); newpix1 = _mm_mullo_pi16(newpix1, newGain); newpix2 = _mm_mullo_pi16(newpix2, newGain); oldpix1 = _mm_mullo_pi16(oldpix1, oldGain); oldpix2 = _mm_mullo_pi16(oldpix2, oldGain); newpix1 = _mm_adds_pu16 (newpix1, oldpix1); newpix2 = _mm_adds_pu16 (newpix2, oldpix2); newpix1 = _mm_srli_pi16(newpix1, 8); newpix2 = _mm_srli_pi16(newpix2, 8); newpix1 = _mm_packs_pu16(newpix1, newpix2); pixels[pixsize]=newpix1; old [pixsize]=newpix1; } _mm_empty(); }
void DrawAAPMMX(PixelBlock& w, int x, int y, Color c) { if(!Rect(w.GetSize()).Contains(Rect(x, y, x + 6, y + 11))) return; dword *a = w.PointAdr(x, y); int d = w.LineDelta(); __m64 zero = _mm_setzero_si64(); __m64 mc = _mm_unpacklo_pi8(_mm_cvtsi32_si64(c.GetRaw()), zero); __m64 mask = _mm_set1_pi16(0xff); const byte *s = aa_packed; dword *t = a; __m64 alpha; __m64 h; __m64 m; for(;;) { dword c = *s++; if(c == 0) break; t += (c >> 3) & 15; switch(c & 7) { case 7: AAPMMX_(6); case 6: AAPMMX_(5); case 5: AAPMMX_(4); case 4: AAPMMX_(3); case 3: AAPMMX_(2); case 2: AAPMMX_(1); case 1: AAPMMX_(0); } t += c & 7; s += c & 7; if(c & 0x80) { a += d; t = a; } } _mm_empty(); }
#include <assert.h> #ifndef NDEBUG #define unreachable() assert(0) #else #define unreachable __builtin_unreachable #endif // requires w%16 == 0 __attribute__((hot)) void maxblend_sse(void *restrict dest, const void *restrict src, int w, int h) { //FIXME: use src_stride //FIXME: deal with w%16 != 0 __m64 *mbdst = dest; const __m64 *mbsrc = src; const __m64 off = _mm_set1_pi16(0x8000); _mm_prefetch(mbdst, _MM_HINT_NTA); _mm_prefetch(mbsrc, _MM_HINT_NTA); for(unsigned int i=0; i < 2*w*h/sizeof(__m64); i+=4, mbdst+=4, mbsrc+=4) { _mm_prefetch(mbdst + 4, _MM_HINT_NTA); _mm_prefetch(mbsrc + 4, _MM_HINT_NTA); __m64 v1, v2, v3, v4, t1, t2, t3, t4; v1 = mbdst[0], t1 = mbsrc[0]; v1 = _mm_add_pi16(v1, off); t1 = _mm_add_pi16(t1, off); v1 = _mm_max_pi16(v1, t1); v1 = _mm_sub_pi16(v1, off); mbdst[0]=v1; v2 = mbdst[1], t2 = mbsrc[1];
void reverb::comb_allpass4(signed short *sp, signed short *dp, const comb_param &comb_delay, const int comb_gain, const int allpass_delay, const int allpass_gain, const int *rvol, const unsigned int sz) { #ifdef use_intrinsics __m64 cg=_mm_set1_pi16(comb_gain), ag=_mm_set1_pi16(allpass_gain), rv[2]; rv[0]=_mm_set1_pi16(rvol[0]); rv[1]=_mm_set1_pi16(rvol[1]); for (unsigned int i=0; i<(sz>>4); i++, sp+=2<<2, dp+=2<<2) { __m64 dv[2]; for (int c=0; c<2; c++) { // Comb __m64 v=_mm_setzero_si64(); for (int f=0; f<4; f++) { int yck=(yp-comb_delay[c][f])&(max_delay-1); __m64 xv=*(__m64 *)(&x[c][yck]), yv=*(__m64 *)(&y[c][f][yck]); yv=_mm_mulhi_pi16(yv,cg); yv=_mm_adds_pi16(yv,yv); yv=_mm_adds_pi16(xv,yv); *((__m64 *)&y[c][f][yp])=yv; yv=_mm_srai_pi16(yv,2); v=_mm_adds_pi16(v,yv); } // Allpass if (allpass_delay) { *((__m64 *)&ax[c][yp])=v; int ypa=(yp-allpass_delay)&(max_delay-1); __m64 ayv=*(__m64 *)&ay[c][ypa], xv=*(__m64 *)&x[c][yp], axv=*(__m64 *)&ax[c][ypa]; ayv=_mm_subs_pi16(ayv,xv); ayv=_mm_mulhi_pi16(ayv,ag); ayv=_mm_adds_pi16(ayv,ayv); v=_mm_adds_pi16(ayv,axv); *((__m64 *)&ay[c][yp])=v; } // Output dv[c]=_mm_mulhi_pi16(v,rv[c]); dv[c]=_mm_adds_pi16(dv[c],dv[c]); } __m64 dv1=_mm_unpacklo_pi16(dv[0],dv[1]), dv2=_mm_unpackhi_pi16(dv[0],dv[1]), d1=*(__m64 *)&dp[0], d2=*(__m64 *)&dp[4], s1=*(__m64 *)&sp[0], s2=*(__m64 *)&sp[4]; d1=_mm_adds_pi16(d1,s1); d2=_mm_adds_pi16(d2,s2); d1=_mm_adds_pi16(d1,dv1); d2=_mm_adds_pi16(d2,dv2); *(__m64 *)&dp[0]=d1; *(__m64 *)&dp[4]=d2; yp=(yp+4)&(max_delay-1); } _mm_empty(); #endif }
void mlib_m_ImageMaximum_U16_3( mlib_s32 *res32, const mlib_image *img) { /* src address */ __m64 *sp, *sl; /* src data */ __m64 sd; /* max values */ __m64 max1, max2, max3; __m64 _2s32_1, _2s32_2, _2s32_3, _2s32_4, _2s32_5, _2s32_6; mlib_s32 s1, s2, s3, s4, s5, s6; /* edge mask */ mlib_s32 emask; /* loop variables */ mlib_s32 n1; /* height of image */ mlib_s32 height = mlib_ImageGetHeight(img); /* elements to next row */ mlib_s32 slb = mlib_ImageGetStride(img); mlib_s32 width = mlib_ImageGetWidth(img) * 3; mlib_u16 *dend; if (slb == width) { width *= height; height = 1; } sp = sl = (__m64 *) mlib_ImageGetData(img); max1 = _mm_set1_pi16(MLIB_U16_MIN); max2 = _mm_set1_pi16(MLIB_U16_MIN); max3 = _mm_set1_pi16(MLIB_U16_MIN); for (; height > 0; height--) { n1 = width; dend = (mlib_u16 *)sp + width; for (; n1 > 11; n1 -= 12) { sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U16(max1, max1, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U16(max2, max2, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U16(max3, max3, sd); } if (n1 > 0) { emask = (n1 > 3) ? 0xF : (0xF << (4 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U16_M32(max1, max1, sd, emask); n1 = ((mlib_s16 *)dend - (mlib_s16 *)sp); if (n1 > 0) { emask = (n1 > 3) ? 0xF : (0xF << (4 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U16_M32(max2, max2, sd, emask); n1 = ((mlib_s16 *)dend - (mlib_s16 *)sp); if (n1 > 0) { emask = (0xF << (4 - n1)); sd = *sp; MLIB_M_IMAGE_MAXIMUM_U16_M32(max3, max3, sd, emask); } } } sp = sl = (__m64 *) ((mlib_u8 *)sl + slb); } MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max1); MLIB_M_CONVERT_4U16_2S32(_2s32_3, _2s32_4, max2); MLIB_M_CONVERT_4U16_2S32(_2s32_5, _2s32_6, max3); MLIB_M_IMAGE_MAXIMUM_S32(max1, _2s32_1, _2s32_6); MLIB_M_IMAGE_MAXIMUM_S32(max2, _2s32_2, _2s32_3); MLIB_M_IMAGE_MAXIMUM_S32(max3, _2s32_4, _2s32_5); MLIB_M_CONVERT_2S32_S32(s1, s2, max1); MLIB_M_CONVERT_2S32_S32(s3, s4, max2); MLIB_M_CONVERT_2S32_S32(s5, s6, max3); MLIB_M_IMAGE_MAXIMUM(res32[0], s2, s3); MLIB_M_IMAGE_MAXIMUM(res32[1], s4, s5); MLIB_M_IMAGE_MAXIMUM(res32[2], s1, s6); _mm_empty(); }
void mlib_m_ImageMaximum_U16_124( mlib_s32 *res32, const mlib_image *img) { /* src address */ __m64 *sp, *sl; /* src data */ __m64 sd; /* min values */ __m64 max; __m64 _2s32_1, _2s32_2; /* edge mask */ mlib_s32 emask; /* loop variables */ mlib_s32 n1; /* height of image */ mlib_s32 height = mlib_ImageGetHeight(img); /* elements to next row */ mlib_s32 slb = mlib_ImageGetStride(img); /* number of image channels */ mlib_s32 channels = mlib_ImageGetChannels(img); mlib_s32 width = mlib_ImageGetWidth(img) * channels; mlib_s32 s1, s2; if (slb == width) { width *= height; height = 1; } sp = sl = (__m64 *) mlib_ImageGetData(img); /* min values */ max = _mm_set1_pi16(MLIB_U16_MIN); for (; height > 0; height--) { n1 = width; for (; n1 > 3; n1 -= 4) { sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U16(max, max, sd); } if (n1 > 0) { emask = (0xF << (4 - n1)); sd = *sp; MLIB_M_IMAGE_MAXIMUM_U16_M32(max, max, sd, emask); } sp = sl = (__m64 *) ((mlib_u8 *)sl + slb); } switch (channels) { case 1: { MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max); MLIB_M_IMAGE_MAXIMUM_S32(_2s32_1, _2s32_1, _2s32_2); MLIB_M_CONVERT_2S32_S32(s1, s2, _2s32_1); MLIB_M_IMAGE_MAXIMUM(res32[0], s1, s2); break; } case 2: { MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max); MLIB_M_IMAGE_MAXIMUM_S32(_2s32_1, _2s32_1, _2s32_2); ((__m64 *) res32)[0] = _2s32_1; break; } case 4: { MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max); ((__m64 *) res32)[0] = _2s32_2; ((__m64 *) res32)[1] = _2s32_1; break; } } _mm_empty(); }