void pix_subtract :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 null64 = _mm_setzero_si64(); __m64 offset = _mm_setr_pi16(0x80, 0x00, 0x80, 0x00); __m64 l0, l1, r0, r1; while (datasize--) { l1=leftPix[datasize]; r1=rightPix[datasize]; l0=_mm_unpacklo_pi8 (l1, null64); r0=_mm_unpacklo_pi8 (r1, null64); l1=_mm_unpackhi_pi8 (l1, null64); r1=_mm_unpackhi_pi8 (r1, null64); l0=_mm_adds_pu16(l0, offset); l1=_mm_adds_pu16(l1, offset); l0=_mm_subs_pu16(l0, r0); l1=_mm_subs_pu16(l1, r1); leftPix[datasize]=_mm_packs_pu16(l0, l1); } _mm_empty(); }
void pix_multiply :: processRGBA_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 null64 = _mm_setzero_si64(); while(datasize--) { l1=leftPix [datasize]; r1=rightPix[datasize]; l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); leftPix[datasize]=_mm_packs_pu16(l0, l1); } _mm_empty(); }
void weighted_merge_planar_mmx(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 zero = _mm_setzero_si64(); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *(reinterpret_cast<const __m64*>(p1+x)); //y7y6 y5y4 y3y2 y1y0 __m64 px2 = *(reinterpret_cast<const __m64*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0 __m64 p0123 = _mm_unpacklo_pi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0 __m64 p4567 = _mm_unpackhi_pi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4 __m64 p01 = _mm_unpacklo_pi8(p0123, zero); //00Y1 00y1 00Y0 00y0 __m64 p23 = _mm_unpackhi_pi8(p0123, zero); //00Y3 00y3 00Y2 00y2 __m64 p45 = _mm_unpacklo_pi8(p4567, zero); //00Y5 00y5 00Y4 00y4 __m64 p67 = _mm_unpackhi_pi8(p4567, zero); //00Y7 00y7 00Y6 00y6 p01 = _mm_madd_pi16(p01, mask); p23 = _mm_madd_pi16(p23, mask); p45 = _mm_madd_pi16(p45, mask); p67 = _mm_madd_pi16(p67, mask); p01 = _mm_add_pi32(p01, round_mask); p23 = _mm_add_pi32(p23, round_mask); p45 = _mm_add_pi32(p45, round_mask); p67 = _mm_add_pi32(p67, round_mask); p01 = _mm_srli_pi32(p01, 15); p23 = _mm_srli_pi32(p23, 15); p45 = _mm_srli_pi32(p45, 15); p67 = _mm_srli_pi32(p67, 15); p0123 = _mm_packs_pi32(p01, p23); p4567 = _mm_packs_pi32(p45, p67); __m64 result = _mm_packs_pu16(p0123, p4567); *reinterpret_cast<__m64*>(p1+x) = result; } for (int x = wMod8; x < width; x++) { p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15; } p1 += p1_pitch; p2 += p2_pitch; } _mm_empty(); }
/* do the processing for all colourspaces */ void pix_motionblur :: processMMX(imageStruct &image) { m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); int pixsize=image.ysize*image.xsize*image.csize; pixsize=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*pixels=(__m64*)image.data; __m64*old=(__m64*)m_savedImage.data; __m64 newGain = _mm_set1_pi16(static_cast<short>(m_blur0)); __m64 oldGain = _mm_set1_pi16(static_cast<short>(m_blur1)); __m64 null64 = _mm_setzero_si64(); __m64 newpix1, newpix2, oldpix1, oldpix2; while(pixsize--) { newpix1=pixels[pixsize]; oldpix1=old[pixsize]; newpix2 = _mm_unpackhi_pi8(newpix1, null64); newpix1 = _mm_unpacklo_pi8(newpix1, null64); oldpix2 = _mm_unpackhi_pi8(oldpix1, null64); oldpix1 = _mm_unpacklo_pi8(oldpix1, null64); newpix1 = _mm_mullo_pi16(newpix1, newGain); newpix2 = _mm_mullo_pi16(newpix2, newGain); oldpix1 = _mm_mullo_pi16(oldpix1, oldGain); oldpix2 = _mm_mullo_pi16(oldpix2, oldGain); newpix1 = _mm_adds_pu16 (newpix1, oldpix1); newpix2 = _mm_adds_pu16 (newpix2, oldpix2); newpix1 = _mm_srli_pi16(newpix1, 8); newpix2 = _mm_srli_pi16(newpix2, 8); newpix1 = _mm_packs_pu16(newpix1, newpix2); pixels[pixsize]=newpix1; old [pixsize]=newpix1; } _mm_empty(); }
void pix_multiply :: processYUV_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 mask= _mm_setr_pi8((unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00); __m64 yuvclamp0 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10); __m64 yuvclamp1 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24); __m64 yuvclamp2 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14); __m64 null64 = _mm_setzero_si64(); while(datasize--) { r1=rightPix[datasize]; l1=leftPix [datasize]; r1=_mm_or_si64(r1, mask); l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); l0=_mm_packs_pu16(l0, l1); l0=_mm_subs_pu8(l0, yuvclamp0); l0=_mm_adds_pu8(l0, yuvclamp1); l0=_mm_subs_pu8(l0, yuvclamp2); leftPix[datasize]=l0; } _mm_empty(); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB16. VOID Yv12ToRgb16_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv, r, g, b; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 2); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 0. ((PM64) pbDstX)[0] = b0; ((PM64) pbDstX)[1] = b1; // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 1. ((PM64) (pbDstX + iDstXStride))[0] = b0; ((PM64) (pbDstX + iDstXStride))[1] = b1; pbDstX += 16; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[1] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[1] = _MakeRgb16(r, g, b); pbDstX += 4; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB24. VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 1 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 1. *((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[1]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride]]; pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to YUY2. VOID Yv12ToYuy2_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; M64 mY0, mY1, mY2, mY3, mU, mV, mUV0, mUV1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 2); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); for (y = iHeight / 2; y; y--) { for (x = uWidth / 16; x; x--) { mU = ((PM64) pbSrcU)[0]; mV = ((PM64) pbSrcV)[0]; mUV0 = _mm_unpacklo_pi8(mU, mV); mUV1 = _mm_unpackhi_pi8(mU, mV); mY0 = ((PM64) pbSrcY)[0]; mY1 = ((PM64) pbSrcY)[1]; mY2 = ((PM64) (pbSrcY + iSrcYStride))[0]; mY3 = ((PM64) (pbSrcY + iSrcYStride))[1]; ((PM64) pbDstX)[0] = _mm_unpacklo_pi8(mY0, mUV0); ((PM64) pbDstX)[1] = _mm_unpackhi_pi8(mY0, mUV0); ((PM64) pbDstX)[2] = _mm_unpacklo_pi8(mY1, mUV1); ((PM64) pbDstX)[3] = _mm_unpackhi_pi8(mY1, mUV1); ((PM64) (pbDstX + iDstXStride))[0] = _mm_unpacklo_pi8(mY2, mUV0); ((PM64) (pbDstX + iDstXStride))[1] = _mm_unpackhi_pi8(mY2, mUV0); ((PM64) (pbDstX + iDstXStride))[2] = _mm_unpacklo_pi8(mY3, mUV1); ((PM64) (pbDstX + iDstXStride))[3] = _mm_unpackhi_pi8(mY3, mUV1); pbDstX += 32; pbSrcY += 16; pbSrcU += 8; pbSrcV += 8; } for (x = (uWidth & 15) / 2; x; x--) { pbDstX[0] = pbSrcY[0]; pbDstX[1] = pbSrcU[0]; pbDstX[2] = pbSrcY[1]; pbDstX[3] = pbSrcV[0]; pbDstX[iDstXStride + 0] = pbSrcY[iSrcYStride + 0]; pbDstX[iDstXStride + 1] = pbSrcU[0]; pbDstX[iDstXStride + 2] = pbSrcY[iSrcYStride + 1]; pbDstX[iDstXStride + 3] = pbSrcV[0]; pbDstX += 4; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga) { // SSE2 temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga); const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga); #if 1 const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]); #else const __m64 bgColors01 = *((__m64 *)&dest[0]); #endif const __m64 fgColors01 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[0]), rt_mapcolor<argb_t>(dcol.colormap, source[1]) ); const __m64 finalColors01 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ) ); #if 1 const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 bgColors23 = *((__m64 *)&dest[2]); #endif const __m64 fgColors23 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[2]), rt_mapcolor<argb_t>(dcol.colormap, source[3]) ); const __m64 finalColors23 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ) ); #if 1 dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0)); dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1)); dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0)); dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)&dest[0]) = finalColors01; *((__m64 *)&dest[2]) = finalColors23; #endif // Required to reset FP: _mm_empty(); }
__m64 test78(__m64 a, __m64 b) { // CHECK: punpckhbw return _mm_unpackhi_pi8(a, b); }
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output) { __m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0); __m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255); const uint8_t *orig_input = input; uint8_t *y_comp = output; uint8_t *u_comp = output + width * height; uint8_t *v_comp = u_comp + (int)((width * height)/2); // 4:2:2 int i, j; // When preparing video for PAL DV50 encoding, the video must be shifted // down by one line to change the field order to be bottom-field-first int start_line = 0; if (shift_picture_down) { memset(y_comp, 0x10, width); // write one line of black Y y_comp += width; memset(u_comp, 0x80, width/2); // write one line of black U,V u_comp += width/2; memset(v_comp, 0x80, width/2); // write one line of black U,V v_comp += width/2; start_line = 1; } /* Do the y component */ for (j = start_line; j < height; j++) { // Consume 16 bytes of UYVY data per iteration (8 pixels worth) for (i = 0; i < width*2; i += 16) { //__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask); //__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask); //__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0); //*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1); __m64 m0 = *(__m64 *)input; __m64 m2 = _mm_srli_si64(m0, 8); __m64 m3 = _mm_slli_si64(m0, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m0 = m2; __m64 m1 = *(__m64 *)(input+8); m2 = _mm_srli_si64(m1, 8); m3 = _mm_slli_si64(m1, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m1 = m2; *(__m64 *)y_comp = _mm_packs_pu16 (m0, m1); y_comp += 8; input += 16; } } /* Do the chroma components */ input = orig_input; for (j = start_line; j < height; j++) { /* Process every line for yuv 4:2:2 */ for (i = 0; i < width*2; i += 16) { __m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m3 = _mm_unpacklo_pi8 (m1, m2); __m64 m4 = _mm_unpackhi_pi8 (m1, m2); //*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2); //*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2); memcpy (u_comp, &m3, 4); memcpy (v_comp, &m4, 4); u_comp += 4; v_comp += 4; input += 16; } } _mm_empty(); // Clear aliased fp register state }
void yuv422_to_uyvy(int width, int height, int shift_picture_up, const uint8_t *input, uint8_t *output) { int i, j, start_line; const uint8_t *y, *u, *v; y = input; u = input + width*height; v = input + width*height * 3/2; start_line = 0; // Shifting picture up one line is necessary when decoding PAL DV50 if (shift_picture_up) { // Skip one line of input picture and start one line lower y += width; u += width / 2; v += width / 2; start_line = 1; } // Convert to UYVY for (j = start_line; j < height; j++) { for (i = 0; i < width*2; i += 32) { __m64 m0 = *(__m64 *)u; // load UUUU UUUU(0) __m64 m1 = *(__m64 *)v; // load VVVV VVVV(0) __m64 m2 = m0; // copy U for mix op m0 = _mm_unpacklo_pi8(m0, m1); // mix -> UVUV UVUV(0) (in m0) m2 = _mm_unpackhi_pi8(m2, m1); // mix -> UVUV UVUV(8) (in m2) __m64 m3 = *(__m64 *)y; // load YYYY YYYY(0) __m64 m5 = *(__m64 *)(y+8); // load YYYY YYYY(8) __m64 m4 = m0; __m64 m6 = m2; m0 = _mm_unpacklo_pi8(m0, m3); // mix to UYVY UYVY(0) m4 = _mm_unpackhi_pi8(m4, m3); // mix to UYVY UYVY(8) m2 = _mm_unpacklo_pi8(m2, m5); // mix to UYVY UYVY(16) m6 = _mm_unpackhi_pi8(m6, m5); // mix to UYVY UYVY(24) *(__m64 *)(output+0) = m0; *(__m64 *)(output+8) = m4; *(__m64 *)(output+16) = m2; *(__m64 *)(output+24) = m6; u += 8; v += 8; y += 16; output += 32; } } _mm_empty(); // Clear aliased fp register state if (shift_picture_up) { // Fill bottom line with one line of black UYVY for (i = 0; i < width*2; i += 4) { *output++ = 0x80; *output++ = 0x10; *output++ = 0x80; *output++ = 0x10; } } }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to RGB24. VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcXDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcXDif = iSrcXStride - (uWidth * 2); mz = _mm_setzero_si64(); for (y = iHeight; y; y--) { for (x = uWidth / 8; x; x--) { y0 = ((PM64) pbSrcX)[0]; y1 = ((PM64) pbSrcX)[1]; u0 = y0; v0 = y1; y0 = _mm_and_si64(y0, g_mWord00FF); y1 = _mm_and_si64(y1, g_mWord00FF); u0 = _mm_srli_pi16(u0, 8); v0 = _mm_srli_pi16(v0, 8); uv_temp1 = _mm_srli_pi32(u0, 16); u0 = _mm_slli_pi32(u0, 16); u0 = _mm_srli_pi32(u0, 16); uv_temp2 = _mm_srli_pi32(v0, 16); v0 = _mm_slli_pi32(v0, 16); v0 = _mm_srli_pi32(v0, 16); u0 = _mm_packs_pi32(u0, v0); v0 = _mm_packs_pi32(uv_temp1, uv_temp2); // Calculate coefficient. u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcX += 16; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcX[1]]; guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]]; rv = g_iRVTab[pbSrcX[3]]; yy = g_iYYTab[pbSrcX[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcX[2]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcX += 4; } pbDstX += iDstXDif; pbSrcX += iSrcXDif; } _mm_empty(); }
/* Fill a surface with a gradient which is generated by bilinearly interpolating between four corner color values. Can take a source surface and multiply it into the gradient, but if 'src' is NULL, it will generate the gradient without multiplying */ static void fillBlend( SDL_Surface *dst, SDL_Surface *src, BROGUE_DRAW_COLOR *color) { int x, y; int lr, lg, lb, rr, rg, rb; int ldr, ldg, ldb, rdr, rdg, rdb; int w, h; BROGUE_DRAW_COLOR ul = color[0]; BROGUE_DRAW_COLOR ur = color[1]; BROGUE_DRAW_COLOR bl = color[2]; BROGUE_DRAW_COLOR br = color[3]; #if defined(__MMX__) int mmx = SDL_HasMMX(); #endif w = dst->w; h = dst->h; if (src != NULL) { assert(dst->w == src->w); assert(dst->h == src->h); } lr = clamp(ul.red * 0xFFFF, 0, 0xFFFF); lg = clamp(ul.green * 0xFFFF, 0, 0xFFFF); lb = clamp(ul.blue * 0xFFFF, 0, 0xFFFF); rr = clamp(ur.red * 0xFFFF, 0, 0xFFFF); rg = clamp(ur.green * 0xFFFF, 0, 0xFFFF); rb = clamp(ur.blue * 0xFFFF, 0, 0xFFFF); ldr = (clamp(bl.red * 0xFFFF, 0, 0xFFFF) - lr) / h; ldg = (clamp(bl.green * 0xFFFF, 0, 0xFFFF) - lg) / h; ldb = (clamp(bl.blue * 0xFFFF, 0, 0xFFFF) - lb) / h; rdr = (clamp(br.red * 0xFFFF, 0, 0xFFFF) - rr) / h; rdg = (clamp(br.green * 0xFFFF, 0, 0xFFFF) - rg) / h; rdb = (clamp(br.blue * 0xFFFF, 0, 0xFFFF) - rb) / h; for (y = 0; y < h; y++) { unsigned char *pix; int dr, dg, db; int rpp, gpp, bpp, raccum, gaccum, baccum; pix = (unsigned char *)dst->pixels + dst->pitch * y; dr = rr - lr; dg = rg - lg; db = rb - lb; rpp = dr / w; gpp = dg / w; bpp = db / w; raccum = lr; gaccum = lg; baccum = lb; lr += ldr; lg += ldg; lb += ldb; rr += rdr; rg += rdg; rb += rdb; if (src != NULL) { unsigned char *src_pix = (unsigned char *)src->pixels + src->pitch * y; x = w; #if defined(__MMX__) /* MMX is significantly faster. Use it if the CPU supports it */ if (mmx) { __m64 mmx_zero = _m_from_int(0); long long ll_color = ((long long)0xFFFF << 48) | ((long long)raccum << 32) | ((long long)gaccum << 16) | ((long long)baccum); __m64 mmx_color = *(__m64 *)&ll_color; long long ll_pp = ((long long)(rpp & 0xFFFF) << 32) | ((long long)(gpp & 0xFFFF) << 16) | ((long long)(bpp & 0xFFFF)); __m64 mmx_pp = *(__m64 *)&ll_pp; while (x >= 2) { __m64 src_pair = *(__m64 *)src_pix; /* Separate the left pixel and right pixel */ __m64 left_pix = _mm_unpacklo_pi8(src_pair, mmx_zero); __m64 right_pix = _mm_unpackhi_pi8(src_pair, mmx_zero); /* Multiply the left source by the gradient color */ left_pix = _mm_mullo_pi16(left_pix, _mm_srli_pi16(mmx_color, 8)); /* Advance the gradient color for the next pixel */ mmx_color = _mm_add_pi16(mmx_color, mmx_pp); /* Multiply the right source by the gradient color */ right_pix = _mm_mullo_pi16(right_pix, _mm_srli_pi16(mmx_color, 8)); /* Advance the gradient */ mmx_color = _mm_add_pi16(mmx_color, mmx_pp); /* Recombine the pixels */ __m64 result_pix = _mm_packs_pu16( _mm_srli_pi16(left_pix, 8), _mm_srli_pi16(right_pix, 8)); *(__m64 *)pix = result_pix; src_pix += 8; pix += 8; x -= 2; } /* Extract the accumulated gradient value for the potential odd remaining pixel */ short *s_color = (short *)&mmx_color; raccum = s_color[2]; gaccum = s_color[1]; baccum = s_color[0]; } #endif /* The equivalent slow loop for odd pixels or CPUs without MMX */ while (x > 0) { #if SDL_BYTEORDER == SDL_LIL_ENDIAN pix[3] = src_pix[3]; pix[2] = (src_pix[2] * raccum) >> 16; pix[1] = (src_pix[1] * gaccum) >> 16; pix[0] = (src_pix[0] * baccum) >> 16; #else pix[0] = src_pix[0]; pix[1] = (src_pix[1] * raccum) >> 16; pix[2] = (src_pix[2] * gaccum) >> 16; pix[3] = (src_pix[3] * baccum) >> 16; #endif raccum += rpp; gaccum += gpp; baccum += bpp; src_pix += 4; pix += 4; x--; } } else {
void test_char_to_float(void) { __m64 narrow = _mm_set_pi8(0, 1, 2, 3, 252, 253, 254, 255); __m64 zero = _mm_setzero_si64(); __m64 loshorts, hishorts; __m128 lofloats, hifloats; float lofloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16))); float hifloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16))); int16_t shortarray[SHORT_ARRAYSIZE] __attribute__ ((aligned (16))); int i; /* interleave zero with narrow and return halves: essentially widening */ /* elements from unsigned chars to unsigned shorts */ loshorts = _mm_unpacklo_pi8(narrow, zero); hishorts = _mm_unpackhi_pi8(narrow, zero); /* now turn the 4 shorts in loshorts into floats and store them in lofloats */ /* likewise hishorts into hifloats. */ /* bug in _mm_cvtpi16_ps ? */ lofloats = _mm_cvtpu16_ps1(loshorts); hifloats = _mm_cvtpu16_ps1(hishorts); _mm_store_ps(lofloatarray, lofloats); _mm_store_ps(hifloatarray, hifloats); /* we used SSE1 instructions that used __m64: add an _mm_empty */ _mm_empty(); /* now store loshorts in shortarray and lofloats into lofloatarray */ /* and print them. */ memcpy(shortarray, &loshorts, sizeof(shortarray)); fprintf(stderr, "loshorts "); for(i= 0; i < SHORT_ARRAYSIZE; i++) { fprintf(stderr, "%d ", shortarray[i]); } fprintf(stderr, "\n"); fprintf(stderr, "lofloats "); for(i= 0; i < FLOAT_ARRAYSIZE; i++) { fprintf(stderr, "%f ", lofloatarray[i]); } fprintf(stderr, "\n"); /* now store hishorts in shortarray and hifloats into lofloatarray */ /* and print them. */ memcpy(shortarray, &hishorts, sizeof(shortarray)); fprintf(stderr, "hishorts "); for(i= 0; i < SHORT_ARRAYSIZE; i++) { fprintf(stderr, "%d ", shortarray[i]); } fprintf(stderr, "\n"); fprintf(stderr, "hifloats "); for(i= 0; i < FLOAT_ARRAYSIZE; i++) { fprintf(stderr, "%f ", hifloatarray[i]); } fprintf(stderr, "\n"); }
__m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpackhi_pi8 // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw return _mm_unpackhi_pi8(a, b); }