pstatus_t sse2_alphaComp_argb( const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height) { const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr2 = (const UINT32*) pSrc2; UINT32* dptr; int linebytes, src1Jump, src2Jump, dstJump; UINT32 y; __m128i xmm0, xmm1; if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; if (width < 4) /* pointless if too small */ { return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, height); } dptr = (UINT32*) pDst; linebytes = width * sizeof(UINT32); src1Jump = (src1Step - linebytes) / sizeof(UINT32); src2Jump = (src2Step - linebytes) / sizeof(UINT32); dstJump = (dstStep - linebytes) / sizeof(UINT32); xmm0 = _mm_set1_epi32(0); xmm1 = _mm_set1_epi16(1); for (y = 0; y < height; ++y) { int pixels = width; int count; /* Get to the 16-byte boundary now. */ int leadIn = 0; switch ((ULONG_PTR) dptr & 0x0f) { case 0: leadIn = 0; break; case 4: leadIn = 3; break; case 8: leadIn = 2; break; case 12: leadIn = 1; break; default: /* We'll never hit a 16-byte boundary, so do the whole * thing the slow way. */ leadIn = width; break; } if (leadIn) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, leadIn, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += leadIn; sptr2 += leadIn; dptr += leadIn; pixels -= leadIn; } /* Use SSE registers to do 4 pixels at a time. */ count = pixels >> 2; pixels -= count << 2; while (count--) { __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ xmm2 = LOAD_SI128(sptr1); sptr1 += 4; /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ xmm3 = LOAD_SI128(sptr2); sptr2 += 4; /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); /* subtract */ xmm6 = _mm_subs_epi16(xmm4, xmm5); /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); /* Add one to alphas */ xmm4 = _mm_adds_epi16(xmm4, xmm1); /* Multiply and take low word */ xmm4 = _mm_mullo_epi16(xmm4, xmm6); /* Shift 8 right */ xmm4 = _mm_srai_epi16(xmm4, 8); /* Add xmm5 */ xmm4 = _mm_adds_epi16(xmm4, xmm5); /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); /* subtract */ xmm7 = _mm_subs_epi16(xmm5, xmm6); /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); /* Add one to alphas */ xmm5 = _mm_adds_epi16(xmm5, xmm1); /* Multiply and take low word */ xmm5 = _mm_mullo_epi16(xmm5, xmm7); /* Shift 8 right */ xmm5 = _mm_srai_epi16(xmm5, 8); /* Add xmm6 */ xmm5 = _mm_adds_epi16(xmm5, xmm6); /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ /* Must mask off remainders or pack gets confused */ xmm3 = _mm_set1_epi16(0x00ffU); xmm4 = _mm_and_si128(xmm4, xmm3); xmm5 = _mm_and_si128(xmm5, xmm3); /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ xmm5 = _mm_packus_epi16(xmm5, xmm4); _mm_store_si128((__m128i*) dptr, xmm5); dptr += 4; } /* Finish off the remainder. */ if (pixels) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, pixels, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += pixels; sptr2 += pixels; dptr += pixels; } /* Jump to next row. */ sptr1 += src1Jump; sptr2 += src2Jump; dptr += dstJump; } return PRIMITIVES_SUCCESS; }
/* ------------------------------------------------------------------------- */ pstatus_t ssse3_sign_16s( const INT16 *pSrc, INT16 *pDst, INT32 len) { const INT16 *sptr = (const INT16 *) pSrc; INT16 *dptr = (INT16 *) pDst; size_t count; if (len < 16) { return general_sign_16s(pSrc, pDst, len); } /* Check for 16-byte alignment (eventually). */ if ((ULONG_PTR) pDst & 0x01) { return general_sign_16s(pSrc, pDst, len); } /* Seek 16-byte alignment. */ while ((ULONG_PTR) dptr & 0x0f) { INT16 src = *sptr++; *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); if (--len == 0) return PRIMITIVES_SUCCESS; } /* Do 32-short chunks using 8 XMM registers. */ count = len >> 5; /* / 32 */ len -= count << 5; /* * 32 */ if ((ULONG_PTR) sptr & 0x0f) { /* Unaligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } else { /* Aligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } /* Do 8-short chunks using two XMM registers. */ count = len >> 3; len -= count << 3; while (count--) { __m128i xmm0 = _mm_set1_epi16(0x0001U); __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm1); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; } /* Do leftovers. */ while (len--) { INT16 src = *sptr++; *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0); } return PRIMITIVES_SUCCESS; }