Пример #1
0
pstatus_t sse2_alphaComp_argb(
    const BYTE* pSrc1,  UINT32 src1Step,
    const BYTE* pSrc2,  UINT32 src2Step,
    BYTE* pDst,  UINT32 dstStep,
    UINT32 width,  UINT32 height)
{
	const UINT32* sptr1 = (const UINT32*) pSrc1;
	const UINT32* sptr2 = (const UINT32*) pSrc2;
	UINT32* dptr;
	int linebytes, src1Jump, src2Jump, dstJump;
	UINT32 y;
	__m128i xmm0, xmm1;

	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;

	if (width < 4)     /* pointless if too small */
	{
		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
					       pDst, dstStep, width, height);
	}

	dptr = (UINT32*) pDst;
	linebytes = width * sizeof(UINT32);
	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
	xmm0 = _mm_set1_epi32(0);
	xmm1 = _mm_set1_epi16(1);

	for (y = 0; y < height; ++y)
	{
		int pixels = width;
		int count;
		/* Get to the 16-byte boundary now. */
		int leadIn = 0;

		switch ((ULONG_PTR) dptr & 0x0f)
		{
			case 0:
				leadIn = 0;
				break;

			case 4:
				leadIn = 3;
				break;

			case 8:
				leadIn = 2;
				break;

			case 12:
				leadIn = 1;
				break;

			default:
				/* We'll never hit a 16-byte boundary, so do the whole
				 * thing the slow way.
				 */
				leadIn = width;
				break;
		}

		if (leadIn)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1,
						src1Step, (const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, leadIn, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += leadIn;
			sptr2 += leadIn;
			dptr  += leadIn;
			pixels -= leadIn;
		}

		/* Use SSE registers to do 4 pixels at a time. */
		count = pixels >> 2;
		pixels -= count << 2;

		while (count--)
		{
			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
			xmm2 = LOAD_SI128(sptr1);
			sptr1 += 4;
			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
			xmm3 = LOAD_SI128(sptr2);
			sptr2 += 4;
			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
			/* subtract */
			xmm6 = _mm_subs_epi16(xmm4, xmm5);
			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
			/* Add one to alphas */
			xmm4 = _mm_adds_epi16(xmm4, xmm1);
			/* Multiply and take low word */
			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
			/* Shift 8 right */
			xmm4 = _mm_srai_epi16(xmm4, 8);
			/* Add xmm5 */
			xmm4 = _mm_adds_epi16(xmm4, xmm5);
			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
			/* subtract */
			xmm7 = _mm_subs_epi16(xmm5, xmm6);
			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
			/* Add one to alphas */
			xmm5 = _mm_adds_epi16(xmm5, xmm1);
			/* Multiply and take low word */
			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
			/* Shift 8 right */
			xmm5 = _mm_srai_epi16(xmm5, 8);
			/* Add xmm6 */
			xmm5 = _mm_adds_epi16(xmm5, xmm6);
			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
			/* Must mask off remainders or pack gets confused */
			xmm3 = _mm_set1_epi16(0x00ffU);
			xmm4 = _mm_and_si128(xmm4, xmm3);
			xmm5 = _mm_and_si128(xmm5, xmm3);
			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
			xmm5 = _mm_packus_epi16(xmm5, xmm4);
			_mm_store_si128((__m128i*) dptr, xmm5);
			dptr += 4;
		}

		/* Finish off the remainder. */
		if (pixels)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
						(const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, pixels, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += pixels;
			sptr2 += pixels;
			dptr  += pixels;
		}

		/* Jump to next row. */
		sptr1 += src1Jump;
		sptr2 += src2Jump;
		dptr  += dstJump;
	}

	return PRIMITIVES_SUCCESS;
}
Пример #2
0
/* ------------------------------------------------------------------------- */
pstatus_t ssse3_sign_16s(
	const INT16 *pSrc,
	INT16 *pDst,
	INT32 len)
{
	const INT16 *sptr = (const INT16 *) pSrc;
	INT16 *dptr = (INT16 *) pDst;
	size_t count;

	if (len < 16)
	{
		return general_sign_16s(pSrc, pDst, len);
	}

	/* Check for 16-byte alignment (eventually). */
	if ((ULONG_PTR) pDst & 0x01)
	{
		return general_sign_16s(pSrc, pDst, len);
	}

	/* Seek 16-byte alignment. */
	while ((ULONG_PTR) dptr & 0x0f)
	{
		INT16 src = *sptr++;
		*dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
		if (--len == 0) return PRIMITIVES_SUCCESS;
	}

	/* Do 32-short chunks using 8 XMM registers. */
	count = len >> 5;	/* / 32  */
	len -= count << 5;	/* * 32 */
	if ((ULONG_PTR) sptr & 0x0f)
	{
		/* Unaligned */
		while (count--)
		{
			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
			xmm0 = _mm_set1_epi16(0x0001U);
			xmm1 = _mm_set1_epi16(0x0001U);
			xmm2 = _mm_set1_epi16(0x0001U);
			xmm3 = _mm_set1_epi16(0x0001U);
			xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
			xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
			xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
			xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8;
			xmm0 = _mm_sign_epi16(xmm0, xmm4);
			xmm1 = _mm_sign_epi16(xmm1, xmm5);
			xmm2 = _mm_sign_epi16(xmm2, xmm6);
			xmm3 = _mm_sign_epi16(xmm3, xmm7);
			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
		}
	}
	else
	{
		/* Aligned */
		while (count--)
		{
			__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
			xmm0 = _mm_set1_epi16(0x0001U);
			xmm1 = _mm_set1_epi16(0x0001U);
			xmm2 = _mm_set1_epi16(0x0001U);
			xmm3 = _mm_set1_epi16(0x0001U);
			xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8;
			xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8;
			xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8;
			xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8;
			xmm0 = _mm_sign_epi16(xmm0, xmm4);
			xmm1 = _mm_sign_epi16(xmm1, xmm5);
			xmm2 = _mm_sign_epi16(xmm2, xmm6);
			xmm3 = _mm_sign_epi16(xmm3, xmm7);
			_mm_store_si128((__m128i *) dptr, xmm0);         dptr += 8;
			_mm_store_si128((__m128i *) dptr, xmm1);         dptr += 8;
			_mm_store_si128((__m128i *) dptr, xmm2);         dptr += 8;
			_mm_store_si128((__m128i *) dptr, xmm3);         dptr += 8;
		}
	}

	/* Do 8-short chunks using two XMM registers. */
	count = len >> 3;
	len -= count << 3;
	while (count--)
	{
		__m128i xmm0 = _mm_set1_epi16(0x0001U);
		__m128i xmm1 = LOAD_SI128(sptr);					sptr += 8;
		xmm0 = _mm_sign_epi16(xmm0, xmm1);
		_mm_store_si128((__m128i *) dptr, xmm0);			dptr += 8;
	}

	/* Do leftovers. */
	while (len--)
	{
		INT16 src = *sptr++;
		*dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
	}

	return PRIMITIVES_SUCCESS;
}