static void
ax203_decode_block_yuv(char *src, int **dest, int dest_x, int dest_y)
{
	int x, y, r, g, b;
	uint8_t Y[4];
	int8_t U, V;

	/* The compressed data consists of blocks of 2x2 pixels, each encoded
	   in 4 bytes The highest 5 bits of each byte encode an Y value, the
	   lowest 3 bits of the first 2 bytes are combined to form U, and of
	   the last 2 bytes to form V */
	for (x = 0; x < 4; x++)
		Y[x] = src[x] & 0xF8;

	U = ((src[0] & 7) << 5) | ((src[1] & 7) << 2);
	V = ((src[2] & 7) << 5) | ((src[3] & 7) << 2);

	/* The Y components are for a 2x2 pixels block like this:
	   1 2
	   3 4  */
	for (y = 0; y < 2; y ++) {
		for (x = 0; x < 2; x ++) {
			r = 1.164 * (Y[y * 2 + x] - 16) + 1.596 * V;
			g = 1.164 * (Y[y * 2 + x] - 16) - 0.391 * U - 0.813 * V;
			b = 1.164 * (Y[y * 2 + x] - 16) + 2.018 * U;
			dest[dest_y + y][dest_x + x] =
				gdTrueColor (CLAMP_U8 (r),
					     CLAMP_U8 (g),
					     CLAMP_U8 (b));
		}
	}
}
mlib_status
__mlib_ImageThresh1(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *thresh,
    const mlib_s32 *ghigh,
    const mlib_s32 *glow)
{
    mlib_s32 nchan, width, height, sstride, dstride;
    void *sdata, *ddata;
    mlib_type type;
    mlib_d64 cnst[6];
    mlib_s32 i, t_sh, algn;

    MLIB_IMAGE_CHECK(src);
    MLIB_IMAGE_CHECK(dst);
    MLIB_IMAGE_SIZE_EQUAL(src, dst);
    MLIB_IMAGE_CHAN_EQUAL(src, dst);
    MLIB_IMAGE_TYPE_DSTBIT_OR_EQ(src, dst);

    if (thresh == NULL)
        return (MLIB_NULLPOINTER);
    if (ghigh == NULL)
        return (MLIB_NULLPOINTER);
    if (glow == NULL)
        return (MLIB_NULLPOINTER);

    MLIB_IMAGE_GET_ALL_PARAMS(dst, type, nchan, width, height, dstride,
                              ddata);
    sstride = mlib_ImageGetStride(src);
    sdata = mlib_ImageGetData(src);

    if (type == MLIB_BIT) {
        return (mlib_ImageThresh1B(dst, src, thresh, ghigh, glow));

    } else if (type == MLIB_BYTE) {
        mlib_u8 *pcol1 = (void *)cnst;
        mlib_u8 *pcol2 = (void *)(cnst + 2);
        mlib_u8 *pcol3 = (void *)(cnst + 4);

        t_sh = 0;

        for (i = 0; i < nchan; i++) {
            pcol1[i] = CLAMP_U8(thresh[i]);
            pcol2[i] = ghigh[i];
            pcol3[i] = glow[i];

            if (thresh[i] < MLIB_U8_MIN)
                pcol3[i] = pcol2[i];
        }

        for (i = nchan; i < 16; i++) {
            pcol1[i] = pcol1[i - nchan];
            pcol2[i] = pcol2[i - nchan];
            pcol3[i] = pcol3[i - nchan];
        }

    } else if (type == MLIB_SHORT) {
        mlib_s16 *pcol1 = (void *)cnst;
        mlib_s16 *pcol2 = (void *)(cnst + 2);
        mlib_s16 *pcol3 = (void *)(cnst + 4);

        t_sh = 1;

        for (i = 0; i < nchan; i++) {
            pcol1[i] = CLAMP_S16(thresh[i]);
            pcol2[i] = ghigh[i];
            pcol3[i] = glow[i];

            if (thresh[i] < MLIB_S16_MIN)
                pcol3[i] = pcol2[i];
        }

        for (i = nchan; i < 8; i++) {
            pcol1[i] = pcol1[i - nchan];
            pcol2[i] = pcol2[i - nchan];
            pcol3[i] = pcol3[i - nchan];
        }

    } else if (type == MLIB_USHORT) {
        mlib_s16 *pcol1 = (void *)cnst;
        mlib_s16 *pcol2 = (void *)(cnst + 2);
        mlib_s16 *pcol3 = (void *)(cnst + 4);

        t_sh = 1;

        for (i = 0; i < nchan; i++) {
            pcol1[i] = CLAMP_U16(thresh[i]);
            pcol2[i] = ghigh[i];
            pcol3[i] = glow[i];

            if (thresh[i] < MLIB_U16_MIN)
                pcol3[i] = pcol2[i];
        }

        for (i = nchan; i < 8; i++) {
            pcol1[i] = pcol1[i - nchan];
            pcol2[i] = pcol2[i - nchan];
            pcol3[i] = pcol3[i - nchan];
        }

    } else {
        mlib_s32 *pcol1 = (void *)cnst;
        mlib_s32 *pcol2 = (void *)(cnst + 2);
        mlib_s32 *pcol3 = (void *)(cnst + 4);

        t_sh = 2;

        for (i = 0; i < nchan; i++) {
            pcol1[i] = thresh[i];
            pcol2[i] = ghigh[i];
            pcol3[i] = glow[i];
        }

        for (i = nchan; i < 4; i++) {
            pcol1[i] = pcol1[i - nchan];
            pcol2[i] = pcol2[i - nchan];
            pcol3[i] = pcol3[i - nchan];
        }
    }

    if (sstride == dstride && sstride == ((nchan * width) << t_sh)) {
        width *= height;
        height = 1;
    }

    algn = (int)sdata | (int)ddata;

    if (height > 1)
        algn |= (sstride | dstride);
    algn &= 7;

    if (nchan == 3)
        algn = 1;

    sstride >>= t_sh;
    dstride >>= t_sh;

    switch (type) {
    case MLIB_BYTE:
        return mlib_v_ImageThresh1_U8_3(sdata, ddata, sstride, dstride,
                                        width, height, nchan, cnst);

    case MLIB_SHORT:

        if (algn == 0) {
            return mlib_v_ImageThresh1_S16_A(sdata, ddata, sstride,
                                             dstride, width, height, nchan, cnst);
        } else {
            return mlib_v_ImageThresh1_S16_3(sdata, ddata, sstride,
                                             dstride, width, height, nchan, cnst);
        }

    case MLIB_USHORT:

        if (algn == 0) {
            return mlib_v_ImageThresh1_U16_A(sdata, ddata, sstride,
                                             dstride, width, height, nchan, cnst);
        } else {
            return mlib_v_ImageThresh1_U16_3(sdata, ddata, sstride,
                                             dstride, width, height, nchan, cnst);
        }

    case MLIB_INT:

        if (nchan == 4) {
            return mlib_v_ImageThresh1_S32_2(sdata, ddata, sstride,
                                             dstride, width, height, nchan, cnst);
        }

        if (algn == 0) {
            return mlib_v_ImageThresh1_S32_A(sdata, ddata, sstride,
                                             dstride, width, height, nchan, cnst);
        } else {
            return mlib_v_ImageThresh1_S32_3(sdata, ddata, sstride,
                                             dstride, width, height, nchan, cnst);
        }

    default:
        return (MLIB_FAILURE);
    }
}
mlib_status
__mlib_ImageReplaceColor(
    mlib_image *dst,
    const mlib_image *src,
    const mlib_s32 *color1,
    const mlib_s32 *color2)
{
	mlib_s32 nchan, width, height, sstride, dstride;
	void *sdata, *ddata;
	mlib_type type;

	__m128i cnst[6];
	mlib_s32 i, t_sh;

	MLIB_IMAGE_CHECK(dst);
	MLIB_IMAGE_CHECK(src);
	MLIB_IMAGE_FULL_EQUAL(dst, src);

	if (color1 == NULL)
		return (MLIB_NULLPOINTER);

	if (color2 == NULL)
		return (MLIB_NULLPOINTER);

	MLIB_IMAGE_GET_ALL_PARAMS(
		dst, type, nchan, width,
		height, dstride, ddata);

	sstride = mlib_ImageGetStride(src);
	sdata = mlib_ImageGetData(src);

	if (type == MLIB_BYTE) {
		mlib_u8 *pcol1 = (void *)cnst;
		mlib_u8 *pcol2 = (void *)(cnst + 3);

		t_sh = 0;

		for (i = 0; i < nchan; i++) {
			pcol1[i] = CLAMP_U8(color1[i]);
			pcol2[i] = color2[i];

			if (pcol1[i] != color1[i])
				pcol2[i] = pcol1[i];
		}

		for (i = nchan; i < 48; i++) {
			pcol1[i] = pcol1[i - nchan];
			pcol2[i] = pcol2[i - nchan];
		}
	} else if (type == MLIB_SHORT) {
		mlib_s16 *pcol1 = (void *)cnst;
		mlib_s16 *pcol2 = (void *)(cnst + 3);

		t_sh = 1;

		for (i = 0; i < nchan; i++) {
			pcol1[i] = CLAMP_S16(color1[i]);
			pcol2[i] = color2[i];

			if (pcol1[i] != color1[i])
				pcol2[i] = pcol1[i];
		}

		for (i = nchan; i < 24; i++) {
			pcol1[i] = pcol1[i - nchan];
			pcol2[i] = pcol2[i - nchan];
		}
	} else if (type == MLIB_USHORT) {
		mlib_u16 *pcol1 = (void *)cnst;
		mlib_u16 *pcol2 = (void *)(cnst + 3);

		t_sh = 1;

		for (i = 0; i < nchan; i++) {
			pcol1[i] = CLAMP_U16(color1[i]);
			pcol2[i] = color2[i];

			if (pcol1[i] != color1[i])
				pcol2[i] = pcol1[i];
		}

		for (i = nchan; i < 24; i++) {
			pcol1[i] = pcol1[i - nchan];
			pcol2[i] = pcol2[i - nchan];
		}
	} else {
		mlib_s32 *pcol1 = (void *)cnst;
		mlib_s32 *pcol2 = (void *)(cnst + 3);

		t_sh = 2;

		for (i = 0; i < nchan; i++) {
			pcol1[i] = color1[i];
			pcol2[i] = color2[i];
		}

		for (i = nchan; i < 12; i++) {
			pcol1[i] = pcol1[i - nchan];
			pcol2[i] = pcol2[i - nchan];
		}
	}

	if (sstride == dstride && sstride ==
		((nchan * width) << t_sh)) {
		width *= height;
		height = 1;
	}

	sstride >>= t_sh;
	dstride >>= t_sh;

	switch (type) {
	case MLIB_BYTE:
		if (nchan == 3) {
			return mlib_s_ImageReplaceColor_U8_3(
					sdata, ddata, sstride, dstride,
					width, height, nchan, cnst);
		} else {
			return mlib_s_ImageReplaceColor_U8_124(
					sdata, ddata, sstride, dstride,
					width, height, nchan, cnst);
		}

	case MLIB_SHORT:
	case MLIB_USHORT:

		if (nchan == 3) {
			return mlib_s_ImageReplaceColor_S16_3(
					sdata, ddata, sstride, dstride,
					width, height, nchan, cnst);
		} else {
			return mlib_s_ImageReplaceColor_S16_124(
					sdata, ddata, sstride, dstride,
					width, height, nchan, cnst);
		}

	case MLIB_INT:

		if (nchan == 3) {
			return mlib_s_ImageReplaceColor_S32_3(
					sdata, ddata, sstride, dstride,
					width, height, nchan, cnst);
		} else {
			return mlib_s_ImageReplaceColor_S32_124(
					sdata, ddata, sstride, dstride,
					width, height, nchan, cnst);
		}

	default:
		return (MLIB_FAILURE);
	}
}
mlib_status
mlib_VideoColorJFIFYCC2ABGR444_naligned(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *cb,
	const mlib_u8 *cr,
	mlib_s32 n)
{
	/* 1.402 * 8192 */
	const __m128i x_c13 = _mm_set1_epi16(0x2cdd);
	const mlib_s32 c13 = 0x2cdd;

	/* abs(-0.34414) * 8192 */
	const __m128i x_c22 = _mm_set1_epi16(0xb03);
	const mlib_s32 c22 = 0xb03;

	/* abs(-0.71414) * 8192 */
	const __m128i x_c23 = _mm_set1_epi16(0x16da);
	const mlib_s32 c23 = 0x16da;

	/* 1.772 * 8192 */
	const __m128i x_c32 = _mm_set1_epi16(0x38b4);
	const mlib_s32 c32 = 0x38b4;

	/* -179.456 * 32 */
	const __m128i x_coff0 = _mm_set1_epi16(0xe991);
	const mlib_s32 coff0 = (mlib_s32)0xffffe991;

	/* 135.45984 * 32 */
	const __m128i x_coff1 = _mm_set1_epi16(0x10ef);
	const mlib_s32 coff1 = 0x10ef;

	/* -226.816 * 32 */
	const __m128i x_coff2 = _mm_set1_epi16(0xe3a6);
	const mlib_s32 coff2 = (mlib_s32)0xffffe3a6;

	const __m128i x_a = _mm_set1_epi8(0xff);
	const __m128i x_zero = _mm_setzero_si128();

	/* __m128i variables */
	__m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_temp;
	__m128i x_y1, x_cb1, x_cr1, x_y2, x_cb2, x_cr2;
	__m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2;
	__m128i x_abgrl, x_abgrh, x_grl, x_grh, x_abl, x_abh;

	/* pointers */
	__m128i *px_y, *px_cb, *px_cr, *px_abgr;
	mlib_u8 *pabgr;

	/* other var */
	mlib_s32 i, iTemp, iy1, icb1, icr1, ir1, ig1, ib1;

	px_y = (__m128i *)y;
	px_cb = (__m128i *)cb;
	px_cr = (__m128i *)cr;
	px_abgr = (__m128i *)abgr;
	i = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; i <= n - 16; i += 16)	{
		x_y = _mm_loadu_si128(px_y);
		px_y++;
		x_y1 = _mm_unpacklo_epi8(x_y, x_zero);
		x_y2 = _mm_unpackhi_epi8(x_y, x_zero);
		x_cb = _mm_loadu_si128(px_cb);
		px_cb++;
		x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb);
		x_cb2 = _mm_unpackhi_epi8(x_zero, x_cb);
		x_cr = _mm_loadu_si128(px_cr);
		px_cr++;
		x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr);
		x_cr2 = _mm_unpackhi_epi8(x_zero, x_cr);

		/* lower half */
		x_temp = _mm_mulhi_epu16(x_cr1, x_c13);
		x_r1 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r1, 5);
		x_r1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c22);
		x_g1 = _mm_mulhi_epu16(x_cr1, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g1);
		x_g1 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g1, 5);
		x_g1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c32);
		x_b1 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b1, 5);
		x_b1 = _mm_add_epi16(x_temp, x_y1);

		/* upper half */
		x_temp = _mm_mulhi_epu16(x_cr2, x_c13);
		x_r2 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r2, 5);
		x_r2 = _mm_add_epi16(x_temp, x_y2);

		x_temp = _mm_mulhi_epu16(x_cb2, x_c22);
		x_g2 = _mm_mulhi_epu16(x_cr2, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g2);
		x_g2 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g2, 5);
		x_g2 = _mm_add_epi16(x_temp, x_y2);

		x_temp = _mm_mulhi_epu16(x_cb2, x_c32);
		x_b2 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b2, 5);
		x_b2 = _mm_add_epi16(x_temp, x_y2);

		/* pack */
		x_b = _mm_packus_epi16(x_b1, x_b2);
		x_r = _mm_packus_epi16(x_r1, x_r2);
		x_g = _mm_packus_epi16(x_g1, x_g2);

		/* create rgb sequences */
		x_abl = _mm_unpacklo_epi8(x_a, x_b);
		x_abh = _mm_unpackhi_epi8(x_a, x_b);
		x_grl = _mm_unpacklo_epi8(x_g, x_r);
		x_grh = _mm_unpackhi_epi8(x_g, x_r);

		/* save */
		x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		x_abgrh = _mm_unpackhi_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrh);

		x_abgrl = _mm_unpacklo_epi16(x_abh, x_grh);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		x_abgrh = _mm_unpackhi_epi16(x_abh, x_grh);
		_mm_storeu_si128(px_abgr++, x_abgrh);
	}

	if (i <= n - 8) {
		x_y = _mm_loadl_epi64(px_y);
		px_y = (__m128i *) (((__m64 *)px_y) + 1);
		x_y1 = _mm_unpacklo_epi8(x_y, x_zero);
		x_cb = _mm_loadl_epi64(px_cb);
		px_cb = (__m128i *) (((__m64 *)px_cb) + 1);
		x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb);
		x_cr = _mm_loadl_epi64(px_cr);
		px_cr = (__m128i *) (((__m64 *)px_cr) + 1);
		x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr);

		/* lower half only */
		x_temp = _mm_mulhi_epu16(x_cr1, x_c13);
		x_r1 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r1, 5);
		x_r1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c22);
		x_g1 = _mm_mulhi_epu16(x_cr1, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g1);
		x_g1 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g1, 5);
		x_g1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c32);
		x_b1 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b1, 5);
		x_b1 = _mm_add_epi16(x_temp, x_y1);

		/* pack */
		x_b = _mm_packus_epi16(x_b1, x_zero);
		x_r = _mm_packus_epi16(x_r1, x_zero);
		x_g = _mm_packus_epi16(x_g1, x_zero);

		/* create rgb sequences */
		x_abl = _mm_unpacklo_epi8(x_a, x_b);
		x_grl = _mm_unpacklo_epi8(x_g, x_r);

		/* save */
		x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		x_abgrh = _mm_unpackhi_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrh);

		i += 8;
	}

	if (i <= n - 4) {
		iTemp = *((mlib_s32 *)px_y);
		x_y = _mm_cvtsi32_si128(iTemp);
		px_y = (__m128i *) (((mlib_s32 *)px_y) + 1);
		x_y1 = _mm_unpacklo_epi8(x_y, x_zero);

		iTemp = *((mlib_s32 *)px_cb);
		x_cb = _mm_cvtsi32_si128(iTemp);
		px_cb = (__m128i *) (((mlib_s32 *)px_cb) + 1);
		x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb);

		iTemp = *((mlib_s32 *)px_cr);
		x_cr = _mm_cvtsi32_si128(iTemp);
		px_cr = (__m128i *) (((mlib_s32 *)px_cr) + 1);
		x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr);

		/* 64 0f lower half only */
		x_temp = _mm_mulhi_epu16(x_cr1, x_c13);
		x_r1 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r1, 5);
		x_r1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c22);
		x_g1 = _mm_mulhi_epu16(x_cr1, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g1);
		x_g1 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g1, 5);
		x_g1 = _mm_add_epi16(x_temp, x_y1);
		x_temp = _mm_mulhi_epu16(x_cb1, x_c32);
		x_b1 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b1, 5);
		x_b1 = _mm_add_epi16(x_temp, x_y1);

		/* pack */
		x_b = _mm_packus_epi16(x_b1, x_zero);
		x_r = _mm_packus_epi16(x_r1, x_zero);
		x_g = _mm_packus_epi16(x_g1, x_zero);

		/* create rgb sequences */
		x_abl = _mm_unpacklo_epi8(x_a, x_b);
		x_grl = _mm_unpacklo_epi8(x_g, x_r);

		/* save */
		x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		i += 4;
	}

	/* pure C implementation */
	pabgr = (mlib_u8 *)px_abgr;
	for (; i < n; i++) {
		iy1 = y[i];
		icb1 = cb[i];
		icr1 = cr[i];

		iTemp = (icr1 * c13) >> 8;
		ir1 = (iTemp + coff0) >> 5;
		ir1 += iy1;

		iTemp = (icb1 * c22) >> 8;
		ig1 = (icr1 * c23) >> 8;
		iTemp += ig1;
		ig1 = coff1 - iTemp;
		iTemp = ig1 >> 5;
		ig1 = iTemp + iy1;

		iTemp = (icb1 * c32) >> 8;
		ib1 = iTemp + coff2;
		iTemp = ib1 >> 5;
		ib1 = iTemp + iy1;

		pabgr[0] = 0xff;
		CLAMP_U8(ib1, pabgr[1]);
		CLAMP_U8(ig1, pabgr[2]);
		CLAMP_U8(ir1, pabgr[3]);

		pabgr += 4;
	}

	return (MLIB_SUCCESS);
}