예제 #1
0
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL)
{
    int i;
    vector signed short vsrc;
    uint16_t *qmat;
    vector unsigned short vqmat;
    vector unsigned short vmquant;
    vector bool short eqzero, ltzero;
    vector signed short val, t0;
    vector signed short zero, one;
    vector unsigned int four;
    vector signed short min, max;
    int offset, offset2;
    int16_t dst0;
    union {
	vector unsigned short vu16;
	unsigned short mquant;
	vector signed int vs32;
	struct {
	    signed int pad[3];
	    signed int sum;
	} s;
    } vu;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY /* {{{ */
    if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat))
	mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)",
	    wsp->intra_q_mat);

    if (NOT_VECTOR_ALIGNED(src))
	mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src);

    if (NOT_VECTOR_ALIGNED(dst))
	mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst);

    for (i = 0; i < 64; i++)
	if (src[i] < -256 || src[i] > 255)
	    mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)",
		i, src[i]);
#endif /* }}} */

    AMBER_START;

    dst0 = src[0] << (3 - dc_prec);

    qmat = (uint16_t*)wsp->intra_q_mat;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(64/8,1,0);
    vec_dst(src, dsc.control, 0);
    vec_dst(qmat, dsc.control, 1);
#endif

    /* vmquant = (vector unsigned short)(mquant); */
    vu.mquant = (unsigned short)mquant;
    vmquant = vec_splat(vu.vu16, 0);

    zero = vec_splat_s16(0);
    one = vec_splat_s16(1);
    four = vec_splat_u32(4);
    /* max = (2047); min = (-2048); {{{ */
    vu8(max) = vec_splat_u8(0x7);
    t0 = vec_splat_s16(-1); /* 0xffff */
    vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */
    min = vec_sub(t0, max);
    /* }}} */
    offset = 0;

#if 1
    vsrc = vec_ld(offset, (signed short*)src);
    vqmat = vec_ld(offset, (unsigned short*)qmat);
    i = (64/8) - 1;
    do {
	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	offset2 = offset;
	offset += 8*sizeof(int16_t);
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset2, dst);
    } while (--i);
    /* intra_q[i] * mquant */
    vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

    /* save sign */
    ltzero = vec_cmplt(vsrc, zero);
    eqzero = vec_cmpeq(vsrc, zero);

    /* val = abs(src) */
    t0 = vec_sub(zero, vsrc);
    val = vec_max(t0, vsrc);

    /* val = (src * quant) >> 4 */
    vs32(t0) = vec_mule(val, vs16(vqmat));
    vs32(val) = vec_mulo(val, vs16(vqmat));
    vs32(t0) = vec_sra(vs32(t0), four);
    vs16(t0) = vec_pack(vs32(t0), vs32(t0));
    vs32(val) = vec_sra(vs32(val), four);
    vs16(val) = vec_pack(vs32(val), vs32(val));
    val = vec_mergeh(vs16(t0), vs16(val));

    /* val = val - 1&~(val|val==0) */
    t0 = vec_or(val, eqzero);
    t0 = vec_andc(one, t0);
    val = vec_sub(val, t0);

    /* restore sign */
    t0 = vec_sub(zero, val);
    val = vec_sel(val, t0, ltzero);

    /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
    val = vec_min(val, max);
    val = vec_max(val, min);

    vec_st(val, offset, dst);
#else
    /* {{{ */
    i = (64/8);
    do {
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset, dst);

	offset += 8*sizeof(int16_t);
    } while (--i);
    /* }}} */
#endif

    dst[0] = dst0;

    AMBER_STOP;
}
예제 #2
0
파일: fdctdsp.c 프로젝트: 63n/FFmpeg
/* two dimensional discrete cosine transform */
void ff_fdct_altivec(int16_t *block)
{
    vector signed short *bp;
    const vector float *cp = fdctconsts;
    vector float b00, b10, b20, b30, b40, b50, b60, b70;
    vector float b01, b11, b21, b31, b41, b51, b61, b71;
    vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
    vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;

    /* setup constants {{{ */
    /* mzero = -0.0 */
    mzero  = ((vector float) vec_splat_u32(-1));
    mzero  = ((vector float) vec_sl(vu32(mzero), vu32(mzero)));
    cnsts0 = vec_ld(0, cp);
    cp++;
    cnsts1 = vec_ld(0, cp);
    cp++;
    cnsts2 = vec_ld(0, cp);
    /* }}} */

    /* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl, a, b) vec_merge ## hl(vs16(a), vs16(b))

    bp  = (vector signed short *) block;
    b00 = ((vector float) vec_ld(0,      bp));
    b40 = ((vector float) vec_ld(16 * 4, bp));
    b01 = ((vector float) MERGE_S16(h, b00, b40));
    b11 = ((vector float) MERGE_S16(l, b00, b40));
    bp++;
    b10 = ((vector float) vec_ld(0,      bp));
    b50 = ((vector float) vec_ld(16 * 4, bp));
    b21 = ((vector float) MERGE_S16(h, b10, b50));
    b31 = ((vector float) MERGE_S16(l, b10, b50));
    bp++;
    b20 = ((vector float) vec_ld(0,      bp));
    b60 = ((vector float) vec_ld(16 * 4, bp));
    b41 = ((vector float) MERGE_S16(h, b20, b60));
    b51 = ((vector float) MERGE_S16(l, b20, b60));
    bp++;
    b30 = ((vector float) vec_ld(0,      bp));
    b70 = ((vector float) vec_ld(16 * 4, bp));
    b61 = ((vector float) MERGE_S16(h, b30, b70));
    b71 = ((vector float) MERGE_S16(l, b30, b70));

    x0 = ((vector float) MERGE_S16(h, b01, b41));
    x1 = ((vector float) MERGE_S16(l, b01, b41));
    x2 = ((vector float) MERGE_S16(h, b11, b51));
    x3 = ((vector float) MERGE_S16(l, b11, b51));
    x4 = ((vector float) MERGE_S16(h, b21, b61));
    x5 = ((vector float) MERGE_S16(l, b21, b61));
    x6 = ((vector float) MERGE_S16(h, b31, b71));
    x7 = ((vector float) MERGE_S16(l, b31, b71));

    b00 = ((vector float) MERGE_S16(h, x0, x4));
    b10 = ((vector float) MERGE_S16(l, x0, x4));
    b20 = ((vector float) MERGE_S16(h, x1, x5));
    b30 = ((vector float) MERGE_S16(l, x1, x5));
    b40 = ((vector float) MERGE_S16(h, x2, x6));
    b50 = ((vector float) MERGE_S16(l, x2, x6));
    b60 = ((vector float) MERGE_S16(h, x3, x7));
    b70 = ((vector float) MERGE_S16(l, x3, x7));

#undef MERGE_S16
    /* }}} */

    /* Some of the initial calculations can be done as vector short
     * before conversion to vector float.  The following code section
     * takes advantage of this. */

    /* fdct rows {{{ */
    x0 = ((vector float) vec_add(vs16(b00), vs16(b70)));
    x7 = ((vector float) vec_sub(vs16(b00), vs16(b70)));
    x1 = ((vector float) vec_add(vs16(b10), vs16(b60)));
    x6 = ((vector float) vec_sub(vs16(b10), vs16(b60)));
    x2 = ((vector float) vec_add(vs16(b20), vs16(b50)));
    x5 = ((vector float) vec_sub(vs16(b20), vs16(b50)));
    x3 = ((vector float) vec_add(vs16(b30), vs16(b40)));
    x4 = ((vector float) vec_sub(vs16(b30), vs16(b40)));

    b70 = ((vector float) vec_add(vs16(x0), vs16(x3)));
    b10 = ((vector float) vec_add(vs16(x1), vs16(x2)));

    b00 = ((vector float) vec_add(vs16(b70), vs16(b10)));
    b40 = ((vector float) vec_sub(vs16(b70), vs16(b10)));

#define CTF0(n)                                                    \
    b ## n ## 1 = ((vector float) vec_unpackl(vs16(b ## n ## 0))); \
    b ## n ## 0 = ((vector float) vec_unpackh(vs16(b ## n ## 0))); \
    b ## n ## 1 = vec_ctf(vs32(b ## n ## 1), 0);                   \
    b ## n ## 0 = vec_ctf(vs32(b ## n ## 0), 0)

    CTF0(0);
    CTF0(4);

    b20 = ((vector float) vec_sub(vs16(x0), vs16(x3)));
    b60 = ((vector float) vec_sub(vs16(x1), vs16(x2)));

    CTF0(2);
    CTF0(6);

#undef CTF0

    x0 = vec_add(b60, b20);
    x1 = vec_add(b61, b21);

    cnst = LD_W2;
    x0   = vec_madd(cnst, x0, mzero);
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_W1;
    b20  = vec_madd(cnst, b20, x0);
    b21  = vec_madd(cnst, b21, x1);
    cnst = LD_W0;
    b60  = vec_madd(cnst, b60, x0);
    b61  = vec_madd(cnst, b61, x1);

#define CTFX(x, b)                                  \
    b ## 0 = ((vector float) vec_unpackh(vs16(x))); \
    b ## 1 = ((vector float) vec_unpackl(vs16(x))); \
    b ## 0 = vec_ctf(vs32(b ## 0), 0);              \
    b ## 1 = vec_ctf(vs32(b ## 1), 0)

    CTFX(x4, b7);
    CTFX(x5, b5);
    CTFX(x6, b3);
    CTFX(x7, b1);

#undef CTFX

    x0   = vec_add(b70, b10);
    x1   = vec_add(b50, b30);
    x2   = vec_add(b70, b30);
    x3   = vec_add(b50, b10);
    x8   = vec_add(x2, x3);
    cnst = LD_W3;
    x8   = vec_madd(cnst, x8, mzero);

    cnst = LD_W8;
    x0   = vec_madd(cnst, x0, mzero);
    cnst = LD_W9;
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_WA;
    x2   = vec_madd(cnst, x2, x8);
    cnst = LD_WB;
    x3   = vec_madd(cnst, x3, x8);

    cnst = LD_W4;
    b70  = vec_madd(cnst, b70, x0);
    cnst = LD_W5;
    b50  = vec_madd(cnst, b50, x1);
    cnst = LD_W6;
    b30  = vec_madd(cnst, b30, x1);
    cnst = LD_W7;
    b10  = vec_madd(cnst, b10, x0);

    b70 = vec_add(b70, x2);
    b50 = vec_add(b50, x3);
    b30 = vec_add(b30, x2);
    b10 = vec_add(b10, x3);

    x0   = vec_add(b71, b11);
    x1   = vec_add(b51, b31);
    x2   = vec_add(b71, b31);
    x3   = vec_add(b51, b11);
    x8   = vec_add(x2, x3);
    cnst = LD_W3;
    x8   = vec_madd(cnst, x8, mzero);

    cnst = LD_W8;
    x0   = vec_madd(cnst, x0, mzero);
    cnst = LD_W9;
    x1   = vec_madd(cnst, x1, mzero);
    cnst = LD_WA;
    x2   = vec_madd(cnst, x2, x8);
    cnst = LD_WB;
    x3   = vec_madd(cnst, x3, x8);

    cnst = LD_W4;
    b71  = vec_madd(cnst, b71, x0);
    cnst = LD_W5;
    b51  = vec_madd(cnst, b51, x1);
    cnst = LD_W6;
    b31  = vec_madd(cnst, b31, x1);
    cnst = LD_W7;
    b11  = vec_madd(cnst, b11, x0);

    b71 = vec_add(b71, x2);
    b51 = vec_add(b51, x3);
    b31 = vec_add(b31, x2);
    b11 = vec_add(b11, x3);
    /* }}} */

    /* 8x8 matrix transpose (vector float[8][2]) {{{ */
    x0 = VEC_FMERGEL(b00, b20);
    x1 = VEC_FMERGEH(b00, b20);
    x2 = VEC_FMERGEL(b10, b30);
    x3 = VEC_FMERGEH(b10, b30);

    b00 = VEC_FMERGEH(x1, x3);
    b10 = VEC_FMERGEL(x1, x3);
    b20 = VEC_FMERGEH(x0, x2);
    b30 = VEC_FMERGEL(x0, x2);

    x4 = VEC_FMERGEL(b41, b61);
    x5 = VEC_FMERGEH(b41, b61);
    x6 = VEC_FMERGEL(b51, b71);
    x7 = VEC_FMERGEH(b51, b71);

    b41 = VEC_FMERGEH(x5, x7);
    b51 = VEC_FMERGEL(x5, x7);
    b61 = VEC_FMERGEH(x4, x6);
    b71 = VEC_FMERGEL(x4, x6);

    x0 = VEC_FMERGEL(b01, b21);
    x1 = VEC_FMERGEH(b01, b21);
    x2 = VEC_FMERGEL(b11, b31);
    x3 = VEC_FMERGEH(b11, b31);

    x4 = VEC_FMERGEL(b40, b60);
    x5 = VEC_FMERGEH(b40, b60);
    x6 = VEC_FMERGEL(b50, b70);
    x7 = VEC_FMERGEH(b50, b70);

    b40 = VEC_FMERGEH(x1, x3);
    b50 = VEC_FMERGEL(x1, x3);
    b60 = VEC_FMERGEH(x0, x2);
    b70 = VEC_FMERGEL(x0, x2);

    b01 = VEC_FMERGEH(x5, x7);
    b11 = VEC_FMERGEL(x5, x7);
    b21 = VEC_FMERGEH(x4, x6);
    b31 = VEC_FMERGEL(x4, x6);
    /* }}} */

    FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
    FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);

    /* round, convert back to short {{{ */
#define CTS(n)                                                  \
    b ## n ## 0 = vec_round(b ## n ## 0);                       \
    b ## n ## 1 = vec_round(b ## n ## 1);                       \
    b ## n ## 0 = ((vector float) vec_cts(b ## n ## 0, 0));     \
    b ## n ## 1 = ((vector float) vec_cts(b ## n ## 1, 0));     \
    b ## n ## 0 = ((vector float) vec_pack(vs32(b ## n ## 0),   \
                                           vs32(b ## n ## 1))); \
    vec_st(vs16(b ## n ## 0), 0, bp)

    bp = (vector signed short *) block;
    CTS(0);
    bp++;
    CTS(1);
    bp++;
    CTS(2);
    bp++;
    CTS(3);
    bp++;
    CTS(4);
    bp++;
    CTS(5);
    bp++;
    CTS(6);
    bp++;
    CTS(7);

#undef CTS
    /* }}} */
}
예제 #3
0
void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)
{
    int i, ii, j, stride1, stride2, stride3, stride4, halfstride;
    unsigned char *pB, *pB2, *pB4;
    vector unsigned char l0, l1, l2, l3;
    vector unsigned short s0, s1, s2, s3;
    vector unsigned short s22_0, s22_1, s22_2, s22_3;
    vector unsigned short s44, s44_0, s44_1;
    vector unsigned short zero, two;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "image", 16, image);
    if (NOT_VECTOR_ALIGNED(sub22_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub22_image", 16, sub22_image);
    if (NOT_VECTOR_ALIGNED(sub44_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub44_image", 16, sub44_image);

    if ((rowstride & 63) != 0)
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "rowstride", 64, rowstride);
#endif

    AMBER_START;

    pB = image;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(6,4,0);
    dsc.block.stride = rowstride;

    vec_dst(pB, dsc.control, 0);
#endif

    pB2 = sub22_image;
    pB4 = sub44_image;

    j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */

    stride1 = rowstride;
    stride2 = stride1 + stride1;
    stride3 = stride2 + stride1;
    stride4 = stride2 + stride2;
    halfstride = stride1 >> 1; /* /2 */

    ii = rowstride >> 6; /* rowstride/16/4 */

    zero = vec_splat_u16(0);
    two = vec_splat_u16(2);

    do {
	i = ii;
	do {
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;
#ifdef ALTIVEC_DST
	    vec_dst(pB + (16 * 3), dsc.control, 0);
#endif

	    /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */
	    /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */
	    /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */
	    /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */

	    /* s0 = 0x[00,01,      02,03,      04,05,      06,07,     ] */
	    /*        [      10,11,      12,13,      14,15,      16,17] */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));

	    /* s1 = 0x[08,09,      0A,0B,      0C,0D,      0E,0F,     ] */
	    /*        [      18,19,      1A,1B,      1C,1D,      1E,1F] */
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));

	    /* s2 = 0x[20,21,      22,23,      24,25,      26,27,     ] */
	    /*        [      30,31,      32,33,      34,35,      36,37] */
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));

	    /* s3 = 0x[28,29,      2A,2B,      2C,2D,      2E,2F,     ] */
	    /*        [      38,39,      3A,3B,      3C,3D,      3E,3F] */
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next block */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;

	    /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */
	    /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */
	    /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */
	    /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */
	    s22_0 = vec_add(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */
	    s22_1 = vec_add(s22_1, two);

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */
	    s22_0 = vec_sra(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */
	    s22_1 = vec_sra(s22_1, two);

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */
	    s44_0 = vec_add(s22_0, s22_1);

	    /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);


	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* starting loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    s22_0 = vec_add(s22_0, two);
	    s22_1 = vec_add(s22_1, two);

	    s22_0 = vec_sra(s22_0, two);
	    s22_1 = vec_sra(s22_1, two);


	    s44 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44 = vec_add(s44, two);
	    s44 = vec_sra(s44, two);

	    s44_0 = vec_add(s22_0, s22_1);
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);

	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* pack all four s44 chunks */
	    s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44_0 = vec_add(s44_0, two);
	    s44_0 = vec_sra(s44_0, two);
	    s44 = vu16(vec_packsu(s44, s44_0));

	    vec_st(vu8(s44), 0, pB4);
	    pB4 += 16;

	} while (--i);

	pB += stride3;
	pB2 += halfstride;

    } while (--j);

#ifdef ALTIVEC_DST
    vec_dss(0);
#endif

    AMBER_STOP;
}