void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL) { int i; vector signed short vsrc; uint16_t *qmat; vector unsigned short vqmat; vector unsigned short vmquant; vector bool short eqzero, ltzero; vector signed short val, t0; vector signed short zero, one; vector unsigned int four; vector signed short min, max; int offset, offset2; int16_t dst0; union { vector unsigned short vu16; unsigned short mquant; vector signed int vs32; struct { signed int pad[3]; signed int sum; } s; } vu; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat)) mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)", wsp->intra_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst); for (i = 0; i < 64; i++) if (src[i] < -256 || src[i] > 255) mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)", i, src[i]); #endif /* }}} */ AMBER_START; dst0 = src[0] << (3 - dc_prec); qmat = (uint16_t*)wsp->intra_q_mat; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(64/8,1,0); vec_dst(src, dsc.control, 0); vec_dst(qmat, dsc.control, 1); #endif /* vmquant = (vector unsigned short)(mquant); */ vu.mquant = (unsigned short)mquant; vmquant = vec_splat(vu.vu16, 0); zero = vec_splat_s16(0); one = vec_splat_s16(1); four = vec_splat_u32(4); /* max = (2047); min = (-2048); {{{ */ vu8(max) = vec_splat_u8(0x7); t0 = vec_splat_s16(-1); /* 0xffff */ vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */ min = vec_sub(t0, max); /* }}} */ offset = 0; #if 1 vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); i = (64/8) - 1; do { /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); offset2 = offset; offset += 8*sizeof(int16_t); vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset2, dst); } while (--i); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); #else /* {{{ */ i = (64/8); do { vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); offset += 8*sizeof(int16_t); } while (--i); /* }}} */ #endif dst[0] = dst0; AMBER_STOP; }
void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL) { int i, ii, j, stride1, stride2, stride3, stride4, halfstride; unsigned char *pB, *pB2, *pB4; vector unsigned char l0, l1, l2, l3; vector unsigned short s0, s1, s2, s3; vector unsigned short s22_0, s22_1, s22_2, s22_3; vector unsigned short s44, s44_0, s44_1; vector unsigned short zero, two; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY if (NOT_VECTOR_ALIGNED(image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "image", 16, image); if (NOT_VECTOR_ALIGNED(sub22_image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "sub22_image", 16, sub22_image); if (NOT_VECTOR_ALIGNED(sub44_image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "sub44_image", 16, sub44_image); if ((rowstride & 63) != 0) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "rowstride", 64, rowstride); #endif AMBER_START; pB = image; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(6,4,0); dsc.block.stride = rowstride; vec_dst(pB, dsc.control, 0); #endif pB2 = sub22_image; pB4 = sub44_image; j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */ stride1 = rowstride; stride2 = stride1 + stride1; stride3 = stride2 + stride1; stride4 = stride2 + stride2; halfstride = stride1 >> 1; /* /2 */ ii = rowstride >> 6; /* rowstride/16/4 */ zero = vec_splat_u16(0); two = vec_splat_u16(2); do { i = ii; do { l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; #ifdef ALTIVEC_DST vec_dst(pB + (16 * 3), dsc.control, 0); #endif /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */ /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */ /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */ /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */ /* s0 = 0x[00,01, 02,03, 04,05, 06,07, ] */ /* [ 10,11, 12,13, 14,15, 16,17] */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */ s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); /* s1 = 0x[08,09, 0A,0B, 0C,0D, 0E,0F, ] */ /* [ 18,19, 1A,1B, 1C,1D, 1E,1F] */ s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */ s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); /* s2 = 0x[20,21, 22,23, 24,25, 26,27, ] */ /* [ 30,31, 32,33, 34,35, 36,37] */ s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */ s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); /* s3 = 0x[28,29, 2A,2B, 2C,2D, 2E,2F, ] */ /* [ 38,39, 3A,3B, 3C,3D, 3E,3F] */ s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */ s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* start loading next block */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */ /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */ /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */ /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */ /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */ s22_0 = vec_packsu(vu32(s0), vu32(s1)); /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */ s22_1 = vec_packsu(vu32(s2), vu32(s3)); /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */ s22_0 = vec_add(s22_0, two); /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */ s22_1 = vec_add(s22_1, two); /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */ s22_0 = vec_sra(s22_0, two); /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */ s22_1 = vec_sra(s22_1, two); /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */ /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */ /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */ s44_0 = vec_add(s22_0, s22_1); /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */ s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero))); /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* start loading next l[0-3] */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; s22_2 = vec_packsu(vu32(s0), vu32(s1)); s22_3 = vec_packsu(vu32(s2), vu32(s3)); s22_2 = vec_add(s22_2, two); s22_3 = vec_add(s22_3, two); s22_2 = vec_sra(s22_2, two); s22_3 = vec_sra(s22_3, two); s44_1 = vec_add(s22_2, s22_3); s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero))); /* store s22 block */ s22_0 = vu16(vec_packsu(s22_0, s22_2)); s22_1 = vu16(vec_packsu(s22_1, s22_3)); vec_st(vu8(s22_0), 0, pB2); vec_st(vu8(s22_1), halfstride, pB2); pB2 += 16; /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* starting loading next l[0-3] */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; s22_0 = vec_packsu(vu32(s0), vu32(s1)); s22_1 = vec_packsu(vu32(s2), vu32(s3)); s22_0 = vec_add(s22_0, two); s22_1 = vec_add(s22_1, two); s22_0 = vec_sra(s22_0, two); s22_1 = vec_sra(s22_1, two); s44 = vec_packsu(vu32(s44_0), vu32(s44_1)); s44 = vec_add(s44, two); s44 = vec_sra(s44, two); s44_0 = vec_add(s22_0, s22_1); s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero))); /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); s22_2 = vec_packsu(vu32(s0), vu32(s1)); s22_3 = vec_packsu(vu32(s2), vu32(s3)); s22_2 = vec_add(s22_2, two); s22_3 = vec_add(s22_3, two); s22_2 = vec_sra(s22_2, two); s22_3 = vec_sra(s22_3, two); s44_1 = vec_add(s22_2, s22_3); s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero))); /* store s22 block */ s22_0 = vu16(vec_packsu(s22_0, s22_2)); s22_1 = vu16(vec_packsu(s22_1, s22_3)); vec_st(vu8(s22_0), 0, pB2); vec_st(vu8(s22_1), halfstride, pB2); pB2 += 16; /* pack all four s44 chunks */ s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1)); s44_0 = vec_add(s44_0, two); s44_0 = vec_sra(s44_0, two); s44 = vu16(vec_packsu(s44, s44_0)); vec_st(vu8(s44), 0, pB4); pB4 += 16; } while (--i); pB += stride3; pB2 += halfstride; } while (--j); #ifdef ALTIVEC_DST vec_dss(0); #endif AMBER_STOP; }