void pix_invert :: processYUVAltivec(imageStruct &image) { int h,w,width; width = image.xsize/8; union{ unsigned char c[16]; vector unsigned char v; }charBuffer; vector unsigned char offset; vector unsigned char *inData = (vector unsigned char*) image.data; charBuffer.c[0] = 255; offset = charBuffer.v; offset = (vector unsigned char) vec_splat(offset,0); #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); #endif inData[0]=vec_subs(offset,inData[0]); inData++; } #ifndef PPC970 vec_dss( 0 ); #endif } /*end of working altivec function */ }
void pix_add :: processRGBA_Altivec(imageStruct &image, imageStruct &right) { int h,w,width; width = image.xsize/4; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif inData[0] = vec_adds(inData[0], rightData[0]); inData++; rightData++; } #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); #endif } /*end of working altivec function */ }
int main (void) { unsigned long ul = 2; signed long sl = 2; unsigned int ui = 2; signed int si = 2; float fl = 2.0; vec_dst (&vi, ul, '\0'); vec_dst (&vi, sl, 0); vec_dst (&vi, ui, '\0'); vec_dst (&vi, si, 0); vec_dstst (&vi, (short)fl, '\0'); return 0; }
void f33() { vec_dst(var_long_ptr[0], var_int[1], 0); vec_dst(var_long_ptr[0], var_int[1], 1); vec_dst(var_long_ptr[0], var_int[1], 2); vec_dst(var_long_ptr[0], var_int[1], 3); vec_dst(var_unsigned_long_ptr[0], var_int[1], 0); vec_dst(var_unsigned_long_ptr[0], var_int[1], 1); vec_dst(var_unsigned_long_ptr[0], var_int[1], 2); vec_dst(var_unsigned_long_ptr[0], var_int[1], 3); }
void pix_diff :: processRGBA_Altivec(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize / 4; vector signed short hiImage, loImage, hiRight, loRight; vector unsigned char zero = vec_splat_u8(0); vector unsigned char *inData = (vector unsigned char *)image.data; vector unsigned char *rightData = (vector unsigned char *)right.data; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif do { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif hiImage = (vector signed short)vec_mergeh(zero,inData[0]); loImage = (vector signed short)vec_mergel(zero,inData[0]); hiRight = (vector signed short)vec_mergeh(zero,rightData[0]); loRight = (vector signed short)vec_mergel(zero,rightData[0]); hiImage = vec_subs(hiImage,hiRight); loImage = vec_subs(loImage,loRight); hiImage = vec_abs(hiImage); loImage = vec_abs(loImage); inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } while (--datasize); #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); #endif }
int main (int argc, const char * argv[]) { int i; const float cf = 1.0; vector float v; const vector float cv = (vector float){1.0, 2.0, 3.0, 4.0}; vec_dst(&cv, i, 0); v = vec_ld(0, &cv); v = vec_lde(0, &cf); vec_lvsl(0, &cf); return 0; }
void b() { z = vec_add (x, y); /* Make sure the predicates accept correct argument types. */ int1 = vec_all_in (f, g); int1 = vec_all_ge (f, g); int1 = vec_all_eq (c, d); int1 = vec_all_ne (s, t); int1 = vec_any_eq (i, j); int1 = vec_any_ge (f, g); int1 = vec_all_ngt (f, g); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); int1 = vec_any_ge (c, d); int1 = vec_any_ge (s, t); int1 = vec_any_ge (i, j); vec_mtvscr (i); vec_dssall (); s = (vector signed short) vec_mfvscr (); vec_dss (3); vec_dst (pi, int1 + int2, 3); vec_dstst (pi, int1 + int2, 3); vec_dststt (pi, int1 + int2, 3); vec_dstt (pi, int1 + int2, 3); uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi); uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi); c = vec_lde (int1, (signed char *) pi); s = vec_lde (int1, (signed short *) pi); i = vec_lde (int1, (signed int *) pi); i = vec_ldl (int1, pi); i = vec_ld (int1, pi); vec_st (i, int2, pi); vec_ste (c, int2, (signed char *) pi); vec_ste (s, int2, (signed short *) pi); vec_ste (i, int2, (signed int *) pi); vec_stl (i, int2, pi); }
void foo(void) { const unsigned char *buf; vector pixel vp = { 3, 4, 5, 6 }; vector bool int vbi = { 1, 0, 1, 0 }; vector bool short vbs = { 1, 0, 1, 0, 1, 0, 1, 0 }; vector bool char vbc = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; vector signed char vsc; int a = 3; vec_dst(buf, a, 1); vec_dstst(buf, a, 2); vec_dststt(buf, a, 3); vec_dststt(buf, a, 2); vp = vec_sld(vp, vp, 5); vbc = vec_splat(vbc, 7); vbs = vec_splat(vbs, 12); vp = vec_splat(vp, 17); vbi = vec_splat(vbi, 31); }
void *mem_searchrn(void *s, size_t len) { vector unsigned char v_cr; vector unsigned char v_nl; vector unsigned char v0; vector unsigned char v_perm; vector unsigned char c; vector bool char rr, rn; vector bool char last_rr; char *p; ssize_t k; size_t block_num; unsigned f; if(unlikely(!s || !len)) return NULL; /* only do one prefetch, this covers nearly 128k */ block_num = DIV_ROUNDUP(len, 512); f = block_num >= 256 ? 0 : block_num << 16; f |= 512; vec_dst((const unsigned char *)s, f, 2); v_cr = vec_splat_u8('\r'); v_nl = vec_splat_u8('\n'); v0 = vec_splat_u8(0); last_rr = (vector bool char)v0; k = SOVUC - ALIGN_DOWN_DIFF(s, SOVUC) - (ssize_t)len; p = (char *)ALIGN_DOWN(s, SOVUC); c = vec_ldl(0, (const vector unsigned char *)p); if(unlikely(k > 0)) goto K_SHIFT; v_perm = vec_lvsl(0, (unsigned char *)s); c = vec_perm(c, v0, v_perm); v_perm = vec_lvsr(0, (unsigned char *)s); c = vec_perm(v0, c, v_perm); rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); k = -k; goto START_LOOP; do { p += SOVUC; c = vec_ldl(0, (const vector unsigned char *)p); k -= SOVUC; if(k > 0) { rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); if(vec_any_eq(last_rr, rn)) { vec_dss(2); return p - 1; } START_LOOP: last_rr = (vector bool char)vec_sld(v0, (vector unsigned char)rr, 1); rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15); rr = vec_and(rr, rn); /* get mask */ if(vec_any_ne(rr, v0)) { vec_dss(2); return p + vec_zpos(rr); } } } while(k > 0); k = -k; K_SHIFT: vec_dss(2); v_perm = vec_lvsr(0, (unsigned char *)k); c = vec_perm(v0, c, v_perm); v_perm = vec_lvsl(0, (unsigned char *)k); c = vec_perm(c, v0, v_perm); rr = vec_cmpeq(c, v_cr); rn = vec_cmpeq(c, v_nl); if(vec_any_eq(last_rr, rn)) return p - 1; rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15); rr = vec_and(rr, rn); /* get mask */ if(vec_any_ne(rr, v0)) return p + vec_zpos(rr); return NULL; }
/* more optimized version - unrolled and load-hoisted */ void pix_offset :: processYUVAltivec(imageStruct &image) { register int h,w,width,height; width = image.xsize/16; //for altivec height = image.ysize; //format is U Y V Y // start of working altivec function union { short elements[8]; vector signed short v; } transferBuffer; register vector signed short c, hi, lo; register vector signed short hi1, lo1; register vector signed short loadhi, loadhi1, loadlo, loadlo1; register vector unsigned char zero = vec_splat_u8(0); register vector unsigned char *inData = (vector unsigned char*) image.data; //Write the pixel (pair) to the transfer buffer //transferBuffer.i = (U << 24) | (Y << 16) | (V << 8 ) | Y; transferBuffer.elements[0] = U; transferBuffer.elements[1] = Y; transferBuffer.elements[2] = V; transferBuffer.elements[3] = Y; transferBuffer.elements[4] = U; transferBuffer.elements[5] = Y; transferBuffer.elements[6] = V; transferBuffer.elements[7] = Y; //Load it into the vector unit c = transferBuffer.v; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( inData+16, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( inData+64, prefetchSize, 3 ); #endif //expand the UInt8's to short's loadhi = (vector signed short) vec_mergeh( zero, inData[0] ); loadlo = (vector signed short) vec_mergel( zero, inData[0] ); loadhi1 = (vector signed short) vec_mergeh( zero, inData[1] ); loadlo1 = (vector signed short) vec_mergel( zero, inData[1] ); \ for ( h=0; h<height; h++) { for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( inData+16, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( inData+64, prefetchSize, 3 ); #endif //add the constant to it hi = vec_add( loadhi, c ); lo = vec_add( loadlo, c ); hi1 = vec_add( loadhi1, c ); lo1 = vec_add( loadlo1, c ); //expand the UInt8's to short's loadhi = (vector signed short) vec_mergeh( zero, inData[2] ); loadlo = (vector signed short) vec_mergel( zero, inData[2] ); loadhi1 = (vector signed short) vec_mergeh( zero, inData[3] ); loadlo1 = (vector signed short) vec_mergel( zero, inData[3] ); //pack the result back down, with saturation inData[0] = vec_packsu( hi, lo ); inData++; inData[0] = vec_packsu( hi1, lo1 ); inData++; } } // // finish the last iteration after the loop // hi = vec_add( loadhi, c ); lo = vec_add( loadlo, c ); hi1 = vec_add( loadhi1, c ); lo1 = vec_add( loadlo1, c ); //pack the result back down, with saturation inData[0] = vec_packsu( hi, lo ); inData++; inData[0] = vec_packsu( hi1, lo1 ); inData++; #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); //end of working altivec function #endif }
void YUV422_to_YV12_altivec(short*pY, short*pY2, short*pU, short*pV, const unsigned char *gem_image, int xsize, int ysize) { // UYVY UYVY UYVY UYVY const vector unsigned char *pixels1=reinterpret_cast<const vector unsigned char *>(gem_image); const vector unsigned char *pixels2=reinterpret_cast<const vector unsigned char *>(gem_image+(xsize*2)); // PDP packet to be filled: // first Y plane vector signed short *py1 = reinterpret_cast<vector signed short *>(pY); // 2nd Y pixel vector signed short *py2 = reinterpret_cast<vector signed short *>(pY2); // U plane vector signed short *pCr = reinterpret_cast<vector signed short *>(pU); // V plane vector signed short *pCb = reinterpret_cast<vector signed short *>(pV); vector signed short uvSub = static_cast<vector signed short>( 128, 128, 128, 128, 128, 128, 128, 128 ); vector unsigned short yShift = static_cast<vector unsigned short>( 7, 7, 7, 7, 7, 7, 7, 7 ); vector unsigned short uvShift = static_cast<vector unsigned short>( 8, 8, 8, 8, 8, 8, 8, 8 ); vector signed short tempY1, tempY2, tempY3, tempY4, tempUV1, tempUV2, tempUV3, tempUV4, tempUV5, tempUV6; vector unsigned char uvPerm = static_cast<vector unsigned char>( 16, 0, 17, 4, 18, 8, 19, 12, // u0..u3 20, 2, 21, 6, 22, 10, 23, 14 ); // v0..v3 vector unsigned char uPerm = static_cast<vector unsigned char>( 0, 1, 2, 3, 4, 5, 6, 7, 16,17,18,19,20,21,22,23); vector unsigned char vPerm = static_cast<vector unsigned char>( 8, 9, 10,11,12,13,14,15, 24,25,26,27,28,29,30,31); vector unsigned char yPerm = static_cast<vector unsigned char>( 16, 1, 17, 3, 18, 5, 19, 7, // y0..y3 20, 9, 21, 11, 23, 13, 25, 15);// y4..y7 vector unsigned char zeroVec = static_cast<vector unsigned char>(0); int row=ysize>>1; int cols=xsize>>4; #if 0 # ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( pu, prefetchSize, 0 ); vec_dst( pv, prefetchSize, 0 ); vec_dst( py1, prefetchSize, 0 ); vec_dst( py2, prefetchSize, 0 ); # endif #endif while(row--){ int col=cols; while(col--){ #if 0 # ifndef PPC970 vec_dst( ); # endif #endif tempUV1 = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, uvPerm)); tempY1 = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, yPerm)); tempY2 = static_cast<vector signed short>(vec_perm( *pixels2, zeroVec, yPerm)); pixels1++;pixels2++; tempUV2 = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, uvPerm)); tempY3 = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, yPerm)); tempY4 = static_cast<vector signed short>(vec_perm( *pixels2, zeroVec, yPerm)); pixels1++;pixels2++; tempUV3 = vec_sub( tempUV1, uvSub ); tempUV4 = vec_sub( tempUV2, uvSub ); tempUV5 = vec_sl( tempUV3, uvShift ); tempUV6 = vec_sl( tempUV4, uvShift ); *pCb = vec_perm( tempUV5, tempUV6, uPerm ); *pCr = vec_perm( tempUV5, tempUV6, vPerm ); pCr++; pCb++; *py1++ = vec_sl( tempY1, yShift); *py2++ = vec_sl( tempY2, yShift); *py1++ = vec_sl( tempY3, yShift); *py2++ = vec_sl( tempY4, yShift); } py1+=(xsize>>3); py2+=(xsize>>3); pixels1+=(xsize*2)>>4; pixels2+=(xsize*2)>>4; } }
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL) { int i; vector signed short vsrc; uint16_t *qmat; vector unsigned short vqmat; vector unsigned short vmquant; vector bool short eqzero, ltzero; vector signed short val, t0; vector signed short zero, one; vector unsigned int four; vector signed short min, max; int offset, offset2; int16_t dst0; union { vector unsigned short vu16; unsigned short mquant; vector signed int vs32; struct { signed int pad[3]; signed int sum; } s; } vu; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat)) mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)", wsp->intra_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst); for (i = 0; i < 64; i++) if (src[i] < -256 || src[i] > 255) mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)", i, src[i]); #endif /* }}} */ AMBER_START; dst0 = src[0] << (3 - dc_prec); qmat = (uint16_t*)wsp->intra_q_mat; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(64/8,1,0); vec_dst(src, dsc.control, 0); vec_dst(qmat, dsc.control, 1); #endif /* vmquant = (vector unsigned short)(mquant); */ vu.mquant = (unsigned short)mquant; vmquant = vec_splat(vu.vu16, 0); zero = vec_splat_s16(0); one = vec_splat_s16(1); four = vec_splat_u32(4); /* max = (2047); min = (-2048); {{{ */ vu8(max) = vec_splat_u8(0x7); t0 = vec_splat_s16(-1); /* 0xffff */ vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */ min = vec_sub(t0, max); /* }}} */ offset = 0; #if 1 vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); i = (64/8) - 1; do { /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); offset2 = offset; offset += 8*sizeof(int16_t); vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset2, dst); } while (--i); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); #else /* {{{ */ i = (64/8); do { vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); offset += 8*sizeof(int16_t); } while (--i); /* }}} */ #endif dst[0] = dst0; AMBER_STOP; }
void func(unsigned char *buf, unsigned len) { vec_dst(buf, (len >= 256 ? 0 : len) | 512, 2); }
/* * add prediction and prediction error, saturate to 0...255 * pred % 8 == 0 * cur % 8 == 0 * lx % 16 == 0 * blk % 16 == 0 */ void add_pred_altivec(ADD_PRED_PDECL) { #ifdef ALTIVEC_DST unsigned int dst; #endif uint8_t *pCA, *pCB, *pPA, *pPB; int16_t *pBA, *pBB; vector unsigned char zero; vector unsigned char predA, predB, curA, curB; vector signed short blkA, blkB; #ifdef ALTIVEC_VERIFY if (NOT_VECTOR_ALIGNED(lx)) mjpeg_error_exit1("add_pred: lx %% 16 != 0, (%d)", lx); if (NOT_VECTOR_ALIGNED(blk)) mjpeg_error_exit1("add_pred: blk %% 16 != 0, (%d)", blk); #ifdef ALTIVEC_DST if (lx & (~0xffff) != 0) mjpeg_error_exit1("add_pred: lx=%d > vec_dst range", lx); #endif if (((unsigned long)pred & 0xf) != ((unsigned long)cur & 0xf)) mjpeg_error_exit1("add_pred: (pred(0x%X) %% 16) != (cur(0x%X) %% 16)", pred, cur); if ((((unsigned long)pred) & 0x7) != 0) mjpeg_error_exit1("add_pred: pred %% 8 != 0, (0x%X)", pred); if ((((unsigned long)cur) & 0x7) != 0) mjpeg_error_exit1("add_pred: cur %% 8 != 0, (0x%X)", cur); #endif /* MACROS expand differently depending on input */ #define ABBA(symbol,ab) _ABBA(ABBA_##ab,symbol) /* {{{ */ #define _ABBA(abba_ab,symbol) abba_ab(symbol) #define ABBA_A(symbol) symbol##B #define ABBA_B(symbol) symbol##A /* }}} */ #define HLLH(symbol,hl) _HLLH(HLLH_##hl,symbol) /* {{{ */ #define _HLLH(hllh_hl,symbol) hllh_hl(symbol) #define HLLH_h(symbol) symbol##l #define HLLH_l(symbol) symbol##h /* }}} */ #define PACKSU(hl,st,ld) _PACKSU(PACKSU_##hl,st,ld) /* {{{ */ #define _PACKSU(psu,st,ld) psu(st,ld) #define PACKSU_h(st,ld) vec_packsu(st,ld) #define PACKSU_l(st,ld) vec_packsu(ld,st) /* }}} */ #define PERFORM_ITERATION(hl,ab,iter) /* iter {{{ */ \ pred##ab = vec_merge##hl(zero, pred##ab); \ cur##ab = HLLH(vec_merge,hl)(zero, cur##ab); \ blk##ab = vec_add(blk##ab, vs16(pred##ab)); \ blk##ab = vec_max(blk##ab, vs16(zero)); \ cur##ab = PACKSU(hl, vu16(blk##ab), vu16(cur##ab)); \ vec_st(cur##ab, 0, pC##ab); \ /* }}} */ #define PREPARE_ITERATION(hl,ab,iter) /* iter {{{ */ \ pP##ab = ABBA(pP,ab) + lx; \ pC##ab = ABBA(pC,ab) + lx; \ pB##ab = ABBA(pB,ab) + 8; \ pred##ab = vec_ld(0, pP##ab); \ cur##ab = vec_ld(0, pC##ab); \ blk##ab = vec_ld(0, pB##ab); \ /* }}} */ #define NO_RESCHEDULE asm volatile ("") AMBER_START; pPA = pred; pCA = cur; pBA = blk; #ifdef ALTIVEC_DST dst = 0x01080000 | lx; vec_dst(pPA, dst, 0); vec_dst(pCA, dst, 1); dst = 0x01080010; vec_dst(pBA, dst, 2); #endif predA = vec_ld(0, pPA); curA = vec_ld(0, pCA); NO_RESCHEDULE; pPB = pPA + lx; NO_RESCHEDULE; blkA = vec_ld(0, pBA); NO_RESCHEDULE; pCB = pCA + lx; NO_RESCHEDULE; predB = vec_ld(0, pPB); NO_RESCHEDULE; pBB = pBA + 8; NO_RESCHEDULE; curB = vec_ld(0, pCB); NO_RESCHEDULE; zero = vec_splat_u8(0); NO_RESCHEDULE; blkB = vec_ld(0, pBB); if (VECTOR_ALIGNED(pPA)) { PERFORM_ITERATION(h,A,0); PREPARE_ITERATION(h,A,2); /* prepare next A iteration */ PERFORM_ITERATION(h,B,1); PREPARE_ITERATION(h,B,3); /* prepare next B iteration */ PERFORM_ITERATION(h,A,2); PREPARE_ITERATION(h,A,4); PERFORM_ITERATION(h,B,3); PREPARE_ITERATION(h,B,5); PERFORM_ITERATION(h,A,4); PREPARE_ITERATION(h,A,6); PERFORM_ITERATION(h,B,5); PREPARE_ITERATION(h,B,7); PERFORM_ITERATION(h,A,6); PERFORM_ITERATION(h,B,7); } else { PERFORM_ITERATION(l,A,0); PREPARE_ITERATION(l,A,2); /* prepare next A iteration */ PERFORM_ITERATION(l,B,1); PREPARE_ITERATION(l,B,3); /* prepare next B iteration */ PERFORM_ITERATION(l,A,2); PREPARE_ITERATION(l,A,4); PERFORM_ITERATION(l,B,3); PREPARE_ITERATION(l,B,5); PERFORM_ITERATION(l,A,4); PREPARE_ITERATION(l,A,6); PERFORM_ITERATION(l,B,5); PREPARE_ITERATION(l,B,7); PERFORM_ITERATION(l,A,6); PERFORM_ITERATION(l,B,7); } #ifdef ALTIVEC_DST vec_dssall(); #endif AMBER_STOP; }
void fluid_genPressure_black(fluid *in_f, int y, pvt_fluidMode *mode) { struct pressure *p = &mode->pressure; int w = fieldWidth(p->velX); int h = fieldHeight(p->velX); #ifdef __APPLE_ALTIVEC__ #elif defined __SSE3__ #else int sx = fieldStrideX(p->velX); #endif int sy = fieldStrideY(p->velY); float *velX = fieldData(p->velX); float *velY = fieldData(p->velY); float *pressure = fieldData(p->pressure); if (y == 0) { #ifdef X_SIMD x128f *vPressure = (x128f*)fluidFloatPointer(pressure, 0*sy); x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, 1*sy); int x; w/=4; for (x=0; x<w; x++) { vPressure[x] = vPressureP[x]; } #else int x; for (x=0; x<w; x++) { fluidFloatPointer(pressure,x*sx)[0] = fluidFloatPointer(pressure,x*sx + sy)[0]; } #endif } else if (y == h-1) { #ifdef X_SIMD x128f *vPressure = (x128f*)fluidFloatPointer(pressure, y*sy); x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy); int x; w/=4; for (x=0; x<w; x++) { vPressure[x] = vPressureP[x]; } #else int x; for (x=0; x<w; x++) { fluidFloatPointer(pressure,x*sx + y*sy)[0] = fluidFloatPointer(pressure,x*sx + (y-1)*sy)[0]; } #endif } else { #ifdef X_SIMD float *vPressureRow = fluidFloatPointer(pressure, y*sy); x128f *vPressure = (x128f*)vPressureRow; x128f *vVelX = (x128f*)fluidFloatPointer(velX, y*sy); x128f *vPressureN = (x128f*)fluidFloatPointer(pressure, (y+1)*sy); x128f *vVelYN = (x128f*)fluidFloatPointer(velY, (y+1)*sy); x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy); x128f *vVelYP = (x128f*)fluidFloatPointer(velY, (y-1)*sy); x128f div4 = {0.0f, 1.0f/4.0f, 0.0f, 1.0f/4.0f}; x128f mask = {1.0f, 0.0f, 1.0f, 0.0f}; #endif #ifdef __APPLE_ALTIVEC__ //int myTempVariable = __mfspr( 1023 ); vector float vZero = {0,0,0,0}; vec_dstst(vPressure, 0x01000001, 0); vec_dst(vVelX, 0x01000001, 1); vec_dst(vVelYN, 0x01000001, 2); vec_dst(vVelYP, 0x01000001, 3); int x; { vector float tmp; //Compute shifts vector float sl_p = vec_sld(vPressure[0], vPressure[1],4); vector float sr_p = vec_sld(vZero, vPressure[0], 12); vector float sl_vx = vec_sld(vVelX[0], vVelX[1],4); vector float sr_vx = vec_sld(vZero, vVelX[0], 12); //Sum everything!!! tmp = vec_add(sl_p, sr_p); tmp = vec_add(tmp, vPressureN[0]); tmp = vec_add(tmp, vPressureP[0]); tmp = vec_sub(tmp, sl_vx); tmp = vec_add(tmp, sr_vx); tmp = vec_sub(tmp, vVelYN[0]); tmp = vec_add(tmp, vVelYP[0]); vPressure[0] = vec_madd(tmp, div4, vZero); vPressureRow[0] = vPressureRow[1]; } x=1; while (x<w/4-5) { PRESSURE_VEC_PRE(0) PRESSURE_VEC_PRE(1) PRESSURE_VEC_PRE(2) PRESSURE_VEC_PRE(3) PRESSURE_VEC_SHIFT(0) PRESSURE_VEC_SHIFT(1) PRESSURE_VEC_SHIFT(2) PRESSURE_VEC_SHIFT(3) PRESSURE_VEC_END(0) PRESSURE_VEC_END(1) PRESSURE_VEC_END(2) PRESSURE_VEC_END(3) x+=4; } while (x<w/4-1) { PRESSURE_VEC_PRE(0) PRESSURE_VEC_SHIFT(0) PRESSURE_VEC_END(0) x++; } { vector float tmp; //Compute shifts vector float sl_p = vec_sld(vPressure[x], vZero,4); vector float sr_p = vec_sld(vPressure[x-1], vPressure[x], 12); vector float sl_vx = vec_sld(vVelX[x], vZero,4); vector float sr_vx = vec_sld(vVelX[x-1], vVelX[x], 12); //Sum everything!!! tmp = vec_add(sl_p, sr_p); tmp = vec_add(tmp, vPressureN[x]); tmp = vec_add(tmp, vPressureP[x]); tmp = vec_sub(tmp, sl_vx); tmp = vec_add(tmp, sr_vx); tmp = vec_sub(tmp, vVelYN[x]); tmp = vec_add(tmp, vVelYP[x]); vPressure[x] = vec_madd(tmp, div4, vZero); vPressureRow[w-1] = vPressureRow[w-2]; } #elif defined __SSE3__ int x; { __m128 tmp; //Compute shifts __m128 sl_p = _mm_srli_sf128(vPressure[0],4); sl_p = _mm_add_ps(sl_p,_mm_slli_sf128(vPressure[1],12)); __m128 sr_p = _mm_slli_sf128(vPressure[0],4); __m128 sl_vx = _mm_srli_sf128(vVelX[0],4); sl_vx = _mm_add_ps(sl_vx,_mm_slli_sf128(vVelX[1],12)); __m128 sr_vx = _mm_slli_sf128(vVelX[0],4); //Sum everything!!! tmp = _mm_add_ps(sl_p, sr_p); tmp = _mm_add_ps(tmp, vPressureN[0]); tmp = _mm_add_ps(tmp, vPressureP[0]); tmp = _mm_sub_ps(tmp, sl_vx); tmp = _mm_add_ps(tmp, sr_vx); tmp = _mm_sub_ps(tmp, vVelYN[0]); tmp = _mm_add_ps(tmp, vVelYP[0]); vPressure[0] = _mm_mul_ps(tmp, div4); vPressureRow[0] = vPressureRow[1]; } x=1; while (x<w/4-9) { //Compute shifts (1) PRESSURE_SSE_PRE(0); PRESSURE_SSE_PRE(1); PRESSURE_SSE_PRE(2); //Sum everything!!! (1) PRESSURE_SSE_POST(0); PRESSURE_SSE_POST(1); PRESSURE_SSE_POST(2); x+=3; } while (x<w/4-1) { //Compute shifts PRESSURE_SSE_PRE(0); //Sum everything!!! PRESSURE_SSE_POST(0); x++; } { __m128 tmp; //Compute shifts __m128 sl_p = _mm_srli_sf128(vPressure[x],4); __m128 sr_p = _mm_slli_sf128(vPressure[x],4); sr_p = _mm_add_ps(sr_p,_mm_srli_sf128(vPressure[x-1],12)); __m128 sl_vx = _mm_srli_sf128(vVelX[x],4); __m128 sr_vx = _mm_slli_sf128(vVelX[x],4); sr_vx = _mm_add_ps(sr_vx,_mm_srli_sf128(vVelX[x-1],12)); //Sum everything!!! tmp = _mm_add_ps(sl_p, sr_p); tmp = _mm_add_ps(tmp, vPressureN[x]); tmp = _mm_add_ps(tmp, vPressureP[x]); tmp = _mm_sub_ps(tmp, sl_vx); tmp = _mm_add_ps(tmp, sr_vx); tmp = _mm_sub_ps(tmp, vVelYN[x]); tmp = _mm_add_ps(tmp, vVelYP[x]); vPressure[x] = _mm_mul_ps(tmp, div4); vPressureRow[w-1] = vPressureRow[w-2]; } #else float lastPressureX = fluidFloatPointer(pressure,sx + y*sy)[0]; float lastVelX = fluidFloatPointer(velX, y*sy)[0]; float curPressureX = lastPressureX; float curVelX = fluidFloatPointer(velX, sx + y*sy)[0]; fluidFloatPointer(pressure,y*sy)[0] = lastPressureX; int x; int curxy = sx + y*sy; for (x=1; x<w-1; x++) { float nextPressureX = fluidFloatPointer(pressure,curxy + sx)[0]; float nextVelX = fluidFloatPointer(velX,curxy + sx)[0]; fluidFloatPointer(pressure,curxy)[0] = ( lastPressureX + nextPressureX + fluidFloatPointer(pressure,curxy - sy)[0] + fluidFloatPointer(pressure,curxy + sy)[0] - ( nextVelX - lastVelX + fluidFloatPointer(velY,curxy + sy)[0] - fluidFloatPointer(velY,curxy - sy)[0])) / 4.0f; lastPressureX = curPressureX; curPressureX = nextPressureX; lastVelX = curVelX; curVelX = nextVelX; curxy += sx; } fluidFloatPointer(pressure,(w-1)*sx + y*sy)[0] = fluidFloatPointer(pressure,(w-2)*sx + y*sy)[0]; #endif } }
void YV12_to_YUV422_altivec(const short*Y, const short*U, const short*V, unsigned char *data, int xsize, int ysize) { // from [email protected], 3/15/2005 // #1. Don't use the pointers. Use vec_ld with an index that you increment (by 16) instead. vector unsigned char *pixels1=reinterpret_cast<vector unsigned char *>(data); vector unsigned char *pixels2=reinterpret_cast<vector unsigned char *>(data+(xsize*2)); const vector unsigned short *py1 = reinterpret_cast<const vector unsigned short *>(Y); const vector unsigned short *py2 = reinterpret_cast<const vector unsigned short *>(Y + xsize ); const vector unsigned short *pu = reinterpret_cast<const vector unsigned short *>(U); const vector unsigned short *pv = reinterpret_cast<const vector unsigned short *>(V); vector unsigned short uvAdd = static_cast<vector unsigned short>( 128, 128, 128, 128, 128, 128, 128, 128 ); vector unsigned short yShift = static_cast<vector unsigned short>( 7, 7, 7, 7, 7, 7, 7, 7 ); vector unsigned short uvShift = static_cast<vector unsigned short>( 8, 8, 8, 8, 8, 8, 8, 8 ); vector unsigned short tempU, tempV, doneU, doneV, tempY1, tempY2, tempY3, tempY4, uv1, uv2, out1, out2, out3, out4, out5, out6, out7, out8; vector unsigned char Perm1 = static_cast<vector unsigned char>( 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 ); vector unsigned char Perm2 = static_cast<vector unsigned char>( 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 ); int row=ysize>>1; int cols=xsize>>4; #if 0 # ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( pu, prefetchSize, 0 ); vec_dst( pv, prefetchSize, 0 ); vec_dst( py1, prefetchSize, 0 ); vec_dst( py2, prefetchSize, 0 ); # endif #endif while(row--){ int col=cols; while(col--){ #if 0 # ifndef PPC970 vec_dst( ); # endif #endif tempU = vec_sra( (*pu++), uvShift ); tempV = vec_sra( (*pv++), uvShift ); doneU = vec_add( tempU, uvAdd ); doneV = vec_add( tempV, uvAdd ); uv1 = vec_perm( doneU, doneV, Perm1 ); // uvuvuvuv uvuvuvuv uv2 = vec_perm( doneU, doneV, Perm2 ); tempY1 = vec_sra( (*py1++), yShift ); tempY2 = vec_sra( (*py2++), yShift ); out1 = vec_perm( uv1, tempY1, Perm1 ); //fill Y's, U's & V's out2 = vec_perm( uv1, tempY1, Perm2 ); out3 = vec_perm( uv1, tempY2, Perm1 ); //fill 2nd Y's, U's & V's out4 = vec_perm( uv1, tempY2, Perm2 ); *pixels1 = vec_packsu( out1, out2 ); *pixels2 = vec_packsu( out3, out4 ); pixels1++; pixels2++; tempY3 = vec_sra( (*py1++), yShift ); // load second set of Y's tempY4 = vec_sra( (*py2++), yShift ); out5 = vec_perm( uv2, tempY3, Perm1 ); out6 = vec_perm( uv2, tempY3, Perm2 ); out7 = vec_perm( uv2, tempY4, Perm1 ); out8 = vec_perm( uv2, tempY4, Perm2 ); *pixels1 = vec_packsu( out5, out6 ); *pixels2 = vec_packsu( out7, out8 ); pixels1++; pixels2++; } pixels1+=(xsize*2)>>4; pixels2+=(xsize*2)>>4; py1+=xsize>>3; py2+=xsize>>3; } }
void pix_background :: processYUVAltivec(imageStruct &image) { register int h,w,i,j,width; int pixsize = image.xsize * image.ysize * image.csize; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec - try something SD or HD maybe?"); return; } union{ unsigned short s[8]; vector unsigned short v; }shortBuffer; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); m_reset = 0; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector unsigned short Yrange, UVrange, Yblank,UVblank,blank; register vector bool short Ymasklo,Ymaskhi, UVmaskhi; register vector unsigned short Yhi,Ylo,UVhi,UVlo; register vector unsigned char one = vec_splat_u8(1); register vector unsigned short sone = vec_splat_u16(1); register vector unsigned int Uhi, Ulo, Vhi, Vlo,Ures,Vres; register vector bool int Umasklo, Umaskhi, Vmaskhi, Vmasklo; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) m_savedImage.data; shortBuffer.s[0] = m_Yrange; Yrange = shortBuffer.v; Yrange = vec_splat(Yrange,0); shortBuffer.s[0] = 128; shortBuffer.s[1] = 0; shortBuffer.s[2] = 128; shortBuffer.s[3] = 0; shortBuffer.s[4] = 128; shortBuffer.s[5] = 0; shortBuffer.s[6] = 128; shortBuffer.s[7] = 0; blank = shortBuffer.v; shortBuffer.s[0] = 0; Yblank = shortBuffer.v; Yblank = vec_splat(Yblank,0); shortBuffer.s[0] = 128; UVblank = shortBuffer.v; UVblank = vec_splat(UVblank,0); shortBuffer.s[0] = m_Urange; shortBuffer.s[1] = m_Vrange; shortBuffer.s[2] = m_Urange; shortBuffer.s[3] = m_Vrange; shortBuffer.s[4] = m_Urange; shortBuffer.s[5] = m_Vrange; shortBuffer.s[6] = m_Urange; shortBuffer.s[7] = m_Vrange; UVrange = shortBuffer.v; //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //PPC970 for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Yhi = vec_adds(Yres2,Yrange); Ylo = vec_subs(Yres2,Yrange); //go to ints for comparison UVhi = vec_adds(UVres2,UVrange); UVlo = vec_subs(UVres2,UVrange); Uhi = vec_mule(sone,UVhi); Ulo = vec_mule(sone,UVlo); Vhi = vec_mulo(sone,UVhi); Vlo = vec_mulo(sone,UVlo); Ures = vec_mule(sone,UVres1); Vres = vec_mulo(sone,UVres1); Umasklo = vec_cmpgt(Ures,Ulo); Umaskhi = vec_cmplt(Ures,Uhi); Vmasklo = vec_cmpgt(Vres,Vlo); Vmaskhi = vec_cmplt(Vres,Vhi); Umaskhi = vec_and(Umaskhi,Umasklo); Vmaskhi = vec_and(Vmaskhi,Vmasklo); Umasklo = vec_and(Umaskhi,Vmaskhi); Vmasklo = vec_and(Umaskhi,Vmaskhi); hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo); loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo); //pack it back down to bool short UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage); Ymasklo = vec_cmpgt(Yres1,Ylo); Ymaskhi = vec_cmplt(Yres1,Yhi); Ymaskhi = vec_and(Ymaskhi,Ymasklo); Ymaskhi = vec_and(Ymaskhi,UVmaskhi); UVmaskhi = vec_and(Ymaskhi,UVmaskhi); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres1,Yblank,Ymaskhi); //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi); UVres1 = vec_sel(UVres1,UVblank,UVmaskhi); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); #endif } }
void pix_compare :: processYUV_Altivec(imageStruct &image, imageStruct &right) { register int h,w,i,j,width; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec"); return; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector bool short Ymask1; register vector unsigned char one = vec_splat_u8(1); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; #ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif if (m_direction) { for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); //compare the Y values Ymask1 = vec_cmpgt(Yres1,Yres2); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres2,Yres1,Ymask1); UVres1 = vec_sel(UVres2,UVres1,Ymask1); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(1); vec_dss(0); #endif } }else{ for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Ymask1 = vec_cmplt(Yres1,Yres2); Yres1 = vec_sel(Yres2,Yres1,Ymask1); UVres1 = vec_sel(UVres2,UVres1,Ymask1); hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(1); vec_dss(0); #endif } } }
/* start of optimized motionblur */ void pix_motionblur :: processYUVAltivec(imageStruct &image) { int h,w,width; signed short rightGain,imageGain; unsigned char *saved = m_savedImage.data; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if(saved!=m_savedImage.data) { m_savedImage.setBlack(); } saved=m_savedImage.data; width = image.xsize/8; /* // hmm: why does it read 235 ? rightGain = (signed short)(235. * m_motionblur); imageGain = (signed short) (255. - (235. * m_motionblur)); */ rightGain = m_blur1; imageGain = m_blur0; union { signed short elements[8]; vector signed short v; } shortBuffer; union { unsigned int elements[4]; vector unsigned int v; } bitBuffer; register vector signed short gainAdd, hiImage, loImage,hiRight,loRight, YImage, UVImage; // register vector signed short loadhiImage, loadloImage,loadhiRight,loadloRight; register vector unsigned char loadImage, loadRight; register vector unsigned char zero = vec_splat_u8(0); register vector signed int UVhi,UVlo,Yhi,Ylo; register vector signed int UVhiR,UVloR,YhiR,YloR; register vector signed short gainSub,gain,gainR;//,d; register vector unsigned int bitshift; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) saved; shortBuffer.elements[0] = 128; shortBuffer.elements[1] = 0; shortBuffer.elements[2] = 128; shortBuffer.elements[3] = 0; shortBuffer.elements[4] = 128; shortBuffer.elements[5] = 0; shortBuffer.elements[6] = 128; shortBuffer.elements[7] = 0; gainSub = shortBuffer.v; shortBuffer.elements[0] = imageGain; gain = shortBuffer.v; gain = vec_splat(gain, 0 ); shortBuffer.elements[0] = rightGain; gainR = shortBuffer.v; gainR = vec_splat(gainR, 0 ); bitBuffer.elements[0] = 8; //Load it into the vector unit bitshift = bitBuffer.v; bitshift = vec_splat(bitshift,0); shortBuffer.elements[0] = 128; //Load it into the vector unit gainAdd = shortBuffer.v; gainAdd = (vector signed short)vec_splat((vector signed short)gainAdd,0); # ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); # endif loadImage = inData[0]; loadRight = rightData[0]; for ( h=0; h<image.ysize; h++) { for (w=0; w<width; w++) { # ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); # endif //interleaved U Y V Y chars hiImage = (vector signed short) vec_mergeh( zero, loadImage ); loImage = (vector signed short) vec_mergel( zero, loadImage ); hiRight = (vector signed short) vec_mergeh( zero, loadRight ); loRight = (vector signed short) vec_mergel( zero, loadRight ); //hoist that load!! loadImage = inData[1]; loadRight = rightData[1]; //subtract 128 from UV hiImage = vec_subs(hiImage,gainSub); loImage = vec_subs(loImage,gainSub); hiRight = vec_subs(hiRight,gainSub); loRight = vec_subs(loRight,gainSub); //now vec_mule the UV into two vector ints //change sone to gain UVhi = vec_mule(gain,hiImage); UVlo = vec_mule(gain,loImage); UVhiR = vec_mule(gainR,hiRight); UVloR = vec_mule(gainR,loRight); //now vec_mulo the Y into two vector ints Yhi = vec_mulo(gain,hiImage); Ylo = vec_mulo(gain,loImage); YhiR = vec_mulo(gainR,hiRight); YloR = vec_mulo(gainR,loRight); //this is where to do the add and bitshift due to the resolution //add UV UVhi = vec_adds(UVhi,UVhiR); UVlo = vec_adds(UVlo,UVloR); Yhi = vec_adds(Yhi,YhiR); Ylo = vec_adds(Ylo,YloR); //bitshift UV UVhi = vec_sra(UVhi,bitshift); UVlo = vec_sra(UVlo,bitshift); Yhi = vec_sra(Yhi,bitshift); Ylo = vec_sra(Ylo,bitshift); //pack the UV into a single short vector UVImage = vec_packs(UVhi,UVlo); //pack the Y into a single short vector YImage = vec_packs(Yhi,Ylo); //vec_mergel + vec_mergeh Y and UV hiImage = vec_mergeh(UVImage,YImage); loImage = vec_mergel(UVImage,YImage); //add 128 offset back hiImage = vec_adds(hiImage,gainSub); loImage = vec_adds(loImage,gainSub); //vec_mergel + vec_mergeh Y and UV rightData[0] = (vector unsigned char)vec_packsu(hiImage, loImage); inData[0] = (vector unsigned char)vec_packsu(hiImage, loImage); inData++; rightData++; } } # ifndef PPC970 //stop the cache streams vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); # endif }/* end of working altivec function */
void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL) { int i, ii, j, stride1, stride2, stride3, stride4, halfstride; unsigned char *pB, *pB2, *pB4; vector unsigned char l0, l1, l2, l3; vector unsigned short s0, s1, s2, s3; vector unsigned short s22_0, s22_1, s22_2, s22_3; vector unsigned short s44, s44_0, s44_1; vector unsigned short zero, two; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY if (NOT_VECTOR_ALIGNED(image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "image", 16, image); if (NOT_VECTOR_ALIGNED(sub22_image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "sub22_image", 16, sub22_image); if (NOT_VECTOR_ALIGNED(sub44_image)) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "sub44_image", 16, sub44_image); if ((rowstride & 63) != 0) mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)", "rowstride", 64, rowstride); #endif AMBER_START; pB = image; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(6,4,0); dsc.block.stride = rowstride; vec_dst(pB, dsc.control, 0); #endif pB2 = sub22_image; pB4 = sub44_image; j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */ stride1 = rowstride; stride2 = stride1 + stride1; stride3 = stride2 + stride1; stride4 = stride2 + stride2; halfstride = stride1 >> 1; /* /2 */ ii = rowstride >> 6; /* rowstride/16/4 */ zero = vec_splat_u16(0); two = vec_splat_u16(2); do { i = ii; do { l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; #ifdef ALTIVEC_DST vec_dst(pB + (16 * 3), dsc.control, 0); #endif /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */ /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */ /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */ /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */ /* s0 = 0x[00,01, 02,03, 04,05, 06,07, ] */ /* [ 10,11, 12,13, 14,15, 16,17] */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */ s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); /* s1 = 0x[08,09, 0A,0B, 0C,0D, 0E,0F, ] */ /* [ 18,19, 1A,1B, 1C,1D, 1E,1F] */ s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */ s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); /* s2 = 0x[20,21, 22,23, 24,25, 26,27, ] */ /* [ 30,31, 32,33, 34,35, 36,37] */ s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */ s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); /* s3 = 0x[28,29, 2A,2B, 2C,2D, 2E,2F, ] */ /* [ 38,39, 3A,3B, 3C,3D, 3E,3F] */ s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */ s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* start loading next block */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */ /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */ /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */ /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */ /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */ s22_0 = vec_packsu(vu32(s0), vu32(s1)); /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */ s22_1 = vec_packsu(vu32(s2), vu32(s3)); /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */ s22_0 = vec_add(s22_0, two); /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */ s22_1 = vec_add(s22_1, two); /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */ s22_0 = vec_sra(s22_0, two); /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */ s22_1 = vec_sra(s22_1, two); /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */ /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */ /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */ s44_0 = vec_add(s22_0, s22_1); /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */ s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero))); /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* start loading next l[0-3] */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; s22_2 = vec_packsu(vu32(s0), vu32(s1)); s22_3 = vec_packsu(vu32(s2), vu32(s3)); s22_2 = vec_add(s22_2, two); s22_3 = vec_add(s22_3, two); s22_2 = vec_sra(s22_2, two); s22_3 = vec_sra(s22_3, two); s44_1 = vec_add(s22_2, s22_3); s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero))); /* store s22 block */ s22_0 = vu16(vec_packsu(s22_0, s22_2)); s22_1 = vu16(vec_packsu(s22_1, s22_3)); vec_st(vu8(s22_0), 0, pB2); vec_st(vu8(s22_1), halfstride, pB2); pB2 += 16; /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); /* starting loading next l[0-3] */ l0 = vec_ld(0, pB); l1 = vec_ld(stride1, pB); l2 = vec_ld(stride2, pB); l3 = vec_ld(stride3, pB); pB += 16; s22_0 = vec_packsu(vu32(s0), vu32(s1)); s22_1 = vec_packsu(vu32(s2), vu32(s3)); s22_0 = vec_add(s22_0, two); s22_1 = vec_add(s22_1, two); s22_0 = vec_sra(s22_0, two); s22_1 = vec_sra(s22_1, two); s44 = vec_packsu(vu32(s44_0), vu32(s44_1)); s44 = vec_add(s44, two); s44 = vec_sra(s44, two); s44_0 = vec_add(s22_0, s22_1); s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero))); /* - - - - - - - - - - - - - - - - - - - */ s0 = vu16(vec_mergeh(vu16(l0), vu16(l1))); s0 = vu16(vec_sum4s(vu8(s0), vu32(zero))); s1 = vu16(vec_mergel(vu16(l0), vu16(l1))); s1 = vu16(vec_sum4s(vu8(s1), vu32(zero))); s2 = vu16(vec_mergeh(vu16(l2), vu16(l3))); s2 = vu16(vec_sum4s(vu8(s2), vu32(zero))); s3 = vu16(vec_mergel(vu16(l2), vu16(l3))); s3 = vu16(vec_sum4s(vu8(s3), vu32(zero))); s22_2 = vec_packsu(vu32(s0), vu32(s1)); s22_3 = vec_packsu(vu32(s2), vu32(s3)); s22_2 = vec_add(s22_2, two); s22_3 = vec_add(s22_3, two); s22_2 = vec_sra(s22_2, two); s22_3 = vec_sra(s22_3, two); s44_1 = vec_add(s22_2, s22_3); s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero))); /* store s22 block */ s22_0 = vu16(vec_packsu(s22_0, s22_2)); s22_1 = vu16(vec_packsu(s22_1, s22_3)); vec_st(vu8(s22_0), 0, pB2); vec_st(vu8(s22_1), halfstride, pB2); pB2 += 16; /* pack all four s44 chunks */ s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1)); s44_0 = vec_add(s44_0, two); s44_0 = vec_sra(s44_0, two); s44 = vu16(vec_packsu(s44, s44_0)); vec_st(vu8(s44), 0, pB4); pB4 += 16; } while (--i); pB += stride3; pB2 += halfstride; } while (--j); #ifdef ALTIVEC_DST vec_dss(0); #endif AMBER_STOP; }
void pix_add :: processYUV_Altivec(imageStruct &image, imageStruct &right) { int h,w,width; width = image.xsize/8; //format is U Y V Y union { //unsigned int i; short elements[8]; //vector signed char v; vector signed short v; }shortBuffer; union { //unsigned int i; unsigned char elements[16]; //vector signed char v; vector unsigned char v; }charBuffer; //vector unsigned char c; register vector signed short d, hiImage, loImage, YRight, UVRight, YImage, UVImage, UVTemp, YTemp; // vector unsigned char zero = vec_splat_u8(0); register vector unsigned char c,one; // vector signed short zshort = vec_splat_s16(0); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; //Write the pixel (pair) to the transfer buffer charBuffer.elements[0] = 2; charBuffer.elements[1] = 1; charBuffer.elements[2] = 2; charBuffer.elements[3] = 1; charBuffer.elements[4] = 2; charBuffer.elements[5] = 1; charBuffer.elements[6] = 2; charBuffer.elements[7] = 1; charBuffer.elements[8] = 2; charBuffer.elements[9] = 1; charBuffer.elements[10] = 2; charBuffer.elements[11] = 1; charBuffer.elements[12] = 2; charBuffer.elements[13] = 1; charBuffer.elements[14] = 2; charBuffer.elements[15] = 1; //Load it into the vector unit c = charBuffer.v; one = vec_splat_u8( 1 ); shortBuffer.elements[0] = 255; //Load it into the vector unit d = shortBuffer.v; d = static_cast<vector signed short>(vec_splat(static_cast<vector signed short>(d),0)); #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //interleaved U Y V Y chars //vec_mule UV * 2 to short vector U V U V shorts UVImage = static_cast<vector signed short>(vec_mule(one,inData[0])); UVRight = static_cast<vector signed short>(vec_mule(c,rightData[0])); //vec_mulo Y * 1 to short vector Y Y Y Y shorts YImage = static_cast<vector signed short>(vec_mulo(c,inData[0])); YRight = static_cast<vector signed short>(vec_mulo(c,rightData[0])); //vel_subs UV - 255 UVRight = static_cast<vector signed short>(vec_subs(UVRight, d)); //vec_adds UV UVTemp = vec_adds(UVImage,UVRight); //vec_adds Y YTemp = vec_adds(YImage,YRight); hiImage = vec_mergeh(UVTemp,YTemp); loImage = vec_mergel(UVTemp,YTemp); //vec_mergel + vec_mergeh Y and UV inData[0] = vec_packsu(hiImage, loImage); inData++; rightData++; } #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); #endif } /*end of working altivec function */ }
void pix_movement :: processYUVAltivec(imageStruct &image) { if (image.xsize*image.ysize != buffer.xsize*buffer.ysize){ buffer.xsize = image.xsize; buffer.ysize = image.ysize; buffer.reallocate(buffer.xsize*buffer.ysize*2); } int pixsize = image.ysize * image.xsize/8; union{ signed short c[8]; vector signed short v; }shortBuffer; union{ unsigned short c[8]; vector unsigned short v; }ushortBuffer; int i; vector signed short thresh; shortBuffer.c[0] = threshold; thresh = shortBuffer.v; thresh = (vector signed short)vec_splat(thresh,0); vector unsigned char *rp = (vector unsigned char *) image.data; // read pointer vector unsigned char *wp = (vector unsigned char *) buffer.data; // write pointer to the copy vector unsigned char grey0,grey1; vector unsigned char one = vec_splat_u8(1); vector unsigned short Y0,Ywp0,hiImage0,loImage0; vector unsigned short Y1,Ywp1,hiImage1,loImage1; vector unsigned short UVwp0,UVwp1; vector signed short temp0,temp1; ushortBuffer.c[0]=127; vector unsigned short UV0= (vector unsigned short)vec_splat(ushortBuffer.v, 0); vector unsigned short UV1= (vector unsigned short)vec_splat(ushortBuffer.v, 0); #ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 0, 256 ); vec_dst( rp, prefetchSize, 0 ); vec_dst( wp, prefetchSize, 1 ); #endif int j = 16; pixsize/=2; for (i=0; i < pixsize; i++) { # ifndef PPC970 //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( j, 0, j * 16 ); vec_dst( rp, prefetchSize, 0 ); vec_dst( wp, prefetchSize, 1 ); vec_dst( rp+16, prefetchSize, 2 ); vec_dst( wp+16, prefetchSize, 3 ); # endif grey0 = rp[0]; grey1 = rp[1]; // rp[Y0]=255*(abs(grey0-*wp)>thresh); // UV0= (vector unsigned short)vec_mule(grey0,one); Y0 = (vector unsigned short)vec_mulo(grey0,one); // UV1= (vector unsigned short)vec_mule(grey1,one); Y1 = (vector unsigned short)vec_mulo(grey1,one); //wp is actually 1/2 the size of the image because it is only Y?? //here the full U Y V Y is stored // UVwp0= (vector unsigned short)vec_mule(wp[0],one); Ywp0 = (vector unsigned short)vec_mulo(wp[0],one); // UVwp1= (vector unsigned short)vec_mule(wp[1],one); Ywp1 = (vector unsigned short)vec_mulo(wp[1],one); //store the current pixels as the history for next time wp[0]=grey0; wp++; wp[0]=grey1; wp++; temp0 = vec_abs(vec_sub((vector signed short)Y0,(vector signed short)Ywp0)); Y0 = (vector unsigned short)vec_cmpgt(temp0,thresh); temp1 = vec_abs(vec_sub((vector signed short)Y1,(vector signed short)Ywp1)); Y1 = (vector unsigned short)vec_cmpgt(temp1,thresh); hiImage0 = vec_mergeh(UV0,Y0); loImage0 = vec_mergel(UV0,Y0); hiImage1 = vec_mergeh(UV1,Y1); loImage1 = vec_mergel(UV1,Y1); grey0 = vec_packsu(hiImage0,loImage0); grey1 = vec_packsu(hiImage1,loImage1); rp[0]=grey0; rp++; rp[0]=grey1; rp++; // grey = rp[0]; // rp[Y1]=255*(abs(grey-*wp)>thresh); // *wp++=grey; // rp+=4; // rp++; } # ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); # endif }
void g (int b) { vec_dst(&b, 3, 3); vec_dst(&b, 1, 1); }
void pix_diff :: processYUV_Altivec(imageStruct &image, imageStruct &right) { long h,w,width; width = image.xsize/8; //format is U Y V Y union { //unsigned int i; short elements[8]; //vector signed char v; vector short v; }shortBuffer; vector signed short d, hiImage, loImage,hiRight, loRight;//, YRight, UVRight, YImage, UVImage, UVTemp, YTemp; vector unsigned char zero = vec_splat_u8(0); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; shortBuffer.elements[0] = 128; shortBuffer.elements[1] = 0; shortBuffer.elements[2] = 128; shortBuffer.elements[3] = 0; shortBuffer.elements[4] = 128; shortBuffer.elements[5] = 0; shortBuffer.elements[6] = 128; shortBuffer.elements[7] = 0; //Load it into the vector unit d = shortBuffer.v; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //interleaved U Y V Y chars //break out to unsigned shorts hiImage = (vector signed short) vec_mergeh( zero, inData[0] ); loImage = (vector signed short) vec_mergel( zero, inData[0] ); hiRight = (vector signed short) vec_mergeh( zero, rightData[0] ); loRight = (vector signed short) vec_mergel( zero, rightData[0] ); //subtract the 128 offset for UV hiImage = vec_subs(hiImage,d); loImage = vec_subs(loImage,d); hiRight = vec_subs(hiRight,d); loRight = vec_subs(loRight,d); hiImage = vec_subs(hiImage,hiRight); loImage = vec_subs(loImage,loRight); hiImage = vec_adds(hiImage,d); loImage = vec_adds(loImage,d); hiImage = vec_abs(hiImage); loImage = vec_abs(loImage); inData[0] = vec_packsu(hiImage, loImage); inData++; rightData++; } #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); #endif } /*end of working altivec function */ }
/* * subtract prediction from block data * pred % 8 == 0 * cur % 8 == 0 * lx % 16 == 0 * blk % 16 == 0 */ void sub_pred_altivec(SUB_PRED_PDECL) { unsigned int dst; uint8_t *pCA, *pCB, *pPA, *pPB; int16_t *pBA, *pBB; vector unsigned char zero; vector unsigned char predA, predB, curA, curB; vector signed short blkA, blkB; #ifdef ALTIVEC_VERIFY #ifdef ALTIVEC_DST if (lx & (~0xffff) != 0) mjpeg_error_exit1("sub_pred: lx > vec_dst range", lx); #endif if (NOT_VECTOR_ALIGNED(lx)) mjpeg_error_exit1("sub_pred: lx %% 16 != 0, (%d)", lx); if (NOT_VECTOR_ALIGNED(blk)) mjpeg_error_exit1("sub_pred: blk %% 16 != 0, (%d)", blk); if (((unsigned long)pred & 0xf) != ((unsigned long)cur & 0xf)) mjpeg_error_exit1("sub_pred: (pred(0x%X) %% 16) != (cur(0x%X) %% 16)", pred, cur); if ((((unsigned long)pred) & 0x7) != 0) mjpeg_error_exit1("sub_pred: pred %% 8 != 0, (0x%X)", pred); if ((((unsigned long)cur) & 0x7) != 0) mjpeg_error_exit1("sub_pred: cur %% 8 != 0, (0x%X)", cur); #endif /* A->B, B->A expand differently depending on input */ #define ABBA(symbol,ab) _ABBA(ABBA_##ab,symbol) /* {{{ */ #define _ABBA(abba_ab,symbol) abba_ab(symbol) #define ABBA_A(symbol) symbol##B #define ABBA_B(symbol) symbol##A /* }}} */ #define PERFORM_ITERATION(hl,ab,iter) /* iter {{{ */ \ pred##ab = vec_merge##hl(zero, pred##ab); \ cur##ab = vec_merge##hl(zero, cur##ab); \ blk##ab = vec_sub(vs16(cur##ab), vs16(pred##ab)); \ vec_st(blk##ab, 0, (signed short*)pB##ab); \ /* }}} */ #define PREPARE_ITERATION(hl,ab,iter) /* iter {{{ */ \ pP##ab = ABBA(pP,ab) + lx; \ pC##ab = ABBA(pC,ab) + lx; \ pB##ab = ABBA(pB,ab) + 8; \ pred##ab = vec_ld(0, pP##ab); \ cur##ab = vec_ld(0, pC##ab); \ /* }}} */ #define NO_RESCHEDULE asm volatile ("") AMBER_START; pPA = pred; pCA = cur; pBA = blk; #ifdef ALTIVEC_DST dst = 0x01080000 | lx; vec_dst(pPA, dst, 0); vec_dst(pCA, dst, 1); dst = 0x01080010; vec_dstst(pBA, dst, 2); #endif pPB = pPA + lx; NO_RESCHEDULE; predA = vec_ld(0, pPA); NO_RESCHEDULE; pCB = pCA + lx; NO_RESCHEDULE; curA = vec_ld(0, pCA); NO_RESCHEDULE; pBB = pBA + 8; NO_RESCHEDULE; predB = vec_ld(0, pPB); NO_RESCHEDULE; zero = vec_splat_u8(0); NO_RESCHEDULE; curB = vec_ld(0, pCB); if (VECTOR_ALIGNED(pPA)) { PERFORM_ITERATION(h,A,0); PREPARE_ITERATION(h,A,2); /* prepare next A iteration */ PERFORM_ITERATION(h,B,1); PREPARE_ITERATION(h,B,3); /* prepare next B iteration */ PERFORM_ITERATION(h,A,2); PREPARE_ITERATION(h,A,4); PERFORM_ITERATION(h,B,3); PREPARE_ITERATION(h,B,5); PERFORM_ITERATION(h,A,4); PREPARE_ITERATION(h,A,6); PERFORM_ITERATION(h,B,5); PREPARE_ITERATION(h,B,7); PERFORM_ITERATION(h,A,6); PERFORM_ITERATION(h,B,7); } else { PERFORM_ITERATION(l,A,0); PREPARE_ITERATION(l,A,2); /* prepare next A iteration */ PERFORM_ITERATION(l,B,1); PREPARE_ITERATION(l,B,3); /* prepare next B iteration */ PERFORM_ITERATION(l,A,2); PREPARE_ITERATION(l,A,4); PERFORM_ITERATION(l,B,3); PREPARE_ITERATION(l,B,5); PERFORM_ITERATION(l,A,4); PREPARE_ITERATION(l,A,6); PERFORM_ITERATION(l,B,5); PREPARE_ITERATION(l,B,7); PERFORM_ITERATION(l,A,6); PERFORM_ITERATION(l,B,7); } #ifdef ALTIVEC_DST vec_dssall(); #endif AMBER_STOP; }