// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2, register vec_u8_t q0, register vec_u8_t tc0) { register vec_u8_t average = vec_avg(p0, q0); register vec_u8_t temp; register vec_u8_t uncliped; register vec_u8_t ones; register vec_u8_t max; register vec_u8_t min; register vec_u8_t newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, register vector unsigned char p1, register vector unsigned char p2, register vector unsigned char q0, register vector unsigned char tc0) { register vector unsigned char average = vec_avg(p0, q0); register vector unsigned char temp; register vector unsigned char uncliped; register vector unsigned char ones; register vector unsigned char max; register vector unsigned char min; register vector unsigned char newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
// out: o = |x-y| < a static inline vec_u8_t diff_lt_altivec( register vec_u8_t x, register vec_u8_t y, register vec_u8_t a ) { register vec_u8_t diff = vec_subs(x, y); register vec_u8_t diffneg = vec_subs(y, x); register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ o = (vec_u8_t)vec_cmplt(o, a); return o; }
static inline vector signed short convert16_altivec(vector signed int v1, vector signed int v2) { register vector signed short result; v1 = vec_subs(v1, magic); v2 = vec_subs(v2, magic); result = vec_packs(v1, v2); return result; }
// out: o = |x-y| < a static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x, register vector unsigned char y, register vector unsigned char a) { register vector unsigned char diff = vec_subs(x, y); register vector unsigned char diffneg = vec_subs(y, x); register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ o = (vector unsigned char)vec_cmplt(o, a); return o; }
void pix_diff :: processRGBA_Altivec(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize / 4; vector signed short hiImage, loImage, hiRight, loRight; vector unsigned char zero = vec_splat_u8(0); vector unsigned char *inData = (vector unsigned char *)image.data; vector unsigned char *rightData = (vector unsigned char *)right.data; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif do { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+256, prefetchSize, 2 ); vec_dst( rightData+256, prefetchSize, 3 ); #endif hiImage = (vector signed short)vec_mergeh(zero,inData[0]); loImage = (vector signed short)vec_mergel(zero,inData[0]); hiRight = (vector signed short)vec_mergeh(zero,rightData[0]); loRight = (vector signed short)vec_mergel(zero,rightData[0]); hiImage = vec_subs(hiImage,hiRight); loImage = vec_subs(loImage,loRight); hiImage = vec_abs(hiImage); loImage = vec_abs(loImage); inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } while (--datasize); #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); #endif }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; vec_s32 v_dc32; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; v_dc32 = vec_lde(0, &dc); dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1); if (size == 4) dc16 = VEC_SLD16(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); #if !HAVE_BIGENDIAN aligner = vec_perm(aligner, zero_u8v, vcswapc()); #endif dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
void gimp_composite_difference_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,e,alpha_a,alpha_b; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); d=vec_min(alpha_a, alpha_b); a=vec_andc(a, alphamask); a=vec_adds(a, d); b=vec_andc(b, alphamask); d=vec_subs(a, b); e=vec_subs(b, a); d=vec_add(d,e); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); alpha_a=vec_and(a,alphamask); alpha_b=vec_and(b,alphamask); d=vec_min(alpha_a,alpha_b); a=vec_andc(a,alphamask); a=vec_adds(a,d); b=vec_andc(b,alphamask); d=vec_subs(a,b); e=vec_subs(b, a); d=vec_add(d,e); StoreUnalignedLess(d, D, length); }
void pix_invert :: processYUVAltivec(imageStruct &image) { int h,w,width; width = image.xsize/8; union{ unsigned char c[16]; vector unsigned char v; }charBuffer; vector unsigned char offset; vector unsigned char *inData = (vector unsigned char*) image.data; charBuffer.c[0] = 255; offset = charBuffer.v; offset = (vector unsigned char) vec_splat(offset,0); #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); #endif inData[0]=vec_subs(offset,inData[0]); inData++; } #ifndef PPC970 vec_dss( 0 ); #endif } /*end of working altivec function */ }
void pix_subtract :: processRGBA_Altivec(imageStruct &image, imageStruct &right) { int h,w,width; width = image.xsize/4; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif inData[0] = vec_subs(inData[0], rightData[0]); inData++; rightData++; } #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); #endif } /*end of working altivec function */ }
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size) { vec_s16 dc16; vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner; LOAD_ZERO; DECLARE_ALIGNED(16, int, dc); int i; dc = (block[0] + 32) >> 6; block[0] = 0; dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); if (size == 4) dc16 = vec_sld(dc16, zero_s16v, 8); dcplus = vec_packsu(dc16, zero_s16v); dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v); aligner = vec_lvsr(0, dst); dcplus = vec_perm(dcplus, dcplus, aligner); dcminus = vec_perm(dcminus, dcminus, aligner); for (i = 0; i < size; i += 4) { v0 = vec_ld(0, dst+0*stride); v1 = vec_ld(0, dst+1*stride); v2 = vec_ld(0, dst+2*stride); v3 = vec_ld(0, dst+3*stride); v0 = vec_adds(v0, dcplus); v1 = vec_adds(v1, dcplus); v2 = vec_adds(v2, dcplus); v3 = vec_adds(v3, dcplus); v0 = vec_subs(v0, dcminus); v1 = vec_subs(v1, dcminus); v2 = vec_subs(v2, dcminus); v3 = vec_subs(v3, dcminus); vec_st(v0, 0, dst+0*stride); vec_st(v1, 0, dst+1*stride); vec_st(v2, 0, dst+2*stride); vec_st(v3, 0, dst+3*stride); dst += 4*stride; } }
void imageFilterSubFrom_Altivec(unsigned char *dst, unsigned char *src, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { SUBFROM_PIXEL(); --n; ++dst; ++src; } // Do bulk of processing using Altivec (sub 16 8-bit unsigned integers, with saturation) while(n >= 16) { vector unsigned char s = vec_ld(0,src); vector unsigned char d = vec_ld(0,dst); vector unsigned char r = vec_subs(d, s); vec_st(r,0,dst); n -= 16; src += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_SUBFROM(); }
/* Function: p7_SSVFilter_longtarget() * Synopsis: Finds windows with SSV scores above some threshold (vewy vewy fast, in limited precision) * * Purpose: Calculates an approximation of the SSV (single ungapped diagonal) * score for regions of sequence <dsq> of length <L> residues, using * optimized profile <om>, and a preallocated one-row DP matrix <ox>, * and captures the positions at which such regions exceed the score * required to be significant in the eyes of the calling function, * which depends on the <bg> and <p> (usually p=0.02 for nhmmer). * Note that this variant performs only SSV computations, never * passing through the J state - the score required to pass SSV at * the default threshold (or less restrictive) is sufficient to * pass MSV in essentially all DNA models we've tested. * * Above-threshold diagonals are captured into a preallocated list * <windowlist>. Rather than simply capturing positions at which a * score threshold is reached, this function establishes windows * around those high-scoring positions, using scores in <msvdata>. * These windows can be merged by the calling function. * * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * msvdata - compact representation of substitution scores, for backtracking diagonals * bg - the background model, required for translating a P-value threshold into a score threshold * P - p-value below which a region is captured as being above threshold * windowlist - preallocated container for all hits (resized if necessary) * * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_SSVFilter_longtarget(const ESL_DSQ *dsq, int L, P7_OPROFILE *om, P7_OMX *ox, const P7_SCOREDATA *ssvdata, P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist) { vector unsigned char mpv; /* previous row values */ vector unsigned char xEv; /* E state: keeps max for Mk->E as we go */ vector unsigned char xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector unsigned char sv; /* temp storage of 1 curr row value in progress */ vector unsigned char biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ vector unsigned char *dp = ox->dpb[0]; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ vector unsigned char *rsc; /* will point at om->rbv[x] for residue x[i] */ vector unsigned char zerov; /* vector of zeros */ vector unsigned char tecv; /* vector for E->C cost */ vector unsigned char tjbmv; /* vector for [JN]->B->M move cost */ vector unsigned char basev; /* offset for scores */ int status; int k; int n; int end; int rem_sc; int start; int target_end; int target_start; int max_end; int max_sc; int sc; int pos_since_max; float ret_sc; union { vector unsigned char v; uint8_t b[16]; } u; /* * Computing the score required to let P meet the F1 prob threshold * In original code, converting from a scaled int MSV * score S (the score getting to state E) to a probability goes like this: * usc = S - om->tec_b - om->tjb_b - om->base_b; * usc /= om->scale_b; * usc -= 3.0; * P = f ( (usc - nullsc) / eslCONST_LOG2 , mu, lambda) * and we're computing the threshold usc, so reverse it: * (usc - nullsc) / eslCONST_LOG2 = inv_f( P, mu, lambda) * usc = nullsc + eslCONST_LOG2 * inv_f( P, mu, lambda) * usc += 3 * usc *= om->scale_b * S = usc + om->tec_b + om->tjb_b + om->base_b * * Here, I compute threshold with length model based on max_length. Doesn't * matter much - in any case, both the bg and om models will change with roughly * 1 bit for each doubling of the length model, so they offset. */ float nullsc; float invP = esl_gumbel_invsurv(P, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); vector unsigned char sc_threshv; /* pushes value to saturation if it's above pthresh */ int sc_thresh; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; p7_bg_SetLength(bg, om->max_length); p7_oprofile_ReconfigMSVLength(om, om->max_length); p7_bg_NullOne (bg, dsq, om->max_length, &nullsc); sc_thresh = (int) ceil( ( ( nullsc + (invP * eslCONST_LOG2) + 3.0 ) * om->scale_b ) + om->base_b + om->tec_b + om->tjb_b ); sc_threshv = esl_vmx_set_u8( (int8_t)sc_thresh - 1); /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ biasv = esl_vmx_set_u8(om->bias_b); for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0); xJ = 0; zerov = vec_splat_u8(0); basev = esl_vmx_set_u8((int8_t) om->base_b); tecv = esl_vmx_set_u8((int8_t) om->tec_b); tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b); xBv = vec_subs(basev, tjbmv); for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = vec_splat_u8(0); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = vec_sld(zerov, dp[Q-1], 15); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_max(mpv, xBv); sv = vec_adds(sv, biasv); sv = vec_subs(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } if (vec_any_gt(xEv, sc_threshv) ) { //hit pthresh, so add position to list and reset values //figure out which model state hit threshold end = -1; rem_sc = -1; for (q = 0; q < Q; q++) { /// Unpack and unstripe, so we can find the state that exceeded pthresh u.v = dp[q]; for (k = 0; k < 16; k++) { // unstripe //(q+Q*k+1) is the model position k at which the xE score is found if (u.b[k] >= sc_thresh && u.b[k] > rem_sc && (q+Q*k+1) <= om->M) { end = (q+Q*k+1); rem_sc = u.b[k]; } } dp[q] = vec_splat_u8(0); // while we're here ... this will cause values to get reset to xB in next dp iteration } //recover the diagonal that hit threshold start = end; target_end = target_start = i; sc = rem_sc; while (rem_sc > om->base_b - om->tjb_b - om->tbm_b) { rem_sc -= om->bias_b - ssvdata->ssv_scores[start*om->abc->Kp + dsq[target_start]]; --start; --target_start; //if ( start == 0 || target_start==0) break; } start++; target_start++; //extend diagonal further with single diagonal extension k = end+1; n = target_end+1; max_end = target_end; max_sc = sc; pos_since_max = 0; while (k<om->M && n<=L) { sc += om->bias_b - ssvdata->ssv_scores[k*om->abc->Kp + dsq[n]]; if (sc >= max_sc) { max_sc = sc; max_end = n; pos_since_max=0; } else { pos_since_max++; if (pos_since_max == 5) break; } k++; n++; } end += (max_end - target_end); target_end = max_end; ret_sc = ((float) (max_sc - om->tjb_b) - (float) om->base_b); ret_sc /= om->scale_b; ret_sc -= 3.0; // that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ p7_hmmwindow_new( windowlist, 0, // sequence_id; used in the FM-based filter, but not here target_start, // position in the target at which the diagonal starts 0, // position in the target fm_index at which diagonal starts; not used here, just in FM-based filter end, // position in the model at which the diagonal ends end-start+1 , // length of diagonal ret_sc, // score of diagonal p7_NOCOMPLEMENT, // always p7_NOCOMPLEMENT here; varies in FM-based filter L ); i = target_end; // skip forward } } /* end loop over sequence residues 1..L */ return eslOK; ERROR: ESL_EXCEPTION(eslEMEM, "Error allocating memory for hit list\n"); }
/* Function: p7_MSVFilter() * Synopsis: Calculates MSV score, vewy vewy fast, in limited precision. * Incept: SRE, Wed Dec 26 15:12:25 2007 [Janelia] * * Purpose: Calculates an approximation of the MSV score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated MSV score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model may be in any mode, because only its match * emission scores will be used. The MSV filter inherently * assumes a multihit local mode, and uses its own special * state transition scores, not the scores in the profile. * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: MSV score (in nats) * * Note: We misuse the matrix <ox> here, using only a third of the * first dp row, accessing it as <dp[0..Q-1]> rather than * in triplets via <{MDI}MX(q)> macros, since we only need * to store M state values. We know that if <ox> was big * enough for normal DP calculations, it must be big enough * to hold the MSVFilter calculation. * * Returns: <eslOK> on success. * <eslERANGE> if the score overflows the limited range; in * this case, this is a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small. */ int p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { vector unsigned char mpv; /* previous row values */ vector unsigned char xEv; /* E state: keeps max for Mk->E as we go */ vector unsigned char xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector unsigned char sv; /* temp storage of 1 curr row value in progress */ vector unsigned char biasv; /* emission bias in a vector */ uint8_t xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q = p7O_NQB(om->M); /* segment length: # of vectors */ vector unsigned char *dp; /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/ vector unsigned char *rsc; /* will point at om->rbv[x] for residue x[i] */ vector unsigned char zerov; /* vector of zeros */ vector unsigned char xJv; /* vector for states score */ vector unsigned char tjbmv; /* vector for B->Mk cost */ vector unsigned char tecv; /* vector for E->C cost */ vector unsigned char basev; /* offset for scores */ vector unsigned char ceilingv; /* saturateed simd value used to test for overflow */ vector unsigned char tempv; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ16) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); ox->M = om->M; /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base. */ dp = ox->dpb[0]; for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0); xJ = 0; biasv = esl_vmx_set_u8(om->bias_b); zerov = vec_splat_u8(0); /* saturate simd register for overflow test */ tempv = vec_splat_u8(1); ceilingv = (vector unsigned char)vec_cmpeq(biasv, biasv); ceilingv = vec_subs(ceilingv, biasv); ceilingv = vec_subs(ceilingv, tempv); basev = esl_vmx_set_u8((int8_t) om->base_b); tecv = esl_vmx_set_u8((int8_t) om->tec_b); tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b); xJv = vec_subs(biasv, biasv); xBv = vec_subs(basev, tjbmv); #if p7_DEBUGGING if (ox->debugging) { unsigned char xB; vec_ste(xBv, 0, &xB); vec_ste(xJv, 0, &xJ); p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ); } #endif for (i = 1; i <= L; i++) { rsc = om->rbv[dsq[i]]; xEv = vec_splat_u8(0); // xBv = vec_sub(xBv, tbmv); /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically, which is our -infinity. */ mpv = vec_sld(zerov, dp[Q-1], 15); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_max(mpv, xBv); sv = vec_adds(sv, biasv); sv = vec_subs(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); mpv = dp[q]; /* Load {MDI}(i-1,q) into mpv */ dp[q] = sv; /* Do delayed store of M(i,q) now that memory is usable */ } /* Now the "special" states, which start from Mk->E (->C, ->J->B) * Use rotates instead of shifts so when the last max has completed, * all elements of the simd register will contain the max value. */ tempv = vec_sld(xEv, xEv, 1); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 2); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 4); xEv = vec_max(xEv, tempv); tempv = vec_sld(xEv, xEv, 8); xEv = vec_max(xEv, tempv); /* immediately detect overflow */ if (vec_any_gt(xEv, ceilingv)) { *ret_sc = eslINFINITY; return eslERANGE; } xEv = vec_subs(xEv, tecv); xJv = vec_max(xJv,xEv); xBv = vec_max(basev, xJv); xBv = vec_subs(xBv, tjbmv); #if p7_DEBUGGING if (ox->debugging) { unsigned char xB, xE; vec_ste(xBv, 0, &xB); vec_ste(xEv, 0, &xE); vec_ste(xJv, 0, &xJ); p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ); } #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and add our missing precision on the NN,CC,JJ back */ vec_ste(xJv, 0, &xJ); *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b); *ret_sc /= om->scale_b; *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */ return eslOK; }
void pix_add :: processYUV_Altivec(imageStruct &image, imageStruct &right) { int h,w,width; width = image.xsize/8; //format is U Y V Y union { //unsigned int i; short elements[8]; //vector signed char v; vector signed short v; }shortBuffer; union { //unsigned int i; unsigned char elements[16]; //vector signed char v; vector unsigned char v; }charBuffer; //vector unsigned char c; register vector signed short d, hiImage, loImage, YRight, UVRight, YImage, UVImage, UVTemp, YTemp; // vector unsigned char zero = vec_splat_u8(0); register vector unsigned char c,one; // vector signed short zshort = vec_splat_s16(0); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; //Write the pixel (pair) to the transfer buffer charBuffer.elements[0] = 2; charBuffer.elements[1] = 1; charBuffer.elements[2] = 2; charBuffer.elements[3] = 1; charBuffer.elements[4] = 2; charBuffer.elements[5] = 1; charBuffer.elements[6] = 2; charBuffer.elements[7] = 1; charBuffer.elements[8] = 2; charBuffer.elements[9] = 1; charBuffer.elements[10] = 2; charBuffer.elements[11] = 1; charBuffer.elements[12] = 2; charBuffer.elements[13] = 1; charBuffer.elements[14] = 2; charBuffer.elements[15] = 1; //Load it into the vector unit c = charBuffer.v; one = vec_splat_u8( 1 ); shortBuffer.elements[0] = 255; //Load it into the vector unit d = shortBuffer.v; d = static_cast<vector signed short>(vec_splat(static_cast<vector signed short>(d),0)); #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //interleaved U Y V Y chars //vec_mule UV * 2 to short vector U V U V shorts UVImage = static_cast<vector signed short>(vec_mule(one,inData[0])); UVRight = static_cast<vector signed short>(vec_mule(c,rightData[0])); //vec_mulo Y * 1 to short vector Y Y Y Y shorts YImage = static_cast<vector signed short>(vec_mulo(c,inData[0])); YRight = static_cast<vector signed short>(vec_mulo(c,rightData[0])); //vel_subs UV - 255 UVRight = static_cast<vector signed short>(vec_subs(UVRight, d)); //vec_adds UV UVTemp = vec_adds(UVImage,UVRight); //vec_adds Y YTemp = vec_adds(YImage,YRight); hiImage = vec_mergeh(UVTemp,YTemp); loImage = vec_mergel(UVTemp,YTemp); //vec_mergel + vec_mergeh Y and UV inData[0] = vec_packsu(hiImage, loImage); inData++; rightData++; } #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); #endif } /*end of working altivec function */ }
void dct_vmx (vector signed short *input, vector signed short *output, vector signed short *postscale) { vector signed short mul0, mul1, mul2, mul3, mul4, mul5, mul6, mul; vector signed short v0, v1, v2, v3, v4, v5, v6, v7, v8, v9; vector signed short v20, v21, v22, v23, v24, v25, v26, v27, v31; int i; vector signed short in[8], out[8]; /* Load first eight rows of input data */ /* Load multiplication constants */ /* Splat multiplication constants */ mul0 = vec_splat(input[8],0); mul1 = vec_splat(input[8],1); mul2 = vec_splat(input[8],2); mul3 = vec_splat(input[8],3); mul4 = vec_splat(input[8],4); mul5 = vec_splat(input[8],5); mul6 = vec_splat(input[8],6); /* Perform DCT on the eight columns */ /*********** Stage 1 ***********/ v8 = vec_adds (input[0], input[7]); v9 = vec_subs (input[0], input[7]); v0 = vec_adds (input[1], input[6]); v7 = vec_subs (input[1], input[6]); v1 = vec_adds (input[2], input[5]); v6 = vec_subs (input[2], input[5]); v2 = vec_adds (input[3], input[4]); v5 = vec_subs (input[3], input[4]); /*********** Stage 2 ***********/ /* Top */ v3 = vec_adds (v8, v2); /* (V0+V7) + (V3+V4) */ v4 = vec_subs (v8, v2); /* (V0+V7) - (V3+V4) */ v2 = vec_adds (v0, v1); /* (V1+V6) + (V2+V5) */ v8 = vec_subs (v0, v1); /* (V1+V6) - (V2+V5) */ /* Bottom */ v0 = vec_subs (v7, v6); /* (V1-V6) - (V2-V5) */ v1 = vec_adds (v7, v6); /* (V1-V6) + (V2-V5) */ /*********** Stage 3 ***********/ /* Top */ in[0] = vec_adds (v3, v2); /* y0 = v3 + v2 */ in[4] = vec_subs (v3, v2); /* y4 = v3 - v2 */ in[2] = vec_mradds (v8, mul2, v4); /* y2 = v8 * a0 + v4 */ v6 = vec_mradds (v4, mul2, mul6); in[6] = vec_subs (v6, v8); /* y6 = v4 * a0 - v8 */ /* Bottom */ v6 = vec_mradds (v0, mul0, v5); /* v6 = v0 * (c4) + v5 */ v7 = vec_mradds (v0, mul4, v5); /* v7 = v0 * (-c4) + v5 */ v2 = vec_mradds (v1, mul4, v9); /* v2 = v1 * (-c4) + v9 */ v3 = vec_mradds (v1, mul0, v9); /* v3 = v1 * (c4) + v9 */ /*********** Stage 4 ***********/ /* Bottom */ in[1] = vec_mradds (v6, mul3, v3); /* y1 = v6 * (a1) + v3 */ v23 = vec_mradds (v3, mul3, mul6); in[7] = vec_subs (v23, v6); /* y7 = v3 * (a1) - v6 */ in[5] = vec_mradds (v2, mul1, v7); /* y5 = v2 * (a2) + v7 */ in[3] = vec_mradds (v7, mul5, v2); /* y3 = v7 * (-a2) + v2 */ transpose_vmx (in, out); /* Perform DCT on the eight rows */ /*********** Stage 1 ***********/ v8 = vec_adds (out[0], out[7]); v9 = vec_subs (out[0], out[7]); v0 = vec_adds (out[1], out[6]); v7 = vec_subs (out[1], out[6]); v1 = vec_adds (out[2], out[5]); v6 = vec_subs (out[2], out[5]); v2 = vec_adds (out[3], out[4]); v5 = vec_subs (out[3], out[4]); /*********** Stage 2 ***********/ /* Top */ v3 = vec_adds (v8, v2); /* (V0+V7) + (V3+V4) */ v4 = vec_subs (v8, v2); /* (V0+V7) - (V3+V4) */ v2 = vec_adds (v0, v1); /* (V1+V6) + (V2+V5) */ v8 = vec_subs (v0, v1); /* (V1+V6) - (V2+V5) */ /* Bottom */ v0 = vec_subs (v7, v6); /* (V1-V6) - (V2-V5) */ v1 = vec_adds (v7, v6); /* (V1-V6) + (V2-V5) */ /*********** Stage 3 ***********/ /* Top */ v25 = vec_subs (v25, v25); /* reinit v25 = 0 */ v20 = vec_adds (v3, v2); /* y0 = v3 + v2 */ v24 = vec_subs (v3, v2); /* y4 = v3 - v2 */ v22 = vec_mradds (v8, mul2, v4); /* y2 = v8 * a0 + v4 */ v6 = vec_mradds (v4, mul2, v25); v26 = vec_subs (v6, v8); /* y6 = v4 * a0 - v8 */ /* Bottom */ v6 = vec_mradds (v0, mul0, v5); /* v6 = v0 * (c4) + v5 */ v7 = vec_mradds (v0, mul4, v5); /* v7 = v0 * (-c4) + v5 */ v2 = vec_mradds (v1, mul4, v9); /* v2 = v1 * (-c4) + v9 */ v3 = vec_mradds (v1, mul0, v9); /* v3 = v1 * (c4) + v9 */ /*********** Stage 4 ***********/ /* Bottom */ v21 = vec_mradds (v6, mul3, v3); /* y1 = v6 * (a1) + v3 */ v23 = vec_mradds (v3, mul3, v25); v27 = vec_subs (v23, v6); /* y7 = v3 * (a1) - v6 */ v25 = vec_mradds (v2, mul1, v7); /* y5 = v2 * (a2) + v7 */ v23 = vec_mradds (v7, mul5, v2); /* y3 = v7 * (-a2) + v2 */ /* Post-scale and store reults */ v31 = vec_subs (v31, v31); /* reinit v25 = 0 */ output[0] = vec_mradds (postscale[0], v20, v31); output[2] = vec_mradds (postscale[2], v22, v31); output[4] = vec_mradds (postscale[4], v24, v31); output[6] = vec_mradds (postscale[6], v26, v31); output[1] = vec_mradds (postscale[1], v21, v31); output[3] = vec_mradds (postscale[3], v23, v31); output[5] = vec_mradds (postscale[5], v25, v31); output[7] = vec_mradds (postscale[7], v27, v31); }
/* start of optimized motionblur */ void pix_motionblur :: processYUVAltivec(imageStruct &image) { int h,w,width; signed short rightGain,imageGain; unsigned char *saved = m_savedImage.data; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if(saved!=m_savedImage.data) { m_savedImage.setBlack(); } saved=m_savedImage.data; width = image.xsize/8; /* // hmm: why does it read 235 ? rightGain = (signed short)(235. * m_motionblur); imageGain = (signed short) (255. - (235. * m_motionblur)); */ rightGain = m_blur1; imageGain = m_blur0; union { signed short elements[8]; vector signed short v; } shortBuffer; union { unsigned int elements[4]; vector unsigned int v; } bitBuffer; register vector signed short gainAdd, hiImage, loImage,hiRight,loRight, YImage, UVImage; // register vector signed short loadhiImage, loadloImage,loadhiRight,loadloRight; register vector unsigned char loadImage, loadRight; register vector unsigned char zero = vec_splat_u8(0); register vector signed int UVhi,UVlo,Yhi,Ylo; register vector signed int UVhiR,UVloR,YhiR,YloR; register vector signed short gainSub,gain,gainR;//,d; register vector unsigned int bitshift; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) saved; shortBuffer.elements[0] = 128; shortBuffer.elements[1] = 0; shortBuffer.elements[2] = 128; shortBuffer.elements[3] = 0; shortBuffer.elements[4] = 128; shortBuffer.elements[5] = 0; shortBuffer.elements[6] = 128; shortBuffer.elements[7] = 0; gainSub = shortBuffer.v; shortBuffer.elements[0] = imageGain; gain = shortBuffer.v; gain = vec_splat(gain, 0 ); shortBuffer.elements[0] = rightGain; gainR = shortBuffer.v; gainR = vec_splat(gainR, 0 ); bitBuffer.elements[0] = 8; //Load it into the vector unit bitshift = bitBuffer.v; bitshift = vec_splat(bitshift,0); shortBuffer.elements[0] = 128; //Load it into the vector unit gainAdd = shortBuffer.v; gainAdd = (vector signed short)vec_splat((vector signed short)gainAdd,0); # ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); # endif loadImage = inData[0]; loadRight = rightData[0]; for ( h=0; h<image.ysize; h++) { for (w=0; w<width; w++) { # ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); # endif //interleaved U Y V Y chars hiImage = (vector signed short) vec_mergeh( zero, loadImage ); loImage = (vector signed short) vec_mergel( zero, loadImage ); hiRight = (vector signed short) vec_mergeh( zero, loadRight ); loRight = (vector signed short) vec_mergel( zero, loadRight ); //hoist that load!! loadImage = inData[1]; loadRight = rightData[1]; //subtract 128 from UV hiImage = vec_subs(hiImage,gainSub); loImage = vec_subs(loImage,gainSub); hiRight = vec_subs(hiRight,gainSub); loRight = vec_subs(loRight,gainSub); //now vec_mule the UV into two vector ints //change sone to gain UVhi = vec_mule(gain,hiImage); UVlo = vec_mule(gain,loImage); UVhiR = vec_mule(gainR,hiRight); UVloR = vec_mule(gainR,loRight); //now vec_mulo the Y into two vector ints Yhi = vec_mulo(gain,hiImage); Ylo = vec_mulo(gain,loImage); YhiR = vec_mulo(gainR,hiRight); YloR = vec_mulo(gainR,loRight); //this is where to do the add and bitshift due to the resolution //add UV UVhi = vec_adds(UVhi,UVhiR); UVlo = vec_adds(UVlo,UVloR); Yhi = vec_adds(Yhi,YhiR); Ylo = vec_adds(Ylo,YloR); //bitshift UV UVhi = vec_sra(UVhi,bitshift); UVlo = vec_sra(UVlo,bitshift); Yhi = vec_sra(Yhi,bitshift); Ylo = vec_sra(Ylo,bitshift); //pack the UV into a single short vector UVImage = vec_packs(UVhi,UVlo); //pack the Y into a single short vector YImage = vec_packs(Yhi,Ylo); //vec_mergel + vec_mergeh Y and UV hiImage = vec_mergeh(UVImage,YImage); loImage = vec_mergel(UVImage,YImage); //add 128 offset back hiImage = vec_adds(hiImage,gainSub); loImage = vec_adds(loImage,gainSub); //vec_mergel + vec_mergeh Y and UV rightData[0] = (vector unsigned char)vec_packsu(hiImage, loImage); inData[0] = (vector unsigned char)vec_packsu(hiImage, loImage); inData++; rightData++; } } # ifndef PPC970 //stop the cache streams vec_dss( 0 ); vec_dss( 1 ); vec_dss( 2 ); vec_dss( 3 ); # endif }/* end of working altivec function */
vector unsigned int f(vector unsigned int a, vector unsigned int b) { return vec_subs(a,b); }
void pix_background :: processYUVAltivec(imageStruct &image) { register int h,w,i,j,width; int pixsize = image.xsize * image.ysize * image.csize; h = image.ysize; w = image.xsize/8; width = image.xsize/8; //check to see if the buffer isn't 16byte aligned (highly unlikely) if (image.ysize*image.xsize % 16 != 0){ error("image not properly aligned for Altivec - try something SD or HD maybe?"); return; } union{ unsigned short s[8]; vector unsigned short v; }shortBuffer; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); m_reset = 0; } register vector unsigned short UVres1, Yres1, UVres2, Yres2;//interleave; register vector unsigned short hiImage, loImage; register vector unsigned short Yrange, UVrange, Yblank,UVblank,blank; register vector bool short Ymasklo,Ymaskhi, UVmaskhi; register vector unsigned short Yhi,Ylo,UVhi,UVlo; register vector unsigned char one = vec_splat_u8(1); register vector unsigned short sone = vec_splat_u16(1); register vector unsigned int Uhi, Ulo, Vhi, Vlo,Ures,Vres; register vector bool int Umasklo, Umaskhi, Vmaskhi, Vmasklo; vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) m_savedImage.data; shortBuffer.s[0] = m_Yrange; Yrange = shortBuffer.v; Yrange = vec_splat(Yrange,0); shortBuffer.s[0] = 128; shortBuffer.s[1] = 0; shortBuffer.s[2] = 128; shortBuffer.s[3] = 0; shortBuffer.s[4] = 128; shortBuffer.s[5] = 0; shortBuffer.s[6] = 128; shortBuffer.s[7] = 0; blank = shortBuffer.v; shortBuffer.s[0] = 0; Yblank = shortBuffer.v; Yblank = vec_splat(Yblank,0); shortBuffer.s[0] = 128; UVblank = shortBuffer.v; UVblank = vec_splat(UVblank,0); shortBuffer.s[0] = m_Urange; shortBuffer.s[1] = m_Vrange; shortBuffer.s[2] = m_Urange; shortBuffer.s[3] = m_Vrange; shortBuffer.s[4] = m_Urange; shortBuffer.s[5] = m_Vrange; shortBuffer.s[6] = m_Urange; shortBuffer.s[7] = m_Vrange; UVrange = shortBuffer.v; //setup the cache prefetch -- A MUST!!! UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //PPC970 for ( i=0; i<h; i++){ for (j=0; j<w; j++) { #ifndef PPC970 //this function is probably memory bound on most G4's -- what else is new? vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); vec_dst( inData+32, prefetchSize, 2 ); vec_dst( rightData+32, prefetchSize, 3 ); #endif //separate the U and V from Y UVres1 = (vector unsigned short)vec_mule(one,inData[0]); UVres2 = (vector unsigned short)vec_mule(one,rightData[0]); //vec_mulo Y * 1 to short vector Y Y Y Y shorts Yres1 = (vector unsigned short)vec_mulo(one,inData[0]); Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]); Yhi = vec_adds(Yres2,Yrange); Ylo = vec_subs(Yres2,Yrange); //go to ints for comparison UVhi = vec_adds(UVres2,UVrange); UVlo = vec_subs(UVres2,UVrange); Uhi = vec_mule(sone,UVhi); Ulo = vec_mule(sone,UVlo); Vhi = vec_mulo(sone,UVhi); Vlo = vec_mulo(sone,UVlo); Ures = vec_mule(sone,UVres1); Vres = vec_mulo(sone,UVres1); Umasklo = vec_cmpgt(Ures,Ulo); Umaskhi = vec_cmplt(Ures,Uhi); Vmasklo = vec_cmpgt(Vres,Vlo); Vmaskhi = vec_cmplt(Vres,Vhi); Umaskhi = vec_and(Umaskhi,Umasklo); Vmaskhi = vec_and(Vmaskhi,Vmasklo); Umasklo = vec_and(Umaskhi,Vmaskhi); Vmasklo = vec_and(Umaskhi,Vmaskhi); hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo); loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo); //pack it back down to bool short UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage); Ymasklo = vec_cmpgt(Yres1,Ylo); Ymaskhi = vec_cmplt(Yres1,Yhi); Ymaskhi = vec_and(Ymaskhi,Ymasklo); Ymaskhi = vec_and(Ymaskhi,UVmaskhi); UVmaskhi = vec_and(Ymaskhi,UVmaskhi); //bitwise comparison and move using the result of the comparison as a mask Yres1 = vec_sel(Yres1,Yblank,Ymaskhi); //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi); UVres1 = vec_sel(UVres1,UVblank,UVmaskhi); //merge the Y and UV back together hiImage = vec_mergeh(UVres1,Yres1); loImage = vec_mergel(UVres1,Yres1); //pack it back down to unsigned char to store inData[0] = vec_packsu(hiImage,loImage); inData++; rightData++; } #ifndef PPC970 vec_dss(0); vec_dss(1); vec_dss(2); vec_dss(3); #endif } }
void pix_diff :: processYUV_Altivec(imageStruct &image, imageStruct &right) { long h,w,width; width = image.xsize/8; //format is U Y V Y union { //unsigned int i; short elements[8]; //vector signed char v; vector short v; }shortBuffer; vector signed short d, hiImage, loImage,hiRight, loRight;//, YRight, UVRight, YImage, UVImage, UVTemp, YTemp; vector unsigned char zero = vec_splat_u8(0); vector unsigned char *inData = (vector unsigned char*) image.data; vector unsigned char *rightData = (vector unsigned char*) right.data; shortBuffer.elements[0] = 128; shortBuffer.elements[1] = 0; shortBuffer.elements[2] = 128; shortBuffer.elements[3] = 0; shortBuffer.elements[4] = 128; shortBuffer.elements[5] = 0; shortBuffer.elements[6] = 128; shortBuffer.elements[7] = 0; //Load it into the vector unit d = shortBuffer.v; #ifndef PPC970 UInt32 prefetchSize = GetPrefetchConstant( 16, 1, 256 ); vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif for ( h=0; h<image.ysize; h++){ for (w=0; w<width; w++) { #ifndef PPC970 vec_dst( inData, prefetchSize, 0 ); vec_dst( rightData, prefetchSize, 1 ); #endif //interleaved U Y V Y chars //break out to unsigned shorts hiImage = (vector signed short) vec_mergeh( zero, inData[0] ); loImage = (vector signed short) vec_mergel( zero, inData[0] ); hiRight = (vector signed short) vec_mergeh( zero, rightData[0] ); loRight = (vector signed short) vec_mergel( zero, rightData[0] ); //subtract the 128 offset for UV hiImage = vec_subs(hiImage,d); loImage = vec_subs(loImage,d); hiRight = vec_subs(hiRight,d); loRight = vec_subs(loRight,d); hiImage = vec_subs(hiImage,hiRight); loImage = vec_subs(loImage,loRight); hiImage = vec_adds(hiImage,d); loImage = vec_adds(loImage,d); hiImage = vec_abs(hiImage); loImage = vec_abs(loImage); inData[0] = vec_packsu(hiImage, loImage); inData++; rightData++; } #ifndef PPC970 vec_dss( 0 ); vec_dss( 1 ); #endif } /*end of working altivec function */ }