Example #1
0
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
                                       register vec_u8_t p1,
                                       register vec_u8_t p2,
                                       register vec_u8_t q0,
                                       register vec_u8_t tc0) {

    register vec_u8_t average = vec_avg(p0, q0);
    register vec_u8_t temp;
    register vec_u8_t uncliped;
    register vec_u8_t ones;
    register vec_u8_t max;
    register vec_u8_t min;
    register vec_u8_t newp1;

    temp = vec_xor(average, p2);
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
    ones = vec_splat_u8(1);
    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
    max = vec_adds(p1, tc0);
    min = vec_subs(p1, tc0);
    newp1 = vec_max(min, uncliped);
    newp1 = vec_min(max, newp1);
    return newp1;
}
Example #2
0
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
                                                   register vector unsigned char p1,
                                                   register vector unsigned char p2,
                                                   register vector unsigned char q0,
                                                   register vector unsigned char tc0) {

    register vector unsigned char average = vec_avg(p0, q0);
    register vector unsigned char temp;
    register vector unsigned char uncliped;
    register vector unsigned char ones;
    register vector unsigned char max;
    register vector unsigned char min;
    register vector unsigned char newp1;

    temp = vec_xor(average, p2);
    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
    ones = vec_splat_u8(1);
    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
    max = vec_adds(p1, tc0);
    min = vec_subs(p1, tc0);
    newp1 = vec_max(min, uncliped);
    newp1 = vec_min(max, newp1);
    return newp1;
}
// out: o = |x-y| < a
static inline vec_u8_t diff_lt_altivec( register vec_u8_t x, register vec_u8_t y, register vec_u8_t a )
{
    register vec_u8_t diff = vec_subs(x, y);
    register vec_u8_t diffneg = vec_subs(y, x);
    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
    o = (vec_u8_t)vec_cmplt(o, a);
    return o;
}
static inline vector signed short convert16_altivec(vector signed int v1, vector signed int v2)
{
  register vector signed short result;
  v1 = vec_subs(v1, magic);
  v2 = vec_subs(v2, magic);
  result = vec_packs(v1, v2);

  return result;
}
Example #5
0
// out: o = |x-y| < a
static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
                                                     register vector unsigned char y,
                                                     register vector unsigned char a) {

    register vector unsigned char diff = vec_subs(x, y);
    register vector unsigned char diffneg = vec_subs(y, x);
    register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
    o = (vector unsigned char)vec_cmplt(o, a);
    return o;
}
Example #6
0
void pix_diff :: processRGBA_Altivec(imageStruct &image, imageStruct &right)
{

    int datasize = image.xsize * image.ysize / 4;
    vector signed short  hiImage, loImage, hiRight, loRight;
    vector unsigned char zero = vec_splat_u8(0);
    vector unsigned char *inData = (vector unsigned char *)image.data;
    vector unsigned char *rightData = (vector unsigned char *)right.data;

    #ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        vec_dst( inData+256, prefetchSize, 2 );
        vec_dst( rightData+256, prefetchSize, 3 );
    #endif

    do {

        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        vec_dst( inData+256, prefetchSize, 2 );
        vec_dst( rightData+256, prefetchSize, 3 );
        #endif

        hiImage = (vector signed short)vec_mergeh(zero,inData[0]);
        loImage = (vector signed short)vec_mergel(zero,inData[0]);
        hiRight = (vector signed short)vec_mergeh(zero,rightData[0]);
        loRight = (vector signed short)vec_mergel(zero,rightData[0]);

        hiImage = vec_subs(hiImage,hiRight);
        loImage = vec_subs(loImage,loRight);

        hiImage = vec_abs(hiImage);
        loImage = vec_abs(loImage);

        inData[0] = vec_packsu(hiImage,loImage);

        inData++;
        rightData++;
    }
    while (--datasize);
    #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        vec_dss( 2 );
        vec_dss( 3 );
    #endif
}
Example #7
0
File: h264dsp.c Project: 63n/FFmpeg
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
{
    vec_s16 dc16;
    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
    vec_s32 v_dc32;
    LOAD_ZERO;
    DECLARE_ALIGNED(16, int, dc);
    int i;

    dc = (block[0] + 32) >> 6;
    block[0] = 0;
    v_dc32 = vec_lde(0, &dc);
    dc16 = VEC_SPLAT16((vec_s16)v_dc32, 1);

    if (size == 4)
        dc16 = VEC_SLD16(dc16, zero_s16v, 8);
    dcplus = vec_packsu(dc16, zero_s16v);
    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);

    aligner = vec_lvsr(0, dst);
#if !HAVE_BIGENDIAN
    aligner = vec_perm(aligner, zero_u8v, vcswapc());
#endif
    dcplus = vec_perm(dcplus, dcplus, aligner);
    dcminus = vec_perm(dcminus, dcminus, aligner);

    for (i = 0; i < size; i += 4) {
        v0 = vec_ld(0, dst+0*stride);
        v1 = vec_ld(0, dst+1*stride);
        v2 = vec_ld(0, dst+2*stride);
        v3 = vec_ld(0, dst+3*stride);

        v0 = vec_adds(v0, dcplus);
        v1 = vec_adds(v1, dcplus);
        v2 = vec_adds(v2, dcplus);
        v3 = vec_adds(v3, dcplus);

        v0 = vec_subs(v0, dcminus);
        v1 = vec_subs(v1, dcminus);
        v2 = vec_subs(v2, dcminus);
        v3 = vec_subs(v3, dcminus);

        vec_st(v0, 0, dst+0*stride);
        vec_st(v1, 0, dst+1*stride);
        vec_st(v2, 0, dst+2*stride);
        vec_st(v3, 0, dst+3*stride);

        dst += 4*stride;
    }
}
Example #8
0
void
gimp_composite_difference_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx)
{
  const guchar *A = ctx->A;
  const guchar *B = ctx->B;
  guchar *D = ctx->D;
  guint length = ctx->n_pixels;
  vector unsigned char a,b,d,e,alpha_a,alpha_b;

  while (length >= 4)
    {
      a=LoadUnaligned(A);
      b=LoadUnaligned(B);

      alpha_a=vec_and(a, alphamask);
      alpha_b=vec_and(b, alphamask);
      d=vec_min(alpha_a, alpha_b);

      a=vec_andc(a, alphamask);
      a=vec_adds(a, d);
      b=vec_andc(b, alphamask);
      d=vec_subs(a, b);
      e=vec_subs(b, a);
      d=vec_add(d,e);

      StoreUnaligned(d, D);

      A+=16;
      B+=16;
      D+=16;
      length-=4;
    }
  /* process last pixels */
  length = length*4;
  a=LoadUnalignedLess(A, length);
  b=LoadUnalignedLess(B, length);

  alpha_a=vec_and(a,alphamask);
  alpha_b=vec_and(b,alphamask);
  d=vec_min(alpha_a,alpha_b);

  a=vec_andc(a,alphamask);
  a=vec_adds(a,d);
  b=vec_andc(b,alphamask);
  d=vec_subs(a,b);
  e=vec_subs(b, a);
  d=vec_add(d,e);

  StoreUnalignedLess(d, D, length);
}
Example #9
0
void pix_invert :: processYUVAltivec(imageStruct &image)
{
int h,w,width;
   width = image.xsize/8;

    union{
        unsigned char c[16];
        vector unsigned char v;
    }charBuffer;

    vector unsigned char offset;
    vector unsigned char *inData = (vector unsigned char*) image.data;

    charBuffer.c[0] = 255;
    offset = charBuffer.v;
    offset = (vector unsigned char) vec_splat(offset,0);
    #ifndef PPC970
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
       #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        #endif
        inData[0]=vec_subs(offset,inData[0]);
        inData++;

         }
         #ifndef PPC970
        vec_dss( 0 );
        #endif
    }  /*end of working altivec function */
}
Example #10
0
void pix_subtract :: processRGBA_Altivec(imageStruct &image, imageStruct &right)
{
 int h,w,width;
   width = image.xsize/4;


    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;

        #ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif

            inData[0] = vec_subs(inData[0], rightData[0]);

            inData++;
            rightData++;
        }
        #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        #endif
    }  /*end of working altivec function */
}
Example #11
0
static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *block, int stride, int size)
{
    vec_s16 dc16;
    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
    LOAD_ZERO;
    DECLARE_ALIGNED(16, int, dc);
    int i;

    dc = (block[0] + 32) >> 6;
    block[0] = 0;
    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);

    if (size == 4)
        dc16 = vec_sld(dc16, zero_s16v, 8);
    dcplus = vec_packsu(dc16, zero_s16v);
    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);

    aligner = vec_lvsr(0, dst);
    dcplus = vec_perm(dcplus, dcplus, aligner);
    dcminus = vec_perm(dcminus, dcminus, aligner);

    for (i = 0; i < size; i += 4) {
        v0 = vec_ld(0, dst+0*stride);
        v1 = vec_ld(0, dst+1*stride);
        v2 = vec_ld(0, dst+2*stride);
        v3 = vec_ld(0, dst+3*stride);

        v0 = vec_adds(v0, dcplus);
        v1 = vec_adds(v1, dcplus);
        v2 = vec_adds(v2, dcplus);
        v3 = vec_adds(v3, dcplus);

        v0 = vec_subs(v0, dcminus);
        v1 = vec_subs(v1, dcminus);
        v2 = vec_subs(v2, dcminus);
        v3 = vec_subs(v3, dcminus);

        vec_st(v0, 0, dst+0*stride);
        vec_st(v1, 0, dst+1*stride);
        vec_st(v2, 0, dst+2*stride);
        vec_st(v3, 0, dst+3*stride);

        dst += 4*stride;
    }
}
void imageFilterSubFrom_Altivec(unsigned char *dst, unsigned char *src, int length)
{
    int n = length;

    // Compute first few values so we're on a 16-byte boundary in dst
    while( (((long)dst & 0xF) > 0) && (n > 0) ) {
        SUBFROM_PIXEL();
        --n; ++dst; ++src;
    }

    // Do bulk of processing using Altivec (sub 16 8-bit unsigned integers, with saturation)
    while(n >= 16) {
        vector unsigned char s = vec_ld(0,src);
        vector unsigned char d = vec_ld(0,dst);
        vector unsigned char r = vec_subs(d, s);
        vec_st(r,0,dst);

        n -= 16; src += 16; dst += 16;
    }

    // If any bytes are left over, deal with them individually
    ++n;
    BASIC_SUBFROM();
}
Example #13
0
/* Function:  p7_SSVFilter_longtarget()
 * Synopsis:  Finds windows with SSV scores above some threshold (vewy vewy fast, in limited precision)
 *
 * Purpose:   Calculates an approximation of the SSV (single ungapped diagonal)
 *            score for regions of sequence <dsq> of length <L> residues, using
 *            optimized profile <om>, and a preallocated one-row DP matrix <ox>,
 *            and captures the positions at which such regions exceed the score
 *            required to be significant in the eyes of the calling function,
 *            which depends on the <bg> and <p> (usually p=0.02 for nhmmer).
 *            Note that this variant performs only SSV computations, never
 *            passing through the J state - the score required to pass SSV at
 *            the default threshold (or less restrictive) is sufficient to
 *            pass MSV in essentially all DNA models we've tested.
 *
 *            Above-threshold diagonals are captured into a preallocated list
 *            <windowlist>. Rather than simply capturing positions at which a
 *            score threshold is reached, this function establishes windows
 *            around those high-scoring positions, using scores in <msvdata>.
 *            These windows can be merged by the calling function.
 *
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues
 *            om      - optimized profile
 *            ox      - DP matrix
 *            msvdata    - compact representation of substitution scores, for backtracking diagonals
 *            bg         - the background model, required for translating a P-value threshold into a score threshold
 *            P          - p-value below which a region is captured as being above threshold
 *            windowlist - preallocated container for all hits (resized if necessary)
 *
 *
 * Note:      We misuse the matrix <ox> here, using only a third of the
 *            first dp row, accessing it as <dp[0..Q-1]> rather than
 *            in triplets via <{MDI}MX(q)> macros, since we only need
 *            to store M state values. We know that if <ox> was big
 *            enough for normal DP calculations, it must be big enough
 *            to hold the MSVFilter calculation.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small.
 */
int
p7_SSVFilter_longtarget(const ESL_DSQ *dsq, int L, P7_OPROFILE *om, P7_OMX *ox, const P7_SCOREDATA *ssvdata,
                        P7_BG *bg, double P, P7_HMM_WINDOWLIST *windowlist)
{

  vector unsigned char mpv;        /* previous row values                                       */
  vector unsigned char xEv;		   /* E state: keeps max for Mk->E as we go                     */
  vector unsigned char xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector unsigned char sv;		   /* temp storage of 1 curr row value in progress              */
  vector unsigned char biasv;	   /* emission bias in a vector                                 */
  uint8_t  xJ;                 /* special states' scores                                    */
  int i;			           /* counter over sequence positions 1..L                      */
  int q;			           /* counter over vectors 0..nq-1                              */
  int Q        = p7O_NQB(om->M);   /* segment length: # of vectors                              */
  vector unsigned char *dp  = ox->dpb[0];	   /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/
  vector unsigned char *rsc;			        /* will point at om->rbv[x] for residue x[i]                 */

  vector unsigned char zerov;	   /* vector of zeros                                           */
  vector unsigned char tecv;                    /* vector for E->C  cost                                     */
  vector unsigned char tjbmv;                    /* vector for [JN]->B->M move cost                                  */
  vector unsigned char basev;                   /* offset for scores                                         */

  int status;

  int k;
  int n;
  int end;
  int rem_sc;
  int start;
  int target_end;
  int target_start;
  int max_end;
  int max_sc;
  int sc;
  int pos_since_max;
  float ret_sc;

  union { vector unsigned char v; uint8_t b[16]; } u;


  /*
   * Computing the score required to let P meet the F1 prob threshold
   * In original code, converting from a scaled int MSV
   * score S (the score getting to state E) to a probability goes like this:
   *  usc =  S - om->tec_b - om->tjb_b - om->base_b;
   *  usc /= om->scale_b;
   *  usc -= 3.0;
   *  P = f ( (usc - nullsc) / eslCONST_LOG2 , mu, lambda)
   * and we're computing the threshold usc, so reverse it:
   *  (usc - nullsc) /  eslCONST_LOG2 = inv_f( P, mu, lambda)
   *  usc = nullsc + eslCONST_LOG2 * inv_f( P, mu, lambda)
   *  usc += 3
   *  usc *= om->scale_b
   *  S = usc + om->tec_b + om->tjb_b + om->base_b
   *
   *  Here, I compute threshold with length model based on max_length.  Doesn't
   *  matter much - in any case, both the bg and om models will change with roughly
   *  1 bit for each doubling of the length model, so they offset.
   */
  float nullsc;
  float invP = esl_gumbel_invsurv(P, om->evparam[p7_MMU],  om->evparam[p7_MLAMBDA]);
  vector unsigned char sc_threshv;               /* pushes value to saturation if it's above pthresh  */
  int sc_thresh;

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ16)  ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  ox->M   = om->M;


  p7_bg_SetLength(bg, om->max_length);
  p7_oprofile_ReconfigMSVLength(om, om->max_length);
  p7_bg_NullOne  (bg, dsq, om->max_length, &nullsc);

  sc_thresh = (int) ceil( ( ( nullsc  + (invP * eslCONST_LOG2) + 3.0 )  * om->scale_b ) + om->base_b +  om->tec_b  + om->tjb_b  );
  sc_threshv = esl_vmx_set_u8( (int8_t)sc_thresh - 1);


  /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base.
   */
  biasv = esl_vmx_set_u8(om->bias_b);
  for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0);
  xJ   = 0;
  zerov = vec_splat_u8(0);


  basev = esl_vmx_set_u8((int8_t) om->base_b);
  tecv = esl_vmx_set_u8((int8_t) om->tec_b);
  tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b);

  xBv = vec_subs(basev, tjbmv);

  for (i = 1; i <= L; i++) {
	  rsc = om->rbv[dsq[i]];
    xEv = vec_splat_u8(0);

	  /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12.
	   * Because ia32 is littlendian, this means a left bit shift.
	   * Zeros shift on automatically, which is our -infinity.
	   */
    mpv = vec_sld(zerov, dp[Q-1], 15);
	  for (q = 0; q < Q; q++)
	  {
		  /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
		  sv   = vec_max(mpv, xBv);
		  sv   = vec_adds(sv, biasv);
		  sv   = vec_subs(sv, *rsc);   rsc++;
		  xEv  = vec_max(xEv, sv);

		  mpv   = dp[q];   	  /* Load {MDI}(i-1,q) into mpv */
		  dp[q] = sv;       	  /* Do delayed store of M(i,q) now that memory is usable */
	  }


	  if (vec_any_gt(xEv, sc_threshv) ) { //hit pthresh, so add position to list and reset values
      //figure out which model state hit threshold
      end = -1;
      rem_sc = -1;
      for (q = 0; q < Q; q++) {  /// Unpack and unstripe, so we can find the state that exceeded pthresh
          u.v = dp[q];
          for (k = 0; k < 16; k++) { // unstripe
            //(q+Q*k+1) is the model position k at which the xE score is found
            if (u.b[k] >= sc_thresh && u.b[k] > rem_sc && (q+Q*k+1) <= om->M) {
              end = (q+Q*k+1);
              rem_sc = u.b[k];
            }
          }
          dp[q] = vec_splat_u8(0); // while we're here ... this will cause values to get reset to xB in next dp iteration
      }

      //recover the diagonal that hit threshold
      start = end;
      target_end = target_start = i;
      sc = rem_sc;
      while (rem_sc > om->base_b - om->tjb_b - om->tbm_b) {
        rem_sc -= om->bias_b -  ssvdata->ssv_scores[start*om->abc->Kp + dsq[target_start]];
        --start;
        --target_start;
        //if ( start == 0 || target_start==0)    break;
      }
      start++;
      target_start++;



      //extend diagonal further with single diagonal extension
      k = end+1;
      n = target_end+1;
      max_end = target_end;
      max_sc = sc;
      pos_since_max = 0;
      while (k<om->M && n<=L) {
        sc += om->bias_b -  ssvdata->ssv_scores[k*om->abc->Kp + dsq[n]];
        if (sc >= max_sc) {
          max_sc = sc;
          max_end = n;
          pos_since_max=0;
        } else {
          pos_since_max++;
          if (pos_since_max == 5)
            break;
        }
        k++;
        n++;
      }

      end  +=  (max_end - target_end);
      target_end = max_end;

      ret_sc = ((float) (max_sc - om->tjb_b) - (float) om->base_b);
      ret_sc /= om->scale_b;
      ret_sc -= 3.0; // that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ

      p7_hmmwindow_new(  windowlist,
                         0,                  // sequence_id; used in the FM-based filter, but not here
                         target_start,       // position in the target at which the diagonal starts
                         0,                  // position in the target fm_index at which diagonal starts;  not used here, just in FM-based filter
                         end,                // position in the model at which the diagonal ends
                         end-start+1 ,       // length of diagonal
                         ret_sc,             // score of diagonal
                         p7_NOCOMPLEMENT,    // always p7_NOCOMPLEMENT here;  varies in FM-based filter
                         L
                       );



      i = target_end; // skip forward


	  }

  } /* end loop over sequence residues 1..L */

  return eslOK;


  ERROR:
  ESL_EXCEPTION(eslEMEM, "Error allocating memory for hit list\n");

}
Example #14
0
/* Function:  p7_MSVFilter()
 * Synopsis:  Calculates MSV score, vewy vewy fast, in limited precision.
 * Incept:    SRE, Wed Dec 26 15:12:25 2007 [Janelia]
 *
 * Purpose:   Calculates an approximation of the MSV score for sequence
 *            <dsq> of length <L> residues, using optimized profile <om>,
 *            and a preallocated one-row DP matrix <ox>. Return the 
 *            estimated MSV score (in nats) in <ret_sc>.
 *            
 *            Score may overflow (and will, on high-scoring
 *            sequences), but will not underflow.
 *            
 *            The model may be in any mode, because only its match
 *            emission scores will be used. The MSV filter inherently
 *            assumes a multihit local mode, and uses its own special
 *            state transition scores, not the scores in the profile.
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - DP matrix
 *            ret_sc  - RETURN: MSV score (in nats)          
 *                      
 * Note:      We misuse the matrix <ox> here, using only a third of the
 *            first dp row, accessing it as <dp[0..Q-1]> rather than
 *            in triplets via <{MDI}MX(q)> macros, since we only need
 *            to store M state values. We know that if <ox> was big
 *            enough for normal DP calculations, it must be big enough
 *            to hold the MSVFilter calculation.
 *
 * Returns:   <eslOK> on success.
 *            <eslERANGE> if the score overflows the limited range; in
 *            this case, this is a high-scoring hit.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small.
 */
int
p7_MSVFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc)
{
  vector unsigned char mpv;        /* previous row values                                       */
  vector unsigned char xEv;	   /* E state: keeps max for Mk->E as we go                     */
  vector unsigned char xBv;	   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector unsigned char sv;	   /* temp storage of 1 curr row value in progress              */
  vector unsigned char biasv;	   /* emission bias in a vector                                 */
  uint8_t xJ;                      /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over vectors 0..nq-1                              */
  int Q        = p7O_NQB(om->M);   /* segment length: # of vectors                              */
  vector unsigned char *dp;	   /* we're going to use dp[0][0..q..Q-1], not {MDI}MX(q) macros*/
  vector unsigned char *rsc;	   /* will point at om->rbv[x] for residue x[i]                 */

  vector unsigned char zerov;	   /* vector of zeros                                           */
  vector unsigned char xJv;        /* vector for states score                                   */
  vector unsigned char tjbmv;       /* vector for B->Mk cost                                     */
  vector unsigned char tecv;       /* vector for E->C  cost                                     */
  vector unsigned char basev;      /* offset for scores                                         */
  vector unsigned char ceilingv;   /* saturateed simd value used to test for overflow           */
  vector unsigned char tempv;

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ16)  ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  ox->M   = om->M;

  /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base.
   */
  dp  = ox->dpb[0];
  for (q = 0; q < Q; q++) dp[q] = vec_splat_u8(0);
  xJ   = 0;

  biasv = esl_vmx_set_u8(om->bias_b);
  zerov = vec_splat_u8(0);

  /* saturate simd register for overflow test */
  tempv = vec_splat_u8(1);
  ceilingv = (vector unsigned char)vec_cmpeq(biasv, biasv);
  ceilingv = vec_subs(ceilingv, biasv);
  ceilingv = vec_subs(ceilingv, tempv);

  basev = esl_vmx_set_u8((int8_t) om->base_b);

  tecv = esl_vmx_set_u8((int8_t) om->tec_b);
  tjbmv = esl_vmx_set_u8((int8_t) om->tjb_b + (int8_t) om->tbm_b);

  xJv = vec_subs(biasv, biasv);
  xBv = vec_subs(basev, tjbmv);

#if p7_DEBUGGING
  if (ox->debugging)
	{
	  unsigned char xB;
	  vec_ste(xBv, 0, &xB);
	  vec_ste(xJv, 0, &xJ);
	  p7_omx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ);
	}
#endif

  for (i = 1; i <= L; i++)
  {
      rsc = om->rbv[dsq[i]];
      xEv = vec_splat_u8(0);
//      xBv = vec_sub(xBv, tbmv);

      /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. 
       * Because ia32 is littlendian, this means a left bit shift.
       * Zeros shift on automatically, which is our -infinity.
       */
      mpv = vec_sld(zerov, dp[Q-1], 15);   
      for (q = 0; q < Q; q++)
      {
        /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
        sv   = vec_max(mpv, xBv);
        sv   = vec_adds(sv, biasv);
        sv   = vec_subs(sv, *rsc);   rsc++;
        xEv  = vec_max(xEv, sv);

        mpv   = dp[q];   	  /* Load {MDI}(i-1,q) into mpv */
        dp[q] = sv;       	  /* Do delayed store of M(i,q) now that memory is usable */
      }

      /* Now the "special" states, which start from Mk->E (->C, ->J->B)
       * Use rotates instead of shifts so when the last max has completed,
       * all elements of the simd register will contain the max value.
       */
      tempv = vec_sld(xEv, xEv, 1);
      xEv = vec_max(xEv, tempv);
      tempv = vec_sld(xEv, xEv, 2);
      xEv = vec_max(xEv, tempv);
      tempv = vec_sld(xEv, xEv, 4);
      xEv = vec_max(xEv, tempv);
      tempv = vec_sld(xEv, xEv, 8);
      xEv = vec_max(xEv, tempv);

      /* immediately detect overflow */
      if (vec_any_gt(xEv, ceilingv))
      {
        *ret_sc = eslINFINITY;
        return eslERANGE;
      }

      xEv = vec_subs(xEv, tecv);
      xJv = vec_max(xJv,xEv);

      xBv = vec_max(basev, xJv);
      xBv = vec_subs(xBv, tjbmv);
	  
#if p7_DEBUGGING
      if (ox->debugging)
      {
        unsigned char xB, xE;
        vec_ste(xBv, 0, &xB);
        vec_ste(xEv, 0, &xE);
        vec_ste(xJv, 0, &xJ);
        p7_omx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ);
      }
#endif
  } /* end loop over sequence residues 1..L */

  /* finally C->T, and add our missing precision on the NN,CC,JJ back */
  vec_ste(xJv, 0, &xJ);
  *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b);
  *ret_sc /= om->scale_b;
  *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */

  return eslOK;
}
Example #15
0
void pix_add :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
 int h,w,width;
   width = image.xsize/8;
   //format is U Y V Y
    union
    {
        //unsigned int	i;
        short	elements[8];
        //vector signed char v;
        vector	signed short v;
    }shortBuffer;

        union
    {
        //unsigned int	i;
        unsigned char	elements[16];
        //vector signed char v;
        vector	unsigned char v;
    }charBuffer;

    //vector unsigned char c;
    register vector signed short d, hiImage, loImage, YRight, UVRight, YImage, UVImage, UVTemp, YTemp;
   // vector unsigned char zero = vec_splat_u8(0);
    register vector unsigned char c,one;
  //  vector signed short zshort = vec_splat_s16(0);
    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;

    //Write the pixel (pair) to the transfer buffer
    charBuffer.elements[0] = 2;
    charBuffer.elements[1] = 1;
    charBuffer.elements[2] = 2;
    charBuffer.elements[3] = 1;
    charBuffer.elements[4] = 2;
    charBuffer.elements[5] = 1;
    charBuffer.elements[6] = 2;
    charBuffer.elements[7] = 1;
    charBuffer.elements[8] = 2;
    charBuffer.elements[9] = 1;
    charBuffer.elements[10] = 2;
    charBuffer.elements[11] = 1;
    charBuffer.elements[12] = 2;
    charBuffer.elements[13] = 1;
    charBuffer.elements[14] = 2;
    charBuffer.elements[15] = 1;


    //Load it into the vector unit
    c = charBuffer.v;

    one =  vec_splat_u8( 1 );

    shortBuffer.elements[0] = 255;

    //Load it into the vector unit
    d = shortBuffer.v;
    d = static_cast<vector signed short>(vec_splat(static_cast<vector signed short>(d),0));
#ifndef PPC970
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
#endif
    for ( h=0; h<image.ysize; h++){
      for (w=0; w<width; w++)
        {
#ifndef PPC970
	  vec_dst( inData, prefetchSize, 0 );
	  vec_dst( rightData, prefetchSize, 1 );
#endif
	  //interleaved U Y V Y chars

	  //vec_mule UV * 2 to short vector U V U V shorts
	  UVImage = static_cast<vector signed short>(vec_mule(one,inData[0]));
	  UVRight = static_cast<vector signed short>(vec_mule(c,rightData[0]));

	  //vec_mulo Y * 1 to short vector Y Y Y Y shorts
	  YImage = static_cast<vector signed short>(vec_mulo(c,inData[0]));
	  YRight = static_cast<vector signed short>(vec_mulo(c,rightData[0]));

	  //vel_subs UV - 255
	  UVRight = static_cast<vector signed short>(vec_subs(UVRight, d));

	  //vec_adds UV
	  UVTemp = vec_adds(UVImage,UVRight);

	  //vec_adds Y
	  YTemp = vec_adds(YImage,YRight);

	  hiImage = vec_mergeh(UVTemp,YTemp);
	  loImage = vec_mergel(UVTemp,YTemp);

	  //vec_mergel + vec_mergeh Y and UV
	  inData[0] = vec_packsu(hiImage, loImage);

	  inData++;
	  rightData++;
        }
#ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
#endif
    }  /*end of working altivec function */
}
Example #16
0
File: dct.c Project: 0day-ci/gcc
void
dct_vmx (vector signed short *input, vector signed short *output,
	 vector signed short *postscale)
{
  vector signed short mul0, mul1, mul2, mul3, mul4, mul5, mul6, mul;
  vector signed short v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
  vector signed short v20, v21, v22, v23, v24, v25, v26, v27, v31;
  int i;
  vector signed short in[8], out[8];

  /* Load first eight rows of input data */

  /* Load multiplication constants */

  /* Splat multiplication constants */
  mul0 = vec_splat(input[8],0);
  mul1 = vec_splat(input[8],1);
  mul2 = vec_splat(input[8],2);
  mul3 = vec_splat(input[8],3);
  mul4 = vec_splat(input[8],4);
  mul5 = vec_splat(input[8],5);
  mul6 = vec_splat(input[8],6);

  /* Perform DCT on the eight columns */

  /*********** Stage 1 ***********/

  v8 = vec_adds (input[0], input[7]);
  v9 = vec_subs (input[0], input[7]);
  v0 = vec_adds (input[1], input[6]);
  v7 = vec_subs (input[1], input[6]);
  v1 = vec_adds (input[2], input[5]);
  v6 = vec_subs (input[2], input[5]);
  v2 = vec_adds (input[3], input[4]);
  v5 = vec_subs (input[3], input[4]);

  /*********** Stage 2 ***********/

  /* Top */
  v3 = vec_adds (v8, v2);		/* (V0+V7) + (V3+V4) */
  v4 = vec_subs (v8, v2);		/* (V0+V7) - (V3+V4) */
  v2 = vec_adds (v0, v1);		/* (V1+V6) + (V2+V5) */
  v8 = vec_subs (v0, v1);		/* (V1+V6) - (V2+V5) */

  /* Bottom */
  v0 = vec_subs (v7, v6);		/* (V1-V6) - (V2-V5) */
  v1 = vec_adds (v7, v6);		/* (V1-V6) + (V2-V5) */

  /*********** Stage 3 ***********/

  /* Top */
  in[0] = vec_adds (v3, v2);		/* y0 = v3 + v2 */
  in[4] = vec_subs (v3, v2);		/* y4 = v3 - v2 */
  in[2] = vec_mradds (v8, mul2, v4);	/* y2 = v8 * a0 + v4 */
  v6 = vec_mradds (v4, mul2, mul6);	
  in[6] = vec_subs (v6, v8);		/* y6 = v4 * a0 - v8 */

  /* Bottom */
  v6 = vec_mradds (v0, mul0, v5);	/* v6 = v0 * (c4) + v5 */
  v7 = vec_mradds (v0, mul4, v5);	/* v7 = v0 * (-c4) + v5 */
  v2 = vec_mradds (v1, mul4, v9);	/* v2 = v1 * (-c4) + v9 */
  v3 = vec_mradds (v1, mul0, v9);	/* v3 = v1 * (c4) + v9 */

  /*********** Stage 4 ***********/

  /* Bottom */
  in[1] = vec_mradds (v6, mul3, v3);	/* y1 = v6 * (a1) + v3 */
  v23 = vec_mradds (v3, mul3, mul6);
  in[7] = vec_subs (v23, v6);		/* y7 = v3 * (a1) - v6 */
  in[5] = vec_mradds (v2, mul1, v7);	/* y5 = v2 * (a2) + v7 */
  in[3] = vec_mradds (v7, mul5, v2);	/* y3 = v7 * (-a2) + v2 */

  transpose_vmx (in, out);

  /* Perform DCT on the eight rows */

  /*********** Stage 1 ***********/

  v8 = vec_adds (out[0], out[7]);
  v9 = vec_subs (out[0], out[7]);
  v0 = vec_adds (out[1], out[6]);
  v7 = vec_subs (out[1], out[6]);
  v1 = vec_adds (out[2], out[5]);
  v6 = vec_subs (out[2], out[5]);
  v2 = vec_adds (out[3], out[4]);
  v5 = vec_subs (out[3], out[4]);

  /*********** Stage 2 ***********/

  /* Top */
  v3 = vec_adds (v8, v2);		/* (V0+V7) + (V3+V4) */
  v4 = vec_subs (v8, v2);		/* (V0+V7) - (V3+V4) */
  v2 = vec_adds (v0, v1);		/* (V1+V6) + (V2+V5) */
  v8 = vec_subs (v0, v1);		/* (V1+V6) - (V2+V5) */

  /* Bottom */
  v0 = vec_subs (v7, v6);		/* (V1-V6) - (V2-V5) */
  v1 = vec_adds (v7, v6);		/* (V1-V6) + (V2-V5) */

  /*********** Stage 3 ***********/

  /* Top */
  v25 = vec_subs (v25, v25);          /* reinit v25 = 0 */

  v20 = vec_adds (v3, v2);		/* y0 = v3 + v2 */
  v24 = vec_subs (v3, v2);		/* y4 = v3 - v2 */
  v22 = vec_mradds (v8, mul2, v4);	/* y2 = v8 * a0 + v4 */
  v6 = vec_mradds (v4, mul2, v25);	
  v26 = vec_subs (v6, v8);		/* y6 = v4 * a0 - v8 */

  /* Bottom */
  v6 = vec_mradds (v0, mul0, v5);	/* v6 = v0 * (c4) + v5 */
  v7 = vec_mradds (v0, mul4, v5);	/* v7 = v0 * (-c4) + v5 */
  v2 = vec_mradds (v1, mul4, v9);	/* v2 = v1 * (-c4) + v9 */
  v3 = vec_mradds (v1, mul0, v9);	/* v3 = v1 * (c4) + v9 */

  /*********** Stage 4 ***********/

  /* Bottom */
  v21 = vec_mradds (v6, mul3, v3);	/* y1 = v6 * (a1) + v3 */
  v23 = vec_mradds (v3, mul3, v25);
  v27 = vec_subs (v23, v6);		/* y7 = v3 * (a1) - v6 */
  v25 = vec_mradds (v2, mul1, v7);	/* y5 = v2 * (a2) + v7 */
  v23 = vec_mradds (v7, mul5, v2);	/* y3 = v7 * (-a2) + v2 */

  /* Post-scale and store reults */

  v31 = vec_subs (v31, v31);          /* reinit v25 = 0 */

  output[0] = vec_mradds (postscale[0], v20, v31);
  output[2] = vec_mradds (postscale[2], v22, v31);
  output[4] = vec_mradds (postscale[4], v24, v31);
  output[6] = vec_mradds (postscale[6], v26, v31);
  output[1] = vec_mradds (postscale[1], v21, v31);
  output[3] = vec_mradds (postscale[3], v23, v31);
  output[5] = vec_mradds (postscale[5], v25, v31);
  output[7] = vec_mradds (postscale[7], v27, v31);
}
Example #17
0
/* start of optimized motionblur */
void pix_motionblur :: processYUVAltivec(imageStruct &image)
{
  int h,w,width;
  signed short rightGain,imageGain;
  unsigned char *saved = m_savedImage.data;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();
  if(saved!=m_savedImage.data) {
    m_savedImage.setBlack();
  }
  saved=m_savedImage.data;

  width = image.xsize/8;
  /*
  // hmm: why does it read 235 ?
  rightGain = (signed short)(235. * m_motionblur);
  imageGain = (signed short) (255. - (235. * m_motionblur));
  */
  rightGain = m_blur1;
  imageGain = m_blur0;

  union {
    signed short        elements[8];
    vector      signed short v;
  } shortBuffer;

  union {
    unsigned int        elements[4];
    vector      unsigned int v;
  } bitBuffer;

  register vector signed short gainAdd, hiImage, loImage,hiRight,loRight,
           YImage, UVImage;
  // register vector signed short loadhiImage, loadloImage,loadhiRight,loadloRight;
  register vector unsigned char loadImage, loadRight;
  register vector unsigned char zero = vec_splat_u8(0);
  register vector signed int UVhi,UVlo,Yhi,Ylo;
  register vector signed int UVhiR,UVloR,YhiR,YloR;
  register vector signed short gainSub,gain,gainR;//,d;
  register vector unsigned int bitshift;
  vector unsigned char *inData = (vector unsigned char*) image.data;
  vector unsigned char *rightData = (vector unsigned char*) saved;


  shortBuffer.elements[0] = 128;
  shortBuffer.elements[1] = 0;
  shortBuffer.elements[2] = 128;
  shortBuffer.elements[3] = 0;
  shortBuffer.elements[4] = 128;
  shortBuffer.elements[5] = 0;
  shortBuffer.elements[6] = 128;
  shortBuffer.elements[7] = 0;

  gainSub = shortBuffer.v;

  shortBuffer.elements[0] = imageGain;
  gain = shortBuffer.v;
  gain =  vec_splat(gain, 0 );

  shortBuffer.elements[0] = rightGain;
  gainR = shortBuffer.v;
  gainR =  vec_splat(gainR, 0 );

  bitBuffer.elements[0] = 8;

  //Load it into the vector unit
  bitshift = bitBuffer.v;
  bitshift = vec_splat(bitshift,0);

  shortBuffer.elements[0] = 128;

  //Load it into the vector unit
  gainAdd = shortBuffer.v;
  gainAdd = (vector signed short)vec_splat((vector signed short)gainAdd,0);

# ifndef PPC970
  UInt32                        prefetchSize = GetPrefetchConstant( 16, 1,
      256 );
  vec_dst( inData, prefetchSize, 0 );
  vec_dst( rightData, prefetchSize, 1 );
  vec_dst( inData+32, prefetchSize, 2 );
  vec_dst( rightData+32, prefetchSize, 3 );
# endif

  loadImage = inData[0];
  loadRight = rightData[0];

  for ( h=0; h<image.ysize; h++) {
    for (w=0; w<width; w++) {
# ifndef PPC970
      vec_dst( inData, prefetchSize, 0 );
      vec_dst( rightData, prefetchSize, 1 );
      vec_dst( inData+32, prefetchSize, 2 );
      vec_dst( rightData+32, prefetchSize, 3 );
# endif
      //interleaved U Y V Y chars

      hiImage = (vector signed short) vec_mergeh( zero, loadImage );
      loImage = (vector signed short) vec_mergel( zero, loadImage );

      hiRight = (vector signed short) vec_mergeh( zero, loadRight );
      loRight = (vector signed short) vec_mergel( zero, loadRight );

      //hoist that load!!
      loadImage = inData[1];
      loadRight = rightData[1];

      //subtract 128 from UV

      hiImage = vec_subs(hiImage,gainSub);
      loImage = vec_subs(loImage,gainSub);

      hiRight = vec_subs(hiRight,gainSub);
      loRight = vec_subs(loRight,gainSub);

      //now vec_mule the UV into two vector ints
      //change sone to gain
      UVhi = vec_mule(gain,hiImage);
      UVlo = vec_mule(gain,loImage);

      UVhiR = vec_mule(gainR,hiRight);
      UVloR = vec_mule(gainR,loRight);

      //now vec_mulo the Y into two vector ints
      Yhi = vec_mulo(gain,hiImage);
      Ylo = vec_mulo(gain,loImage);

      YhiR = vec_mulo(gainR,hiRight);
      YloR = vec_mulo(gainR,loRight);


      //this is where to do the add and bitshift due to the resolution
      //add UV
      UVhi = vec_adds(UVhi,UVhiR);
      UVlo = vec_adds(UVlo,UVloR);

      Yhi = vec_adds(Yhi,YhiR);
      Ylo = vec_adds(Ylo,YloR);

      //bitshift UV
      UVhi = vec_sra(UVhi,bitshift);
      UVlo = vec_sra(UVlo,bitshift);

      Yhi = vec_sra(Yhi,bitshift);
      Ylo = vec_sra(Ylo,bitshift);

      //pack the UV into a single short vector
      UVImage =  vec_packs(UVhi,UVlo);

      //pack the Y into a single short vector
      YImage =  vec_packs(Yhi,Ylo);

      //vec_mergel + vec_mergeh Y and UV
      hiImage =  vec_mergeh(UVImage,YImage);
      loImage =  vec_mergel(UVImage,YImage);

      //add 128 offset back
      hiImage = vec_adds(hiImage,gainSub);
      loImage = vec_adds(loImage,gainSub);

      //vec_mergel + vec_mergeh Y and UV
      rightData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);
      inData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);

      inData++;
      rightData++;
    }
  }
# ifndef PPC970
  //stop the cache streams
  vec_dss( 0 );
  vec_dss( 1 );
  vec_dss( 2 );
  vec_dss( 3 );
# endif


}/* end of working altivec function */
Example #18
0
File: 3b-01.c Project: 0day-ci/gcc
vector unsigned int
f(vector unsigned int a, vector unsigned int b) 
{
  return vec_subs(a,b);
}
Example #19
0
void pix_background :: processYUVAltivec(imageStruct &image)
{
register int h,w,i,j,width;
int pixsize = image.xsize * image.ysize * image.csize;
    h = image.ysize;
    w = image.xsize/8;
    width = image.xsize/8;
    
    //check to see if the buffer isn't 16byte aligned (highly unlikely)
    if (image.ysize*image.xsize % 16 != 0){
        error("image not properly aligned for Altivec - try something SD or HD maybe?");
        return;
        }
    
    union{
        unsigned short		s[8];
        vector unsigned short	v;
    }shortBuffer;

    if(m_savedImage.xsize!=image.xsize ||
       m_savedImage.ysize!=image.ysize ||
       m_savedImage.format!=image.format)m_reset=1;

    m_savedImage.xsize=image.xsize;
    m_savedImage.ysize=image.ysize;
    m_savedImage.setCsizeByFormat(image.format);
    m_savedImage.reallocate();
    
    if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
    m_reset = 0; 
    }
    
    register vector unsigned short	UVres1, Yres1, UVres2, Yres2;//interleave;
    register vector unsigned short	hiImage, loImage;
    register vector unsigned short	Yrange, UVrange, Yblank,UVblank,blank;
    register vector bool short		Ymasklo,Ymaskhi,  UVmaskhi;
    register vector unsigned short	Yhi,Ylo,UVhi,UVlo; 
    register vector unsigned char	one = vec_splat_u8(1);
    register vector unsigned short	sone = vec_splat_u16(1);
    register vector unsigned int			Uhi, Ulo, Vhi, Vlo,Ures,Vres;
    register vector bool int 			Umasklo, Umaskhi, Vmaskhi, Vmasklo;

    vector unsigned char	*inData = (vector unsigned char*) image.data;
    vector unsigned char	*rightData = (vector unsigned char*) m_savedImage.data;
    
    shortBuffer.s[0] =  m_Yrange;
    Yrange = shortBuffer.v;
    Yrange = vec_splat(Yrange,0);
    
    shortBuffer.s[0] = 128;
    shortBuffer.s[1] = 0;
    shortBuffer.s[2] = 128;
    shortBuffer.s[3] = 0;
    shortBuffer.s[4] = 128;
    shortBuffer.s[5] = 0;
    shortBuffer.s[6] = 128;
    shortBuffer.s[7] = 0;
    blank = shortBuffer.v;
    
    shortBuffer.s[0] =  0;
    Yblank = shortBuffer.v;
    Yblank = vec_splat(Yblank,0);
    
    shortBuffer.s[0] =  128;
    UVblank = shortBuffer.v;
    UVblank = vec_splat(UVblank,0);
    
    shortBuffer.s[0] = m_Urange;
    shortBuffer.s[1] = m_Vrange;
    shortBuffer.s[2] = m_Urange;
    shortBuffer.s[3] = m_Vrange;
    shortBuffer.s[4] = m_Urange;
    shortBuffer.s[5] = m_Vrange;
    shortBuffer.s[6] = m_Urange;
    shortBuffer.s[7] = m_Vrange;
    UVrange = shortBuffer.v;
    
    
    //setup the cache prefetch -- A MUST!!!
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    #ifndef PPC970 
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
    vec_dst( inData+32, prefetchSize, 2 );
    vec_dst( rightData+32, prefetchSize, 3 );
    #endif //PPC970
    
    for ( i=0; i<h; i++){
        for (j=0; j<w; j++)
        {
        #ifndef PPC970
        //this function is probably memory bound on most G4's -- what else is new?
            vec_dst( inData, prefetchSize, 0 );
            vec_dst( rightData, prefetchSize, 1 );
            vec_dst( inData+32, prefetchSize, 2 );
            vec_dst( rightData+32, prefetchSize, 3 );
        #endif
        //separate the U and V from Y
        UVres1 = (vector unsigned short)vec_mule(one,inData[0]);
        UVres2 = (vector unsigned short)vec_mule(one,rightData[0]);
            
        //vec_mulo Y * 1 to short vector Y Y Y Y shorts
        Yres1 = (vector unsigned short)vec_mulo(one,inData[0]);
        Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]);
        
        Yhi = vec_adds(Yres2,Yrange);
        Ylo = vec_subs(Yres2,Yrange);
        
        //go to ints for comparison
        UVhi = vec_adds(UVres2,UVrange);
        UVlo = vec_subs(UVres2,UVrange);
        
        Uhi = vec_mule(sone,UVhi);
        Ulo = vec_mule(sone,UVlo);
        
        Vhi = vec_mulo(sone,UVhi);
        Vlo = vec_mulo(sone,UVlo);
        
        Ures = vec_mule(sone,UVres1);
         Vres = vec_mulo(sone,UVres1);
         
         Umasklo = vec_cmpgt(Ures,Ulo);
         Umaskhi = vec_cmplt(Ures,Uhi);
         
         Vmasklo = vec_cmpgt(Vres,Vlo);
         Vmaskhi = vec_cmplt(Vres,Vhi);
         
         Umaskhi = vec_and(Umaskhi,Umasklo);
         
         Vmaskhi = vec_and(Vmaskhi,Vmasklo);
         
         Umasklo = vec_and(Umaskhi,Vmaskhi);
         Vmasklo = vec_and(Umaskhi,Vmaskhi);
         
         hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo);
         loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo);
         
         //pack it back down to bool short
         UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage);
         
         Ymasklo = vec_cmpgt(Yres1,Ylo);
         Ymaskhi = vec_cmplt(Yres1,Yhi);
         
         Ymaskhi = vec_and(Ymaskhi,Ymasklo);
         
         Ymaskhi = vec_and(Ymaskhi,UVmaskhi);
         UVmaskhi = vec_and(Ymaskhi,UVmaskhi);
         
         //bitwise comparison and move using the result of the comparison as a mask
         Yres1 = vec_sel(Yres1,Yblank,Ymaskhi);
         
         //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi);
         UVres1 = vec_sel(UVres1,UVblank,UVmaskhi);
         
         //merge the Y and UV back together
         hiImage = vec_mergeh(UVres1,Yres1);
         loImage = vec_mergel(UVres1,Yres1);
         
         //pack it back down to unsigned char to store
         inData[0] = vec_packsu(hiImage,loImage);
         
         inData++;
         rightData++;
        
        }
        #ifndef PPC970
        vec_dss(0);
        vec_dss(1);
        vec_dss(2);
        vec_dss(3);
        #endif
    }
}
Example #20
0
void pix_diff :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
  long h,w,width;

   width = image.xsize/8;
   //format is U Y V Y
    union
    {
        //unsigned int	i;
        short	elements[8];
        //vector signed char v;
        vector	short v;
    }shortBuffer;


    vector signed short d, hiImage, loImage,hiRight, loRight;//, YRight, UVRight, YImage, UVImage, UVTemp, YTemp;
    vector unsigned char zero = vec_splat_u8(0);
    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;


    shortBuffer.elements[0] = 128;
    shortBuffer.elements[1] = 0;
    shortBuffer.elements[2] = 128;
    shortBuffer.elements[3] = 0;
    shortBuffer.elements[4] = 128;
    shortBuffer.elements[5] = 0;
    shortBuffer.elements[6] = 128;
    shortBuffer.elements[7] = 0;

    //Load it into the vector unit
    d = shortBuffer.v;



#ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
    #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
           #endif
            //interleaved U Y V Y chars

            //break out to unsigned shorts
            hiImage = (vector signed short) vec_mergeh( zero, inData[0] );
            loImage = (vector signed short) vec_mergel( zero, inData[0] );
            hiRight = (vector signed short) vec_mergeh( zero, rightData[0] );
            loRight = (vector signed short) vec_mergel( zero, rightData[0] );

            //subtract the 128 offset for UV
            hiImage = vec_subs(hiImage,d);
            loImage = vec_subs(loImage,d);
            hiRight = vec_subs(hiRight,d);
            loRight = vec_subs(loRight,d);

            hiImage = vec_subs(hiImage,hiRight);
            loImage = vec_subs(loImage,loRight);

            hiImage = vec_adds(hiImage,d);
            loImage = vec_adds(loImage,d);

            hiImage = vec_abs(hiImage);
            loImage = vec_abs(loImage);

            inData[0] = vec_packsu(hiImage, loImage);

            inData++;
            rightData++;

        }
        #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        #endif
    }  /*end of working altivec function */
}