示例#1
0
void pix_invert :: processYUVAltivec(imageStruct &image)
{
int h,w,width;
   width = image.xsize/8;

    union{
        unsigned char c[16];
        vector unsigned char v;
    }charBuffer;

    vector unsigned char offset;
    vector unsigned char *inData = (vector unsigned char*) image.data;

    charBuffer.c[0] = 255;
    offset = charBuffer.v;
    offset = (vector unsigned char) vec_splat(offset,0);
    #ifndef PPC970
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
       #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        #endif
        inData[0]=vec_subs(offset,inData[0]);
        inData++;

         }
         #ifndef PPC970
        vec_dss( 0 );
        #endif
    }  /*end of working altivec function */
}
示例#2
0
文件: pix_add.cpp 项目: avilleret/Gem
void pix_add :: processRGBA_Altivec(imageStruct &image, imageStruct &right)
{
 int h,w,width;
   width = image.xsize/4;


    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;

        #ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif

            inData[0] = vec_adds(inData[0], rightData[0]);

            inData++;
            rightData++;
        }
        #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        #endif
    }  /*end of working altivec function */
}
示例#3
0
文件: altivec-15.c 项目: 0day-ci/gcc
int
main (void)
{
    unsigned long ul = 2;
    signed long sl = 2;
    unsigned int ui = 2;
    signed int si = 2;
    float fl = 2.0;

    vec_dst (&vi, ul, '\0');
    vec_dst (&vi, sl, 0);
    vec_dst (&vi, ui, '\0');
    vec_dst (&vi, si, 0);
    vec_dstst (&vi, (short)fl, '\0');

    return 0;
}
示例#4
0
文件: ops-long-1.c 项目: 0day-ci/gcc
void f33() {
  vec_dst(var_long_ptr[0], var_int[1], 0);
  vec_dst(var_long_ptr[0], var_int[1], 1);
  vec_dst(var_long_ptr[0], var_int[1], 2);
  vec_dst(var_long_ptr[0], var_int[1], 3);
  vec_dst(var_unsigned_long_ptr[0], var_int[1], 0);
  vec_dst(var_unsigned_long_ptr[0], var_int[1], 1);
  vec_dst(var_unsigned_long_ptr[0], var_int[1], 2);
  vec_dst(var_unsigned_long_ptr[0], var_int[1], 3);
}
示例#5
0
void pix_diff :: processRGBA_Altivec(imageStruct &image, imageStruct &right)
{

    int datasize = image.xsize * image.ysize / 4;
    vector signed short  hiImage, loImage, hiRight, loRight;
    vector unsigned char zero = vec_splat_u8(0);
    vector unsigned char *inData = (vector unsigned char *)image.data;
    vector unsigned char *rightData = (vector unsigned char *)right.data;

    #ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        vec_dst( inData+256, prefetchSize, 2 );
        vec_dst( rightData+256, prefetchSize, 3 );
    #endif

    do {

        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        vec_dst( inData+256, prefetchSize, 2 );
        vec_dst( rightData+256, prefetchSize, 3 );
        #endif

        hiImage = (vector signed short)vec_mergeh(zero,inData[0]);
        loImage = (vector signed short)vec_mergel(zero,inData[0]);
        hiRight = (vector signed short)vec_mergeh(zero,rightData[0]);
        loRight = (vector signed short)vec_mergel(zero,rightData[0]);

        hiImage = vec_subs(hiImage,hiRight);
        loImage = vec_subs(loImage,loRight);

        hiImage = vec_abs(hiImage);
        loImage = vec_abs(loImage);

        inData[0] = vec_packsu(hiImage,loImage);

        inData++;
        rightData++;
    }
    while (--datasize);
    #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        vec_dss( 2 );
        vec_dss( 3 );
    #endif
}
int main (int argc, const char * argv[])
{
  int i;
  const float cf = 1.0;
  vector float v;
  const vector float cv = (vector float){1.0, 2.0, 3.0, 4.0};

  vec_dst(&cv, i, 0);
  v = vec_ld(0, &cv);	
  v = vec_lde(0, &cf);
  vec_lvsl(0, &cf);
  
  return 0;
}
void
b()
{
  z = vec_add (x, y);

  /* Make sure the predicates accept correct argument types.  */

  int1 = vec_all_in (f, g);
  int1 = vec_all_ge (f, g);
  int1 = vec_all_eq (c, d);
  int1 = vec_all_ne (s, t);
  int1 = vec_any_eq (i, j);
  int1 = vec_any_ge (f, g);
  int1 = vec_all_ngt (f, g);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);

  vec_mtvscr (i);
  vec_dssall ();
  s = (vector signed short) vec_mfvscr ();
  vec_dss (3);

  vec_dst (pi, int1 + int2, 3);
  vec_dstst (pi, int1 + int2, 3);
  vec_dststt (pi, int1 + int2, 3);
  vec_dstt (pi, int1 + int2, 3);

  uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi);
  uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi);

  c = vec_lde (int1, (signed char *) pi);
  s = vec_lde (int1, (signed short *) pi);
  i = vec_lde (int1, (signed int *) pi);
  i = vec_ldl (int1, pi);
  i = vec_ld (int1, pi);

  vec_st (i, int2, pi);
  vec_ste (c, int2, (signed char *) pi);
  vec_ste (s, int2, (signed short *) pi);
  vec_ste (i, int2, (signed int *) pi);
  vec_stl (i, int2, pi);
}
示例#8
0
void foo(void) {
  const unsigned char *buf;
  vector pixel vp = { 3, 4, 5, 6 };
  vector bool int vbi = { 1, 0, 1, 0 };
  vector bool short vbs = { 1, 0, 1, 0, 1, 0, 1, 0 };
  vector bool char vbc = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
  vector signed char vsc;
  int a = 3;
  
  vec_dst(buf, a, 1);
  vec_dstst(buf, a, 2);
  vec_dststt(buf, a, 3);
  vec_dststt(buf, a, 2);

  vp = vec_sld(vp, vp, 5);
  vbc = vec_splat(vbc, 7);
  vbs = vec_splat(vbs, 12);
  vp = vec_splat(vp, 17);
  vbi = vec_splat(vbi, 31);  
}
示例#9
0
void *mem_searchrn(void *s, size_t len)
{
	vector unsigned char v_cr;
	vector unsigned char v_nl;
	vector unsigned char v0;
	vector unsigned char v_perm;
	vector unsigned char c;
	vector bool char rr, rn;
	vector bool char last_rr;
	char *p;
	ssize_t k;
	size_t block_num;
	unsigned f;

	if(unlikely(!s || !len))
		return NULL;

	/* only do one prefetch, this covers nearly 128k */
	block_num = DIV_ROUNDUP(len, 512);
	f  = block_num >= 256 ? 0 : block_num << 16;
	f |= 512;
	vec_dst((const unsigned char *)s, f, 2);

	v_cr = vec_splat_u8('\r');
	v_nl = vec_splat_u8('\n');
	v0   = vec_splat_u8(0);
	last_rr = (vector bool char)v0;

	k = SOVUC - ALIGN_DOWN_DIFF(s, SOVUC) - (ssize_t)len;

	p = (char *)ALIGN_DOWN(s, SOVUC);
	c = vec_ldl(0, (const vector unsigned char *)p);
	if(unlikely(k > 0))
		goto K_SHIFT;
	v_perm = vec_lvsl(0, (unsigned char *)s);
	c = vec_perm(c, v0, v_perm);
	v_perm = vec_lvsr(0, (unsigned char *)s);
	c = vec_perm(v0, c, v_perm);
	rr = vec_cmpeq(c, v_cr);
	rn = vec_cmpeq(c, v_nl);

	k = -k;
	goto START_LOOP;

	do
	{
		p += SOVUC;
		c = vec_ldl(0, (const vector unsigned char *)p);
		k -= SOVUC;
		if(k > 0)
		{
			rr = vec_cmpeq(c, v_cr);
			rn = vec_cmpeq(c, v_nl);

			if(vec_any_eq(last_rr, rn)) {
				vec_dss(2);
				return p - 1;
			}
START_LOOP:
			last_rr = (vector bool char)vec_sld(v0, (vector unsigned char)rr, 1);
			rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15);
			rr = vec_and(rr, rn); /* get mask */
			if(vec_any_ne(rr, v0)) {
				vec_dss(2);
				return p + vec_zpos(rr);
			}
		}
	} while(k > 0);
	k = -k;
K_SHIFT:
	vec_dss(2);
	v_perm = vec_lvsr(0, (unsigned char *)k);
	c = vec_perm(v0, c, v_perm);
	v_perm = vec_lvsl(0, (unsigned char *)k);
	c = vec_perm(c, v0, v_perm);
	rr = vec_cmpeq(c, v_cr);
	rn = vec_cmpeq(c, v_nl);
	if(vec_any_eq(last_rr, rn))
		return p - 1;

	rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15);
	rr = vec_and(rr, rn); /* get mask */
	if(vec_any_ne(rr, v0))
		return p + vec_zpos(rr);

	return NULL;
}
示例#10
0
/* more optimized version - unrolled and load-hoisted */
void pix_offset :: processYUVAltivec(imageStruct &image)
{
  register int h,w,width,height;
  width = image.xsize/16; //for altivec
  height = image.ysize;
  //format is U Y V Y
  // start of working altivec function
  union {
    short       elements[8];
    vector      signed short v;
  } transferBuffer;

  register vector signed short c, hi, lo;
  register vector signed short hi1, lo1;
  register vector signed short loadhi, loadhi1, loadlo, loadlo1;
  register vector unsigned char zero = vec_splat_u8(0);
  register vector unsigned char *inData = (vector unsigned char*) image.data;

  //Write the pixel (pair) to the transfer buffer
  //transferBuffer.i = (U << 24) | (Y << 16) | (V << 8 ) | Y;
  transferBuffer.elements[0] = U;
  transferBuffer.elements[1] = Y;
  transferBuffer.elements[2] = V;
  transferBuffer.elements[3] = Y;
  transferBuffer.elements[4] = U;
  transferBuffer.elements[5] = Y;
  transferBuffer.elements[6] = V;
  transferBuffer.elements[7] = Y;

  //Load it into the vector unit
  c = transferBuffer.v;


#ifndef PPC970
  UInt32                        prefetchSize = GetPrefetchConstant( 16, 1,
      256 );
  vec_dst( inData, prefetchSize, 0 );
  vec_dst( inData+16, prefetchSize, 1 );
  vec_dst( inData+32, prefetchSize, 2 );
  vec_dst( inData+64, prefetchSize, 3 );
#endif

  //expand the UInt8's to short's
  loadhi = (vector signed short) vec_mergeh( zero, inData[0] );
  loadlo = (vector signed short) vec_mergel( zero, inData[0] );

  loadhi1 = (vector signed short) vec_mergeh( zero, inData[1] );
  loadlo1 = (vector signed short) vec_mergel( zero, inData[1] );
  \


  for ( h=0; h<height; h++) {
    for (w=0; w<width; w++) {

#ifndef PPC970
      vec_dst( inData, prefetchSize, 0 );
      vec_dst( inData+16, prefetchSize, 1 );
      vec_dst( inData+32, prefetchSize, 2 );
      vec_dst( inData+64, prefetchSize, 3 );
#endif

      //add the constant to it
      hi = vec_add( loadhi, c );
      lo = vec_add( loadlo, c );

      hi1 = vec_add( loadhi1, c );
      lo1 = vec_add( loadlo1, c );


      //expand the UInt8's to short's
      loadhi = (vector signed short) vec_mergeh( zero, inData[2] );
      loadlo = (vector signed short) vec_mergel( zero, inData[2] );


      loadhi1 = (vector signed short) vec_mergeh( zero, inData[3] );
      loadlo1 = (vector signed short) vec_mergel( zero, inData[3] );

      //pack the result back down, with saturation
      inData[0] = vec_packsu( hi, lo );
      inData++;


      inData[0] = vec_packsu( hi1, lo1 );
      inData++;
    }
  }

  //
  // finish the last iteration after the loop
  //
  hi = vec_add( loadhi, c );
  lo = vec_add( loadlo, c );

  hi1 = vec_add( loadhi1, c );
  lo1 = vec_add( loadlo1, c );

  //pack the result back down, with saturation
  inData[0] = vec_packsu( hi, lo );

  inData++;

  inData[0] = vec_packsu( hi1, lo1 );

  inData++;

#ifndef PPC970
  vec_dss( 0 );
  vec_dss( 1 );
  vec_dss( 2 );
  vec_dss( 3 );  //end of working altivec function
#endif
}
示例#11
0
void YUV422_to_YV12_altivec(short*pY, short*pY2, short*pU, short*pV,
			    const unsigned char *gem_image, int xsize, int ysize)
{
  // UYVY UYVY UYVY UYVY
  const vector unsigned char *pixels1=reinterpret_cast<const vector unsigned char *>(gem_image);
  const vector unsigned char *pixels2=reinterpret_cast<const vector unsigned char *>(gem_image+(xsize*2));
  // PDP packet to be filled:
  // first Y plane
  vector signed short *py1 = reinterpret_cast<vector signed short *>(pY);
  // 2nd Y pixel
  vector signed short *py2 = reinterpret_cast<vector signed short *>(pY2);
  // U plane
  vector signed short *pCr = reinterpret_cast<vector signed short *>(pU);
  // V plane
  vector signed short *pCb = reinterpret_cast<vector signed short *>(pV);
  vector signed short uvSub = static_cast<vector signed short>( 128, 128, 128, 128,
													 128, 128, 128, 128 );
  vector unsigned short yShift = static_cast<vector unsigned short>( 7, 7, 7, 7, 7, 7, 7, 7 );
  vector unsigned short uvShift = static_cast<vector unsigned short>( 8, 8, 8, 8, 8, 8, 8, 8 );

  vector signed short tempY1, tempY2, tempY3, tempY4,
		tempUV1, tempUV2, tempUV3, tempUV4, tempUV5, tempUV6;

  vector unsigned char uvPerm = static_cast<vector unsigned char>( 16, 0, 17, 4, 18,  8, 19, 12,   // u0..u3
  														20, 2, 21, 6, 22, 10, 23, 14 ); // v0..v3

  vector unsigned char uPerm = static_cast<vector unsigned char>( 0, 1, 2, 3, 4, 5, 6, 7,
													   16,17,18,19,20,21,22,23);
  vector unsigned char vPerm = static_cast<vector unsigned char>( 8, 9, 10,11,12,13,14,15,
													   24,25,26,27,28,29,30,31);

  vector unsigned char yPerm = static_cast<vector unsigned char>( 16, 1, 17,  3, 18,  5, 19,  7, // y0..y3
													   20, 9, 21, 11, 23, 13, 25, 15);// y4..y7
  vector unsigned char zeroVec = static_cast<vector unsigned char>(0);

  int row=ysize>>1;
  int cols=xsize>>4;
#if 0
# ifndef PPC970
  UInt32	prefetchSize = GetPrefetchConstant( 16, 1, 256 );
  vec_dst( pu, prefetchSize, 0 );
  vec_dst( pv, prefetchSize, 0 );
  vec_dst( py1, prefetchSize, 0 );
  vec_dst( py2, prefetchSize, 0 );
# endif
#endif
  while(row--){
    int col=cols;
    while(col--){
#if 0
# ifndef PPC970
      vec_dst( );
# endif
#endif
      tempUV1 = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, uvPerm));
      tempY1  = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, yPerm));
      tempY2  = static_cast<vector signed short>(vec_perm( *pixels2, zeroVec, yPerm));
	  pixels1++;pixels2++;

      tempUV2 = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, uvPerm));
      tempY3  = static_cast<vector signed short>(vec_perm( *pixels1, zeroVec, yPerm));
      tempY4  = static_cast<vector signed short>(vec_perm( *pixels2, zeroVec, yPerm));
	  pixels1++;pixels2++;

	  tempUV3 = vec_sub( tempUV1, uvSub );
	  tempUV4 = vec_sub( tempUV2, uvSub );
	  tempUV5 = vec_sl( tempUV3, uvShift );
	  tempUV6 = vec_sl( tempUV4, uvShift );

	  *pCb = vec_perm( tempUV5, tempUV6, uPerm );
	  *pCr = vec_perm( tempUV5, tempUV6, vPerm );
	  pCr++; pCb++;

	  *py1++ = vec_sl( tempY1, yShift);
      *py2++ = vec_sl( tempY2, yShift);
      *py1++ = vec_sl( tempY3, yShift);
      *py2++ = vec_sl( tempY4, yShift);
	}

	py1+=(xsize>>3); py2+=(xsize>>3);
	pixels1+=(xsize*2)>>4; pixels2+=(xsize*2)>>4;
  }
}
示例#12
0
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL)
{
    int i;
    vector signed short vsrc;
    uint16_t *qmat;
    vector unsigned short vqmat;
    vector unsigned short vmquant;
    vector bool short eqzero, ltzero;
    vector signed short val, t0;
    vector signed short zero, one;
    vector unsigned int four;
    vector signed short min, max;
    int offset, offset2;
    int16_t dst0;
    union {
	vector unsigned short vu16;
	unsigned short mquant;
	vector signed int vs32;
	struct {
	    signed int pad[3];
	    signed int sum;
	} s;
    } vu;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY /* {{{ */
    if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat))
	mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)",
	    wsp->intra_q_mat);

    if (NOT_VECTOR_ALIGNED(src))
	mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src);

    if (NOT_VECTOR_ALIGNED(dst))
	mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst);

    for (i = 0; i < 64; i++)
	if (src[i] < -256 || src[i] > 255)
	    mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)",
		i, src[i]);
#endif /* }}} */

    AMBER_START;

    dst0 = src[0] << (3 - dc_prec);

    qmat = (uint16_t*)wsp->intra_q_mat;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(64/8,1,0);
    vec_dst(src, dsc.control, 0);
    vec_dst(qmat, dsc.control, 1);
#endif

    /* vmquant = (vector unsigned short)(mquant); */
    vu.mquant = (unsigned short)mquant;
    vmquant = vec_splat(vu.vu16, 0);

    zero = vec_splat_s16(0);
    one = vec_splat_s16(1);
    four = vec_splat_u32(4);
    /* max = (2047); min = (-2048); {{{ */
    vu8(max) = vec_splat_u8(0x7);
    t0 = vec_splat_s16(-1); /* 0xffff */
    vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */
    min = vec_sub(t0, max);
    /* }}} */
    offset = 0;

#if 1
    vsrc = vec_ld(offset, (signed short*)src);
    vqmat = vec_ld(offset, (unsigned short*)qmat);
    i = (64/8) - 1;
    do {
	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	offset2 = offset;
	offset += 8*sizeof(int16_t);
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset2, dst);
    } while (--i);
    /* intra_q[i] * mquant */
    vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

    /* save sign */
    ltzero = vec_cmplt(vsrc, zero);
    eqzero = vec_cmpeq(vsrc, zero);

    /* val = abs(src) */
    t0 = vec_sub(zero, vsrc);
    val = vec_max(t0, vsrc);

    /* val = (src * quant) >> 4 */
    vs32(t0) = vec_mule(val, vs16(vqmat));
    vs32(val) = vec_mulo(val, vs16(vqmat));
    vs32(t0) = vec_sra(vs32(t0), four);
    vs16(t0) = vec_pack(vs32(t0), vs32(t0));
    vs32(val) = vec_sra(vs32(val), four);
    vs16(val) = vec_pack(vs32(val), vs32(val));
    val = vec_mergeh(vs16(t0), vs16(val));

    /* val = val - 1&~(val|val==0) */
    t0 = vec_or(val, eqzero);
    t0 = vec_andc(one, t0);
    val = vec_sub(val, t0);

    /* restore sign */
    t0 = vec_sub(zero, val);
    val = vec_sel(val, t0, ltzero);

    /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
    val = vec_min(val, max);
    val = vec_max(val, min);

    vec_st(val, offset, dst);
#else
    /* {{{ */
    i = (64/8);
    do {
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset, dst);

	offset += 8*sizeof(int16_t);
    } while (--i);
    /* }}} */
#endif

    dst[0] = dst0;

    AMBER_STOP;
}
示例#13
0
文件: pr47197.c 项目: 0day-ci/gcc
void func(unsigned char *buf, unsigned len)
{
        vec_dst(buf, (len >= 256 ? 0 : len) | 512, 2);
}
示例#14
0
/*
 * add prediction and prediction error, saturate to 0...255
 * pred % 8 == 0
 * cur % 8 == 0
 * lx % 16 == 0
 * blk % 16 == 0
 */
void add_pred_altivec(ADD_PRED_PDECL)
{
#ifdef ALTIVEC_DST
    unsigned int dst;
#endif
    uint8_t *pCA, *pCB, *pPA, *pPB;
    int16_t *pBA, *pBB;
    vector unsigned char zero;
    vector unsigned char predA, predB, curA, curB;
    vector signed short blkA, blkB;


#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(lx))
	mjpeg_error_exit1("add_pred: lx %% 16 != 0, (%d)", lx);

    if (NOT_VECTOR_ALIGNED(blk))
	mjpeg_error_exit1("add_pred: blk %% 16 != 0, (%d)", blk);

#ifdef ALTIVEC_DST
    if (lx & (~0xffff) != 0)
	mjpeg_error_exit1("add_pred: lx=%d > vec_dst range", lx);
#endif

    if (((unsigned long)pred & 0xf) != ((unsigned long)cur & 0xf))
	mjpeg_error_exit1("add_pred: (pred(0x%X) %% 16) != (cur(0x%X) %% 16)",
		pred, cur);
    if ((((unsigned long)pred) & 0x7) != 0)
	mjpeg_error_exit1("add_pred: pred %% 8 != 0, (0x%X)", pred);
    if ((((unsigned long)cur) & 0x7) != 0)
	mjpeg_error_exit1("add_pred: cur %% 8 != 0, (0x%X)", cur);
#endif

/* MACROS expand differently depending on input */
#define ABBA(symbol,ab)		_ABBA(ABBA_##ab,symbol) /* {{{ */
#define _ABBA(abba_ab,symbol)	abba_ab(symbol)
#define ABBA_A(symbol)		symbol##B
#define ABBA_B(symbol)		symbol##A
/* }}} */
#define HLLH(symbol,hl)		_HLLH(HLLH_##hl,symbol) /* {{{ */
#define _HLLH(hllh_hl,symbol)	hllh_hl(symbol)
#define HLLH_h(symbol)		symbol##l
#define HLLH_l(symbol)		symbol##h
/* }}} */
#define PACKSU(hl,st,ld)	_PACKSU(PACKSU_##hl,st,ld) /* {{{ */
#define _PACKSU(psu,st,ld)	psu(st,ld)
#define PACKSU_h(st,ld)		vec_packsu(st,ld)
#define PACKSU_l(st,ld)		vec_packsu(ld,st)
/* }}} */


#define	PERFORM_ITERATION(hl,ab,iter) /* iter {{{ */                         \
	pred##ab = vec_merge##hl(zero, pred##ab);                            \
	cur##ab = HLLH(vec_merge,hl)(zero, cur##ab);                         \
	blk##ab = vec_add(blk##ab, vs16(pred##ab));                          \
	blk##ab = vec_max(blk##ab, vs16(zero));                              \
	cur##ab = PACKSU(hl, vu16(blk##ab), vu16(cur##ab));                  \
	vec_st(cur##ab, 0, pC##ab);                                          \
	/* }}} */

#define PREPARE_ITERATION(hl,ab,iter) /* iter {{{ */                         \
	pP##ab = ABBA(pP,ab) + lx;                                           \
	pC##ab = ABBA(pC,ab) + lx;                                           \
	pB##ab = ABBA(pB,ab) + 8;                                            \
	pred##ab = vec_ld(0, pP##ab);                                        \
	cur##ab = vec_ld(0, pC##ab);                                         \
	blk##ab = vec_ld(0, pB##ab);                                         \
	/* }}} */

#define NO_RESCHEDULE	asm volatile ("") 

    AMBER_START;

    pPA = pred;
    pCA = cur;
    pBA = blk;

#ifdef ALTIVEC_DST
    dst = 0x01080000 | lx;
    vec_dst(pPA, dst, 0);
    vec_dst(pCA, dst, 1);
    dst = 0x01080010;
    vec_dst(pBA, dst, 2);
#endif

    predA = vec_ld(0, pPA);
    curA = vec_ld(0, pCA);  NO_RESCHEDULE;
    pPB = pPA + lx;         NO_RESCHEDULE;
    blkA = vec_ld(0, pBA);  NO_RESCHEDULE;
    pCB = pCA + lx;         NO_RESCHEDULE;
    predB = vec_ld(0, pPB); NO_RESCHEDULE;
    pBB = pBA + 8;          NO_RESCHEDULE;
    curB = vec_ld(0, pCB);  NO_RESCHEDULE;
    zero = vec_splat_u8(0); NO_RESCHEDULE;
    blkB = vec_ld(0, pBB);


    if (VECTOR_ALIGNED(pPA)) {
	PERFORM_ITERATION(h,A,0);
	PREPARE_ITERATION(h,A,2);   /* prepare next A iteration */
	PERFORM_ITERATION(h,B,1);
	PREPARE_ITERATION(h,B,3);   /* prepare next B iteration */
	PERFORM_ITERATION(h,A,2);
	PREPARE_ITERATION(h,A,4);
	PERFORM_ITERATION(h,B,3);
	PREPARE_ITERATION(h,B,5);
	PERFORM_ITERATION(h,A,4);
	PREPARE_ITERATION(h,A,6);
	PERFORM_ITERATION(h,B,5);
	PREPARE_ITERATION(h,B,7);
	PERFORM_ITERATION(h,A,6);
	PERFORM_ITERATION(h,B,7);
    } else {
	PERFORM_ITERATION(l,A,0);
	PREPARE_ITERATION(l,A,2);   /* prepare next A iteration */
	PERFORM_ITERATION(l,B,1);
	PREPARE_ITERATION(l,B,3);   /* prepare next B iteration */
	PERFORM_ITERATION(l,A,2);
	PREPARE_ITERATION(l,A,4);
	PERFORM_ITERATION(l,B,3);
	PREPARE_ITERATION(l,B,5);
	PERFORM_ITERATION(l,A,4);
	PREPARE_ITERATION(l,A,6);
	PERFORM_ITERATION(l,B,5);
	PREPARE_ITERATION(l,B,7);
	PERFORM_ITERATION(l,A,6);
	PERFORM_ITERATION(l,B,7);
    }

#ifdef ALTIVEC_DST
    vec_dssall();
#endif

    AMBER_STOP;
}
示例#15
0
void fluid_genPressure_black(fluid *in_f, int y, pvt_fluidMode *mode)
{
	struct pressure *p = &mode->pressure;
	
	int w = fieldWidth(p->velX);
	int h = fieldHeight(p->velX);

#ifdef __APPLE_ALTIVEC__
#elif defined __SSE3__
#else
	int sx = fieldStrideX(p->velX);
#endif
	int sy = fieldStrideY(p->velY);
	
	float *velX = fieldData(p->velX);
	float *velY = fieldData(p->velY);
	
	float *pressure = fieldData(p->pressure);
	
	if (y == 0)
	{
#ifdef X_SIMD
		x128f *vPressure = (x128f*)fluidFloatPointer(pressure, 0*sy);
		x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, 1*sy);
		
		int x;
		w/=4;
		for (x=0; x<w; x++)
		{
			vPressure[x] = vPressureP[x];
		}
#else
		int x;
		for (x=0; x<w; x++)
		{
			fluidFloatPointer(pressure,x*sx)[0] = fluidFloatPointer(pressure,x*sx + sy)[0];
		}
#endif
	}
	else if (y == h-1)
	{
#ifdef X_SIMD
		x128f *vPressure = (x128f*)fluidFloatPointer(pressure, y*sy);
		x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy);
		
		int x;
		w/=4;
		for (x=0; x<w; x++)
		{
			vPressure[x] = vPressureP[x];
		}
#else
		int x;
		for (x=0; x<w; x++)
		{
			fluidFloatPointer(pressure,x*sx + y*sy)[0] =
					fluidFloatPointer(pressure,x*sx + (y-1)*sy)[0];
		}
#endif
	}
	else
	{
#ifdef X_SIMD
		float *vPressureRow = fluidFloatPointer(pressure, y*sy);
		
		x128f *vPressure = (x128f*)vPressureRow;
		x128f *vVelX = (x128f*)fluidFloatPointer(velX, y*sy);
		
		x128f *vPressureN = (x128f*)fluidFloatPointer(pressure, (y+1)*sy);
		x128f *vVelYN = (x128f*)fluidFloatPointer(velY, (y+1)*sy);
		
		x128f *vPressureP = (x128f*)fluidFloatPointer(pressure, (y-1)*sy);
		x128f *vVelYP = (x128f*)fluidFloatPointer(velY, (y-1)*sy);
		
		x128f div4 = {0.0f, 1.0f/4.0f, 0.0f, 1.0f/4.0f};
		x128f mask = {1.0f, 0.0f, 1.0f, 0.0f};
#endif
	
#ifdef __APPLE_ALTIVEC__
		//int myTempVariable = __mfspr( 1023 );
		
		vector float vZero = {0,0,0,0};
		
		vec_dstst(vPressure, 0x01000001, 0);
		vec_dst(vVelX, 0x01000001, 1);
		vec_dst(vVelYN, 0x01000001, 2);
		vec_dst(vVelYP, 0x01000001, 3);
		
		int x;
		{
			vector float tmp;
			
			//Compute shifts
			vector float sl_p = vec_sld(vPressure[0], vPressure[1],4);
			vector float sr_p = vec_sld(vZero, vPressure[0], 12);
			
			vector float sl_vx = vec_sld(vVelX[0], vVelX[1],4);
			vector float sr_vx = vec_sld(vZero, vVelX[0], 12);
			
			//Sum everything!!!
			tmp = vec_add(sl_p, sr_p);
			tmp = vec_add(tmp, vPressureN[0]);
			tmp = vec_add(tmp, vPressureP[0]);
			tmp = vec_sub(tmp, sl_vx);
			tmp = vec_add(tmp, sr_vx);
			tmp = vec_sub(tmp, vVelYN[0]);
			tmp = vec_add(tmp, vVelYP[0]);
			
			vPressure[0] = vec_madd(tmp, div4, vZero);
			vPressureRow[0] = vPressureRow[1];
		}
		x=1;
		
		while (x<w/4-5)
		{
			PRESSURE_VEC_PRE(0)
			PRESSURE_VEC_PRE(1)
			PRESSURE_VEC_PRE(2)
			PRESSURE_VEC_PRE(3)
			
			PRESSURE_VEC_SHIFT(0)
			PRESSURE_VEC_SHIFT(1)
			PRESSURE_VEC_SHIFT(2)
			PRESSURE_VEC_SHIFT(3)
			
			PRESSURE_VEC_END(0)
			PRESSURE_VEC_END(1)
			PRESSURE_VEC_END(2)
			PRESSURE_VEC_END(3)
			
			x+=4;
		}

		while (x<w/4-1)
		{			
			PRESSURE_VEC_PRE(0)
			PRESSURE_VEC_SHIFT(0)
			PRESSURE_VEC_END(0)
			x++;
		}
		{
			vector float tmp;
			
			//Compute shifts
			vector float sl_p = vec_sld(vPressure[x], vZero,4);
			vector float sr_p = vec_sld(vPressure[x-1], vPressure[x], 12);
			
			vector float sl_vx = vec_sld(vVelX[x], vZero,4);
			vector float sr_vx = vec_sld(vVelX[x-1], vVelX[x], 12);
			
			//Sum everything!!!
			tmp = vec_add(sl_p, sr_p);
			tmp = vec_add(tmp, vPressureN[x]);
			tmp = vec_add(tmp, vPressureP[x]);
			tmp = vec_sub(tmp, sl_vx);
			tmp = vec_add(tmp, sr_vx);
			tmp = vec_sub(tmp, vVelYN[x]);
			tmp = vec_add(tmp, vVelYP[x]);
			
			vPressure[x] = vec_madd(tmp, div4, vZero);
			
			vPressureRow[w-1] = vPressureRow[w-2];
		}
		
#elif defined __SSE3__
		
		int x;
		{
			__m128 tmp;
			
			//Compute shifts
			__m128 sl_p = _mm_srli_sf128(vPressure[0],4);
			sl_p = _mm_add_ps(sl_p,_mm_slli_sf128(vPressure[1],12));
			
			__m128 sr_p = _mm_slli_sf128(vPressure[0],4);
			
			__m128 sl_vx = _mm_srli_sf128(vVelX[0],4);
			sl_vx = _mm_add_ps(sl_vx,_mm_slli_sf128(vVelX[1],12));
			
			__m128 sr_vx = _mm_slli_sf128(vVelX[0],4);
			
			//Sum everything!!!
			tmp = _mm_add_ps(sl_p, sr_p);
			tmp = _mm_add_ps(tmp, vPressureN[0]);
			tmp = _mm_add_ps(tmp, vPressureP[0]);
			tmp = _mm_sub_ps(tmp, sl_vx);
			tmp = _mm_add_ps(tmp, sr_vx);
			tmp = _mm_sub_ps(tmp, vVelYN[0]);
			tmp = _mm_add_ps(tmp, vVelYP[0]);
			
			vPressure[0] = _mm_mul_ps(tmp, div4);
			vPressureRow[0] = vPressureRow[1];
		}
		x=1;
		while (x<w/4-9)
		{
			//Compute shifts (1)
			PRESSURE_SSE_PRE(0);
			PRESSURE_SSE_PRE(1);
			PRESSURE_SSE_PRE(2);
			
			//Sum everything!!! (1)
			PRESSURE_SSE_POST(0);
			PRESSURE_SSE_POST(1);
			PRESSURE_SSE_POST(2);
			
			x+=3;
		}
		while (x<w/4-1)
		{
			//Compute shifts
			PRESSURE_SSE_PRE(0);
			
			//Sum everything!!!
			PRESSURE_SSE_POST(0);
			
			x++;
		}
		{
			__m128 tmp;
			
			//Compute shifts
			__m128 sl_p = _mm_srli_sf128(vPressure[x],4);
			
			__m128 sr_p = _mm_slli_sf128(vPressure[x],4);
			sr_p = _mm_add_ps(sr_p,_mm_srli_sf128(vPressure[x-1],12));
			
			__m128 sl_vx = _mm_srli_sf128(vVelX[x],4);
			
			__m128 sr_vx = _mm_slli_sf128(vVelX[x],4);
			sr_vx = _mm_add_ps(sr_vx,_mm_srli_sf128(vVelX[x-1],12));
			
			//Sum everything!!!
			tmp = _mm_add_ps(sl_p, sr_p);
			tmp = _mm_add_ps(tmp, vPressureN[x]);
			tmp = _mm_add_ps(tmp, vPressureP[x]);
			tmp = _mm_sub_ps(tmp, sl_vx);
			tmp = _mm_add_ps(tmp, sr_vx);
			tmp = _mm_sub_ps(tmp, vVelYN[x]);
			tmp = _mm_add_ps(tmp, vVelYP[x]);
			
			vPressure[x] = _mm_mul_ps(tmp, div4);
			
			vPressureRow[w-1] = vPressureRow[w-2];
		}
		
#else
		float lastPressureX = fluidFloatPointer(pressure,sx + y*sy)[0];
		float lastVelX = fluidFloatPointer(velX, y*sy)[0];
		
		float curPressureX = lastPressureX;
		float curVelX = fluidFloatPointer(velX, sx + y*sy)[0];
		
		fluidFloatPointer(pressure,y*sy)[0] = lastPressureX;
		
		int x;
		int curxy = sx + y*sy;
		for (x=1; x<w-1; x++)
		{
			float nextPressureX = fluidFloatPointer(pressure,curxy + sx)[0];
			float nextVelX = fluidFloatPointer(velX,curxy + sx)[0];
			
			fluidFloatPointer(pressure,curxy)[0] =
				(	  lastPressureX
				 	+ nextPressureX
				 	+ fluidFloatPointer(pressure,curxy - sy)[0]
					+ fluidFloatPointer(pressure,curxy + sy)[0]
				 - 		(  nextVelX
						 - lastVelX
						 + fluidFloatPointer(velY,curxy + sy)[0]
						 - fluidFloatPointer(velY,curxy - sy)[0])) / 4.0f;
			
			lastPressureX = curPressureX;
			curPressureX = nextPressureX;
			
			lastVelX = curVelX;
			curVelX = nextVelX;
			
			curxy += sx;
		}
		
		fluidFloatPointer(pressure,(w-1)*sx + y*sy)[0]
			= fluidFloatPointer(pressure,(w-2)*sx + y*sy)[0];
#endif
	}
}
示例#16
0
void YV12_to_YUV422_altivec(const short*Y, const short*U, const short*V,
			    unsigned char *data, int xsize, int ysize)
{
  // from [email protected], 3/15/2005
  // #1. Don't use the pointers. Use vec_ld with an index that you increment (by 16) instead.
  vector unsigned char *pixels1=reinterpret_cast<vector unsigned char *>(data);
  vector unsigned char *pixels2=reinterpret_cast<vector unsigned char *>(data+(xsize*2));
  const vector unsigned short *py1 = reinterpret_cast<const vector unsigned short *>(Y);
  const vector unsigned short *py2 = reinterpret_cast<const vector unsigned short *>(Y + xsize );
  const vector unsigned short *pu = reinterpret_cast<const vector unsigned short *>(U);
  const vector unsigned short *pv = reinterpret_cast<const vector unsigned short *>(V);
  vector unsigned short uvAdd = static_cast<vector unsigned short>( 128, 128, 128, 128,
                                                         128, 128, 128, 128 );
  vector unsigned short yShift = static_cast<vector unsigned short>( 7, 7, 7, 7, 7, 7, 7, 7 );
  vector unsigned short uvShift = static_cast<vector unsigned short>( 8, 8, 8, 8, 8, 8, 8, 8 );
  vector unsigned short tempU, tempV, doneU, doneV, tempY1, tempY2, tempY3, tempY4,
    uv1, uv2, out1, out2, out3, out4, out5, out6, out7, out8;
  vector unsigned char Perm1 =
    static_cast<vector unsigned char>( 0, 1, 16, 17, 2, 3, 18, 19,
                            4, 5, 20, 21, 6, 7, 22, 23 );
  vector unsigned char Perm2 =
    static_cast<vector unsigned char>(  8,  9, 24, 25, 10, 11, 26, 27,
                             12, 13, 28, 29, 14, 15, 30, 31 );
  int row=ysize>>1;
  int cols=xsize>>4;
#if 0
# ifndef PPC970
  UInt32	prefetchSize = GetPrefetchConstant( 16, 1, 256 );
  vec_dst( pu, prefetchSize, 0 );
  vec_dst( pv, prefetchSize, 0 );
  vec_dst( py1, prefetchSize, 0 );
  vec_dst( py2, prefetchSize, 0 );
# endif
#endif
  while(row--){
    int col=cols;
    while(col--){
#if 0
# ifndef PPC970
      vec_dst( );
# endif
#endif
      tempU = vec_sra( (*pu++), uvShift );
      tempV = vec_sra( (*pv++), uvShift );
      doneU = vec_add( tempU, uvAdd );
      doneV = vec_add( tempV, uvAdd );

      uv1 = vec_perm( doneU, doneV, Perm1 ); // uvuvuvuv uvuvuvuv
      uv2 = vec_perm( doneU, doneV, Perm2 );

      tempY1 = vec_sra( (*py1++), yShift );
      tempY2 = vec_sra( (*py2++), yShift );

      out1 = vec_perm( uv1, tempY1, Perm1 ); //fill Y's, U's & V's
      out2 = vec_perm( uv1, tempY1, Perm2 );
      out3 = vec_perm( uv1, tempY2, Perm1 ); //fill 2nd Y's, U's & V's
      out4 = vec_perm( uv1, tempY2, Perm2 );

      *pixels1 = vec_packsu( out1, out2 );
      *pixels2 = vec_packsu( out3, out4 );
      pixels1++; pixels2++;

      tempY3 = vec_sra( (*py1++), yShift ); // load second set of Y's
      tempY4 = vec_sra( (*py2++), yShift );

      out5 = vec_perm( uv2, tempY3, Perm1 );
      out6 = vec_perm( uv2, tempY3, Perm2 );
      out7 = vec_perm( uv2, tempY4, Perm1 );
      out8 = vec_perm( uv2, tempY4, Perm2 );

      *pixels1 = vec_packsu( out5, out6 );
      *pixels2 = vec_packsu( out7, out8 );
      pixels1++; pixels2++;
    }
    pixels1+=(xsize*2)>>4; pixels2+=(xsize*2)>>4;
    py1+=xsize>>3; py2+=xsize>>3;
  }
}
示例#17
0
void pix_background :: processYUVAltivec(imageStruct &image)
{
register int h,w,i,j,width;
int pixsize = image.xsize * image.ysize * image.csize;
    h = image.ysize;
    w = image.xsize/8;
    width = image.xsize/8;
    
    //check to see if the buffer isn't 16byte aligned (highly unlikely)
    if (image.ysize*image.xsize % 16 != 0){
        error("image not properly aligned for Altivec - try something SD or HD maybe?");
        return;
        }
    
    union{
        unsigned short		s[8];
        vector unsigned short	v;
    }shortBuffer;

    if(m_savedImage.xsize!=image.xsize ||
       m_savedImage.ysize!=image.ysize ||
       m_savedImage.format!=image.format)m_reset=1;

    m_savedImage.xsize=image.xsize;
    m_savedImage.ysize=image.ysize;
    m_savedImage.setCsizeByFormat(image.format);
    m_savedImage.reallocate();
    
    if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
    m_reset = 0; 
    }
    
    register vector unsigned short	UVres1, Yres1, UVres2, Yres2;//interleave;
    register vector unsigned short	hiImage, loImage;
    register vector unsigned short	Yrange, UVrange, Yblank,UVblank,blank;
    register vector bool short		Ymasklo,Ymaskhi,  UVmaskhi;
    register vector unsigned short	Yhi,Ylo,UVhi,UVlo; 
    register vector unsigned char	one = vec_splat_u8(1);
    register vector unsigned short	sone = vec_splat_u16(1);
    register vector unsigned int			Uhi, Ulo, Vhi, Vlo,Ures,Vres;
    register vector bool int 			Umasklo, Umaskhi, Vmaskhi, Vmasklo;

    vector unsigned char	*inData = (vector unsigned char*) image.data;
    vector unsigned char	*rightData = (vector unsigned char*) m_savedImage.data;
    
    shortBuffer.s[0] =  m_Yrange;
    Yrange = shortBuffer.v;
    Yrange = vec_splat(Yrange,0);
    
    shortBuffer.s[0] = 128;
    shortBuffer.s[1] = 0;
    shortBuffer.s[2] = 128;
    shortBuffer.s[3] = 0;
    shortBuffer.s[4] = 128;
    shortBuffer.s[5] = 0;
    shortBuffer.s[6] = 128;
    shortBuffer.s[7] = 0;
    blank = shortBuffer.v;
    
    shortBuffer.s[0] =  0;
    Yblank = shortBuffer.v;
    Yblank = vec_splat(Yblank,0);
    
    shortBuffer.s[0] =  128;
    UVblank = shortBuffer.v;
    UVblank = vec_splat(UVblank,0);
    
    shortBuffer.s[0] = m_Urange;
    shortBuffer.s[1] = m_Vrange;
    shortBuffer.s[2] = m_Urange;
    shortBuffer.s[3] = m_Vrange;
    shortBuffer.s[4] = m_Urange;
    shortBuffer.s[5] = m_Vrange;
    shortBuffer.s[6] = m_Urange;
    shortBuffer.s[7] = m_Vrange;
    UVrange = shortBuffer.v;
    
    
    //setup the cache prefetch -- A MUST!!!
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    #ifndef PPC970 
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
    vec_dst( inData+32, prefetchSize, 2 );
    vec_dst( rightData+32, prefetchSize, 3 );
    #endif //PPC970
    
    for ( i=0; i<h; i++){
        for (j=0; j<w; j++)
        {
        #ifndef PPC970
        //this function is probably memory bound on most G4's -- what else is new?
            vec_dst( inData, prefetchSize, 0 );
            vec_dst( rightData, prefetchSize, 1 );
            vec_dst( inData+32, prefetchSize, 2 );
            vec_dst( rightData+32, prefetchSize, 3 );
        #endif
        //separate the U and V from Y
        UVres1 = (vector unsigned short)vec_mule(one,inData[0]);
        UVres2 = (vector unsigned short)vec_mule(one,rightData[0]);
            
        //vec_mulo Y * 1 to short vector Y Y Y Y shorts
        Yres1 = (vector unsigned short)vec_mulo(one,inData[0]);
        Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]);
        
        Yhi = vec_adds(Yres2,Yrange);
        Ylo = vec_subs(Yres2,Yrange);
        
        //go to ints for comparison
        UVhi = vec_adds(UVres2,UVrange);
        UVlo = vec_subs(UVres2,UVrange);
        
        Uhi = vec_mule(sone,UVhi);
        Ulo = vec_mule(sone,UVlo);
        
        Vhi = vec_mulo(sone,UVhi);
        Vlo = vec_mulo(sone,UVlo);
        
        Ures = vec_mule(sone,UVres1);
         Vres = vec_mulo(sone,UVres1);
         
         Umasklo = vec_cmpgt(Ures,Ulo);
         Umaskhi = vec_cmplt(Ures,Uhi);
         
         Vmasklo = vec_cmpgt(Vres,Vlo);
         Vmaskhi = vec_cmplt(Vres,Vhi);
         
         Umaskhi = vec_and(Umaskhi,Umasklo);
         
         Vmaskhi = vec_and(Vmaskhi,Vmasklo);
         
         Umasklo = vec_and(Umaskhi,Vmaskhi);
         Vmasklo = vec_and(Umaskhi,Vmaskhi);
         
         hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo);
         loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo);
         
         //pack it back down to bool short
         UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage);
         
         Ymasklo = vec_cmpgt(Yres1,Ylo);
         Ymaskhi = vec_cmplt(Yres1,Yhi);
         
         Ymaskhi = vec_and(Ymaskhi,Ymasklo);
         
         Ymaskhi = vec_and(Ymaskhi,UVmaskhi);
         UVmaskhi = vec_and(Ymaskhi,UVmaskhi);
         
         //bitwise comparison and move using the result of the comparison as a mask
         Yres1 = vec_sel(Yres1,Yblank,Ymaskhi);
         
         //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi);
         UVres1 = vec_sel(UVres1,UVblank,UVmaskhi);
         
         //merge the Y and UV back together
         hiImage = vec_mergeh(UVres1,Yres1);
         loImage = vec_mergel(UVres1,Yres1);
         
         //pack it back down to unsigned char to store
         inData[0] = vec_packsu(hiImage,loImage);
         
         inData++;
         rightData++;
        
        }
        #ifndef PPC970
        vec_dss(0);
        vec_dss(1);
        vec_dss(2);
        vec_dss(3);
        #endif
    }
}
示例#18
0
void pix_compare :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
register int h,w,i,j,width;

    h = image.ysize;
    w = image.xsize/8;
    width = image.xsize/8;

    //check to see if the buffer isn't 16byte aligned (highly unlikely)
    if (image.ysize*image.xsize % 16 != 0){
        error("image not properly aligned for Altivec");
        return;
        }

    register vector unsigned short	UVres1, Yres1, UVres2, Yres2;//interleave;
    register vector unsigned short	hiImage, loImage;
    register vector bool short		Ymask1;
    register vector unsigned char	one = vec_splat_u8(1);

    vector unsigned char	*inData = (vector unsigned char*) image.data;
    vector unsigned char	*rightData = (vector unsigned char*) right.data;

    #ifndef PPC970
    //setup the cache prefetch -- A MUST!!!
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
    #endif
    if (m_direction) {

    for ( i=0; i<h; i++){
        for (j=0; j<w; j++)
        {
        #ifndef PPC970
        //this function is probably memory bound on most G4's -- what else is new?
        vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif

        //separate the U and V from Y
        UVres1 = (vector unsigned short)vec_mule(one,inData[0]);
        UVres2 = (vector unsigned short)vec_mule(one,rightData[0]);

        //vec_mulo Y * 1 to short vector Y Y Y Y shorts
        Yres1 = (vector unsigned short)vec_mulo(one,inData[0]);
        Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]);

         //compare the Y values
         Ymask1 = vec_cmpgt(Yres1,Yres2);

         //bitwise comparison and move using the result of the comparison as a mask
         Yres1 = vec_sel(Yres2,Yres1,Ymask1);

         UVres1 = vec_sel(UVres2,UVres1,Ymask1);

         //merge the Y and UV back together
         hiImage = vec_mergeh(UVres1,Yres1);
         loImage = vec_mergel(UVres1,Yres1);

         //pack it back down to unsigned char to store
         inData[0] = vec_packsu(hiImage,loImage);

            inData++;
            rightData++;

        }
        #ifndef PPC970
        vec_dss(1);
        vec_dss(0);
        #endif

    }
    }else{

    for ( i=0; i<h; i++){
        for (j=0; j<w; j++)
        {
        #ifndef PPC970
        vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif

        UVres1 = (vector unsigned short)vec_mule(one,inData[0]);
        UVres2 = (vector unsigned short)vec_mule(one,rightData[0]);

        //vec_mulo Y * 1 to short vector Y Y Y Y shorts
        Yres1 = (vector unsigned short)vec_mulo(one,inData[0]);
        Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]);

         Ymask1 = vec_cmplt(Yres1,Yres2);

         Yres1 = vec_sel(Yres2,Yres1,Ymask1);

         UVres1 = vec_sel(UVres2,UVres1,Ymask1);

         hiImage = vec_mergeh(UVres1,Yres1);
         loImage = vec_mergel(UVres1,Yres1);

         inData[0] = vec_packsu(hiImage,loImage);

            inData++;
            rightData++;

        }
        #ifndef PPC970
        vec_dss(1);
        vec_dss(0);
        #endif
    }
    }
}
示例#19
0
/* start of optimized motionblur */
void pix_motionblur :: processYUVAltivec(imageStruct &image)
{
  int h,w,width;
  signed short rightGain,imageGain;
  unsigned char *saved = m_savedImage.data;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();
  if(saved!=m_savedImage.data) {
    m_savedImage.setBlack();
  }
  saved=m_savedImage.data;

  width = image.xsize/8;
  /*
  // hmm: why does it read 235 ?
  rightGain = (signed short)(235. * m_motionblur);
  imageGain = (signed short) (255. - (235. * m_motionblur));
  */
  rightGain = m_blur1;
  imageGain = m_blur0;

  union {
    signed short        elements[8];
    vector      signed short v;
  } shortBuffer;

  union {
    unsigned int        elements[4];
    vector      unsigned int v;
  } bitBuffer;

  register vector signed short gainAdd, hiImage, loImage,hiRight,loRight,
           YImage, UVImage;
  // register vector signed short loadhiImage, loadloImage,loadhiRight,loadloRight;
  register vector unsigned char loadImage, loadRight;
  register vector unsigned char zero = vec_splat_u8(0);
  register vector signed int UVhi,UVlo,Yhi,Ylo;
  register vector signed int UVhiR,UVloR,YhiR,YloR;
  register vector signed short gainSub,gain,gainR;//,d;
  register vector unsigned int bitshift;
  vector unsigned char *inData = (vector unsigned char*) image.data;
  vector unsigned char *rightData = (vector unsigned char*) saved;


  shortBuffer.elements[0] = 128;
  shortBuffer.elements[1] = 0;
  shortBuffer.elements[2] = 128;
  shortBuffer.elements[3] = 0;
  shortBuffer.elements[4] = 128;
  shortBuffer.elements[5] = 0;
  shortBuffer.elements[6] = 128;
  shortBuffer.elements[7] = 0;

  gainSub = shortBuffer.v;

  shortBuffer.elements[0] = imageGain;
  gain = shortBuffer.v;
  gain =  vec_splat(gain, 0 );

  shortBuffer.elements[0] = rightGain;
  gainR = shortBuffer.v;
  gainR =  vec_splat(gainR, 0 );

  bitBuffer.elements[0] = 8;

  //Load it into the vector unit
  bitshift = bitBuffer.v;
  bitshift = vec_splat(bitshift,0);

  shortBuffer.elements[0] = 128;

  //Load it into the vector unit
  gainAdd = shortBuffer.v;
  gainAdd = (vector signed short)vec_splat((vector signed short)gainAdd,0);

# ifndef PPC970
  UInt32                        prefetchSize = GetPrefetchConstant( 16, 1,
      256 );
  vec_dst( inData, prefetchSize, 0 );
  vec_dst( rightData, prefetchSize, 1 );
  vec_dst( inData+32, prefetchSize, 2 );
  vec_dst( rightData+32, prefetchSize, 3 );
# endif

  loadImage = inData[0];
  loadRight = rightData[0];

  for ( h=0; h<image.ysize; h++) {
    for (w=0; w<width; w++) {
# ifndef PPC970
      vec_dst( inData, prefetchSize, 0 );
      vec_dst( rightData, prefetchSize, 1 );
      vec_dst( inData+32, prefetchSize, 2 );
      vec_dst( rightData+32, prefetchSize, 3 );
# endif
      //interleaved U Y V Y chars

      hiImage = (vector signed short) vec_mergeh( zero, loadImage );
      loImage = (vector signed short) vec_mergel( zero, loadImage );

      hiRight = (vector signed short) vec_mergeh( zero, loadRight );
      loRight = (vector signed short) vec_mergel( zero, loadRight );

      //hoist that load!!
      loadImage = inData[1];
      loadRight = rightData[1];

      //subtract 128 from UV

      hiImage = vec_subs(hiImage,gainSub);
      loImage = vec_subs(loImage,gainSub);

      hiRight = vec_subs(hiRight,gainSub);
      loRight = vec_subs(loRight,gainSub);

      //now vec_mule the UV into two vector ints
      //change sone to gain
      UVhi = vec_mule(gain,hiImage);
      UVlo = vec_mule(gain,loImage);

      UVhiR = vec_mule(gainR,hiRight);
      UVloR = vec_mule(gainR,loRight);

      //now vec_mulo the Y into two vector ints
      Yhi = vec_mulo(gain,hiImage);
      Ylo = vec_mulo(gain,loImage);

      YhiR = vec_mulo(gainR,hiRight);
      YloR = vec_mulo(gainR,loRight);


      //this is where to do the add and bitshift due to the resolution
      //add UV
      UVhi = vec_adds(UVhi,UVhiR);
      UVlo = vec_adds(UVlo,UVloR);

      Yhi = vec_adds(Yhi,YhiR);
      Ylo = vec_adds(Ylo,YloR);

      //bitshift UV
      UVhi = vec_sra(UVhi,bitshift);
      UVlo = vec_sra(UVlo,bitshift);

      Yhi = vec_sra(Yhi,bitshift);
      Ylo = vec_sra(Ylo,bitshift);

      //pack the UV into a single short vector
      UVImage =  vec_packs(UVhi,UVlo);

      //pack the Y into a single short vector
      YImage =  vec_packs(Yhi,Ylo);

      //vec_mergel + vec_mergeh Y and UV
      hiImage =  vec_mergeh(UVImage,YImage);
      loImage =  vec_mergel(UVImage,YImage);

      //add 128 offset back
      hiImage = vec_adds(hiImage,gainSub);
      loImage = vec_adds(loImage,gainSub);

      //vec_mergel + vec_mergeh Y and UV
      rightData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);
      inData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);

      inData++;
      rightData++;
    }
  }
# ifndef PPC970
  //stop the cache streams
  vec_dss( 0 );
  vec_dss( 1 );
  vec_dss( 2 );
  vec_dss( 3 );
# endif


}/* end of working altivec function */
示例#20
0
void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)
{
    int i, ii, j, stride1, stride2, stride3, stride4, halfstride;
    unsigned char *pB, *pB2, *pB4;
    vector unsigned char l0, l1, l2, l3;
    vector unsigned short s0, s1, s2, s3;
    vector unsigned short s22_0, s22_1, s22_2, s22_3;
    vector unsigned short s44, s44_0, s44_1;
    vector unsigned short zero, two;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "image", 16, image);
    if (NOT_VECTOR_ALIGNED(sub22_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub22_image", 16, sub22_image);
    if (NOT_VECTOR_ALIGNED(sub44_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub44_image", 16, sub44_image);

    if ((rowstride & 63) != 0)
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "rowstride", 64, rowstride);
#endif

    AMBER_START;

    pB = image;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(6,4,0);
    dsc.block.stride = rowstride;

    vec_dst(pB, dsc.control, 0);
#endif

    pB2 = sub22_image;
    pB4 = sub44_image;

    j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */

    stride1 = rowstride;
    stride2 = stride1 + stride1;
    stride3 = stride2 + stride1;
    stride4 = stride2 + stride2;
    halfstride = stride1 >> 1; /* /2 */

    ii = rowstride >> 6; /* rowstride/16/4 */

    zero = vec_splat_u16(0);
    two = vec_splat_u16(2);

    do {
	i = ii;
	do {
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;
#ifdef ALTIVEC_DST
	    vec_dst(pB + (16 * 3), dsc.control, 0);
#endif

	    /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */
	    /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */
	    /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */
	    /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */

	    /* s0 = 0x[00,01,      02,03,      04,05,      06,07,     ] */
	    /*        [      10,11,      12,13,      14,15,      16,17] */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));

	    /* s1 = 0x[08,09,      0A,0B,      0C,0D,      0E,0F,     ] */
	    /*        [      18,19,      1A,1B,      1C,1D,      1E,1F] */
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));

	    /* s2 = 0x[20,21,      22,23,      24,25,      26,27,     ] */
	    /*        [      30,31,      32,33,      34,35,      36,37] */
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));

	    /* s3 = 0x[28,29,      2A,2B,      2C,2D,      2E,2F,     ] */
	    /*        [      38,39,      3A,3B,      3C,3D,      3E,3F] */
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next block */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;

	    /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */
	    /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */
	    /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */
	    /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */
	    s22_0 = vec_add(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */
	    s22_1 = vec_add(s22_1, two);

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */
	    s22_0 = vec_sra(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */
	    s22_1 = vec_sra(s22_1, two);

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */
	    s44_0 = vec_add(s22_0, s22_1);

	    /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);


	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* starting loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    s22_0 = vec_add(s22_0, two);
	    s22_1 = vec_add(s22_1, two);

	    s22_0 = vec_sra(s22_0, two);
	    s22_1 = vec_sra(s22_1, two);


	    s44 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44 = vec_add(s44, two);
	    s44 = vec_sra(s44, two);

	    s44_0 = vec_add(s22_0, s22_1);
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);

	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* pack all four s44 chunks */
	    s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44_0 = vec_add(s44_0, two);
	    s44_0 = vec_sra(s44_0, two);
	    s44 = vu16(vec_packsu(s44, s44_0));

	    vec_st(vu8(s44), 0, pB4);
	    pB4 += 16;

	} while (--i);

	pB += stride3;
	pB2 += halfstride;

    } while (--j);

#ifdef ALTIVEC_DST
    vec_dss(0);
#endif

    AMBER_STOP;
}
示例#21
0
文件: pix_add.cpp 项目: avilleret/Gem
void pix_add :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
 int h,w,width;
   width = image.xsize/8;
   //format is U Y V Y
    union
    {
        //unsigned int	i;
        short	elements[8];
        //vector signed char v;
        vector	signed short v;
    }shortBuffer;

        union
    {
        //unsigned int	i;
        unsigned char	elements[16];
        //vector signed char v;
        vector	unsigned char v;
    }charBuffer;

    //vector unsigned char c;
    register vector signed short d, hiImage, loImage, YRight, UVRight, YImage, UVImage, UVTemp, YTemp;
   // vector unsigned char zero = vec_splat_u8(0);
    register vector unsigned char c,one;
  //  vector signed short zshort = vec_splat_s16(0);
    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;

    //Write the pixel (pair) to the transfer buffer
    charBuffer.elements[0] = 2;
    charBuffer.elements[1] = 1;
    charBuffer.elements[2] = 2;
    charBuffer.elements[3] = 1;
    charBuffer.elements[4] = 2;
    charBuffer.elements[5] = 1;
    charBuffer.elements[6] = 2;
    charBuffer.elements[7] = 1;
    charBuffer.elements[8] = 2;
    charBuffer.elements[9] = 1;
    charBuffer.elements[10] = 2;
    charBuffer.elements[11] = 1;
    charBuffer.elements[12] = 2;
    charBuffer.elements[13] = 1;
    charBuffer.elements[14] = 2;
    charBuffer.elements[15] = 1;


    //Load it into the vector unit
    c = charBuffer.v;

    one =  vec_splat_u8( 1 );

    shortBuffer.elements[0] = 255;

    //Load it into the vector unit
    d = shortBuffer.v;
    d = static_cast<vector signed short>(vec_splat(static_cast<vector signed short>(d),0));
#ifndef PPC970
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
#endif
    for ( h=0; h<image.ysize; h++){
      for (w=0; w<width; w++)
        {
#ifndef PPC970
	  vec_dst( inData, prefetchSize, 0 );
	  vec_dst( rightData, prefetchSize, 1 );
#endif
	  //interleaved U Y V Y chars

	  //vec_mule UV * 2 to short vector U V U V shorts
	  UVImage = static_cast<vector signed short>(vec_mule(one,inData[0]));
	  UVRight = static_cast<vector signed short>(vec_mule(c,rightData[0]));

	  //vec_mulo Y * 1 to short vector Y Y Y Y shorts
	  YImage = static_cast<vector signed short>(vec_mulo(c,inData[0]));
	  YRight = static_cast<vector signed short>(vec_mulo(c,rightData[0]));

	  //vel_subs UV - 255
	  UVRight = static_cast<vector signed short>(vec_subs(UVRight, d));

	  //vec_adds UV
	  UVTemp = vec_adds(UVImage,UVRight);

	  //vec_adds Y
	  YTemp = vec_adds(YImage,YRight);

	  hiImage = vec_mergeh(UVTemp,YTemp);
	  loImage = vec_mergel(UVTemp,YTemp);

	  //vec_mergel + vec_mergeh Y and UV
	  inData[0] = vec_packsu(hiImage, loImage);

	  inData++;
	  rightData++;
        }
#ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
#endif
    }  /*end of working altivec function */
}
示例#22
0
void pix_movement :: processYUVAltivec(imageStruct &image)
{
    if (image.xsize*image.ysize != buffer.xsize*buffer.ysize){
        buffer.xsize = image.xsize;
        buffer.ysize = image.ysize;
        buffer.reallocate(buffer.xsize*buffer.ysize*2);
    }
    int pixsize = image.ysize * image.xsize/8;

    union{
        signed short  c[8];
        vector signed short  v;
    }shortBuffer;

    union{
        unsigned short  c[8];
        vector unsigned short  v;
    }ushortBuffer;

    int i;

    vector signed short thresh;
    shortBuffer.c[0] = threshold;
    thresh = shortBuffer.v;
    thresh = (vector signed short)vec_splat(thresh,0);

    vector unsigned char *rp = (vector unsigned char *) image.data; // read pointer
    vector unsigned char *wp = (vector unsigned char *) buffer.data; // write pointer to the copy
    vector unsigned char grey0,grey1;
    vector unsigned char one = vec_splat_u8(1);
    vector unsigned short Y0,Ywp0,hiImage0,loImage0;
    vector unsigned short Y1,Ywp1,hiImage1,loImage1;
    vector unsigned short UVwp0,UVwp1;
    vector signed short temp0,temp1;

    ushortBuffer.c[0]=127;
    vector unsigned short UV0= (vector unsigned short)vec_splat(ushortBuffer.v, 0);
    vector unsigned short UV1= (vector unsigned short)vec_splat(ushortBuffer.v, 0);

#ifndef PPC970
    //setup the cache prefetch -- A MUST!!!
    UInt32 prefetchSize = GetPrefetchConstant( 16, 0, 256 );
    vec_dst( rp, prefetchSize, 0 );
    vec_dst( wp, prefetchSize, 1 );
#endif

    int j = 16;

    pixsize/=2;
    for (i=0; i < pixsize; i++) {
# ifndef PPC970
        //setup the cache prefetch -- A MUST!!!
        UInt32 prefetchSize = GetPrefetchConstant( j, 0, j * 16 );
        vec_dst( rp, prefetchSize, 0 );
        vec_dst( wp, prefetchSize, 1 );
        vec_dst( rp+16, prefetchSize, 2 );
        vec_dst( wp+16, prefetchSize, 3 );
# endif

        grey0 = rp[0];
        grey1 = rp[1];

//      rp[Y0]=255*(abs(grey0-*wp)>thresh);

//      UV0= (vector unsigned short)vec_mule(grey0,one);
        Y0 = (vector unsigned short)vec_mulo(grey0,one);

//      UV1= (vector unsigned short)vec_mule(grey1,one);
        Y1 = (vector unsigned short)vec_mulo(grey1,one);

        //wp is actually 1/2 the size of the image because it is only Y??

        //here the full U Y V Y is stored
//      UVwp0= (vector unsigned short)vec_mule(wp[0],one);
        Ywp0 = (vector unsigned short)vec_mulo(wp[0],one);

//      UVwp1= (vector unsigned short)vec_mule(wp[1],one);
        Ywp1 = (vector unsigned short)vec_mulo(wp[1],one);

        //store the current pixels as the history for next time
        wp[0]=grey0;
        wp++;
        wp[0]=grey1;
        wp++;

        temp0 = vec_abs(vec_sub((vector signed short)Y0,(vector signed short)Ywp0));
        Y0 = (vector unsigned short)vec_cmpgt(temp0,thresh);

        temp1 = vec_abs(vec_sub((vector signed short)Y1,(vector signed short)Ywp1));
        Y1 = (vector unsigned short)vec_cmpgt(temp1,thresh);

        hiImage0 = vec_mergeh(UV0,Y0);
        loImage0 = vec_mergel(UV0,Y0);

        hiImage1 = vec_mergeh(UV1,Y1);
        loImage1 = vec_mergel(UV1,Y1);

        grey0 = vec_packsu(hiImage0,loImage0);
        grey1 = vec_packsu(hiImage1,loImage1);

        rp[0]=grey0;
        rp++;
        rp[0]=grey1;
        rp++;
       // grey = rp[0];
       // rp[Y1]=255*(abs(grey-*wp)>thresh);
       // *wp++=grey;

       // rp+=4;
       // rp++;
    }

# ifndef PPC970
    vec_dss(0);
    vec_dss(1);
    vec_dss(2);
    vec_dss(3);
# endif
}
示例#23
0
文件: 3b-10.c 项目: 0day-ci/gcc
void g (int b) 
{
  vec_dst(&b, 3, 3); 
  vec_dst(&b, 1, 1);
}
示例#24
0
void pix_diff :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
  long h,w,width;

   width = image.xsize/8;
   //format is U Y V Y
    union
    {
        //unsigned int	i;
        short	elements[8];
        //vector signed char v;
        vector	short v;
    }shortBuffer;


    vector signed short d, hiImage, loImage,hiRight, loRight;//, YRight, UVRight, YImage, UVImage, UVTemp, YTemp;
    vector unsigned char zero = vec_splat_u8(0);
    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;


    shortBuffer.elements[0] = 128;
    shortBuffer.elements[1] = 0;
    shortBuffer.elements[2] = 128;
    shortBuffer.elements[3] = 0;
    shortBuffer.elements[4] = 128;
    shortBuffer.elements[5] = 0;
    shortBuffer.elements[6] = 128;
    shortBuffer.elements[7] = 0;

    //Load it into the vector unit
    d = shortBuffer.v;



#ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
    #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
           #endif
            //interleaved U Y V Y chars

            //break out to unsigned shorts
            hiImage = (vector signed short) vec_mergeh( zero, inData[0] );
            loImage = (vector signed short) vec_mergel( zero, inData[0] );
            hiRight = (vector signed short) vec_mergeh( zero, rightData[0] );
            loRight = (vector signed short) vec_mergel( zero, rightData[0] );

            //subtract the 128 offset for UV
            hiImage = vec_subs(hiImage,d);
            loImage = vec_subs(loImage,d);
            hiRight = vec_subs(hiRight,d);
            loRight = vec_subs(loRight,d);

            hiImage = vec_subs(hiImage,hiRight);
            loImage = vec_subs(loImage,loRight);

            hiImage = vec_adds(hiImage,d);
            loImage = vec_adds(loImage,d);

            hiImage = vec_abs(hiImage);
            loImage = vec_abs(loImage);

            inData[0] = vec_packsu(hiImage, loImage);

            inData++;
            rightData++;

        }
        #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        #endif
    }  /*end of working altivec function */
}
示例#25
0
/*
 * subtract prediction from block data
 * pred % 8 == 0
 * cur % 8 == 0
 * lx % 16 == 0
 * blk % 16 == 0
 */
void sub_pred_altivec(SUB_PRED_PDECL)
{
    unsigned int dst;
    uint8_t *pCA, *pCB, *pPA, *pPB;
    int16_t *pBA, *pBB;
    vector unsigned char zero;
    vector unsigned char predA, predB, curA, curB;
    vector signed short blkA, blkB;


#ifdef ALTIVEC_VERIFY
#ifdef ALTIVEC_DST
    if (lx & (~0xffff) != 0)
	mjpeg_error_exit1("sub_pred: lx > vec_dst range", lx);
#endif

    if (NOT_VECTOR_ALIGNED(lx))
	mjpeg_error_exit1("sub_pred: lx %% 16 != 0, (%d)", lx);
    if (NOT_VECTOR_ALIGNED(blk))
	mjpeg_error_exit1("sub_pred: blk %% 16 != 0, (%d)", blk);

    if (((unsigned long)pred & 0xf) != ((unsigned long)cur & 0xf))
	mjpeg_error_exit1("sub_pred: (pred(0x%X) %% 16) != (cur(0x%X) %% 16)",
	    pred, cur);
    if ((((unsigned long)pred) & 0x7) != 0)
	mjpeg_error_exit1("sub_pred: pred %% 8 != 0, (0x%X)", pred);
    if ((((unsigned long)cur) & 0x7) != 0)
	mjpeg_error_exit1("sub_pred: cur %% 8 != 0, (0x%X)", cur);
#endif

/* A->B, B->A expand differently depending on input */
#define ABBA(symbol,ab)		_ABBA(ABBA_##ab,symbol) /* {{{ */
#define _ABBA(abba_ab,symbol)	abba_ab(symbol)
#define ABBA_A(symbol)		symbol##B
#define ABBA_B(symbol)		symbol##A
/* }}} */


#define	PERFORM_ITERATION(hl,ab,iter) /* iter {{{ */                         \
	pred##ab = vec_merge##hl(zero, pred##ab);                            \
	cur##ab = vec_merge##hl(zero, cur##ab);                              \
	blk##ab = vec_sub(vs16(cur##ab), vs16(pred##ab));                    \
	vec_st(blk##ab, 0, (signed short*)pB##ab);                           \
	/* }}} */

#define PREPARE_ITERATION(hl,ab,iter) /* iter {{{ */                         \
	pP##ab = ABBA(pP,ab) + lx;                                           \
	pC##ab = ABBA(pC,ab) + lx;                                           \
	pB##ab = ABBA(pB,ab) + 8;                                            \
	pred##ab = vec_ld(0, pP##ab);                                        \
	cur##ab = vec_ld(0, pC##ab);                                         \
	/* }}} */

#define NO_RESCHEDULE	asm volatile ("")

    AMBER_START;

    pPA = pred;
    pCA = cur;
    pBA = blk;

#ifdef ALTIVEC_DST
    dst = 0x01080000 | lx;
    vec_dst(pPA, dst, 0);
    vec_dst(pCA, dst, 1);
    dst = 0x01080010;
    vec_dstst(pBA, dst, 2);
#endif

    pPB = pPA + lx;         NO_RESCHEDULE;
    predA = vec_ld(0, pPA); NO_RESCHEDULE;
    pCB = pCA + lx;         NO_RESCHEDULE;
    curA = vec_ld(0, pCA);  NO_RESCHEDULE;
    pBB = pBA + 8;          NO_RESCHEDULE;
    predB = vec_ld(0, pPB); NO_RESCHEDULE;
    zero = vec_splat_u8(0); NO_RESCHEDULE;
    curB = vec_ld(0, pCB);

    if (VECTOR_ALIGNED(pPA)) {
	PERFORM_ITERATION(h,A,0);
	PREPARE_ITERATION(h,A,2);   /* prepare next A iteration */
	PERFORM_ITERATION(h,B,1);
	PREPARE_ITERATION(h,B,3);   /* prepare next B iteration */
	PERFORM_ITERATION(h,A,2);
	PREPARE_ITERATION(h,A,4);
	PERFORM_ITERATION(h,B,3);
	PREPARE_ITERATION(h,B,5);
	PERFORM_ITERATION(h,A,4);
	PREPARE_ITERATION(h,A,6);
	PERFORM_ITERATION(h,B,5);
	PREPARE_ITERATION(h,B,7);
	PERFORM_ITERATION(h,A,6);
	PERFORM_ITERATION(h,B,7);
    } else {
	PERFORM_ITERATION(l,A,0);
	PREPARE_ITERATION(l,A,2);   /* prepare next A iteration */
	PERFORM_ITERATION(l,B,1);
	PREPARE_ITERATION(l,B,3);   /* prepare next B iteration */
	PERFORM_ITERATION(l,A,2);
	PREPARE_ITERATION(l,A,4);
	PERFORM_ITERATION(l,B,3);
	PREPARE_ITERATION(l,B,5);
	PERFORM_ITERATION(l,A,4);
	PREPARE_ITERATION(l,A,6);
	PERFORM_ITERATION(l,B,5);
	PREPARE_ITERATION(l,B,7);
	PERFORM_ITERATION(l,A,6);
	PERFORM_ITERATION(l,B,7);
    }

#ifdef ALTIVEC_DST
    vec_dssall();
#endif

    AMBER_STOP;
}