Пример #1
0
void pix_add :: processRGBA_Altivec(imageStruct &image, imageStruct &right)
{
 int h,w,width;
   width = image.xsize/4;


    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;

        #ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif

            inData[0] = vec_adds(inData[0], rightData[0]);

            inData++;
            rightData++;
        }
        #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        #endif
    }  /*end of working altivec function */
}
Пример #2
0
void pix_diff :: processRGBA_Altivec(imageStruct &image, imageStruct &right)
{

    int datasize = image.xsize * image.ysize / 4;
    vector signed short  hiImage, loImage, hiRight, loRight;
    vector unsigned char zero = vec_splat_u8(0);
    vector unsigned char *inData = (vector unsigned char *)image.data;
    vector unsigned char *rightData = (vector unsigned char *)right.data;

    #ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        vec_dst( inData+256, prefetchSize, 2 );
        vec_dst( rightData+256, prefetchSize, 3 );
    #endif

    do {

        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        vec_dst( inData+256, prefetchSize, 2 );
        vec_dst( rightData+256, prefetchSize, 3 );
        #endif

        hiImage = (vector signed short)vec_mergeh(zero,inData[0]);
        loImage = (vector signed short)vec_mergel(zero,inData[0]);
        hiRight = (vector signed short)vec_mergeh(zero,rightData[0]);
        loRight = (vector signed short)vec_mergel(zero,rightData[0]);

        hiImage = vec_subs(hiImage,hiRight);
        loImage = vec_subs(loImage,loRight);

        hiImage = vec_abs(hiImage);
        loImage = vec_abs(loImage);

        inData[0] = vec_packsu(hiImage,loImage);

        inData++;
        rightData++;
    }
    while (--datasize);
    #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        vec_dss( 2 );
        vec_dss( 3 );
    #endif
}
Пример #3
0
void pix_invert :: processYUVAltivec(imageStruct &image)
{
int h,w,width;
   width = image.xsize/8;

    union{
        unsigned char c[16];
        vector unsigned char v;
    }charBuffer;

    vector unsigned char offset;
    vector unsigned char *inData = (vector unsigned char*) image.data;

    charBuffer.c[0] = 255;
    offset = charBuffer.v;
    offset = (vector unsigned char) vec_splat(offset,0);
    #ifndef PPC970
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
       #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        #endif
        inData[0]=vec_subs(offset,inData[0]);
        inData++;

         }
         #ifndef PPC970
        vec_dss( 0 );
        #endif
    }  /*end of working altivec function */
}
void
b()
{
  z = vec_add (x, y);

  /* Make sure the predicates accept correct argument types.  */

  int1 = vec_all_in (f, g);
  int1 = vec_all_ge (f, g);
  int1 = vec_all_eq (c, d);
  int1 = vec_all_ne (s, t);
  int1 = vec_any_eq (i, j);
  int1 = vec_any_ge (f, g);
  int1 = vec_all_ngt (f, g);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);
  int1 = vec_any_ge (c, d);
  int1 = vec_any_ge (s, t);
  int1 = vec_any_ge (i, j);

  vec_mtvscr (i);
  vec_dssall ();
  s = (vector signed short) vec_mfvscr ();
  vec_dss (3);

  vec_dst (pi, int1 + int2, 3);
  vec_dstst (pi, int1 + int2, 3);
  vec_dststt (pi, int1 + int2, 3);
  vec_dstt (pi, int1 + int2, 3);

  uc = (vector unsigned char) vec_lvsl (int1 + 69, (signed int *) pi);
  uc = (vector unsigned char) vec_lvsr (int1 + 69, (signed int *) pi);

  c = vec_lde (int1, (signed char *) pi);
  s = vec_lde (int1, (signed short *) pi);
  i = vec_lde (int1, (signed int *) pi);
  i = vec_ldl (int1, pi);
  i = vec_ld (int1, pi);

  vec_st (i, int2, pi);
  vec_ste (c, int2, (signed char *) pi);
  vec_ste (s, int2, (signed short *) pi);
  vec_ste (i, int2, (signed int *) pi);
  vec_stl (i, int2, pi);
}
Пример #5
0
void pix_background :: processYUVAltivec(imageStruct &image)
{
register int h,w,i,j,width;
int pixsize = image.xsize * image.ysize * image.csize;
    h = image.ysize;
    w = image.xsize/8;
    width = image.xsize/8;
    
    //check to see if the buffer isn't 16byte aligned (highly unlikely)
    if (image.ysize*image.xsize % 16 != 0){
        error("image not properly aligned for Altivec - try something SD or HD maybe?");
        return;
        }
    
    union{
        unsigned short		s[8];
        vector unsigned short	v;
    }shortBuffer;

    if(m_savedImage.xsize!=image.xsize ||
       m_savedImage.ysize!=image.ysize ||
       m_savedImage.format!=image.format)m_reset=1;

    m_savedImage.xsize=image.xsize;
    m_savedImage.ysize=image.ysize;
    m_savedImage.setCsizeByFormat(image.format);
    m_savedImage.reallocate();
    
    if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
    m_reset = 0; 
    }
    
    register vector unsigned short	UVres1, Yres1, UVres2, Yres2;//interleave;
    register vector unsigned short	hiImage, loImage;
    register vector unsigned short	Yrange, UVrange, Yblank,UVblank,blank;
    register vector bool short		Ymasklo,Ymaskhi,  UVmaskhi;
    register vector unsigned short	Yhi,Ylo,UVhi,UVlo; 
    register vector unsigned char	one = vec_splat_u8(1);
    register vector unsigned short	sone = vec_splat_u16(1);
    register vector unsigned int			Uhi, Ulo, Vhi, Vlo,Ures,Vres;
    register vector bool int 			Umasklo, Umaskhi, Vmaskhi, Vmasklo;

    vector unsigned char	*inData = (vector unsigned char*) image.data;
    vector unsigned char	*rightData = (vector unsigned char*) m_savedImage.data;
    
    shortBuffer.s[0] =  m_Yrange;
    Yrange = shortBuffer.v;
    Yrange = vec_splat(Yrange,0);
    
    shortBuffer.s[0] = 128;
    shortBuffer.s[1] = 0;
    shortBuffer.s[2] = 128;
    shortBuffer.s[3] = 0;
    shortBuffer.s[4] = 128;
    shortBuffer.s[5] = 0;
    shortBuffer.s[6] = 128;
    shortBuffer.s[7] = 0;
    blank = shortBuffer.v;
    
    shortBuffer.s[0] =  0;
    Yblank = shortBuffer.v;
    Yblank = vec_splat(Yblank,0);
    
    shortBuffer.s[0] =  128;
    UVblank = shortBuffer.v;
    UVblank = vec_splat(UVblank,0);
    
    shortBuffer.s[0] = m_Urange;
    shortBuffer.s[1] = m_Vrange;
    shortBuffer.s[2] = m_Urange;
    shortBuffer.s[3] = m_Vrange;
    shortBuffer.s[4] = m_Urange;
    shortBuffer.s[5] = m_Vrange;
    shortBuffer.s[6] = m_Urange;
    shortBuffer.s[7] = m_Vrange;
    UVrange = shortBuffer.v;
    
    
    //setup the cache prefetch -- A MUST!!!
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    #ifndef PPC970 
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
    vec_dst( inData+32, prefetchSize, 2 );
    vec_dst( rightData+32, prefetchSize, 3 );
    #endif //PPC970
    
    for ( i=0; i<h; i++){
        for (j=0; j<w; j++)
        {
        #ifndef PPC970
        //this function is probably memory bound on most G4's -- what else is new?
            vec_dst( inData, prefetchSize, 0 );
            vec_dst( rightData, prefetchSize, 1 );
            vec_dst( inData+32, prefetchSize, 2 );
            vec_dst( rightData+32, prefetchSize, 3 );
        #endif
        //separate the U and V from Y
        UVres1 = (vector unsigned short)vec_mule(one,inData[0]);
        UVres2 = (vector unsigned short)vec_mule(one,rightData[0]);
            
        //vec_mulo Y * 1 to short vector Y Y Y Y shorts
        Yres1 = (vector unsigned short)vec_mulo(one,inData[0]);
        Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]);
        
        Yhi = vec_adds(Yres2,Yrange);
        Ylo = vec_subs(Yres2,Yrange);
        
        //go to ints for comparison
        UVhi = vec_adds(UVres2,UVrange);
        UVlo = vec_subs(UVres2,UVrange);
        
        Uhi = vec_mule(sone,UVhi);
        Ulo = vec_mule(sone,UVlo);
        
        Vhi = vec_mulo(sone,UVhi);
        Vlo = vec_mulo(sone,UVlo);
        
        Ures = vec_mule(sone,UVres1);
         Vres = vec_mulo(sone,UVres1);
         
         Umasklo = vec_cmpgt(Ures,Ulo);
         Umaskhi = vec_cmplt(Ures,Uhi);
         
         Vmasklo = vec_cmpgt(Vres,Vlo);
         Vmaskhi = vec_cmplt(Vres,Vhi);
         
         Umaskhi = vec_and(Umaskhi,Umasklo);
         
         Vmaskhi = vec_and(Vmaskhi,Vmasklo);
         
         Umasklo = vec_and(Umaskhi,Vmaskhi);
         Vmasklo = vec_and(Umaskhi,Vmaskhi);
         
         hiImage = (vector unsigned short)vec_mergeh(Umasklo,Vmasklo);
         loImage = (vector unsigned short)vec_mergel(Umasklo,Vmasklo);
         
         //pack it back down to bool short
         UVmaskhi = (vector bool short)vec_packsu(hiImage,loImage);
         
         Ymasklo = vec_cmpgt(Yres1,Ylo);
         Ymaskhi = vec_cmplt(Yres1,Yhi);
         
         Ymaskhi = vec_and(Ymaskhi,Ymasklo);
         
         Ymaskhi = vec_and(Ymaskhi,UVmaskhi);
         UVmaskhi = vec_and(Ymaskhi,UVmaskhi);
         
         //bitwise comparison and move using the result of the comparison as a mask
         Yres1 = vec_sel(Yres1,Yblank,Ymaskhi);
         
         //UVres1 = vec_sel(UVres1,UVres2,UVmaskhi);
         UVres1 = vec_sel(UVres1,UVblank,UVmaskhi);
         
         //merge the Y and UV back together
         hiImage = vec_mergeh(UVres1,Yres1);
         loImage = vec_mergel(UVres1,Yres1);
         
         //pack it back down to unsigned char to store
         inData[0] = vec_packsu(hiImage,loImage);
         
         inData++;
         rightData++;
        
        }
        #ifndef PPC970
        vec_dss(0);
        vec_dss(1);
        vec_dss(2);
        vec_dss(3);
        #endif
    }
}
Пример #6
0
void pix_add :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
 int h,w,width;
   width = image.xsize/8;
   //format is U Y V Y
    union
    {
        //unsigned int	i;
        short	elements[8];
        //vector signed char v;
        vector	signed short v;
    }shortBuffer;

        union
    {
        //unsigned int	i;
        unsigned char	elements[16];
        //vector signed char v;
        vector	unsigned char v;
    }charBuffer;

    //vector unsigned char c;
    register vector signed short d, hiImage, loImage, YRight, UVRight, YImage, UVImage, UVTemp, YTemp;
   // vector unsigned char zero = vec_splat_u8(0);
    register vector unsigned char c,one;
  //  vector signed short zshort = vec_splat_s16(0);
    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;

    //Write the pixel (pair) to the transfer buffer
    charBuffer.elements[0] = 2;
    charBuffer.elements[1] = 1;
    charBuffer.elements[2] = 2;
    charBuffer.elements[3] = 1;
    charBuffer.elements[4] = 2;
    charBuffer.elements[5] = 1;
    charBuffer.elements[6] = 2;
    charBuffer.elements[7] = 1;
    charBuffer.elements[8] = 2;
    charBuffer.elements[9] = 1;
    charBuffer.elements[10] = 2;
    charBuffer.elements[11] = 1;
    charBuffer.elements[12] = 2;
    charBuffer.elements[13] = 1;
    charBuffer.elements[14] = 2;
    charBuffer.elements[15] = 1;


    //Load it into the vector unit
    c = charBuffer.v;

    one =  vec_splat_u8( 1 );

    shortBuffer.elements[0] = 255;

    //Load it into the vector unit
    d = shortBuffer.v;
    d = static_cast<vector signed short>(vec_splat(static_cast<vector signed short>(d),0));
#ifndef PPC970
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
#endif
    for ( h=0; h<image.ysize; h++){
      for (w=0; w<width; w++)
        {
#ifndef PPC970
	  vec_dst( inData, prefetchSize, 0 );
	  vec_dst( rightData, prefetchSize, 1 );
#endif
	  //interleaved U Y V Y chars

	  //vec_mule UV * 2 to short vector U V U V shorts
	  UVImage = static_cast<vector signed short>(vec_mule(one,inData[0]));
	  UVRight = static_cast<vector signed short>(vec_mule(c,rightData[0]));

	  //vec_mulo Y * 1 to short vector Y Y Y Y shorts
	  YImage = static_cast<vector signed short>(vec_mulo(c,inData[0]));
	  YRight = static_cast<vector signed short>(vec_mulo(c,rightData[0]));

	  //vel_subs UV - 255
	  UVRight = static_cast<vector signed short>(vec_subs(UVRight, d));

	  //vec_adds UV
	  UVTemp = vec_adds(UVImage,UVRight);

	  //vec_adds Y
	  YTemp = vec_adds(YImage,YRight);

	  hiImage = vec_mergeh(UVTemp,YTemp);
	  loImage = vec_mergel(UVTemp,YTemp);

	  //vec_mergel + vec_mergeh Y and UV
	  inData[0] = vec_packsu(hiImage, loImage);

	  inData++;
	  rightData++;
        }
#ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
#endif
    }  /*end of working altivec function */
}
Пример #7
0
void *mem_searchrn(void *s, size_t len)
{
	vector unsigned char v_cr;
	vector unsigned char v_nl;
	vector unsigned char v0;
	vector unsigned char v_perm;
	vector unsigned char c;
	vector bool char rr, rn;
	vector bool char last_rr;
	char *p;
	ssize_t k;
	size_t block_num;
	unsigned f;

	if(unlikely(!s || !len))
		return NULL;

	/* only do one prefetch, this covers nearly 128k */
	block_num = DIV_ROUNDUP(len, 512);
	f  = block_num >= 256 ? 0 : block_num << 16;
	f |= 512;
	vec_dst((const unsigned char *)s, f, 2);

	v_cr = vec_splat_u8('\r');
	v_nl = vec_splat_u8('\n');
	v0   = vec_splat_u8(0);
	last_rr = (vector bool char)v0;

	k = SOVUC - ALIGN_DOWN_DIFF(s, SOVUC) - (ssize_t)len;

	p = (char *)ALIGN_DOWN(s, SOVUC);
	c = vec_ldl(0, (const vector unsigned char *)p);
	if(unlikely(k > 0))
		goto K_SHIFT;
	v_perm = vec_lvsl(0, (unsigned char *)s);
	c = vec_perm(c, v0, v_perm);
	v_perm = vec_lvsr(0, (unsigned char *)s);
	c = vec_perm(v0, c, v_perm);
	rr = vec_cmpeq(c, v_cr);
	rn = vec_cmpeq(c, v_nl);

	k = -k;
	goto START_LOOP;

	do
	{
		p += SOVUC;
		c = vec_ldl(0, (const vector unsigned char *)p);
		k -= SOVUC;
		if(k > 0)
		{
			rr = vec_cmpeq(c, v_cr);
			rn = vec_cmpeq(c, v_nl);

			if(vec_any_eq(last_rr, rn)) {
				vec_dss(2);
				return p - 1;
			}
START_LOOP:
			last_rr = (vector bool char)vec_sld(v0, (vector unsigned char)rr, 1);
			rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15);
			rr = vec_and(rr, rn); /* get mask */
			if(vec_any_ne(rr, v0)) {
				vec_dss(2);
				return p + vec_zpos(rr);
			}
		}
	} while(k > 0);
	k = -k;
K_SHIFT:
	vec_dss(2);
	v_perm = vec_lvsr(0, (unsigned char *)k);
	c = vec_perm(v0, c, v_perm);
	v_perm = vec_lvsl(0, (unsigned char *)k);
	c = vec_perm(c, v0, v_perm);
	rr = vec_cmpeq(c, v_cr);
	rn = vec_cmpeq(c, v_nl);
	if(vec_any_eq(last_rr, rn))
		return p - 1;

	rn = (vector bool char)vec_sld(v0, (vector unsigned char)rn, 15);
	rr = vec_and(rr, rn); /* get mask */
	if(vec_any_ne(rr, v0))
		return p + vec_zpos(rr);

	return NULL;
}
Пример #8
0
/* more optimized version - unrolled and load-hoisted */
void pix_offset :: processYUVAltivec(imageStruct &image)
{
  register int h,w,width,height;
  width = image.xsize/16; //for altivec
  height = image.ysize;
  //format is U Y V Y
  // start of working altivec function
  union {
    short       elements[8];
    vector      signed short v;
  } transferBuffer;

  register vector signed short c, hi, lo;
  register vector signed short hi1, lo1;
  register vector signed short loadhi, loadhi1, loadlo, loadlo1;
  register vector unsigned char zero = vec_splat_u8(0);
  register vector unsigned char *inData = (vector unsigned char*) image.data;

  //Write the pixel (pair) to the transfer buffer
  //transferBuffer.i = (U << 24) | (Y << 16) | (V << 8 ) | Y;
  transferBuffer.elements[0] = U;
  transferBuffer.elements[1] = Y;
  transferBuffer.elements[2] = V;
  transferBuffer.elements[3] = Y;
  transferBuffer.elements[4] = U;
  transferBuffer.elements[5] = Y;
  transferBuffer.elements[6] = V;
  transferBuffer.elements[7] = Y;

  //Load it into the vector unit
  c = transferBuffer.v;


#ifndef PPC970
  UInt32                        prefetchSize = GetPrefetchConstant( 16, 1,
      256 );
  vec_dst( inData, prefetchSize, 0 );
  vec_dst( inData+16, prefetchSize, 1 );
  vec_dst( inData+32, prefetchSize, 2 );
  vec_dst( inData+64, prefetchSize, 3 );
#endif

  //expand the UInt8's to short's
  loadhi = (vector signed short) vec_mergeh( zero, inData[0] );
  loadlo = (vector signed short) vec_mergel( zero, inData[0] );

  loadhi1 = (vector signed short) vec_mergeh( zero, inData[1] );
  loadlo1 = (vector signed short) vec_mergel( zero, inData[1] );
  \


  for ( h=0; h<height; h++) {
    for (w=0; w<width; w++) {

#ifndef PPC970
      vec_dst( inData, prefetchSize, 0 );
      vec_dst( inData+16, prefetchSize, 1 );
      vec_dst( inData+32, prefetchSize, 2 );
      vec_dst( inData+64, prefetchSize, 3 );
#endif

      //add the constant to it
      hi = vec_add( loadhi, c );
      lo = vec_add( loadlo, c );

      hi1 = vec_add( loadhi1, c );
      lo1 = vec_add( loadlo1, c );


      //expand the UInt8's to short's
      loadhi = (vector signed short) vec_mergeh( zero, inData[2] );
      loadlo = (vector signed short) vec_mergel( zero, inData[2] );


      loadhi1 = (vector signed short) vec_mergeh( zero, inData[3] );
      loadlo1 = (vector signed short) vec_mergel( zero, inData[3] );

      //pack the result back down, with saturation
      inData[0] = vec_packsu( hi, lo );
      inData++;


      inData[0] = vec_packsu( hi1, lo1 );
      inData++;
    }
  }

  //
  // finish the last iteration after the loop
  //
  hi = vec_add( loadhi, c );
  lo = vec_add( loadlo, c );

  hi1 = vec_add( loadhi1, c );
  lo1 = vec_add( loadlo1, c );

  //pack the result back down, with saturation
  inData[0] = vec_packsu( hi, lo );

  inData++;

  inData[0] = vec_packsu( hi1, lo1 );

  inData++;

#ifndef PPC970
  vec_dss( 0 );
  vec_dss( 1 );
  vec_dss( 2 );
  vec_dss( 3 );  //end of working altivec function
#endif
}
Пример #9
0
void pix_movement :: processYUVAltivec(imageStruct &image)
{
    if (image.xsize*image.ysize != buffer.xsize*buffer.ysize){
        buffer.xsize = image.xsize;
        buffer.ysize = image.ysize;
        buffer.reallocate(buffer.xsize*buffer.ysize*2);
    }
    int pixsize = image.ysize * image.xsize/8;

    union{
        signed short  c[8];
        vector signed short  v;
    }shortBuffer;

    union{
        unsigned short  c[8];
        vector unsigned short  v;
    }ushortBuffer;

    int i;

    vector signed short thresh;
    shortBuffer.c[0] = threshold;
    thresh = shortBuffer.v;
    thresh = (vector signed short)vec_splat(thresh,0);

    vector unsigned char *rp = (vector unsigned char *) image.data; // read pointer
    vector unsigned char *wp = (vector unsigned char *) buffer.data; // write pointer to the copy
    vector unsigned char grey0,grey1;
    vector unsigned char one = vec_splat_u8(1);
    vector unsigned short Y0,Ywp0,hiImage0,loImage0;
    vector unsigned short Y1,Ywp1,hiImage1,loImage1;
    vector unsigned short UVwp0,UVwp1;
    vector signed short temp0,temp1;

    ushortBuffer.c[0]=127;
    vector unsigned short UV0= (vector unsigned short)vec_splat(ushortBuffer.v, 0);
    vector unsigned short UV1= (vector unsigned short)vec_splat(ushortBuffer.v, 0);

#ifndef PPC970
    //setup the cache prefetch -- A MUST!!!
    UInt32 prefetchSize = GetPrefetchConstant( 16, 0, 256 );
    vec_dst( rp, prefetchSize, 0 );
    vec_dst( wp, prefetchSize, 1 );
#endif

    int j = 16;

    pixsize/=2;
    for (i=0; i < pixsize; i++) {
# ifndef PPC970
        //setup the cache prefetch -- A MUST!!!
        UInt32 prefetchSize = GetPrefetchConstant( j, 0, j * 16 );
        vec_dst( rp, prefetchSize, 0 );
        vec_dst( wp, prefetchSize, 1 );
        vec_dst( rp+16, prefetchSize, 2 );
        vec_dst( wp+16, prefetchSize, 3 );
# endif

        grey0 = rp[0];
        grey1 = rp[1];

//      rp[Y0]=255*(abs(grey0-*wp)>thresh);

//      UV0= (vector unsigned short)vec_mule(grey0,one);
        Y0 = (vector unsigned short)vec_mulo(grey0,one);

//      UV1= (vector unsigned short)vec_mule(grey1,one);
        Y1 = (vector unsigned short)vec_mulo(grey1,one);

        //wp is actually 1/2 the size of the image because it is only Y??

        //here the full U Y V Y is stored
//      UVwp0= (vector unsigned short)vec_mule(wp[0],one);
        Ywp0 = (vector unsigned short)vec_mulo(wp[0],one);

//      UVwp1= (vector unsigned short)vec_mule(wp[1],one);
        Ywp1 = (vector unsigned short)vec_mulo(wp[1],one);

        //store the current pixels as the history for next time
        wp[0]=grey0;
        wp++;
        wp[0]=grey1;
        wp++;

        temp0 = vec_abs(vec_sub((vector signed short)Y0,(vector signed short)Ywp0));
        Y0 = (vector unsigned short)vec_cmpgt(temp0,thresh);

        temp1 = vec_abs(vec_sub((vector signed short)Y1,(vector signed short)Ywp1));
        Y1 = (vector unsigned short)vec_cmpgt(temp1,thresh);

        hiImage0 = vec_mergeh(UV0,Y0);
        loImage0 = vec_mergel(UV0,Y0);

        hiImage1 = vec_mergeh(UV1,Y1);
        loImage1 = vec_mergel(UV1,Y1);

        grey0 = vec_packsu(hiImage0,loImage0);
        grey1 = vec_packsu(hiImage1,loImage1);

        rp[0]=grey0;
        rp++;
        rp[0]=grey1;
        rp++;
       // grey = rp[0];
       // rp[Y1]=255*(abs(grey-*wp)>thresh);
       // *wp++=grey;

       // rp+=4;
       // rp++;
    }

# ifndef PPC970
    vec_dss(0);
    vec_dss(1);
    vec_dss(2);
    vec_dss(3);
# endif
}
Пример #10
0
void pix_diff :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
  long h,w,width;

   width = image.xsize/8;
   //format is U Y V Y
    union
    {
        //unsigned int	i;
        short	elements[8];
        //vector signed char v;
        vector	short v;
    }shortBuffer;


    vector signed short d, hiImage, loImage,hiRight, loRight;//, YRight, UVRight, YImage, UVImage, UVTemp, YTemp;
    vector unsigned char zero = vec_splat_u8(0);
    vector unsigned char *inData = (vector unsigned char*) image.data;
    vector unsigned char *rightData = (vector unsigned char*) right.data;


    shortBuffer.elements[0] = 128;
    shortBuffer.elements[1] = 0;
    shortBuffer.elements[2] = 128;
    shortBuffer.elements[3] = 0;
    shortBuffer.elements[4] = 128;
    shortBuffer.elements[5] = 0;
    shortBuffer.elements[6] = 128;
    shortBuffer.elements[7] = 0;

    //Load it into the vector unit
    d = shortBuffer.v;



#ifndef PPC970
   	UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
    #endif
    for ( h=0; h<image.ysize; h++){
        for (w=0; w<width; w++)
        {
        #ifndef PPC970
	vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
           #endif
            //interleaved U Y V Y chars

            //break out to unsigned shorts
            hiImage = (vector signed short) vec_mergeh( zero, inData[0] );
            loImage = (vector signed short) vec_mergel( zero, inData[0] );
            hiRight = (vector signed short) vec_mergeh( zero, rightData[0] );
            loRight = (vector signed short) vec_mergel( zero, rightData[0] );

            //subtract the 128 offset for UV
            hiImage = vec_subs(hiImage,d);
            loImage = vec_subs(loImage,d);
            hiRight = vec_subs(hiRight,d);
            loRight = vec_subs(loRight,d);

            hiImage = vec_subs(hiImage,hiRight);
            loImage = vec_subs(loImage,loRight);

            hiImage = vec_adds(hiImage,d);
            loImage = vec_adds(loImage,d);

            hiImage = vec_abs(hiImage);
            loImage = vec_abs(loImage);

            inData[0] = vec_packsu(hiImage, loImage);

            inData++;
            rightData++;

        }
        #ifndef PPC970
        vec_dss( 0 );
        vec_dss( 1 );
        #endif
    }  /*end of working altivec function */
}
Пример #11
0
void pix_compare :: processYUV_Altivec(imageStruct &image, imageStruct &right)
{
register int h,w,i,j,width;

    h = image.ysize;
    w = image.xsize/8;
    width = image.xsize/8;

    //check to see if the buffer isn't 16byte aligned (highly unlikely)
    if (image.ysize*image.xsize % 16 != 0){
        error("image not properly aligned for Altivec");
        return;
        }

    register vector unsigned short	UVres1, Yres1, UVres2, Yres2;//interleave;
    register vector unsigned short	hiImage, loImage;
    register vector bool short		Ymask1;
    register vector unsigned char	one = vec_splat_u8(1);

    vector unsigned char	*inData = (vector unsigned char*) image.data;
    vector unsigned char	*rightData = (vector unsigned char*) right.data;

    #ifndef PPC970
    //setup the cache prefetch -- A MUST!!!
    UInt32			prefetchSize = GetPrefetchConstant( 16, 1, 256 );
    vec_dst( inData, prefetchSize, 0 );
    vec_dst( rightData, prefetchSize, 1 );
    #endif
    if (m_direction) {

    for ( i=0; i<h; i++){
        for (j=0; j<w; j++)
        {
        #ifndef PPC970
        //this function is probably memory bound on most G4's -- what else is new?
        vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif

        //separate the U and V from Y
        UVres1 = (vector unsigned short)vec_mule(one,inData[0]);
        UVres2 = (vector unsigned short)vec_mule(one,rightData[0]);

        //vec_mulo Y * 1 to short vector Y Y Y Y shorts
        Yres1 = (vector unsigned short)vec_mulo(one,inData[0]);
        Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]);

         //compare the Y values
         Ymask1 = vec_cmpgt(Yres1,Yres2);

         //bitwise comparison and move using the result of the comparison as a mask
         Yres1 = vec_sel(Yres2,Yres1,Ymask1);

         UVres1 = vec_sel(UVres2,UVres1,Ymask1);

         //merge the Y and UV back together
         hiImage = vec_mergeh(UVres1,Yres1);
         loImage = vec_mergel(UVres1,Yres1);

         //pack it back down to unsigned char to store
         inData[0] = vec_packsu(hiImage,loImage);

            inData++;
            rightData++;

        }
        #ifndef PPC970
        vec_dss(1);
        vec_dss(0);
        #endif

    }
    }else{

    for ( i=0; i<h; i++){
        for (j=0; j<w; j++)
        {
        #ifndef PPC970
        vec_dst( inData, prefetchSize, 0 );
        vec_dst( rightData, prefetchSize, 1 );
        #endif

        UVres1 = (vector unsigned short)vec_mule(one,inData[0]);
        UVres2 = (vector unsigned short)vec_mule(one,rightData[0]);

        //vec_mulo Y * 1 to short vector Y Y Y Y shorts
        Yres1 = (vector unsigned short)vec_mulo(one,inData[0]);
        Yres2 = (vector unsigned short)vec_mulo(one,rightData[0]);

         Ymask1 = vec_cmplt(Yres1,Yres2);

         Yres1 = vec_sel(Yres2,Yres1,Ymask1);

         UVres1 = vec_sel(UVres2,UVres1,Ymask1);

         hiImage = vec_mergeh(UVres1,Yres1);
         loImage = vec_mergel(UVres1,Yres1);

         inData[0] = vec_packsu(hiImage,loImage);

            inData++;
            rightData++;

        }
        #ifndef PPC970
        vec_dss(1);
        vec_dss(0);
        #endif
    }
    }
}
Пример #12
0
/* start of optimized motionblur */
void pix_motionblur :: processYUVAltivec(imageStruct &image)
{
  int h,w,width;
  signed short rightGain,imageGain;
  unsigned char *saved = m_savedImage.data;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();
  if(saved!=m_savedImage.data) {
    m_savedImage.setBlack();
  }
  saved=m_savedImage.data;

  width = image.xsize/8;
  /*
  // hmm: why does it read 235 ?
  rightGain = (signed short)(235. * m_motionblur);
  imageGain = (signed short) (255. - (235. * m_motionblur));
  */
  rightGain = m_blur1;
  imageGain = m_blur0;

  union {
    signed short        elements[8];
    vector      signed short v;
  } shortBuffer;

  union {
    unsigned int        elements[4];
    vector      unsigned int v;
  } bitBuffer;

  register vector signed short gainAdd, hiImage, loImage,hiRight,loRight,
           YImage, UVImage;
  // register vector signed short loadhiImage, loadloImage,loadhiRight,loadloRight;
  register vector unsigned char loadImage, loadRight;
  register vector unsigned char zero = vec_splat_u8(0);
  register vector signed int UVhi,UVlo,Yhi,Ylo;
  register vector signed int UVhiR,UVloR,YhiR,YloR;
  register vector signed short gainSub,gain,gainR;//,d;
  register vector unsigned int bitshift;
  vector unsigned char *inData = (vector unsigned char*) image.data;
  vector unsigned char *rightData = (vector unsigned char*) saved;


  shortBuffer.elements[0] = 128;
  shortBuffer.elements[1] = 0;
  shortBuffer.elements[2] = 128;
  shortBuffer.elements[3] = 0;
  shortBuffer.elements[4] = 128;
  shortBuffer.elements[5] = 0;
  shortBuffer.elements[6] = 128;
  shortBuffer.elements[7] = 0;

  gainSub = shortBuffer.v;

  shortBuffer.elements[0] = imageGain;
  gain = shortBuffer.v;
  gain =  vec_splat(gain, 0 );

  shortBuffer.elements[0] = rightGain;
  gainR = shortBuffer.v;
  gainR =  vec_splat(gainR, 0 );

  bitBuffer.elements[0] = 8;

  //Load it into the vector unit
  bitshift = bitBuffer.v;
  bitshift = vec_splat(bitshift,0);

  shortBuffer.elements[0] = 128;

  //Load it into the vector unit
  gainAdd = shortBuffer.v;
  gainAdd = (vector signed short)vec_splat((vector signed short)gainAdd,0);

# ifndef PPC970
  UInt32                        prefetchSize = GetPrefetchConstant( 16, 1,
      256 );
  vec_dst( inData, prefetchSize, 0 );
  vec_dst( rightData, prefetchSize, 1 );
  vec_dst( inData+32, prefetchSize, 2 );
  vec_dst( rightData+32, prefetchSize, 3 );
# endif

  loadImage = inData[0];
  loadRight = rightData[0];

  for ( h=0; h<image.ysize; h++) {
    for (w=0; w<width; w++) {
# ifndef PPC970
      vec_dst( inData, prefetchSize, 0 );
      vec_dst( rightData, prefetchSize, 1 );
      vec_dst( inData+32, prefetchSize, 2 );
      vec_dst( rightData+32, prefetchSize, 3 );
# endif
      //interleaved U Y V Y chars

      hiImage = (vector signed short) vec_mergeh( zero, loadImage );
      loImage = (vector signed short) vec_mergel( zero, loadImage );

      hiRight = (vector signed short) vec_mergeh( zero, loadRight );
      loRight = (vector signed short) vec_mergel( zero, loadRight );

      //hoist that load!!
      loadImage = inData[1];
      loadRight = rightData[1];

      //subtract 128 from UV

      hiImage = vec_subs(hiImage,gainSub);
      loImage = vec_subs(loImage,gainSub);

      hiRight = vec_subs(hiRight,gainSub);
      loRight = vec_subs(loRight,gainSub);

      //now vec_mule the UV into two vector ints
      //change sone to gain
      UVhi = vec_mule(gain,hiImage);
      UVlo = vec_mule(gain,loImage);

      UVhiR = vec_mule(gainR,hiRight);
      UVloR = vec_mule(gainR,loRight);

      //now vec_mulo the Y into two vector ints
      Yhi = vec_mulo(gain,hiImage);
      Ylo = vec_mulo(gain,loImage);

      YhiR = vec_mulo(gainR,hiRight);
      YloR = vec_mulo(gainR,loRight);


      //this is where to do the add and bitshift due to the resolution
      //add UV
      UVhi = vec_adds(UVhi,UVhiR);
      UVlo = vec_adds(UVlo,UVloR);

      Yhi = vec_adds(Yhi,YhiR);
      Ylo = vec_adds(Ylo,YloR);

      //bitshift UV
      UVhi = vec_sra(UVhi,bitshift);
      UVlo = vec_sra(UVlo,bitshift);

      Yhi = vec_sra(Yhi,bitshift);
      Ylo = vec_sra(Ylo,bitshift);

      //pack the UV into a single short vector
      UVImage =  vec_packs(UVhi,UVlo);

      //pack the Y into a single short vector
      YImage =  vec_packs(Yhi,Ylo);

      //vec_mergel + vec_mergeh Y and UV
      hiImage =  vec_mergeh(UVImage,YImage);
      loImage =  vec_mergel(UVImage,YImage);

      //add 128 offset back
      hiImage = vec_adds(hiImage,gainSub);
      loImage = vec_adds(loImage,gainSub);

      //vec_mergel + vec_mergeh Y and UV
      rightData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);
      inData[0] = (vector unsigned char)vec_packsu(hiImage, loImage);

      inData++;
      rightData++;
    }
  }
# ifndef PPC970
  //stop the cache streams
  vec_dss( 0 );
  vec_dss( 1 );
  vec_dss( 2 );
  vec_dss( 3 );
# endif


}/* end of working altivec function */
Пример #13
0
void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)
{
    int i, ii, j, stride1, stride2, stride3, stride4, halfstride;
    unsigned char *pB, *pB2, *pB4;
    vector unsigned char l0, l1, l2, l3;
    vector unsigned short s0, s1, s2, s3;
    vector unsigned short s22_0, s22_1, s22_2, s22_3;
    vector unsigned short s44, s44_0, s44_1;
    vector unsigned short zero, two;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY
    if (NOT_VECTOR_ALIGNED(image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "image", 16, image);
    if (NOT_VECTOR_ALIGNED(sub22_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub22_image", 16, sub22_image);
    if (NOT_VECTOR_ALIGNED(sub44_image))
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "sub44_image", 16, sub44_image);

    if ((rowstride & 63) != 0)
	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
	    "rowstride", 64, rowstride);
#endif

    AMBER_START;

    pB = image;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(6,4,0);
    dsc.block.stride = rowstride;

    vec_dst(pB, dsc.control, 0);
#endif

    pB2 = sub22_image;
    pB4 = sub44_image;

    j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */

    stride1 = rowstride;
    stride2 = stride1 + stride1;
    stride3 = stride2 + stride1;
    stride4 = stride2 + stride2;
    halfstride = stride1 >> 1; /* /2 */

    ii = rowstride >> 6; /* rowstride/16/4 */

    zero = vec_splat_u16(0);
    two = vec_splat_u16(2);

    do {
	i = ii;
	do {
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;
#ifdef ALTIVEC_DST
	    vec_dst(pB + (16 * 3), dsc.control, 0);
#endif

	    /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */
	    /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */
	    /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */
	    /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */

	    /* s0 = 0x[00,01,      02,03,      04,05,      06,07,     ] */
	    /*        [      10,11,      12,13,      14,15,      16,17] */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));

	    /* s1 = 0x[08,09,      0A,0B,      0C,0D,      0E,0F,     ] */
	    /*        [      18,19,      1A,1B,      1C,1D,      1E,1F] */
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));

	    /* s2 = 0x[20,21,      22,23,      24,25,      26,27,     ] */
	    /*        [      30,31,      32,33,      34,35,      36,37] */
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));

	    /* s3 = 0x[28,29,      2A,2B,      2C,2D,      2E,2F,     ] */
	    /*        [      38,39,      3A,3B,      3C,3D,      3E,3F] */
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next block */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;

	    /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */
	    /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */
	    /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */
	    /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */
	    s22_0 = vec_add(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */
	    s22_1 = vec_add(s22_1, two);

	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */
	    s22_0 = vec_sra(s22_0, two);
	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */
	    s22_1 = vec_sra(s22_1, two);

	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
	    /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */
	    s44_0 = vec_add(s22_0, s22_1);

	    /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* start loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);


	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    /* starting loading next l[0-3] */
	    l0 = vec_ld(0, pB);
	    l1 = vec_ld(stride1, pB);
	    l2 = vec_ld(stride2, pB);
	    l3 = vec_ld(stride3, pB);
	    pB += 16;


	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
	    s22_1 = vec_packsu(vu32(s2), vu32(s3));

	    s22_0 = vec_add(s22_0, two);
	    s22_1 = vec_add(s22_1, two);

	    s22_0 = vec_sra(s22_0, two);
	    s22_1 = vec_sra(s22_1, two);


	    s44 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44 = vec_add(s44, two);
	    s44 = vec_sra(s44, two);

	    s44_0 = vec_add(s22_0, s22_1);
	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));

	    /* - - - - - - - - - - - - - - - - - - - */
	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));

	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
	    s22_3 = vec_packsu(vu32(s2), vu32(s3));

	    s22_2 = vec_add(s22_2, two);
	    s22_3 = vec_add(s22_3, two);

	    s22_2 = vec_sra(s22_2, two);
	    s22_3 = vec_sra(s22_3, two);

	    s44_1 = vec_add(s22_2, s22_3);
	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));

	    /* store s22 block */
	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
	    vec_st(vu8(s22_0), 0, pB2);
	    vec_st(vu8(s22_1), halfstride, pB2);
	    pB2 += 16;

	    /* pack all four s44 chunks */
	    s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1));
	    s44_0 = vec_add(s44_0, two);
	    s44_0 = vec_sra(s44_0, two);
	    s44 = vu16(vec_packsu(s44, s44_0));

	    vec_st(vu8(s44), 0, pB4);
	    pB4 += 16;

	} while (--i);

	pB += stride3;
	pB2 += halfstride;

    } while (--j);

#ifdef ALTIVEC_DST
    vec_dss(0);
#endif

    AMBER_STOP;
}