예제 #1
0
static void
TEST (void)
{
  __m64_union u, s1, s2;
  __m64_union e;
  int i, tmp;

  s1.as_m64 = _mm_set_pi8 (30, 2, 3, 4, 10, 20, 30, 90);
  s2.as_m64 = _mm_set_pi8 (88, 44, 3, 22, 11, 98, 76, 100);
  u.as_m64 = test (s1.as_m64, s2.as_m64);

  for (i = 0; i < 8; i++)
    {
      tmp = s1.as_char[i] - s2.as_char[i];

      if (tmp > 255)
        tmp = -1;
      if (tmp < 0)
        tmp = 0;

      e.as_char[i] = tmp;
    }

  if (u.as_m64 != e.as_m64)
    abort ();
}
예제 #2
0
void pix_background :: processRGBAMMX(imageStruct &image)
{
  long i,pixsize;
  pixsize = image.xsize * image.ysize * image.csize;

  if(m_savedImage.xsize!=image.xsize ||
     m_savedImage.ysize!=image.ysize ||
     m_savedImage.format!=image.format)m_reset=1;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();

  if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
  }
  m_reset=0;

  i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*data =(__m64*)image.data;
  __m64*saved=(__m64*)m_savedImage.data;

  const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange,
				m_Yrange, m_Urange, m_Vrange, m_Arange);
  const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1);
  __m64 newpix, oldpix, m1;

  while(i--){
    /* 7ops, 3memops */
    /* i have the feeling that this is not faster at all! 
     * even if i have the 3memops + ONLY 1 _mm_subs_pu8() 
     * i am equally slow as the generic code; 
     * adding the other instruction does not change much
     */
    newpix=*data;
    oldpix=*saved++;
    m1    = newpix;
    m1    = _mm_subs_pu8     (m1, oldpix);
    oldpix= _mm_subs_pu8     (oldpix, newpix);
    m1    = _mm_or_si64      (m1, oldpix); // |oldpix-newpix|
    m1    = _mm_adds_pu8     (m1, offset);
    m1    = _mm_subs_pu8     (m1, thresh);
    m1    = _mm_cmpeq_pi32   (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh
    m1    = _mm_andnot_si64(m1, newpix);

    *data++ = m1; 
  }
  _mm_empty();
}
예제 #3
0
static void TEST (void)
{
  __m64_union u, e, s1, s2;
  int i;

  s1.as_m64 = _mm_set_pi8 (1, 2, 3, 4, 5, 6, 7, 8);
  s2.as_m64 = _mm_set_pi8 (8, 7, 6, 5, 4, 3, 2, 1);
  u.as_m64 = test (s1.as_m64, s2.as_m64);

  for (i = 0; i < 8; i++)
    e.as_char[i] =
	((unsigned char) s1.as_char[i] > (unsigned char) s2.as_char[i]) ?
	    s1.as_char[i] : s2.as_char[i];

  if (u.as_m64 != e.as_m64)
    abort ();
}
예제 #4
0
void pix_movement :: processGrayMMX(imageStruct &image)
{
  // assume that the pix_size does not change !
  bool doclear=(image.xsize*image.ysize != buffer.xsize*buffer.ysize);
  buffer.xsize = image.xsize;
  buffer.ysize = image.ysize;
  buffer.reallocate();
  if(doclear) {
    buffer.setWhite();
  }
  buffer2.xsize = image.xsize;
  buffer2.ysize = image.ysize;
  buffer2.reallocate();

  int pixsize = image.ysize * image.xsize / sizeof(__m64);

  unsigned char thresh=threshold;

  __m64*rp = (__m64*)image.data; // read pointer
  __m64*wp = (__m64*)buffer.data; // write pointer to the copy
  __m64*wp2= (__m64*)buffer2.data;      // write pointer to the diff-image

  __m64 m1, m2, grey;
  __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh,
                            thresh,thresh,thresh,thresh);

  // there is still one problem with the threshold: is the cmpgt only for signed ?
  while(pixsize--) {
    grey = rp[pixsize]; // image.data
    m2   = wp[pixsize]; // buffer.data

    //m0 =_mm_cmpgt_pi8(grey, m2); // (grey>m2)
    //m1 =_mm_subs_pu8 (grey, m2); // (grey-m2)
    //m2 =_mm_subs_pu8 (m2, grey); // (m2-grey)
    //m1 =_mm_and_si64   (m1, m0); // (m2-grey)&(grey>m2)   ((??))
    //m0 =_mm_andnot_si64(m0, m2); // !(grey>m2)&(grey-m2)  ((??))
    //m2 =_mm_or_si64    (m2, m0); // [(a-b)&(a>b)]|[(b-a)&!(a>b)]=abs(a-b)

    // this is better: use saturated arithmetic!

    m1 =_mm_subs_pu8 (grey, m2); // (grey-m2)
    m2 =_mm_subs_pu8 (m2, grey); // (m2-grey)
    wp[pixsize]=grey; // buffer.data

    m2 = _mm_or_si64 (m2, m1); // |grey-m2|

    m2 =_mm_subs_pu8 (m2, thresh8);
    m2 =_mm_cmpgt_pi8(m2, _mm_setzero_si64());

    wp2[pixsize]=m2;  // output.data
  }
  _mm_empty();
  image.data = buffer2.data;
}
예제 #5
0
__m64 test_mm_set_pi8(char a, char b, char c, char d, char e, char f, char g, char h) {
  // CHECK-LABEL: test_mm_set_pi8
  // CHECK: insertelement <8 x i8>
  // CHECK: insertelement <8 x i8>
  // CHECK: insertelement <8 x i8>
  // CHECK: insertelement <8 x i8>
  // CHECK: insertelement <8 x i8>
  // CHECK: insertelement <8 x i8>
  // CHECK: insertelement <8 x i8>
  // CHECK: insertelement <8 x i8>
  return _mm_set_pi8(a, b, c, d, e, f, g, h);
}
예제 #6
0
unsigned int interpolvline64_2_mmx(__m64* temp){

	__m64 res;
	__m64 ptr = _mm_setzero_si64();
	

	short A = _mm_extract_pi16(temp[0],0);
	short B = _mm_extract_pi16(temp[1],0);
	short C = _mm_extract_pi16(temp[2],0);
	short D = _mm_extract_pi16(temp[3],0);
	short E = _mm_extract_pi16(temp[4],0);
	short F = _mm_extract_pi16(temp[5],0);

	unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0);

	 A = _mm_extract_pi16(temp[0],1);
	 B = _mm_extract_pi16(temp[1],1);
	 C = _mm_extract_pi16(temp[2],1);
	 D = _mm_extract_pi16(temp[3],1);
	 E = _mm_extract_pi16(temp[4],1);
	 F = _mm_extract_pi16(temp[5],1);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1);

	 A = _mm_extract_pi16(temp[0],2);
	 B = _mm_extract_pi16(temp[1],2);
	 C = _mm_extract_pi16(temp[2],2);
	 D = _mm_extract_pi16(temp[3],2);
	 E = _mm_extract_pi16(temp[4],2);
	 F = _mm_extract_pi16(temp[5],2);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2);

	 A = _mm_extract_pi16(temp[0],3);
	 B = _mm_extract_pi16(temp[1],3);
	 C = _mm_extract_pi16(temp[2],3);
	 D = _mm_extract_pi16(temp[3],3);
	 E = _mm_extract_pi16(temp[4],3);
	 F = _mm_extract_pi16(temp[5],3);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3);
	
	res = _mm_set_pi8(0,0,0,0,_mm_extract_pi16(ptr,3),_mm_extract_pi16(ptr,2),_mm_extract_pi16(ptr,1),_mm_extract_pi16(ptr,0));
	result = _mm_cvtsi64_si32(res); 
	empty();
	return result;

}
예제 #7
0
void pix_background :: processGrayMMX(imageStruct &image){
  int i;
  long pixsize;

  pixsize = image.xsize * image.ysize * image.csize;
  if(m_savedImage.xsize!=image.xsize ||
     m_savedImage.ysize!=image.ysize ||
     m_savedImage.format!=image.format)m_reset=1;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();

  if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
  }
  m_reset=0;
  if(m_Yrange==0)return;  

  __m64*npixes=(__m64*)image.data;
  __m64*opixes=(__m64*)m_savedImage.data;
  __m64 newpix, oldpix, m1;

  unsigned char thresh=m_Yrange-1;
  __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh,
			  thresh,thresh,thresh,thresh);

  
  i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);
  while(i--){
    newpix=npixes[i];
    oldpix=opixes[i];
    
    m1    = _mm_subs_pu8 (newpix, oldpix);
    oldpix= _mm_subs_pu8 (oldpix, newpix);
    m1    = _mm_or_si64  (m1, oldpix); // |oldpix-newpix|
    m1    = _mm_subs_pu8 (m1, thresh8);
    m1    = _mm_cmpgt_pi8(m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh8
    npixes[i] = _mm_and_si64(m1, newpix);
  }
  _mm_empty();
}
예제 #8
0
void pix_background :: processYUVMMX(imageStruct &image)
{
  long pixsize;

  pixsize = image.xsize * image.ysize * image.csize;

  if(m_savedImage.xsize!=image.xsize ||
     m_savedImage.ysize!=image.ysize ||
     m_savedImage.format!=image.format)m_reset=1;

  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();
  
  if (m_reset){
    memcpy(m_savedImage.data,image.data,pixsize);
    // return;
  }
  m_reset=0;

  int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*data =(__m64*)image.data;
  __m64*saved=(__m64*)m_savedImage.data;

  const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange,
			  m_Urange, m_Yrange, m_Vrange, m_Yrange);
  const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1);
  const __m64 black =_mm_set_pi8((unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80,
				 (unsigned char)0x00,
				 (unsigned char)0x80);

  __m64 newpix, oldpix, m1;

  while(i--){
    newpix=*data;
    oldpix=*saved++;
    m1    = newpix;
    m1    = _mm_subs_pu8     (m1, oldpix);
    oldpix= _mm_subs_pu8     (oldpix, newpix);
    m1    = _mm_or_si64      (m1, oldpix); // |oldpix-newpix|
    m1    = _mm_adds_pu8     (m1, offset); // to make thresh=0 work correctly
    m1    = _mm_subs_pu8     (m1, thresh);  // m1>thresh -> saturation -> 0
    m1    = _mm_cmpeq_pi32   (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh

    oldpix= black;
    oldpix= _mm_and_si64     (oldpix, m1);

    m1    = _mm_andnot_si64  (m1, newpix);
    m1    = _mm_or_si64      (m1, oldpix);

    *data++ = m1; 
  }
  _mm_empty();
}
예제 #9
0
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output)
{
	__m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0);
	__m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255);
	const uint8_t *orig_input = input;
	uint8_t *y_comp = output;
	uint8_t *u_comp = output + width * height;
	uint8_t *v_comp = u_comp + (int)((width * height)/2);	// 4:2:2
	int i, j;

	// When preparing video for PAL DV50 encoding, the video must be shifted
	// down by one line to change the field order to be bottom-field-first
	int start_line = 0;
	if (shift_picture_down) {
		memset(y_comp, 0x10, width);		// write one line of black Y
		y_comp += width;
		memset(u_comp, 0x80, width/2);		// write one line of black U,V
		u_comp += width/2;
		memset(v_comp, 0x80, width/2);		// write one line of black U,V
		v_comp += width/2;
		start_line = 1;
	}

	/* Do the y component */
	for (j = start_line; j < height; j++)
	{
		// Consume 16 bytes of UYVY data per iteration (8 pixels worth)
		for (i = 0; i < width*2; i += 16)
		{
			//__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask);
			//__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask);
			//__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0);
			//*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1);
			__m64 m0 = *(__m64 *)input;
			__m64 m2 = _mm_srli_si64(m0, 8);
			__m64 m3 = _mm_slli_si64(m0, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m0 = m2;
			__m64 m1 = *(__m64 *)(input+8);
			m2 = _mm_srli_si64(m1, 8);
			m3 = _mm_slli_si64(m1, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m1 = m2;
			*(__m64 *)y_comp = _mm_packs_pu16 (m0, m1);

			y_comp += 8;
			input += 16;
		}
	}
	/* Do the chroma components */
	input = orig_input;
	for (j = start_line; j < height; j++)
	{
		/* Process every line for yuv 4:2:2 */
		for (i = 0; i < width*2; i += 16)
		{
			__m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8));
			__m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8));

			__m64 m3 = _mm_unpacklo_pi8 (m1, m2);
			__m64 m4 = _mm_unpackhi_pi8 (m1, m2);
			//*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2);
			//*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2);
			memcpy (u_comp, &m3, 4);
			memcpy (v_comp, &m4, 4);
			u_comp += 4;
			v_comp += 4;
			input += 16;
		}
	}
	_mm_empty();        // Clear aliased fp register state
}
void test_char_to_float(void)
{
  __m64 narrow = _mm_set_pi8(0, 1, 2, 3, 252, 253, 254, 255);
  __m64 zero = _mm_setzero_si64();
  __m64 loshorts, hishorts;
  __m128 lofloats, hifloats;
  float lofloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16)));
  float hifloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16)));
  int16_t shortarray[SHORT_ARRAYSIZE] __attribute__ ((aligned (16)));
  int i;
  
  /* interleave zero with narrow and return halves: essentially widening  */
  /* elements from unsigned chars to unsigned shorts                      */
  loshorts = _mm_unpacklo_pi8(narrow, zero);
  hishorts = _mm_unpackhi_pi8(narrow, zero);

  /* now turn the 4 shorts in loshorts into floats and store them in lofloats */
  /* likewise hishorts into hifloats.  */
  /* bug in _mm_cvtpi16_ps ? */
  
  lofloats = _mm_cvtpu16_ps1(loshorts);
  hifloats = _mm_cvtpu16_ps1(hishorts);
  
  _mm_store_ps(lofloatarray, lofloats);
  _mm_store_ps(hifloatarray, hifloats);

  /* we used SSE1 instructions that used __m64: add an   _mm_empty */
  _mm_empty(); 

  /* now store loshorts in shortarray and lofloats into lofloatarray  */
  /* and print them.   */
  memcpy(shortarray, &loshorts, sizeof(shortarray));
  
  fprintf(stderr, "loshorts ");
  for(i= 0; i < SHORT_ARRAYSIZE; i++)
    {
      fprintf(stderr, "%d ", shortarray[i]);
    }
  fprintf(stderr, "\n");
  
   
  fprintf(stderr, "lofloats ");
  for(i= 0; i < FLOAT_ARRAYSIZE; i++)
    {
      fprintf(stderr, "%f ", lofloatarray[i]);
    }
  fprintf(stderr, "\n");

  /* now store hishorts in shortarray and hifloats into lofloatarray  */
  /* and print them.   */
  memcpy(shortarray, &hishorts, sizeof(shortarray));
  
  fprintf(stderr, "hishorts ");
  for(i= 0; i < SHORT_ARRAYSIZE; i++)
    {
      fprintf(stderr, "%d ", shortarray[i]);
    }
  fprintf(stderr, "\n");
  
  fprintf(stderr, "hifloats ");
  for(i= 0; i < FLOAT_ARRAYSIZE; i++)
    {
      fprintf(stderr, "%f ", hifloatarray[i]);
    }
  fprintf(stderr, "\n");

	  
}
예제 #11
0
unsigned int interpolvline64_3_mmx(__m64* temp){


	__m64 res,res1;
	__m64 ptr = _mm_setzero_si64();
	__m64 mm_16 = _mm_set_pi16(16,16,16,16);
	
	short A = _mm_extract_pi16(temp[0],0);
	short B = _mm_extract_pi16(temp[1],0);
	short C = _mm_extract_pi16(temp[2],0);
	short D = _mm_extract_pi16(temp[3],0);
	short E = _mm_extract_pi16(temp[4],0);
	short F = _mm_extract_pi16(temp[5],0);

	unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0);

	 A = _mm_extract_pi16(temp[0],1);
	 B = _mm_extract_pi16(temp[1],1);
	 C = _mm_extract_pi16(temp[2],1);
	 D = _mm_extract_pi16(temp[3],1);
	 E = _mm_extract_pi16(temp[4],1);
	 F = _mm_extract_pi16(temp[5],1);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1);

	 A = _mm_extract_pi16(temp[0],2);
	 B = _mm_extract_pi16(temp[1],2);
	 C = _mm_extract_pi16(temp[2],2);
	 D = _mm_extract_pi16(temp[3],2);
	 E = _mm_extract_pi16(temp[4],2);
	 F = _mm_extract_pi16(temp[5],2);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2);

	 A = _mm_extract_pi16(temp[0],3);
	 B = _mm_extract_pi16(temp[1],3);
	 C = _mm_extract_pi16(temp[2],3);
	 D = _mm_extract_pi16(temp[3],3);
	 E = _mm_extract_pi16(temp[4],3);
	 F = _mm_extract_pi16(temp[5],3);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3);
	

	res = _mm_add_pi16(temp[3],mm_16);
	res1 = _mm_srai_pi16(res,5);
	res1 = _mm_max_pi16(res1,_mm_set_pi16(0,0,0,0));
	res1 = _mm_min_pi16(res1,_mm_set_pi16(255,255,255,255)); //Clip

	res = _mm_set_pi8(0,0,0,0,_mm_extract_pi16(ptr,3),_mm_extract_pi16(ptr,2),_mm_extract_pi16(ptr,1),_mm_extract_pi16(ptr,0));
	res1 =_mm_set_pi8(0,0,0,0,_mm_extract_pi16(res1,3),_mm_extract_pi16(res1,2),_mm_extract_pi16(res1,1),_mm_extract_pi16(res1,0));
	res =  _mm_avg_pu8(res,res1);//(ptr_img[0] + ptr_rf[0] + 1) >> 1



	result = _mm_cvtsi64_si32(res); 
	empty();
	return result;

}