void pix_offset :: processRGBAMMX(imageStruct &image) { char R = m_offset[chRed]; char G = m_offset[chGreen]; char B = m_offset[chBlue]; char A = m_offset[chAlpha]; register int pixsize = (image.ysize * image.xsize)>>1; register __m64 offset_64 = _mm_setr_pi8(R, G, B, A, R, G, B, A); register __m64*data_p= (__m64*)image.data; _mm_empty(); if(m_saturate) { while(pixsize--) { data_p[0]=_mm_adds_pu8(data_p[0], offset_64); data_p++; } } else { while(pixsize--) { data_p[0]=_mm_add_pi8(data_p[0], offset_64); data_p++; } } _mm_empty(); }
void pix_diff :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 mask = _mm_setr_pi8(0x40, 0x00, 0x40, 0x00, 0x40, 0x00, 0x40, 0x00); __m64 l, r, b; while (datasize--) { l=leftPix[datasize]; r=rightPix[datasize]; l=_mm_adds_pu8(l, mask); r=_mm_subs_pu8(r, mask); b = l; b = _mm_subs_pu8 (b, r); r = _mm_subs_pu8 (r, l); b = _mm_or_si64 (b, r); leftPix[datasize]=b; } _mm_empty(); }
__m64 test_mm_setr_pi8(char a, char b, char c, char d, char e, char f, char g, char h) { // CHECK-LABEL: test_mm_setr_pi8 // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> return _mm_setr_pi8(a, b, c, d, e, f, g, h); }
void pix_offset :: processYUVMMX(imageStruct &image) { register int pixsize = (image.ysize * image.xsize)>>2; register __m64 offset_64 = _mm_setr_pi8(U, Y, V, Y, U, Y, V, Y); register __m64*data_p= (__m64*)image.data; _mm_empty(); while(pixsize--) { data_p[0]=_mm_add_pi8(data_p[0], offset_64); data_p++; } _mm_empty(); }
void pix_compare :: processYUV_MMX(imageStruct &image, imageStruct &right) { long datasize = image.xsize * image.ysize * image.csize; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; __m64 l, r, b; __m64 mask = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF); __m64 zeros = _mm_set1_pi8((unsigned char)0x00); //format is U Y V Y if (m_direction) { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(l, r); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } else { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(r, l); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } _mm_empty(); }
void pix_add :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = reinterpret_cast<__m64*>(image.data); __m64*rightPix = reinterpret_cast<__m64*>(right.data); datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 mask = _mm_setr_pi8(0x40, 0x00, 0x40, 0x00, 0x40, 0x00, 0x40, 0x00); __m64 l, r; while (datasize--) { l=leftPix[datasize]; r=rightPix[datasize]; l=_mm_subs_pu8(l, mask); r=_mm_subs_pu8(r, mask); l=_mm_adds_pu8(l,r); leftPix[datasize]=l; } _mm_empty(); }
void pix_multiply :: processYUV_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 mask= _mm_setr_pi8((unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00); __m64 yuvclamp0 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10); __m64 yuvclamp1 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24); __m64 yuvclamp2 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14); __m64 null64 = _mm_setzero_si64(); while(datasize--) { r1=rightPix[datasize]; l1=leftPix [datasize]; r1=_mm_or_si64(r1, mask); l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); l0=_mm_packs_pu16(l0, l1); l0=_mm_subs_pu8(l0, yuvclamp0); l0=_mm_adds_pu8(l0, yuvclamp1); l0=_mm_subs_pu8(l0, yuvclamp2); leftPix[datasize]=l0; } _mm_empty(); }