void pix_diff :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 mask = _mm_setr_pi8(0x40, 0x00, 0x40, 0x00, 0x40, 0x00, 0x40, 0x00); __m64 l, r, b; while (datasize--) { l=leftPix[datasize]; r=rightPix[datasize]; l=_mm_adds_pu8(l, mask); r=_mm_subs_pu8(r, mask); b = l; b = _mm_subs_pu8 (b, r); r = _mm_subs_pu8 (r, l); b = _mm_or_si64 (b, r); leftPix[datasize]=b; } _mm_empty(); }
void pix_movement :: processGrayMMX(imageStruct &image) { // assume that the pix_size does not change ! bool doclear=(image.xsize*image.ysize != buffer.xsize*buffer.ysize); buffer.xsize = image.xsize; buffer.ysize = image.ysize; buffer.reallocate(); if(doclear) { buffer.setWhite(); } buffer2.xsize = image.xsize; buffer2.ysize = image.ysize; buffer2.reallocate(); int pixsize = image.ysize * image.xsize / sizeof(__m64); unsigned char thresh=threshold; __m64*rp = (__m64*)image.data; // read pointer __m64*wp = (__m64*)buffer.data; // write pointer to the copy __m64*wp2= (__m64*)buffer2.data; // write pointer to the diff-image __m64 m1, m2, grey; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); // there is still one problem with the threshold: is the cmpgt only for signed ? while(pixsize--) { grey = rp[pixsize]; // image.data m2 = wp[pixsize]; // buffer.data //m0 =_mm_cmpgt_pi8(grey, m2); // (grey>m2) //m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) //m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) //m1 =_mm_and_si64 (m1, m0); // (m2-grey)&(grey>m2) ((??)) //m0 =_mm_andnot_si64(m0, m2); // !(grey>m2)&(grey-m2) ((??)) //m2 =_mm_or_si64 (m2, m0); // [(a-b)&(a>b)]|[(b-a)&!(a>b)]=abs(a-b) // this is better: use saturated arithmetic! m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) wp[pixsize]=grey; // buffer.data m2 = _mm_or_si64 (m2, m1); // |grey-m2| m2 =_mm_subs_pu8 (m2, thresh8); m2 =_mm_cmpgt_pi8(m2, _mm_setzero_si64()); wp2[pixsize]=m2; // output.data } _mm_empty(); image.data = buffer2.data; }
void pix_background :: processRGBAMMX(imageStruct &image) { long i,pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange, m_Yrange, m_Urange, m_Vrange, m_Arange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); __m64 newpix, oldpix, m1; while(i--){ /* 7ops, 3memops */ /* i have the feeling that this is not faster at all! * even if i have the 3memops + ONLY 1 _mm_subs_pu8() * i am equally slow as the generic code; * adding the other instruction does not change much */ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); m1 = _mm_subs_pu8 (m1, thresh); m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh m1 = _mm_andnot_si64(m1, newpix); *data++ = m1; } _mm_empty(); }
void pix_compare :: processYUV_MMX(imageStruct &image, imageStruct &right) { long datasize = image.xsize * image.ysize * image.csize; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; __m64 l, r, b; __m64 mask = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF); __m64 zeros = _mm_set1_pi8((unsigned char)0x00); //format is U Y V Y if (m_direction) { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(l, r); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } else { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(r, l); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } _mm_empty(); }
void pix_background :: processGrayMMX(imageStruct &image){ int i; long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; if(m_Yrange==0)return; __m64*npixes=(__m64*)image.data; __m64*opixes=(__m64*)m_savedImage.data; __m64 newpix, oldpix, m1; unsigned char thresh=m_Yrange-1; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); while(i--){ newpix=npixes[i]; oldpix=opixes[i]; m1 = _mm_subs_pu8 (newpix, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_subs_pu8 (m1, thresh8); m1 = _mm_cmpgt_pi8(m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh8 npixes[i] = _mm_and_si64(m1, newpix); } _mm_empty(); }
void pix_add :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = reinterpret_cast<__m64*>(image.data); __m64*rightPix = reinterpret_cast<__m64*>(right.data); datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 mask = _mm_setr_pi8(0x40, 0x00, 0x40, 0x00, 0x40, 0x00, 0x40, 0x00); __m64 l, r; while (datasize--) { l=leftPix[datasize]; r=rightPix[datasize]; l=_mm_subs_pu8(l, mask); r=_mm_subs_pu8(r, mask); l=_mm_adds_pu8(l,r); leftPix[datasize]=l; } _mm_empty(); }
void pix_subtract :: processRGBA_MMX(imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l, r; while (datasize--) { l=leftPix[datasize]; r=rightPix[datasize]; leftPix[datasize]=_mm_subs_pu8(l,r); } _mm_empty(); }
void pix_background :: processYUVMMX(imageStruct &image) { long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); // return; } m_reset=0; int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange, m_Urange, m_Yrange, m_Vrange, m_Yrange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); const __m64 black =_mm_set_pi8((unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80); __m64 newpix, oldpix, m1; while(i--){ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); // to make thresh=0 work correctly m1 = _mm_subs_pu8 (m1, thresh); // m1>thresh -> saturation -> 0 m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh oldpix= black; oldpix= _mm_and_si64 (oldpix, m1); m1 = _mm_andnot_si64 (m1, newpix); m1 = _mm_or_si64 (m1, oldpix); *data++ = m1; } _mm_empty(); }
void pix_multiply :: processYUV_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 mask= _mm_setr_pi8((unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00); __m64 yuvclamp0 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10); __m64 yuvclamp1 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24); __m64 yuvclamp2 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14); __m64 null64 = _mm_setzero_si64(); while(datasize--) { r1=rightPix[datasize]; l1=leftPix [datasize]; r1=_mm_or_si64(r1, mask); l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); l0=_mm_packs_pu16(l0, l1); l0=_mm_subs_pu8(l0, yuvclamp0); l0=_mm_adds_pu8(l0, yuvclamp1); l0=_mm_subs_pu8(l0, yuvclamp2); leftPix[datasize]=l0; } _mm_empty(); }
__m64 test47(__m64 a, __m64 b) { // CHECK: psubusb return _mm_subs_pu8(a, b); }
test (__m64 s1, __m64 s2) { return _mm_subs_pu8 (s1, s2); }
__m64 test_mm_subs_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pu8 // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b return _mm_subs_pu8(a, b); }