static void TEST (void) { __m64_union u, s1, s2; __m64_union e; int i, tmp; s1.as_m64 = _mm_set_pi8 (30, 2, 3, 4, 10, 20, 30, 90); s2.as_m64 = _mm_set_pi8 (88, 44, 3, 22, 11, 98, 76, 100); u.as_m64 = test (s1.as_m64, s2.as_m64); for (i = 0; i < 8; i++) { tmp = s1.as_char[i] - s2.as_char[i]; if (tmp > 255) tmp = -1; if (tmp < 0) tmp = 0; e.as_char[i] = tmp; } if (u.as_m64 != e.as_m64) abort (); }
void pix_background :: processRGBAMMX(imageStruct &image) { long i,pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange, m_Yrange, m_Urange, m_Vrange, m_Arange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); __m64 newpix, oldpix, m1; while(i--){ /* 7ops, 3memops */ /* i have the feeling that this is not faster at all! * even if i have the 3memops + ONLY 1 _mm_subs_pu8() * i am equally slow as the generic code; * adding the other instruction does not change much */ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); m1 = _mm_subs_pu8 (m1, thresh); m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh m1 = _mm_andnot_si64(m1, newpix); *data++ = m1; } _mm_empty(); }
static void TEST (void) { __m64_union u, e, s1, s2; int i; s1.as_m64 = _mm_set_pi8 (1, 2, 3, 4, 5, 6, 7, 8); s2.as_m64 = _mm_set_pi8 (8, 7, 6, 5, 4, 3, 2, 1); u.as_m64 = test (s1.as_m64, s2.as_m64); for (i = 0; i < 8; i++) e.as_char[i] = ((unsigned char) s1.as_char[i] > (unsigned char) s2.as_char[i]) ? s1.as_char[i] : s2.as_char[i]; if (u.as_m64 != e.as_m64) abort (); }
void pix_movement :: processGrayMMX(imageStruct &image) { // assume that the pix_size does not change ! bool doclear=(image.xsize*image.ysize != buffer.xsize*buffer.ysize); buffer.xsize = image.xsize; buffer.ysize = image.ysize; buffer.reallocate(); if(doclear) { buffer.setWhite(); } buffer2.xsize = image.xsize; buffer2.ysize = image.ysize; buffer2.reallocate(); int pixsize = image.ysize * image.xsize / sizeof(__m64); unsigned char thresh=threshold; __m64*rp = (__m64*)image.data; // read pointer __m64*wp = (__m64*)buffer.data; // write pointer to the copy __m64*wp2= (__m64*)buffer2.data; // write pointer to the diff-image __m64 m1, m2, grey; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); // there is still one problem with the threshold: is the cmpgt only for signed ? while(pixsize--) { grey = rp[pixsize]; // image.data m2 = wp[pixsize]; // buffer.data //m0 =_mm_cmpgt_pi8(grey, m2); // (grey>m2) //m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) //m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) //m1 =_mm_and_si64 (m1, m0); // (m2-grey)&(grey>m2) ((??)) //m0 =_mm_andnot_si64(m0, m2); // !(grey>m2)&(grey-m2) ((??)) //m2 =_mm_or_si64 (m2, m0); // [(a-b)&(a>b)]|[(b-a)&!(a>b)]=abs(a-b) // this is better: use saturated arithmetic! m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) wp[pixsize]=grey; // buffer.data m2 = _mm_or_si64 (m2, m1); // |grey-m2| m2 =_mm_subs_pu8 (m2, thresh8); m2 =_mm_cmpgt_pi8(m2, _mm_setzero_si64()); wp2[pixsize]=m2; // output.data } _mm_empty(); image.data = buffer2.data; }
__m64 test_mm_set_pi8(char a, char b, char c, char d, char e, char f, char g, char h) { // CHECK-LABEL: test_mm_set_pi8 // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> // CHECK: insertelement <8 x i8> return _mm_set_pi8(a, b, c, d, e, f, g, h); }
unsigned int interpolvline64_2_mmx(__m64* temp){ __m64 res; __m64 ptr = _mm_setzero_si64(); short A = _mm_extract_pi16(temp[0],0); short B = _mm_extract_pi16(temp[1],0); short C = _mm_extract_pi16(temp[2],0); short D = _mm_extract_pi16(temp[3],0); short E = _mm_extract_pi16(temp[4],0); short F = _mm_extract_pi16(temp[5],0); unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0); A = _mm_extract_pi16(temp[0],1); B = _mm_extract_pi16(temp[1],1); C = _mm_extract_pi16(temp[2],1); D = _mm_extract_pi16(temp[3],1); E = _mm_extract_pi16(temp[4],1); F = _mm_extract_pi16(temp[5],1); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1); A = _mm_extract_pi16(temp[0],2); B = _mm_extract_pi16(temp[1],2); C = _mm_extract_pi16(temp[2],2); D = _mm_extract_pi16(temp[3],2); E = _mm_extract_pi16(temp[4],2); F = _mm_extract_pi16(temp[5],2); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2); A = _mm_extract_pi16(temp[0],3); B = _mm_extract_pi16(temp[1],3); C = _mm_extract_pi16(temp[2],3); D = _mm_extract_pi16(temp[3],3); E = _mm_extract_pi16(temp[4],3); F = _mm_extract_pi16(temp[5],3); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3); res = _mm_set_pi8(0,0,0,0,_mm_extract_pi16(ptr,3),_mm_extract_pi16(ptr,2),_mm_extract_pi16(ptr,1),_mm_extract_pi16(ptr,0)); result = _mm_cvtsi64_si32(res); empty(); return result; }
void pix_background :: processGrayMMX(imageStruct &image){ int i; long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; if(m_Yrange==0)return; __m64*npixes=(__m64*)image.data; __m64*opixes=(__m64*)m_savedImage.data; __m64 newpix, oldpix, m1; unsigned char thresh=m_Yrange-1; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); while(i--){ newpix=npixes[i]; oldpix=opixes[i]; m1 = _mm_subs_pu8 (newpix, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_subs_pu8 (m1, thresh8); m1 = _mm_cmpgt_pi8(m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh8 npixes[i] = _mm_and_si64(m1, newpix); } _mm_empty(); }
void pix_background :: processYUVMMX(imageStruct &image) { long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); // return; } m_reset=0; int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange, m_Urange, m_Yrange, m_Vrange, m_Yrange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); const __m64 black =_mm_set_pi8((unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80); __m64 newpix, oldpix, m1; while(i--){ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); // to make thresh=0 work correctly m1 = _mm_subs_pu8 (m1, thresh); // m1>thresh -> saturation -> 0 m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh oldpix= black; oldpix= _mm_and_si64 (oldpix, m1); m1 = _mm_andnot_si64 (m1, newpix); m1 = _mm_or_si64 (m1, oldpix); *data++ = m1; } _mm_empty(); }
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output) { __m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0); __m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255); const uint8_t *orig_input = input; uint8_t *y_comp = output; uint8_t *u_comp = output + width * height; uint8_t *v_comp = u_comp + (int)((width * height)/2); // 4:2:2 int i, j; // When preparing video for PAL DV50 encoding, the video must be shifted // down by one line to change the field order to be bottom-field-first int start_line = 0; if (shift_picture_down) { memset(y_comp, 0x10, width); // write one line of black Y y_comp += width; memset(u_comp, 0x80, width/2); // write one line of black U,V u_comp += width/2; memset(v_comp, 0x80, width/2); // write one line of black U,V v_comp += width/2; start_line = 1; } /* Do the y component */ for (j = start_line; j < height; j++) { // Consume 16 bytes of UYVY data per iteration (8 pixels worth) for (i = 0; i < width*2; i += 16) { //__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask); //__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask); //__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0); //*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1); __m64 m0 = *(__m64 *)input; __m64 m2 = _mm_srli_si64(m0, 8); __m64 m3 = _mm_slli_si64(m0, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m0 = m2; __m64 m1 = *(__m64 *)(input+8); m2 = _mm_srli_si64(m1, 8); m3 = _mm_slli_si64(m1, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m1 = m2; *(__m64 *)y_comp = _mm_packs_pu16 (m0, m1); y_comp += 8; input += 16; } } /* Do the chroma components */ input = orig_input; for (j = start_line; j < height; j++) { /* Process every line for yuv 4:2:2 */ for (i = 0; i < width*2; i += 16) { __m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m3 = _mm_unpacklo_pi8 (m1, m2); __m64 m4 = _mm_unpackhi_pi8 (m1, m2); //*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2); //*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2); memcpy (u_comp, &m3, 4); memcpy (v_comp, &m4, 4); u_comp += 4; v_comp += 4; input += 16; } } _mm_empty(); // Clear aliased fp register state }
void test_char_to_float(void) { __m64 narrow = _mm_set_pi8(0, 1, 2, 3, 252, 253, 254, 255); __m64 zero = _mm_setzero_si64(); __m64 loshorts, hishorts; __m128 lofloats, hifloats; float lofloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16))); float hifloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16))); int16_t shortarray[SHORT_ARRAYSIZE] __attribute__ ((aligned (16))); int i; /* interleave zero with narrow and return halves: essentially widening */ /* elements from unsigned chars to unsigned shorts */ loshorts = _mm_unpacklo_pi8(narrow, zero); hishorts = _mm_unpackhi_pi8(narrow, zero); /* now turn the 4 shorts in loshorts into floats and store them in lofloats */ /* likewise hishorts into hifloats. */ /* bug in _mm_cvtpi16_ps ? */ lofloats = _mm_cvtpu16_ps1(loshorts); hifloats = _mm_cvtpu16_ps1(hishorts); _mm_store_ps(lofloatarray, lofloats); _mm_store_ps(hifloatarray, hifloats); /* we used SSE1 instructions that used __m64: add an _mm_empty */ _mm_empty(); /* now store loshorts in shortarray and lofloats into lofloatarray */ /* and print them. */ memcpy(shortarray, &loshorts, sizeof(shortarray)); fprintf(stderr, "loshorts "); for(i= 0; i < SHORT_ARRAYSIZE; i++) { fprintf(stderr, "%d ", shortarray[i]); } fprintf(stderr, "\n"); fprintf(stderr, "lofloats "); for(i= 0; i < FLOAT_ARRAYSIZE; i++) { fprintf(stderr, "%f ", lofloatarray[i]); } fprintf(stderr, "\n"); /* now store hishorts in shortarray and hifloats into lofloatarray */ /* and print them. */ memcpy(shortarray, &hishorts, sizeof(shortarray)); fprintf(stderr, "hishorts "); for(i= 0; i < SHORT_ARRAYSIZE; i++) { fprintf(stderr, "%d ", shortarray[i]); } fprintf(stderr, "\n"); fprintf(stderr, "hifloats "); for(i= 0; i < FLOAT_ARRAYSIZE; i++) { fprintf(stderr, "%f ", hifloatarray[i]); } fprintf(stderr, "\n"); }
unsigned int interpolvline64_3_mmx(__m64* temp){ __m64 res,res1; __m64 ptr = _mm_setzero_si64(); __m64 mm_16 = _mm_set_pi16(16,16,16,16); short A = _mm_extract_pi16(temp[0],0); short B = _mm_extract_pi16(temp[1],0); short C = _mm_extract_pi16(temp[2],0); short D = _mm_extract_pi16(temp[3],0); short E = _mm_extract_pi16(temp[4],0); short F = _mm_extract_pi16(temp[5],0); unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0); A = _mm_extract_pi16(temp[0],1); B = _mm_extract_pi16(temp[1],1); C = _mm_extract_pi16(temp[2],1); D = _mm_extract_pi16(temp[3],1); E = _mm_extract_pi16(temp[4],1); F = _mm_extract_pi16(temp[5],1); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1); A = _mm_extract_pi16(temp[0],2); B = _mm_extract_pi16(temp[1],2); C = _mm_extract_pi16(temp[2],2); D = _mm_extract_pi16(temp[3],2); E = _mm_extract_pi16(temp[4],2); F = _mm_extract_pi16(temp[5],2); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2); A = _mm_extract_pi16(temp[0],3); B = _mm_extract_pi16(temp[1],3); C = _mm_extract_pi16(temp[2],3); D = _mm_extract_pi16(temp[3],3); E = _mm_extract_pi16(temp[4],3); F = _mm_extract_pi16(temp[5],3); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3); res = _mm_add_pi16(temp[3],mm_16); res1 = _mm_srai_pi16(res,5); res1 = _mm_max_pi16(res1,_mm_set_pi16(0,0,0,0)); res1 = _mm_min_pi16(res1,_mm_set_pi16(255,255,255,255)); //Clip res = _mm_set_pi8(0,0,0,0,_mm_extract_pi16(ptr,3),_mm_extract_pi16(ptr,2),_mm_extract_pi16(ptr,1),_mm_extract_pi16(ptr,0)); res1 =_mm_set_pi8(0,0,0,0,_mm_extract_pi16(res1,3),_mm_extract_pi16(res1,2),_mm_extract_pi16(res1,1),_mm_extract_pi16(res1,0)); res = _mm_avg_pu8(res,res1);//(ptr_img[0] + ptr_rf[0] + 1) >> 1 result = _mm_cvtsi64_si32(res); empty(); return result; }