void pix_subtract :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 null64 = _mm_setzero_si64(); __m64 offset = _mm_setr_pi16(0x80, 0x00, 0x80, 0x00); __m64 l0, l1, r0, r1; while (datasize--) { l1=leftPix[datasize]; r1=rightPix[datasize]; l0=_mm_unpacklo_pi8 (l1, null64); r0=_mm_unpacklo_pi8 (r1, null64); l1=_mm_unpackhi_pi8 (l1, null64); r1=_mm_unpackhi_pi8 (r1, null64); l0=_mm_adds_pu16(l0, offset); l1=_mm_adds_pu16(l1, offset); l0=_mm_subs_pu16(l0, r0); l1=_mm_subs_pu16(l1, r1); leftPix[datasize]=_mm_packs_pu16(l0, l1); } _mm_empty(); }
__m64 test_mm_setr_pi16(short a, short b, short c, short d) { // CHECK-LABEL: test_mm_setr_pi16 // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> return _mm_setr_pi16(a, b, c, d); }
15-Dec-09 initial coding gpk 16-Jan-11 compute rgba1 directly, not using delta_y; fix gpk reference URL 20-Feb-11 sum u_vec & v_vec before shifting to match order gpk of operations in scalar.c code ************************************************************************* */ void yuv422rgb_mmx1(const unsigned char * __restrict__ sourcep, int source_byte_count, unsigned char * __restrict__ destp) { const unsigned char *source_endp; const unsigned char *vector_endp; int remainder; const __m64 u_coeff = _mm_setr_pi16(0, -22, 113, 0); const __m64 v_coeff = _mm_setr_pi16(90, -46, 0, 0); __m64 y0_vec, y1_vec, u_vec, v_vec, uv_vec, rgba0, rgba1; short y0, u, y1, v; const unsigned char alpha = 255; /* we're working with things in 4-byte macropixels */ remainder = source_byte_count % 4; source_endp = sourcep + source_byte_count; vector_endp = source_endp - remainder; while (sourcep < vector_endp) { /* pull YUYV from the four byte macropixel starting at sourcep. */