/* Test the 128-bit form */ static void ssse3_test_phsubd128 (int *i1, int *i2, int *r) { /* Assumes incoming pointers are 16-byte aligned */ __m128i t1 = *(__m128i *) i1; __m128i t2 = *(__m128i *) i2; *(__m128i *) r = _mm_hsub_epi32 (t1, t2); }
void ahd_interpolate_tile(int top, char * buffer) { int row, col, tr, tc, c, val; const int dir[4] = { -1, 1, -width, width }; __m128i ldiff[2], abdiff[2]; union hvrgbpix (*rgb)[width] = (union hvrgbpix (*)[width])buffer; union hvrgbpix *rix; union rgbpix * pix; union hvrgbpix (*lab)[width]; short (*lix)[8]; char (*h**o)[width][2]; lab = (union hvrgbpix (*)[width])(buffer + 16*width*TS); h**o = (char (*)[width][2])(buffer + 32*width*TS); const int left=2; if ((uintptr_t)(image+top*width)&0xf || (uintptr_t)buffer&0xf) { fprintf(stderr, "unaligned buffers defeat speed!\n"); abort(); } /* Interpolate gren horz&vert, red and blue, and convert to CIELab: */ //do the first two rows of green first. //then one green, and rgb through the tile.. this because R/B needs down-right green value for (row=top; row < top+2 && row < height-2; row++) { col = left + (FC(row,left) & 1); for (c = FC(row,col); col < width-2; col+=2) { pix = (union rgbpix*)image + row*width+col; val = ((pix[-1].g + pix[0].c[c] + pix[1].g) * 2 - pix[-2].c[c] - pix[2].c[c]) >> 2; rgb[row-top][col-left].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c] + pix[width].g) * 2 - pix[-2*width].c[c] - pix[2*width].c[c]) >> 2; rgb[row-top][col-left].v.g = ULIM(val,pix[-width].g,pix[width].g); } } for (; row < top+TS && row < height-2; row++) { int rowx = row-1; if (FC(rowx,left+1)==1) { int c1 = FC(rowx+1,left+1), c2 = FC(rowx,left+2); pix = (union rgbpix*)image + row*width+left+1; rix = &rgb[row-top][1]; val = ((pix[-1].g + pix[0].c[c1] + pix[1].g) * 2 - pix[-2].c[c1] - pix[2].c[c1]) >> 2; rix[0].h.g = ULIM(val,pix[-1].g,pix[1].g); val = ((pix[-width].g + pix[0].c[c1] + pix[width].g) * 2 - pix[-2*width].c[c1] - pix[2*width].c[c1]) >> 2; rix[0].v.g = ULIM(val,pix[-width].g,pix[width].g); for (col=left+1; col < width-3; col+=2) { pix = (union rgbpix*)image + rowx*width+col+1; union hvrgbpix rixr, rix0; rix = &rgb[rowx-top][col-left]+1; signed pix_diag = pix[-width-1].c[c1] + pix[-width+1].c[c1]; signed pix_ul = pix[-width-1].c[c1]; rixr.vec = _mm_set1_epi16(pix[-1].g); signed pix_lr = pix[-2].c[c2] + pix[0].c[c2]; rix0.h.c[c2] = rix0.v.c[c2] = pix[0].c[c2]; pix_diag += pix[width-1].c[c1] + pix[width+1].c[c1] + 1; signed pix_dl = pix[width-1].c[c1]; //fully loaded __m128i rix_dr = _mm_setr_epi32(pix[width].g, pix[width-1].c[c1], pix[1].g, pix[-width+1].c[c1]); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+1].c[c1], pix[width+3].c[c1], pix[width+1].c[c1], 0)); rix_dr = _mm_add_epi32(rix_dr,_mm_setr_epi32(pix[width+2].g, 0, pix[2*width+1].g, pix[3*width+1].c[c1])); rix_dr = _mm_mullo_epi32(rix_dr,_mm_setr_epi32(2,1,2,1)); //half loaded rix_dr = _mm_hsub_epi32(rix_dr,_mm_setzero_si128()); rix_dr = _mm_srai_epi32(rix_dr,2); __m128i a = _mm_setr_epi32(pix[width].g,pix[1].g,0,0); __m128i b = _mm_setr_epi32(pix[width+2].g,pix[2*width+1].g,0,0); __m128i m = _mm_min_epi32(a,b); __m128i M = _mm_max_epi32(a,b); rix_dr = _mm_min_epi32(rix_dr,M); rix_dr = _mm_max_epi32(rix_dr,m); signed pix_udr = pix_ul + pix_dl; signed rix0_ul = rix[-width-1].h.g; signed rix1_ul = rix[-width-1].v.g; __m128i rix_ur = _mm_setr_epi32(rix[-width+1].h.g, rix[-width+1].v.g, 0, 0); signed rix0_rr = rix[-2].h.g; signed rix1_rr = rix[-2].v.g; rix0.h.g = rix[0].h.g; rix0.v.g = rix[0].v.g; signed rix0_dl = rix[width-1].h.g; signed rix1_dl = rix[width-1].v.g; // fully loaded __m128i rix_udr = _mm_setr_epi32(rix0_ul, rix1_ul, rix0_rr, rix1_rr); rix_udr = _mm_add_epi32(rix_udr, _mm_setr_epi32(rix0_dl, rix1_dl, rix0.h.g, rix0.v.g)); __m128i v2 = _mm_set_epi32(pix_lr, pix_lr, pix_udr, pix_udr); v2 = _mm_sub_epi32(v2, rix_udr); v2 = _mm_srai_epi32(v2,1); v2 = _mm_add_epi32(v2,_mm_cvtepu16_epi32(rixr.vec)); v2 = _mm_max_epi32(v2, _mm_setzero_si128()); v2 = _mm_min_epi32(v2, _mm_set1_epi32(0xffff)); rixr.h.c[c2] = _mm_extract_epi32(v2,2); rixr.v.c[c2] = _mm_extract_epi32(v2,3); rixr.h.c[c1] = _mm_extract_epi32(v2,0); rixr.v.c[c1] = _mm_extract_epi32(v2,1); // following only uses 64 bit __m128i v1 = _mm_set1_epi32(pix_diag); v1 = _mm_sub_epi32(v1, rix_ur); v1 = _mm_sub_epi32(v1, rix_dr); v1 = _mm_sub_epi32(v1, rix_udr); v1 = _mm_srai_epi32(v1,2); v1 = _mm_add_epi32(v1, _mm_setr_epi32(rix0.h.g, rix0.v.g, 0, 0)); v1 = _mm_max_epi32(v1, _mm_setzero_si128()); v1 = _mm_min_epi32(v1, _mm_set1_epi32(0xffff)); rix0.h.c[c1] = _mm_extract_epi32(v1,0); rix0.v.c[c1] = _mm_extract_epi32(v1,1); lab[rowx-top][col-left].vec = cielabv(rixr); lab[rowx-top][col-left+1].vec = cielabv(rix0); _mm_store_si128(&rix[-1].vec,rixr.vec); _mm_store_si128(&rix[0].vec,rix0.vec); rix[width+1].h.g = _mm_extract_epi32(rix_dr,0); rix[width+1].v.g = _mm_extract_epi32(rix_dr,1); } } else {
__m128i test_mm_hsub_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hsub_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hsub_epi32(a, b); }
/// CURRENTLY SAME CODE AS SCALAR !! /// REPLACE HERE WITH SSE intrinsics static void partialButterflyInverse16_simd(short *src, short *dst, int shift) { int add = 1<<(shift-1); //we cast the original 16X16 matrix to an SIMD vector type __m128i *g_aiT16_vec = (__m128i *)g_aiT16; //We cast the input source (which is basically random numbers(see the main function for details)) to an SIMD vector type //We also cast the output to an SIMD vector type __m128i *in_vec = (__m128i *) src; __m128i *out_vec = (__m128i *) dst; //we declare an 8X8 array and cast it to an SIMD vector type short gt[8][8] __attribute__ ((aligned (16))); __m128i *gt_vec = (__m128i *)gt; //we declare an 16X16 array and cast it to an SIMD vector type short random[16][16] __attribute__ ((aligned (16))); __m128i *random_vec = (__m128i *)random; trans_g_aiT16(g_aiT16_vec,gt_vec); tranpose8x8(in_vec,2, random_vec,0); tranpose8x8(in_vec,3, random_vec,8); tranpose8x8(in_vec,0, random_vec,16); tranpose8x8(in_vec,1, random_vec,24); for (int j=0; j<16; j++) { /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ __m128i I0 = _mm_load_si128 (&random_vec[j]); __m128i II0 = _mm_load_si128 (&random_vec[j+16]); // for (int k=0; k<8; k++) //here we are loading up the transposed values in the initial matrix //multiplying it with the input numbers to produce intermediate 32-bit integers // we then sum up adjacent pairs of 32-bit integers and store them in the destination register __m128i I1 = _mm_load_si128 (>_vec[0]); __m128i I2 = _mm_madd_epi16 (I1, I0); __m128i I3 = _mm_load_si128 (>_vec[1]); __m128i I4 = _mm_madd_epi16 (I3, I0); __m128i I5 = _mm_load_si128 (>_vec[2]); __m128i I6 = _mm_madd_epi16 (I5, I0); __m128i I7 = _mm_load_si128 (>_vec[3]); __m128i I8 = _mm_madd_epi16 (I7, I0); __m128i I9 = _mm_load_si128 (>_vec[4]); __m128i I10 = _mm_madd_epi16 (I9, I0); __m128i I11 = _mm_load_si128 (>_vec[5]); __m128i I12 = _mm_madd_epi16 (I11, I0); __m128i I13 = _mm_load_si128 (>_vec[6]); __m128i I14 = _mm_madd_epi16 (I13, I0); __m128i I15 = _mm_load_si128 (>_vec[7]); __m128i I16 = _mm_madd_epi16 (I15, I0); //horizontally add the partial results obtained from thee previous step __m128i A1 =_mm_hadd_epi32 (I2, I4); __m128i A2 =_mm_hadd_epi32 (I6, I8); __m128i R1 =_mm_hadd_epi32 (A1, A2); __m128i A3 =_mm_hadd_epi32 (I10, I12); __m128i A4 =_mm_hadd_epi32 (I14, I16); __m128i R2 =_mm_hadd_epi32 (A3, A4); // O[k] = T[0]+T[1]+T[2]+T[3]; // for (int k=0; k<4; k++) // { //load the original matrix values, multiply it with the random values //store the low bits to I2 and the hi bits to I3 I1 = _mm_load_si128 (>_vec[8]); I2 = _mm_mullo_epi16 (I1, II0); I3 = _mm_mulhi_epi16 (I1, II0); __m128i lowI23 = _mm_unpacklo_epi16(I2,I3); __m128i hiI23 = _mm_unpackhi_epi16(I2,I3); __m128i temp1 = _mm_add_epi32(lowI23,hiI23); __m128i temp5 = _mm_hsub_epi32 (lowI23, hiI23); I4 = _mm_load_si128 (>_vec[9]); I5 = _mm_mullo_epi16 (I4, II0); I6 = _mm_mulhi_epi16 (I4, II0); __m128i lowI56 = _mm_unpacklo_epi16(I5,I6); __m128i hiI56 = _mm_unpackhi_epi16(I5,I6); __m128i temp2 = _mm_add_epi32(lowI56,hiI56); __m128i temp6 = _mm_hsub_epi32 (lowI56, hiI56); I7 = _mm_load_si128 (>_vec[10]); I8 = _mm_mullo_epi16 (I7, II0); I9 = _mm_mulhi_epi16 (I7, II0); __m128i lowI89 = _mm_unpacklo_epi16(I8,I9); __m128i hiI89 = _mm_unpackhi_epi16(I8,I9); __m128i temp3 = _mm_add_epi32(lowI89,hiI89); __m128i temp7 = _mm_hsub_epi32 (lowI89, hiI89); I10 = _mm_load_si128 (>_vec[11]); I11 = _mm_mullo_epi16 (I10, II0); I12 = _mm_mulhi_epi16 (I10, II0); __m128i lowI1112 = _mm_unpacklo_epi16(I11,I12); __m128i hiI1112 = _mm_unpackhi_epi16(I11,I12); __m128i temp4 = _mm_add_epi32(lowI1112,hiI1112); __m128i temp8 = _mm_hsub_epi32 (lowI1112, hiI1112); __m128i A5 =_mm_hadd_epi32 (temp1, temp2); __m128i A6 =_mm_hadd_epi32 (temp3, temp4); __m128i R3 =_mm_hadd_epi32 (A5, A6); __m128i A7 =_mm_hadd_epi32 (temp8, temp7); __m128i A8 =_mm_hadd_epi32 (temp6, temp5); __m128i R4 =_mm_hadd_epi32 (A7, A8); /////////////////////////// __m128i add_reg = _mm_set1_epi32(add); __m128i sum_vec0 = _mm_add_epi32(R3,R1); sum_vec0 = _mm_add_epi32(sum_vec0,add_reg); sum_vec0 = _mm_srai_epi32(sum_vec0, shift); // shift right __m128i sum_vec1 = _mm_add_epi32(R4,R2); sum_vec1 = _mm_add_epi32(sum_vec1,add_reg); sum_vec1 = _mm_srai_epi32(sum_vec1, shift); // shift right __m128i finalres0 = _mm_packs_epi32(sum_vec0, sum_vec1); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j], finalres0); __m128i sum_vec2 = _mm_sub_epi32(R4, R2); sum_vec2 = _mm_add_epi32(sum_vec2,add_reg); sum_vec2 = _mm_srai_epi32(sum_vec2, shift); // shift right __m128i sum_vec3 = _mm_sub_epi32(R3, R1); sum_vec3 = _mm_add_epi32(sum_vec3,add_reg); sum_vec3 = _mm_srai_epi32(sum_vec3, shift); // shift right I5 = _mm_unpackhi_epi32(sum_vec2, sum_vec3); I6 = _mm_unpacklo_epi32(sum_vec2, sum_vec3); I7 = _mm_unpackhi_epi32(I5, I6); I8 = _mm_unpacklo_epi32(I5, I6); I9 = _mm_unpacklo_epi32(I7, I8); I10 = _mm_unpackhi_epi32(I7, I8); sum_vec3 = _mm_packs_epi32(I9, I10); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j+1], sum_vec3); } }