int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const { assert(FitMatrix::PADDEDCOLS == 32); __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale); const short* localFitMatrix = fitMatrix(); // Load data into four SSE Registers __m128i x[4]; signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0)); x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8)); x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16)); x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24)); x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10); // Clear dataFlat[27..31] for(int i = 0; i < FitMatrix::ROWS; i++) { // Compute scalar product between ((float*)x)[0..31] and localFitMatrix __m128i sum = _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0)); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16))); sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24))); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale)); localFitMatrix += 32; } }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
template <bool align> SIMD_INLINE __m128i LoadAndConvertY16(const __m128i * bgra, __m128i & b16_r16, __m128i & g16_1) { __m128i _b16_r16[2], _g16_1[2]; LoadPreparedBgra16<align>(bgra + 0, _b16_r16[0], _g16_1[0]); LoadPreparedBgra16<align>(bgra + 1, _b16_r16[1], _g16_1[1]); b16_r16 = _mm_hadd_epi32(_b16_r16[0], _b16_r16[1]); g16_1 = _mm_hadd_epi32(_g16_1[0], _g16_1[1]); return SaturateI16ToU8(_mm_add_epi16(K16_Y_ADJUST, _mm_packs_epi32(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1])))); }
int vector_ps_short (const short* pa,const short* pb,size_t n) { size_t k; size_t q = n / 16; size_t r = n % 16; int w; if (q > 0) { __m128i acc1 = _mm_setzero_si128(); __m128i acc2 = _mm_setzero_si128(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_load_si128((__m128i*)pa); __m128i b1 = _mm_load_si128((__m128i*)pb); __m128i a2 = _mm_load_si128((__m128i*)(pa+8)); __m128i b2 = _mm_load_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i s1 = _mm_madd_epi16(a1,b1); __m128i s2 = _mm_madd_epi16(a2,b2); pa += 16; pb += 16; /* Accumule */ acc1 = _mm_add_epi32(acc1,s1); acc2 = _mm_add_epi32(acc2,s2); } } else { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_loadu_si128((__m128i*)pa); __m128i b1 = _mm_loadu_si128((__m128i*)pb); __m128i a2 = _mm_loadu_si128((__m128i*)(pa+8)); __m128i b2 = _mm_loadu_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i s1 = _mm_madd_epi16(a1,b1); __m128i s2 = _mm_madd_epi16(a2,b2); pa += 16; pb += 16; /* Accumule */ acc1 = _mm_add_epi32(acc1,s1); acc2 = _mm_add_epi32(acc2,s2); } } /* Somme finale */ acc1 = _mm_add_epi32(acc1,acc2); acc1 = _mm_hadd_epi32(acc1,acc1); acc1 = _mm_hadd_epi32(acc1,acc1); w = _mm_extract_epi32(acc1,0); } else { w = 0; } for (k=0;k<r;k++) w += (*pa++) * (*pb++); return w; }
__m128i kvz_eight_tap_filter_x4_and_flip_16bit(__m128i *data0, __m128i *data1, __m128i *data2, __m128i *data3, __m128i *filter) { __m128i a, b, c, d; __m128i fir = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(filter))); a = _mm_madd_epi16(*data0, fir); b = _mm_madd_epi16(*data1, fir); a = _mm_hadd_epi32(a, b); c = _mm_madd_epi16(*data2, fir); d = _mm_madd_epi16(*data3, fir); c = _mm_hadd_epi32(c, d); a = _mm_hadd_epi32(a, c); return a; }
/* Test the 128-bit form */ static void ssse3_test_phaddd128 (int *i1, int *i2, int *r) { /* Assumes incoming pointers are 16-byte aligned */ __m128i t1 = *(__m128i *) i1; __m128i t2 = *(__m128i *) i2; *(__m128i *) r = _mm_hadd_epi32 (t1, t2); }
// credit: Harold Aptroot uint32_t maskedvectorsum(uint32_t * z, uint32_t N, uint32_t * accesses, uint32_t nmbr) { __m256i Nvec = _mm256_set1_epi32(N - 1); __m256i sum = _mm256_setzero_si256(); for(uint32_t j = 0; j < nmbr ; j += 8) { __m256i indexes = _mm256_loadu_si256((__m256i*)(accesses + j)); indexes = _mm256_and_si256(indexes, Nvec); __m256i fi = _mm256_i32gather_epi32((int*)z, indexes, 4); sum = _mm256_add_epi32(sum, fi); } __m128i sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 0), _mm256_extracti128_si256(sum, 1)); sum128 = _mm_hadd_epi32(sum128, sum128); return _mm_extract_epi32(sum128, 0) + _mm_extract_epi32(sum128, 1); }
// Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) { int max_offset = n - 8; int offset = 0; // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit // values, extending to 16 bit, multiplying to make 32 bit results. int32_t result = 0; if (offset <= max_offset) { offset = 8; __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u)); __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v)); __m128i sum = _mm_cvtepi8_epi16(packed1); packed2 = _mm_cvtepi8_epi16(packed2); // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit // ints to make 32 bit results, which are then horizontally added in pairs // to make 4 32 bit results that still fit in a 128 bit register. sum = _mm_madd_epi16(sum, packed2); while (offset <= max_offset) { packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset)); packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset)); offset += 8; packed1 = _mm_cvtepi8_epi16(packed1); packed2 = _mm_cvtepi8_epi16(packed2); packed1 = _mm_madd_epi16(packed1, packed2); sum = _mm_add_epi32(sum, packed1); } // Sum the 4 packed 32 bit sums and extract the low result. sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); result = _mm_cvtsi128_si32(sum); } while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; }
inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA) __m128i p1234 = _mm_unpacklo_epi8(p12, p34); __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128()); __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx); // extend to 16bit __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128()); __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128()); // convert weights to integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4] __m128i outRG = _mm_madd_epi16(pRG, weighti); //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4] __m128i outBA = _mm_madd_epi16(pBA, weighti); // horizontal add that will produce the output values (in 32bit) __m128i out = _mm_hadd_epi32(outRG, outBA); out = _mm_srli_epi32(out, 8); // divide by 256 // convert 32bit->8bit out = _mm_packus_epi32(out, _mm_setzero_si128()); out = _mm_packus_epi16(out, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(out); }
int oneThread(int threadId) { int *aa; int *bb; int k; int itr; aa = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16); bb = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16); memset(&aa[0], 1, ARRAY_SIZE*4); memset(&bb[0], 2, ARRAY_SIZE*4); __m128i a0,a1,a2,a3,b0,b1,b2,b3; __m128i a4,a5,a6,a7,b4,b5,b6,b7; __m128i c0,c1,c2,c3; __m128i c4,c5,c6,c7; __m128i cc; cc = _mm_set_epi32 (0, 0, 0, 0); for (k = 0; k < REPS; k++) { for (itr = 0; itr<ARRAY_SIZE; itr+=32) { a0 = _mm_load_si128((__m128i*)&aa[itr]); a1 = _mm_load_si128((__m128i*)&aa[itr+4]); a2 = _mm_load_si128((__m128i*)&aa[itr+8]); a3 = _mm_load_si128((__m128i*)&aa[itr+12]); a4 = _mm_load_si128((__m128i*)&aa[itr+16]); a5 = _mm_load_si128((__m128i*)&aa[itr+20]); a6 = _mm_load_si128((__m128i*)&aa[itr+24]); a7 = _mm_load_si128((__m128i*)&aa[itr+28]); b0 = _mm_load_si128((__m128i*)&bb[itr]); b1 = _mm_load_si128((__m128i*)&bb[itr+4]); b2 = _mm_load_si128((__m128i*)&bb[itr+8]); b3 = _mm_load_si128((__m128i*)&bb[itr+12]); b4 = _mm_load_si128((__m128i*)&bb[itr+16]); b5 = _mm_load_si128((__m128i*)&bb[itr+20]); b6 = _mm_load_si128((__m128i*)&bb[itr+24]); b7 = _mm_load_si128((__m128i*)&bb[itr+28]); c0 = _mm_mul_epi32(a0, b0); c1 = _mm_mul_epi32(a1, b1); c2 = _mm_mul_epi32(a2, b2); c3 = _mm_mul_epi32(a3, b3); c4 = _mm_mul_epi32(a4, b4); c5 = _mm_mul_epi32(a5, b5); c6 = _mm_mul_epi32(a6, b6); c7 = _mm_mul_epi32(a7, b7); c0 = _mm_add_epi32(c0,c1); c1 = _mm_add_epi32(c2,c3); c2 = _mm_add_epi32(c4,c5); c3 = _mm_add_epi32(c6,c7); c0 = _mm_add_epi32(c0,c1); c1 = _mm_add_epi32(c2,c3); c0 = _mm_add_epi32(c0,c1); cc = _mm_add_epi32(cc,c0); } } cc = _mm_hadd_epi32(cc,cc); cc = _mm_hadd_epi32(cc,cc); int count =0; count = _mm_cvtsi128_si32(cc) ; free(aa); free(bb); return count; }
inline __m128i foo5 (__m128i x, __m128i y) { return _mm_hadd_epi32 (x, y); }
void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) { const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; unsigned partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { const unsigned threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order); if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); unsigned e1, e3; end += default_partition_samples; e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */ for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=4) { __m128i mm_res = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample))); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); } } else { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); unsigned e1, e3; end += default_partition_samples; e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); /* 0 0 0 |r0| == 00 |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=2) { __m128i mm_res = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); _mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum); } } } /* now merge partitions for lower orders */ { unsigned from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { unsigned i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }
void precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) { const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; unsigned partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order); unsigned e1, e3; __m128i mm_res, mm_sum; if(bps <= 16) { FLAC__uint32 abs_residual_partition_sum; for(partition = residual_sample = 0; partition < partitions; partition++) { end += default_partition_samples; abs_residual_partition_sum = 0; mm_sum = _mm_setzero_si128(); e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast*/ for( ; residual_sample < e1; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */ for( ; residual_sample < e3; residual_sample+=4) { mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample)); mm_res = _mm_abs_epi32(mm_res); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); abs_residual_partition_sum += _mm_cvtsi128_si32(mm_sum); for( ; residual_sample < end; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum; } } else { /* have to pessimistically use 64 bits for accumulator */ FLAC__uint64 abs_residual_partition_sum; for(partition = residual_sample = 0; partition < partitions; partition++) { end += default_partition_samples; abs_residual_partition_sum = 0; mm_sum = _mm_setzero_si128(); e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); for( ; residual_sample < e3; residual_sample+=2) { mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /* 0 0 r1 r0 */ mm_res = _mm_abs_epi32(mm_res); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); #ifdef FLAC__CPU_IA32 #ifdef _MSC_VER abs_residual_partition_sum += mm_sum.m128i_u64[0]; #else { FLAC__uint64 tmp[2]; _mm_storel_epi64((__m128i *)tmp, mm_sum); abs_residual_partition_sum += tmp[0]; } #endif #else abs_residual_partition_sum += _mm_cvtsi128_si64(mm_sum); #endif for( ; residual_sample < end; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum; } } } /* now merge partitions for lower orders */ { unsigned from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { unsigned i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load, combine and transpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_cvtepu8_epi16(transpose1_0); tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8)); tmp_2 = _mm_cvtepu8_epi16(transpose1_1); tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8)); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); // cascading summation of the differences B_b0 = _mm_hadd_epi32(A_b2, A_b2); B_b2 = _mm_hadd_epi32(B_b0, B_b0); return _mm_cvtsi128_si32(B_b2); } }
/// CURRENTLY SAME CODE AS SCALAR !! /// REPLACE HERE WITH SSE intrinsics static void partialButterflyInverse16_simd(short *src, short *dst, int shift) { int add = 1<<(shift-1); //we cast the original 16X16 matrix to an SIMD vector type __m128i *g_aiT16_vec = (__m128i *)g_aiT16; //We cast the input source (which is basically random numbers(see the main function for details)) to an SIMD vector type //We also cast the output to an SIMD vector type __m128i *in_vec = (__m128i *) src; __m128i *out_vec = (__m128i *) dst; //we declare an 8X8 array and cast it to an SIMD vector type short gt[8][8] __attribute__ ((aligned (16))); __m128i *gt_vec = (__m128i *)gt; //we declare an 16X16 array and cast it to an SIMD vector type short random[16][16] __attribute__ ((aligned (16))); __m128i *random_vec = (__m128i *)random; trans_g_aiT16(g_aiT16_vec,gt_vec); tranpose8x8(in_vec,2, random_vec,0); tranpose8x8(in_vec,3, random_vec,8); tranpose8x8(in_vec,0, random_vec,16); tranpose8x8(in_vec,1, random_vec,24); for (int j=0; j<16; j++) { /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ __m128i I0 = _mm_load_si128 (&random_vec[j]); __m128i II0 = _mm_load_si128 (&random_vec[j+16]); // for (int k=0; k<8; k++) //here we are loading up the transposed values in the initial matrix //multiplying it with the input numbers to produce intermediate 32-bit integers // we then sum up adjacent pairs of 32-bit integers and store them in the destination register __m128i I1 = _mm_load_si128 (>_vec[0]); __m128i I2 = _mm_madd_epi16 (I1, I0); __m128i I3 = _mm_load_si128 (>_vec[1]); __m128i I4 = _mm_madd_epi16 (I3, I0); __m128i I5 = _mm_load_si128 (>_vec[2]); __m128i I6 = _mm_madd_epi16 (I5, I0); __m128i I7 = _mm_load_si128 (>_vec[3]); __m128i I8 = _mm_madd_epi16 (I7, I0); __m128i I9 = _mm_load_si128 (>_vec[4]); __m128i I10 = _mm_madd_epi16 (I9, I0); __m128i I11 = _mm_load_si128 (>_vec[5]); __m128i I12 = _mm_madd_epi16 (I11, I0); __m128i I13 = _mm_load_si128 (>_vec[6]); __m128i I14 = _mm_madd_epi16 (I13, I0); __m128i I15 = _mm_load_si128 (>_vec[7]); __m128i I16 = _mm_madd_epi16 (I15, I0); //horizontally add the partial results obtained from thee previous step __m128i A1 =_mm_hadd_epi32 (I2, I4); __m128i A2 =_mm_hadd_epi32 (I6, I8); __m128i R1 =_mm_hadd_epi32 (A1, A2); __m128i A3 =_mm_hadd_epi32 (I10, I12); __m128i A4 =_mm_hadd_epi32 (I14, I16); __m128i R2 =_mm_hadd_epi32 (A3, A4); // O[k] = T[0]+T[1]+T[2]+T[3]; // for (int k=0; k<4; k++) // { //load the original matrix values, multiply it with the random values //store the low bits to I2 and the hi bits to I3 I1 = _mm_load_si128 (>_vec[8]); I2 = _mm_mullo_epi16 (I1, II0); I3 = _mm_mulhi_epi16 (I1, II0); __m128i lowI23 = _mm_unpacklo_epi16(I2,I3); __m128i hiI23 = _mm_unpackhi_epi16(I2,I3); __m128i temp1 = _mm_add_epi32(lowI23,hiI23); __m128i temp5 = _mm_hsub_epi32 (lowI23, hiI23); I4 = _mm_load_si128 (>_vec[9]); I5 = _mm_mullo_epi16 (I4, II0); I6 = _mm_mulhi_epi16 (I4, II0); __m128i lowI56 = _mm_unpacklo_epi16(I5,I6); __m128i hiI56 = _mm_unpackhi_epi16(I5,I6); __m128i temp2 = _mm_add_epi32(lowI56,hiI56); __m128i temp6 = _mm_hsub_epi32 (lowI56, hiI56); I7 = _mm_load_si128 (>_vec[10]); I8 = _mm_mullo_epi16 (I7, II0); I9 = _mm_mulhi_epi16 (I7, II0); __m128i lowI89 = _mm_unpacklo_epi16(I8,I9); __m128i hiI89 = _mm_unpackhi_epi16(I8,I9); __m128i temp3 = _mm_add_epi32(lowI89,hiI89); __m128i temp7 = _mm_hsub_epi32 (lowI89, hiI89); I10 = _mm_load_si128 (>_vec[11]); I11 = _mm_mullo_epi16 (I10, II0); I12 = _mm_mulhi_epi16 (I10, II0); __m128i lowI1112 = _mm_unpacklo_epi16(I11,I12); __m128i hiI1112 = _mm_unpackhi_epi16(I11,I12); __m128i temp4 = _mm_add_epi32(lowI1112,hiI1112); __m128i temp8 = _mm_hsub_epi32 (lowI1112, hiI1112); __m128i A5 =_mm_hadd_epi32 (temp1, temp2); __m128i A6 =_mm_hadd_epi32 (temp3, temp4); __m128i R3 =_mm_hadd_epi32 (A5, A6); __m128i A7 =_mm_hadd_epi32 (temp8, temp7); __m128i A8 =_mm_hadd_epi32 (temp6, temp5); __m128i R4 =_mm_hadd_epi32 (A7, A8); /////////////////////////// __m128i add_reg = _mm_set1_epi32(add); __m128i sum_vec0 = _mm_add_epi32(R3,R1); sum_vec0 = _mm_add_epi32(sum_vec0,add_reg); sum_vec0 = _mm_srai_epi32(sum_vec0, shift); // shift right __m128i sum_vec1 = _mm_add_epi32(R4,R2); sum_vec1 = _mm_add_epi32(sum_vec1,add_reg); sum_vec1 = _mm_srai_epi32(sum_vec1, shift); // shift right __m128i finalres0 = _mm_packs_epi32(sum_vec0, sum_vec1); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j], finalres0); __m128i sum_vec2 = _mm_sub_epi32(R4, R2); sum_vec2 = _mm_add_epi32(sum_vec2,add_reg); sum_vec2 = _mm_srai_epi32(sum_vec2, shift); // shift right __m128i sum_vec3 = _mm_sub_epi32(R3, R1); sum_vec3 = _mm_add_epi32(sum_vec3,add_reg); sum_vec3 = _mm_srai_epi32(sum_vec3, shift); // shift right I5 = _mm_unpackhi_epi32(sum_vec2, sum_vec3); I6 = _mm_unpacklo_epi32(sum_vec2, sum_vec3); I7 = _mm_unpackhi_epi32(I5, I6); I8 = _mm_unpacklo_epi32(I5, I6); I9 = _mm_unpacklo_epi32(I7, I8); I10 = _mm_unpackhi_epi32(I7, I8); sum_vec3 = _mm_packs_epi32(I9, I10); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j+1], sum_vec3); } }
__m128i test_mm_hadd_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadd_epi32 // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) return _mm_hadd_epi32(a, b); }
float vector_cos_short (const short* pa,const short* pb,size_t n) { size_t k; double norm; size_t q = n / 16; size_t r = n % 16; int ps,na,nb; if (q > 0) { __m128i acc; __m128i acc_ps1 = _mm_setzero_si128(); __m128i acc_ps2 = _mm_setzero_si128(); __m128i acc_na1 = _mm_setzero_si128(); __m128i acc_na2 = _mm_setzero_si128(); __m128i acc_nb1 = _mm_setzero_si128(); __m128i acc_nb2 = _mm_setzero_si128(); if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { for (k=0;k<q;k++) { /* Charge 16 mots dans chaque tableau */ __m128i a1 = _mm_load_si128((__m128i*)pa); __m128i b1 = _mm_load_si128((__m128i*)pb); __m128i a2 = _mm_load_si128((__m128i*)(pa+8)); __m128i b2 = _mm_load_si128((__m128i*)(pb+8)); /* Multiple, somme et converti en double word */ __m128i ps1 = _mm_madd_epi16(a1,b1); __m128i ps2 = _mm_madd_epi16(a2,b2); __m128i na1 = _mm_madd_epi16(a1,a1); __m128i na2 = _mm_madd_epi16(a2,a2); __m128i nb1 = _mm_madd_epi16(b1,b1); __m128i nb2 = _mm_madd_epi16(b2,b2); pa += 16; pb += 16; /* Accumule */ acc_ps1 = _mm_add_epi32(acc_ps1,ps1); acc_ps2 = _mm_add_epi32(acc_ps2,ps2); acc_na1 = _mm_add_epi32(acc_na1,na1); acc_na2 = _mm_add_epi32(acc_na2,na2); acc_nb1 = _mm_add_epi32(acc_nb1,nb1); acc_nb2 = _mm_add_epi32(acc_nb2,nb2); } } else { for (k=0;k<q;k++) { } } /* Somme finale */ acc = _mm_add_epi32(acc_ps1,acc_ps2); acc = _mm_hadd_epi32(acc,acc); acc = _mm_hadd_epi32(acc,acc); ps = _mm_extract_epi32(acc,0); acc = _mm_add_epi32(acc_na1,acc_na2); acc = _mm_hadd_epi32(acc,acc); acc = _mm_hadd_epi32(acc,acc); na = _mm_extract_epi32(acc,0); acc = _mm_add_epi32(acc_nb1,acc_nb2); acc = _mm_hadd_epi32(acc,acc); acc = _mm_hadd_epi32(acc,acc); nb = _mm_extract_epi32(acc,0); } else { ps = 0; na = 0; nb = 0; } for (k=0;k<r;k++) { int a = *pa++; int b = *pb++; ps += a*b; na += a*a; nb += b*b; } norm = sqrt( ((double)na) * ((double)nb) ); if (norm < 1E-5f) return 0; return ps / norm; }
void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; end += default_partition_samples; for( ; (int)residual_sample < (int)end-7; residual_sample+=8) { __m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample))); sum256 = _mm256_add_epi32(sum256, res256); } sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256)); for( ; (int)residual_sample < (int)end-3; residual_sample+=4) { __m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample))); sum128 = _mm_add_epi32(sum128, res128); } for( ; residual_sample < end; residual_sample++) { __m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); sum128 = _mm_add_epi32(sum128, res128); } sum128 = _mm_hadd_epi32(sum128, sum128); sum128 = _mm_hadd_epi32(sum128, sum128); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128); /* workaround for a bug in MSVC2015U2 - see https://connect.microsoft.com/VisualStudio/feedback/details/2659191/incorrect-code-generation-for-x86-64 */ #if (defined _MSC_VER) && (_MSC_FULL_VER == 190023918) && (defined FLAC__CPU_X86_64) abs_residual_partition_sums[partition] &= 0xFFFFFFFF; /**/ #endif } } else { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; end += default_partition_samples; for( ; (int)residual_sample < (int)end-3; residual_sample+=4) { __m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample))); __m256i res256 = _mm256_cvtepu32_epi64(res128); sum256 = _mm256_add_epi64(sum256, res256); } sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256)); for( ; (int)residual_sample < (int)end-1; residual_sample+=2) { __m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); res128 = _mm_cvtepu32_epi64(res128); sum128 = _mm_add_epi64(sum128, res128); } for( ; residual_sample < end; residual_sample++) { __m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); sum128 = _mm_add_epi64(sum128, res128); } sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8)); _mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128); } } } /* now merge partitions for lower orders */ { uint32_t from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { uint32_t i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } _mm256_zeroupper(); }