int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
OD_SIMD_INLINE od_m256i od_mm256_add_epi32(od_m256i a, od_m256i b) { od_m256i r; r.lo = _mm_add_epi32(a.lo, b.lo); r.hi = _mm_add_epi32(a.hi, b.hi); return r; }
inline static void comp_func_helper_sse2(uint32_t *dest, int length, uint32_t color, uint32_t alpha) { const __m128i v_color = _mm_set1_epi32(color); const __m128i v_a = _mm_set1_epi16(alpha); LOOP_ALIGNED_U1_A4(dest, length, { /* UOP */ *dest = color + DRAW_BYTE_MUL(*dest, alpha); dest++; length--; }, { /* A4OP */ __m128i v_dest = _mm_load_si128((__m128i *)dest); v_dest = v4_byte_mul_sse2(v_dest, v_a); v_dest = _mm_add_epi32(v_dest, v_color); _mm_store_si128((__m128i *)dest, v_dest); dest += 4; length -= 4; }) } void comp_func_solid_source_sse2(uint32_t *dest, int length, uint32_t color, uint32_t const_alpha) { if (const_alpha == 255) { draw_memset32(dest, color, length); } else
static inline void xor_salsa8_sse2(__m128i B[4], const __m128i Bx[4]) { __m128i X0, X1, X2, X3; __m128i T; int i; X0 = B[0] = _mm_xor_si128(B[0], Bx[0]); X1 = B[1] = _mm_xor_si128(B[1], Bx[1]); X2 = B[2] = _mm_xor_si128(B[2], Bx[2]); X3 = B[3] = _mm_xor_si128(B[3], Bx[3]); for (i = 0; i < 8; i += 2) { /* Operate on "columns". */ T = _mm_add_epi32(X0, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X1, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X3, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x93); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x39); /* Operate on "rows". */ T = _mm_add_epi32(X0, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X3, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X1, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x39); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x93); } B[0] = _mm_add_epi32(B[0], X0); B[1] = _mm_add_epi32(B[1], X1); B[2] = _mm_add_epi32(B[2], X2); B[3] = _mm_add_epi32(B[3], X3); }
void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output, const int8_t *cos_bit, const int8_t *stage_range) { const int txfm_size = 4; const int num_per_128 = 4; const int32_t *cospi; __m128i buf0[4]; __m128i buf1[4]; int col_num = txfm_size / num_per_128; int bit; int col; (void)stage_range; for (col = 0; col < col_num; col++) { // stage 0; int32_t stage_idx = 0; buf0[0] = input[0 * col_num + col]; buf0[1] = input[1 * col_num + col]; buf0[2] = input[2 * col_num + col]; buf0[3] = input[3 * col_num + col]; // stage 1 stage_idx++; buf1[0] = buf0[3]; buf1[1] = buf0[0]; buf1[2] = buf0[1]; buf1[3] = buf0[2]; // stage 2 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], bit); btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3], bit); // stage 3 stage_idx++; buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); // stage 4 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3], bit); // stage 5 stage_idx++; buf1[0] = buf0[0]; buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]); buf1[2] = buf0[3]; buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]); output[0 * col_num + col] = buf1[0]; output[1 * col_num + col] = buf1[1]; output[2 * col_num + col] = buf1[2]; output[3 * col_num + col] = buf1[3]; } }
void ENC_MSG_x8(const unsigned char *PT, unsigned char *CT, const unsigned char *TAG, const unsigned char *KS, int length) { __m128i or_mask, TWO,ctr_block, tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ONE; int i,j,remainder_loc; if (length%16) length = length/16 + 1; else length/=16; ONE = _mm_setr_epi32(1,0,0,0); TWO = _mm_setr_epi32(2,0,0,0); ctr_block = _mm_setzero_si128(); ctr_block = _mm_loadu_si128(((__m128i*)TAG)); or_mask = _mm_setr_epi32(0,0,0,0x80000000); ctr_block = _mm_or_si128(ctr_block, or_mask); for (i=0; i< (length-length%8); i=i+8) { tmp = ctr_block; tmp1 = _mm_add_epi32(ctr_block, ONE); tmp2 = _mm_add_epi32(ctr_block, TWO); tmp3 = _mm_add_epi32(tmp2, ONE); tmp4 = _mm_add_epi32(tmp2, TWO); tmp5 = _mm_add_epi32(tmp4, ONE); tmp6 = _mm_add_epi32(tmp4, TWO); tmp7 = _mm_add_epi32(tmp6, ONE); ctr_block = _mm_add_epi32(tmp6, TWO); tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]); tmp1 = _mm_xor_si128(tmp1, ((__m128i*)KS)[0]); tmp2 = _mm_xor_si128(tmp2, ((__m128i*)KS)[0]); tmp3 = _mm_xor_si128(tmp3, ((__m128i*)KS)[0]); tmp4 = _mm_xor_si128(tmp4, ((__m128i*)KS)[0]); tmp5 = _mm_xor_si128(tmp5, ((__m128i*)KS)[0]); tmp6 = _mm_xor_si128(tmp6, ((__m128i*)KS)[0]); tmp7 = _mm_xor_si128(tmp7, ((__m128i*)KS)[0]); for(j=1; j <10; j++) { tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]); tmp1 = _mm_aesenc_si128 (tmp1, ((__m128i*)KS)[j]); tmp2 = _mm_aesenc_si128 (tmp2, ((__m128i*)KS)[j]); tmp3 = _mm_aesenc_si128 (tmp3, ((__m128i*)KS)[j]); tmp4 = _mm_aesenc_si128 (tmp4, ((__m128i*)KS)[j]); tmp5 = _mm_aesenc_si128 (tmp5, ((__m128i*)KS)[j]); tmp6 = _mm_aesenc_si128 (tmp6, ((__m128i*)KS)[j]); tmp7 = _mm_aesenc_si128 (tmp7, ((__m128i*)KS)[j]); }; tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]); tmp1 = _mm_aesenclast_si128 (tmp1, ((__m128i*)KS)[j]); tmp2 = _mm_aesenclast_si128 (tmp2, ((__m128i*)KS)[j]); tmp3 = _mm_aesenclast_si128 (tmp3, ((__m128i*)KS)[j]); tmp4 = _mm_aesenclast_si128 (tmp4, ((__m128i*)KS)[j]); tmp5 = _mm_aesenclast_si128 (tmp5, ((__m128i*)KS)[j]); tmp6 = _mm_aesenclast_si128 (tmp6, ((__m128i*)KS)[j]); tmp7 = _mm_aesenclast_si128 (tmp7, ((__m128i*)KS)[j]); tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[i])); tmp1 = _mm_xor_si128(tmp1,_mm_loadu_si128(&((__m128i*)PT)[i+1])); tmp2 = _mm_xor_si128(tmp2,_mm_loadu_si128(&((__m128i*)PT)[i+2])); tmp3 = _mm_xor_si128(tmp3,_mm_loadu_si128(&((__m128i*)PT)[i+3])); tmp4 = _mm_xor_si128(tmp4,_mm_loadu_si128(&((__m128i*)PT)[i+4])); tmp5 = _mm_xor_si128(tmp5,_mm_loadu_si128(&((__m128i*)PT)[i+5])); tmp6 = _mm_xor_si128(tmp6,_mm_loadu_si128(&((__m128i*)PT)[i+6])); tmp7 = _mm_xor_si128(tmp7,_mm_loadu_si128(&((__m128i*)PT)[i+7])); _mm_storeu_si128(&((__m128i*)CT)[i],tmp); _mm_storeu_si128(&((__m128i*)CT)[i+1],tmp1); _mm_storeu_si128(&((__m128i*)CT)[i+2],tmp2); _mm_storeu_si128(&((__m128i*)CT)[i+3],tmp3); _mm_storeu_si128(&((__m128i*)CT)[i+4],tmp4); _mm_storeu_si128(&((__m128i*)CT)[i+5],tmp5); _mm_storeu_si128(&((__m128i*)CT)[i+6],tmp6); _mm_storeu_si128(&((__m128i*)CT)[i+7],tmp7); } // handling remainder and less than 8 blocks if (length%8==0) return; // The remainder_loc is used to remember the location of our block handled remainder_loc = length-length%8; for(i=0; i < (length%8); i++) { tmp = ctr_block; ctr_block = _mm_add_epi32(ctr_block, ONE); tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]); for(j=1; j <10; j++) { tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]); }; tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]); tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[remainder_loc+i])); _mm_storeu_si128 (&((__m128i*)CT)[remainder_loc+i],tmp); } }
static void transform(hashState *state,int r) { __m128i x0; __m128i x1; __m128i x2; __m128i x3; __m128i x4; __m128i x5; __m128i x6; __m128i x7; __m128i y0; __m128i y1; __m128i y2; __m128i y3; x0 = state->x[0]; x1 = state->x[1]; x2 = state->x[2]; x3 = state->x[3]; x4 = state->x[4]; x5 = state->x[5]; x6 = state->x[6]; x7 = state->x[7]; for (;r > 0;--r) { x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0x4e); x5 = _mm_shuffle_epi32(x5,0x4e); x6 = _mm_shuffle_epi32(x6,0x4e); x7 = _mm_shuffle_epi32(x7,0x4e); x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0xb1); x5 = _mm_shuffle_epi32(x5,0xb1); x6 = _mm_shuffle_epi32(x6,0xb1); x7 = _mm_shuffle_epi32(x7,0xb1); } state->x[0] = x0; state->x[1] = x1; state->x[2] = x2; state->x[3] = x3; state->x[4] = x4; state->x[5] = x5; state->x[6] = x6; state->x[7] = x7; }
void spu_interpreter::BG(SPUThread& CPU, spu_opcode_t op) { CPU.GPR[op.rt].vi = _mm_add_epi32(sse_cmpgt_epu32(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi), _mm_set1_epi32(1)); }
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments ) { typedef uchar T; typedef int WT; typedef int MT; Size size = img.size(); int y; MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); for( y = 0; y < size.height; y++ ) { const T* ptr = img.ptr<T>(y); int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= size.width - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); __m128i px = _mm_mullo_epi16(p, qx); __m128i sx = _mm_mullo_epi16(qx, qx); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx)); qx = _mm_add_epi16(qx, dx); } int CV_DECL_ALIGNED(16) buf[4]; _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } for( ; x < size.width; x++ ) { WT p = ptr[x]; WT xp = x * p, xxp; x0 += p; x1 += xp; xxp = xp * x; x2 += xxp; x3 += xxp * x; } WT py = y * x0, sy = y*y; mom[9] += ((MT)py) * sy; // m03 mom[8] += ((MT)x1) * sy; // m12 mom[7] += ((MT)x2) * y; // m21 mom[6] += x3; // m30 mom[5] += x0 * sy; // m02 mom[4] += x1 * y; // m11 mom[3] += x2; // m20 mom[2] += py; // m01 mom[1] += x1; // m10 mom[0] += x0; // m00 } for(int x = 0; x < 10; x++ ) moments[x] = (double)mom[x]; }
void hashcat_md4_64 (__m128i digests[4], __m128i W[16]) { __m128i a = digests[0]; __m128i b = digests[1]; __m128i c = digests[2]; __m128i d = digests[3]; __m128i tmp2; MD4_STEP (MD4_F , a, b, c, d, W[ 0], MD4C00, MD4S00); MD4_STEP (MD4_F , d, a, b, c, W[ 1], MD4C00, MD4S01); MD4_STEP (MD4_F , c, d, a, b, W[ 2], MD4C00, MD4S02); MD4_STEP (MD4_F , b, c, d, a, W[ 3], MD4C00, MD4S03); MD4_STEP (MD4_F , a, b, c, d, W[ 4], MD4C00, MD4S00); MD4_STEP (MD4_F , d, a, b, c, W[ 5], MD4C00, MD4S01); MD4_STEP (MD4_F , c, d, a, b, W[ 6], MD4C00, MD4S02); MD4_STEP (MD4_F , b, c, d, a, W[ 7], MD4C00, MD4S03); MD4_STEP (MD4_F , a, b, c, d, W[ 8], MD4C00, MD4S00); MD4_STEP (MD4_F , d, a, b, c, W[ 9], MD4C00, MD4S01); MD4_STEP (MD4_F , c, d, a, b, W[10], MD4C00, MD4S02); MD4_STEP (MD4_F , b, c, d, a, W[11], MD4C00, MD4S03); MD4_STEP (MD4_F , a, b, c, d, W[12], MD4C00, MD4S00); MD4_STEP (MD4_F , d, a, b, c, W[13], MD4C00, MD4S01); MD4_STEP (MD4_F , c, d, a, b, W[14], MD4C00, MD4S02); MD4_STEP (MD4_F , b, c, d, a, W[15], MD4C00, MD4S03); MD4_STEP (MD4_G , a, b, c, d, W[ 0], MD4C01, MD4S10); MD4_STEP (MD4_G , d, a, b, c, W[ 4], MD4C01, MD4S11); MD4_STEP (MD4_G , c, d, a, b, W[ 8], MD4C01, MD4S12); MD4_STEP (MD4_G , b, c, d, a, W[12], MD4C01, MD4S13); MD4_STEP (MD4_G , a, b, c, d, W[ 1], MD4C01, MD4S10); MD4_STEP (MD4_G , d, a, b, c, W[ 5], MD4C01, MD4S11); MD4_STEP (MD4_G , c, d, a, b, W[ 9], MD4C01, MD4S12); MD4_STEP (MD4_G , b, c, d, a, W[13], MD4C01, MD4S13); MD4_STEP (MD4_G , a, b, c, d, W[ 2], MD4C01, MD4S10); MD4_STEP (MD4_G , d, a, b, c, W[ 6], MD4C01, MD4S11); MD4_STEP (MD4_G , c, d, a, b, W[10], MD4C01, MD4S12); MD4_STEP (MD4_G , b, c, d, a, W[14], MD4C01, MD4S13); MD4_STEP (MD4_G , a, b, c, d, W[ 3], MD4C01, MD4S10); MD4_STEP (MD4_G , d, a, b, c, W[ 7], MD4C01, MD4S11); MD4_STEP (MD4_G , c, d, a, b, W[11], MD4C01, MD4S12); MD4_STEP (MD4_G , b, c, d, a, W[15], MD4C01, MD4S13); MD4_STEP (MD4_H1, a, b, c, d, W[ 0], MD4C02, MD4S20); MD4_STEP (MD4_H2, d, a, b, c, W[ 8], MD4C02, MD4S21); MD4_STEP (MD4_H1, c, d, a, b, W[ 4], MD4C02, MD4S22); MD4_STEP (MD4_H2, b, c, d, a, W[12], MD4C02, MD4S23); MD4_STEP (MD4_H1, a, b, c, d, W[ 2], MD4C02, MD4S20); MD4_STEP (MD4_H2, d, a, b, c, W[10], MD4C02, MD4S21); MD4_STEP (MD4_H1, c, d, a, b, W[ 6], MD4C02, MD4S22); MD4_STEP (MD4_H2, b, c, d, a, W[14], MD4C02, MD4S23); MD4_STEP (MD4_H1, a, b, c, d, W[ 1], MD4C02, MD4S20); MD4_STEP (MD4_H2, d, a, b, c, W[ 9], MD4C02, MD4S21); MD4_STEP (MD4_H1, c, d, a, b, W[ 5], MD4C02, MD4S22); MD4_STEP (MD4_H2, b, c, d, a, W[13], MD4C02, MD4S23); MD4_STEP (MD4_H1, a, b, c, d, W[ 3], MD4C02, MD4S20); MD4_STEP (MD4_H2, d, a, b, c, W[11], MD4C02, MD4S21); MD4_STEP (MD4_H1, c, d, a, b, W[ 7], MD4C02, MD4S22); MD4_STEP (MD4_H2, b, c, d, a, W[15], MD4C02, MD4S23); digests[0] = _mm_add_epi32 (a, digests[0]); digests[1] = _mm_add_epi32 (b, digests[1]); digests[2] = _mm_add_epi32 (c, digests[2]); digests[3] = _mm_add_epi32 (d, digests[3]); }
static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) { const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); return rgba2; }
static WEBP_INLINE __m128i LoadUVPart(int u, int v) { const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); const __m128i uv_part = _mm_add_epi32(u_part, v_part); return uv_part; }
int global_sse2_word(int queryLength, unsigned short *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, struct f_struct *f_str) { int i, j; int score; int scale; int temp; int distance; int offset; int position; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHNext; __m128i vFPrev; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 7) / 8; offset = (queryLength - 1) % iter; position = 7 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load gap opening penalty to all elements of a constant */ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp); vCeiling = _mm_srli_epi16 (vCeiling, 1); vCeiling = _mm_subs_epi16 (vCeiling, vTemp); vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen); vNull = _mm_cmpeq_epi16 (vTemp, vTemp); vNull = _mm_slli_epi16 (vNull, 15); vScaleAmt = _mm_xor_si128 (vNull, vNull); /* Zero out the storage vector */ vTemp = _mm_adds_epi16 (vNull, vGapOpen); for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vTemp); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vTemp = _mm_srli_si128 (vGapOpen, 14); vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_adds_epi16 (vH, vTemp); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = vNull; vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epi16 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epi16 (vHNext, vGapOpen); vE = _mm_max_epi16 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epi16 (vH, *pvScore++); /* get max from vH, vE and vF */ vH = _mm_max_epi16 (vH, vE); vH = _mm_max_epi16 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epi16 (vH, vGapOpen); vF = _mm_max_epi16 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_cmpgt_epi16 (vF, vCeiling); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vF = _mm_xor_si128 (vF, vNull); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu16 (vTemp, vScaleAmt); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vF, 4); vScaleTmp = _mm_slli_si128 (vScaleAmt, 2); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { __m128i vScale1; __m128i vScale2; vScale = _mm_slli_si128 (vF, 2); vScale = _mm_subs_epu16 (vScale, vGapOpen); vScale = _mm_subs_epu16 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vScale, vTemp); vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vTemp, vScale); vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu16 (vF, vScale); /* check if we can continue in signed 16-bits */ vTemp = _mm_xor_si128 (vF, vNull); vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } vTemp = _mm_adds_epi16 (vCeiling, vCeiling); vScale1 = _mm_subs_epu16 (vScale, vTemp); vScale2 = _mm_subs_epu16 (vScale, vScale1); /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epi16 (vH, vScale1); vH = _mm_subs_epi16 (vH, vScale2); vE = _mm_subs_epi16 (vE, vScale1); vE = _mm_subs_epi16 (vE, vScale2); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 2); } /* calculate the final scaling amount */ vTemp = _mm_xor_si128 (vTemp, vTemp); vScale1 = _mm_unpacklo_epi16 (vScale, vTemp); vScale2 = _mm_unpackhi_epi16 (vScale, vTemp); vScale = _mm_add_epi32 (vScale1, vScale2); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_add_epi32 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_add_epi32 (vScale, vTemp); scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0); temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1); scale = scale + (temp << 16); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 2); vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt); vFPrev = _mm_xor_si128 (vFPrev, vNull); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_xor_si128 (vH, vNull); vH = _mm_slli_si128 (vH, 2); vH = _mm_subs_epu16 (vH, vScaleAmt); vH = _mm_insert_epi16 (vH, gapOpen, 0); vH = _mm_xor_si128 (vH, vNull); } vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 2); } score = (int) (signed short) _mm_extract_epi16 (vH, 7); score = score + SHORT_BIAS; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
OD_SIMD_INLINE __m128i od_dct_mul_epi32(__m128i val, int32_t scale, int32_t offset, int32_t shift) { return _mm_srai_epi32(_mm_add_epi32(OD_MULLO_EPI32(val, scale), _mm_set1_epi32(offset)), shift); }
OD_SIMD_INLINE __m128i od_unbiased_rshift_epi32(__m128i a, int b) { return _mm_srai_epi32(_mm_add_epi32(_mm_srli_epi32(a, 32 - b), a), b); }
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; __m128i packed_out; // Load all inputs. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // coeff = abs(in) __m128i coeff0 = _mm_abs_epi16(in0); __m128i coeff8 = _mm_abs_epi16(in8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // put sign back out0 = _mm_sign_epi16(out0, in0); out8 = _mm_sign_epi16(out8, in8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. The re-ordering is: // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15 // There's only two misplaced entries ([8] and [7]) that are crossing the // reg's boundaries. // We use pshufb instead of pshuflo/pshufhi. { const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6); const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1); const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo); const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7 const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7); const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1); const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi); const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8 const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8); const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7); _mm_storeu_si128((__m128i*)&out[0], out_z0); _mm_storeu_si128((__m128i*)&out[8], out_z8); packed_out = _mm_packs_epi16(out_z0, out_z8); } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }
// Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. // w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load and combine inputs. { const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]); // In SSE4.1, with gcc 4.8 at least (maybe other versions), // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump // of inA and inB, _mm_loadl_epi64 is still used not to have an out of // bound read. const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); tmp_0 = _mm_cvtepu8_epi16(inAB_0); tmp_1 = _mm_cvtepu8_epi16(inAB_1); tmp_2 = _mm_cvtepu8_epi16(inAB_2); tmp_3 = _mm_cvtepu8_epi16(inAB_3); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 } // Vertical pass first to avoid a transpose (vertical and horizontal passes // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } // Horizontal pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b2); } return sum[0] + sum[1] + sum[2] + sum[3]; }
/* Compute reflection coefficients from input signal */ void silk_burg_modified_sse4_1( opus_int32 *res_nrg, /* O Residual energy */ opus_int *res_nrg_Q, /* O Residual energy Q value */ opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ const opus_int16 x[], /* I Input signal, length: nb_subfr * (D + subfr_length) */ const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ const opus_int nb_subfr, /* I Number of subframes stacked in x */ const opus_int D, /* I Order */ int arch /* I Run-time architecture */ ) { opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; const opus_int16 *x_ptr; opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; __m128i CONST1 = _mm_set1_epi32(1); silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE); /* Compute autocorrelations, added over subframes */ silk_sum_sqr_shift(&C0, &rshifts, x, nb_subfr * subfr_length); if(rshifts > MAX_RSHIFTS) { C0 = silk_LSHIFT32(C0, rshifts - MAX_RSHIFTS); silk_assert(C0 > 0); rshifts = MAX_RSHIFTS; } else { lz = silk_CLZ32(C0) - 1; rshifts_extra = N_BITS_HEAD_ROOM - lz; if(rshifts_extra > 0) { rshifts_extra = silk_min(rshifts_extra, MAX_RSHIFTS - rshifts); C0 = silk_RSHIFT32(C0, rshifts_extra); } else { rshifts_extra = silk_max(rshifts_extra, MIN_RSHIFTS - rshifts); C0 = silk_LSHIFT32(C0, -rshifts_extra); } rshifts += rshifts_extra; } CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts); } } } else { for(s = 0; s < nb_subfr; s++) { int i; opus_int32 d; x_ptr = x + s * subfr_length; celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch); for(n = 1; n < D + 1; n++) { for (i = n + subfr_length - D, d = 0; i < subfr_length; i++) d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]); xcorr[ n - 1 ] += d; } for(n = 1; n < D + 1; n++) { C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts); } } } silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32)); /* Initialize */ CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1; /* Q(-rshifts) */ invGain_Q30 = (opus_int32)1 << 30; reached_max_gain = 0; for(n = 0; n < D; n++) { /* Update first row of correlation matrix (without first element) */ /* Update last row of correlation matrix (without last element, stored in reversed order) */ /* Update C * Af */ /* Update C * flipud(Af) (stored in reversed order) */ if(rshifts > -2) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], 16 - rshifts); /* Q(16-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts); /* Q(16-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], QA - 16); /* Q(QA-16) */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16); /* Q(QA-16) */ for(k = 0; k < n; k++) { C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_SMLAWB(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp_QA = Af_QA[ k ]; tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */ tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]); /* Q(QA-16) */ } tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts); /* Q(16-rshifts) */ tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts); /* Q(16-rshifts) */ for(k = 0; k <= n; k++) { CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]); /* Q(-rshift) */ } } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; x1 = -silk_LSHIFT32((opus_int32)x_ptr[ n ], -rshifts); /* Q(-rshifts) */ x2 = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts); /* Q(-rshifts) */ tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ], 17); /* Q17 */ tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17); /* Q17 */ X1_3210 = _mm_set1_epi32(x1); X2_3210 = _mm_set1_epi32(x2); TMP1_3210 = _mm_setzero_si128(); TMP2_3210 = _mm_setzero_si128(); for(k = 0; k < n - 3; k += 4) { PTR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 1 - 3 ]); SUBFR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k ]); FIRST_3210 = _mm_loadu_si128((__m128i *)&C_first_row[ k ]); PTR_3210 = _mm_shuffle_epi32(PTR_3210, _MM_SHUFFLE(0, 1, 2, 3)); LAST_3210 = _mm_loadu_si128((__m128i *)&C_last_row[ k ]); ATMP_3210 = _mm_loadu_si128((__m128i *)&Af_QA[ k ]); T1_3210 = _mm_mullo_epi32(PTR_3210, X1_3210); T2_3210 = _mm_mullo_epi32(SUBFR_3210, X2_3210); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 7); ATMP_3210 = _mm_add_epi32(ATMP_3210, CONST1); ATMP_3210 = _mm_srai_epi32(ATMP_3210, 1); FIRST_3210 = _mm_add_epi32(FIRST_3210, T1_3210); LAST_3210 = _mm_add_epi32(LAST_3210, T2_3210); PTR_3210 = _mm_mullo_epi32(ATMP_3210, PTR_3210); SUBFR_3210 = _mm_mullo_epi32(ATMP_3210, SUBFR_3210); _mm_storeu_si128((__m128i *)&C_first_row[ k ], FIRST_3210); _mm_storeu_si128((__m128i *)&C_last_row[ k ], LAST_3210); TMP1_3210 = _mm_add_epi32(TMP1_3210, PTR_3210); TMP2_3210 = _mm_add_epi32(TMP2_3210, SUBFR_3210); } TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210)); TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E)); TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E)); tmp1 += _mm_cvtsi128_si32(TMP1_3210); tmp2 += _mm_cvtsi128_si32(TMP2_3210); for(; k < n; k++) { C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q(-rshifts) */ C_last_row[ k ] = silk_MLA(C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */ Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17); /* Q17 */ tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ], Atmp1); /* Q17 */ tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1); /* Q17 */ } tmp1 = -tmp1; /* Q17 */ tmp2 = -tmp2; /* Q17 */ { __m128i xmm_tmp1, xmm_tmp2; __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1; __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1; xmm_tmp1 = _mm_set1_epi32(tmp1); xmm_tmp2 = _mm_set1_epi32(tmp2); for(k = 0; k <= n - 3; k += 4) { xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 3 ]); xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k - 1 ]); xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 1, 2, 3)); xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32(xmm_x_ptr_n_k_x2x0, -rshifts - 1); xmm_x_ptr_sub_x2x0 = _mm_slli_epi32(xmm_x_ptr_sub_x2x0, -rshifts - 1); /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/ xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_sub_x2x0, _MM_SHUFFLE(0, 3, 2, 1)); xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32(xmm_x_ptr_n_k_x2x0, xmm_tmp1); xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32(xmm_x_ptr_n_k_x3x1, xmm_tmp1); xmm_x_ptr_sub_x2x0 = _mm_mul_epi32(xmm_x_ptr_sub_x2x0, xmm_tmp2); xmm_x_ptr_sub_x3x1 = _mm_mul_epi32(xmm_x_ptr_sub_x3x1, xmm_tmp2); xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64(xmm_x_ptr_n_k_x2x0, 16); xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64(xmm_x_ptr_n_k_x3x1, 16); xmm_x_ptr_sub_x2x0 = _mm_srli_epi64(xmm_x_ptr_sub_x2x0, 16); xmm_x_ptr_sub_x3x1 = _mm_slli_epi64(xmm_x_ptr_sub_x3x1, 16); xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16(xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC); xmm_x_ptr_sub_x2x0 = _mm_blend_epi16(xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC); X1_3210 = _mm_loadu_si128((__m128i *)&CAf[ k ]); PTR_3210 = _mm_loadu_si128((__m128i *)&CAb[ k ]); X1_3210 = _mm_add_epi32(X1_3210, xmm_x_ptr_n_k_x2x0); PTR_3210 = _mm_add_epi32(PTR_3210, xmm_x_ptr_sub_x2x0); _mm_storeu_si128((__m128i *)&CAf[ k ], X1_3210); _mm_storeu_si128((__m128i *)&CAb[ k ], PTR_3210); } for(; k <= n; k++) { CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1, silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1)); /* Q(-rshift) */ CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2, silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */ } } } } /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ tmp1 = C_first_row[ n ]; /* Q(-rshifts) */ tmp2 = C_last_row[ n ]; /* Q(-rshifts) */ num = 0; /* Q(-rshifts) */ nrg = silk_ADD32(CAb[ 0 ], CAf[ 0 ]); /* Q(1-rshifts) */ for(k = 0; k < n; k++) { Atmp_QA = Af_QA[ k ]; lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1; lz = silk_min(32 - QA, lz); Atmp1 = silk_LSHIFT32(Atmp_QA, lz); /* Q(QA + lz) */ tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ num = silk_ADD_LSHIFT32(num, silk_SMMUL(CAb[ n - k ], Atmp1), 32 - QA - lz); /* Q(-rshifts) */ nrg = silk_ADD_LSHIFT32(nrg, silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]), Atmp1), 32 - QA - lz); /* Q(1-rshifts) */ } CAf[ n + 1 ] = tmp1; /* Q(-rshifts) */ CAb[ n + 1 ] = tmp2; /* Q(-rshifts) */ num = silk_ADD32(num, tmp2); /* Q(-rshifts) */ num = silk_LSHIFT32(-num, 1); /* Q(1-rshifts) */ /* Calculate the next order reflection (parcor) coefficient */ if(silk_abs(num) < nrg) { rc_Q31 = silk_DIV32_varQ(num, nrg, 31); } else { rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN; } /* Update inverse prediction gain */ tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31); tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2); if(tmp1 <= minInvGain_Q30) { /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30); /* Q30 */ rc_Q31 = silk_SQRT_APPROX(tmp2); /* Q15 */ /* Newton-Raphson iteration */ rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1); /* Q15 */ rc_Q31 = silk_LSHIFT32(rc_Q31, 16); /* Q31 */ if(num < 0) { /* Ensure adjusted reflection coefficients has the original sign */ rc_Q31 = -rc_Q31; } invGain_Q30 = minInvGain_Q30; reached_max_gain = 1; } else { invGain_Q30 = tmp1; } /* Update the AR coefficients */ for(k = 0; k < (n + 1) >> 1; k++) { tmp1 = Af_QA[ k ]; /* QA */ tmp2 = Af_QA[ n - k - 1 ]; /* QA */ Af_QA[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* QA */ Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* QA */ } Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA); /* QA */ if(reached_max_gain) { /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ for(k = n + 1; k < D; k++) { Af_QA[ k ] = 0; } break; } /* Update C * Af and C * Ab */ for(k = 0; k <= n + 1; k++) { tmp1 = CAf[ k ]; /* Q(-rshifts) */ tmp2 = CAb[ n - k + 1 ]; /* Q(-rshifts) */ CAf[ k ] = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1); /* Q(-rshifts) */ CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1); /* Q(-rshifts) */ } } if(reached_max_gain) { for(k = 0; k < D; k++) { /* Scale coefficients */ A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); } /* Subtract energy of preceding samples from C0 */ if(rshifts > 0) { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts); } } else { for(s = 0; s < nb_subfr; s++) { x_ptr = x + s * subfr_length; C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts); } } /* Approximate residual energy */ *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2); *res_nrg_Q = -rshifts; } else { /* Return residual energy */ nrg = CAf[ 0 ]; /* Q(-rshifts) */ tmp1 = (opus_int32)1 << 16; /* Q16 */ for(k = 0; k < D; k++) { Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16); /* Q16 */ nrg = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1); /* Q(-rshifts) */ tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1); /* Q16 */ A_Q16[ k ] = -Atmp1; } *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */ *res_nrg_Q = -rshifts; } }
// arithmetic operators RETi ADD( const __m128i x, const __m128i y ) { return _mm_add_epi32(x,y); }
void minmax_vec2(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction __m128 sse_min_permute = _mm_shuffle_epi32(sse_min, 2 | (3<<2)); __m128 sse_max_permute = _mm_shuffle_epi32(sse_max, 2 | (3<<2)); __m128i sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 2 | (3<<2)); __m128i sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 2 | (3<<2)); __m128 sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); __m128 sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); sse_min_permute = _mm_shuffle_epi32(sse_min, 1); sse_max_permute = _mm_shuffle_epi32(sse_max, 1); sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 1); sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 1); sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); // Epilogue float min, max; uint32_t idx_min, idx_max; _mm_store_ss(&min, sse_min); _mm_store_ss(&max, sse_max); idx_min = _mm_extract_epi32(sse_idx_min, 0); idx_max = _mm_extract_epi32(sse_idx_max, 0); for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) { int r; __m128i x0; __m128i x1; __m128i x2; __m128i x3; __m128i x4; __m128i x5; __m128i x6; __m128i x7; __m128i y0; __m128i y1; __m128i y2; __m128i y3; while (databitlen >= 8 && state->pos != 0) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; data += 1; databitlen -= 8; state->pos += 8; if (state->pos == 8 * CUBEHASH_BLOCKBYTES) { transform(state,CUBEHASH_ROUNDS); state->pos = 0; } } x0 = state->x[0]; x1 = state->x[1]; x2 = state->x[2]; x3 = state->x[3]; x4 = state->x[4]; x5 = state->x[5]; x6 = state->x[6]; x7 = state->x[7]; while (databitlen >= 8 * CUBEHASH_BLOCKBYTES) { x0 = _mm_xor_si128(x0,_mm_set_epi32(0,0,0,(crypto_uint32) *(crypto_uint16 *) data)); data += CUBEHASH_BLOCKBYTES; databitlen -= 8 * CUBEHASH_BLOCKBYTES; for (r = 0;r < CUBEHASH_ROUNDS;++r) { x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0x4e); x5 = _mm_shuffle_epi32(x5,0x4e); x6 = _mm_shuffle_epi32(x6,0x4e); x7 = _mm_shuffle_epi32(x7,0x4e); x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0xb1); x5 = _mm_shuffle_epi32(x5,0xb1); x6 = _mm_shuffle_epi32(x6,0xb1); x7 = _mm_shuffle_epi32(x7,0xb1); } } state->x[0] = x0; state->x[1] = x1; state->x[2] = x2; state->x[3] = x3; state->x[4] = x4; state->x[5] = x5; state->x[6] = x6; state->x[7] = x7; while (databitlen >= 8) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; data += 1; databitlen -= 8; state->pos += 8; if (state->pos == 8 * CUBEHASH_BLOCKBYTES) { transform(state,CUBEHASH_ROUNDS); state->pos = 0; } } if (databitlen > 0) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; state->pos += databitlen; } return SUCCESS; }
void minmax_vec(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction float __attribute__((aligned(16))) mins[4]; float __attribute__((aligned(16))) maxs[4]; _mm_store_ps(mins, sse_min); _mm_store_ps(maxs, sse_max); float min = mins[0]; float max = maxs[0]; uint32_t idx_min = _mm_extract_epi32(sse_idx_min, 0); uint32_t idx_max = _mm_extract_epi32(sse_idx_max, 0); // Unrolled by GCC for (int i = 1; i < 4; i++) { float v = mins[i]; if (v < min) { min = v; idx_min = _mm_extract_epi32(sse_idx_min, i); } v = maxs[i]; if (v > max) { max = v; idx_max = _mm_extract_epi32(sse_idx_max, i); } } // Epilogue for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; end += default_partition_samples; e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */ for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=4) { __m128i mm_res = local_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample))); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2))); mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2))); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) abs_residual_partition_sums[partition] &= 0xFFFFFFFF; #endif } } else { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; end += default_partition_samples; e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); /* 0 0 0 |r0| == 00 |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=2) { __m128i mm_res = local_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); _mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum); } } } /* now merge partitions for lower orders */ { uint32_t from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { uint32_t i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }
/** * salsa20_8(B): * Apply the salsa20/8 core to the provided block. */ static void salsa20_8(__m128i B[4]) { __m128i X0, X1, X2, X3; __m128i T; size_t i; X0 = B[0]; X1 = B[1]; X2 = B[2]; X3 = B[3]; for (i = 0; i < 8; i += 2) { /* Operate on "columns". */ T = _mm_add_epi32(X0, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X1, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X3, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x93); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x39); /* Operate on "rows". */ T = _mm_add_epi32(X0, X1); X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7)); X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25)); T = _mm_add_epi32(X3, X0); X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9)); X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23)); T = _mm_add_epi32(X2, X3); X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13)); X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19)); T = _mm_add_epi32(X1, X2); X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18)); X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14)); /* Rearrange data. */ X1 = _mm_shuffle_epi32(X1, 0x39); X2 = _mm_shuffle_epi32(X2, 0x4E); X3 = _mm_shuffle_epi32(X3, 0x93); } B[0] = _mm_add_epi32(B[0], X0); B[1] = _mm_add_epi32(B[1], X1); B[2] = _mm_add_epi32(B[2], X2); B[3] = _mm_add_epi32(B[3], X3); }
void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output, const int8_t *cos_bit, const int8_t *stage_range) { const int txfm_size = 32; const int num_per_128 = 4; const int32_t *cospi; __m128i buf0[32]; __m128i buf1[32]; int col_num = txfm_size / num_per_128; int bit; int col; (void)stage_range; for (col = 0; col < col_num; col++) { // stage 0; int32_t stage_idx = 0; buf0[0] = input[0 * col_num + col]; buf0[1] = input[1 * col_num + col]; buf0[2] = input[2 * col_num + col]; buf0[3] = input[3 * col_num + col]; buf0[4] = input[4 * col_num + col]; buf0[5] = input[5 * col_num + col]; buf0[6] = input[6 * col_num + col]; buf0[7] = input[7 * col_num + col]; buf0[8] = input[8 * col_num + col]; buf0[9] = input[9 * col_num + col]; buf0[10] = input[10 * col_num + col]; buf0[11] = input[11 * col_num + col]; buf0[12] = input[12 * col_num + col]; buf0[13] = input[13 * col_num + col]; buf0[14] = input[14 * col_num + col]; buf0[15] = input[15 * col_num + col]; buf0[16] = input[16 * col_num + col]; buf0[17] = input[17 * col_num + col]; buf0[18] = input[18 * col_num + col]; buf0[19] = input[19 * col_num + col]; buf0[20] = input[20 * col_num + col]; buf0[21] = input[21 * col_num + col]; buf0[22] = input[22 * col_num + col]; buf0[23] = input[23 * col_num + col]; buf0[24] = input[24 * col_num + col]; buf0[25] = input[25 * col_num + col]; buf0[26] = input[26 * col_num + col]; buf0[27] = input[27 * col_num + col]; buf0[28] = input[28 * col_num + col]; buf0[29] = input[29 * col_num + col]; buf0[30] = input[30 * col_num + col]; buf0[31] = input[31 * col_num + col]; // stage 1 stage_idx++; buf1[0] = buf0[31]; buf1[1] = buf0[0]; buf1[2] = buf0[29]; buf1[3] = buf0[2]; buf1[4] = buf0[27]; buf1[5] = buf0[4]; buf1[6] = buf0[25]; buf1[7] = buf0[6]; buf1[8] = buf0[23]; buf1[9] = buf0[8]; buf1[10] = buf0[21]; buf1[11] = buf0[10]; buf1[12] = buf0[19]; buf1[13] = buf0[12]; buf1[14] = buf0[17]; buf1[15] = buf0[14]; buf1[16] = buf0[15]; buf1[17] = buf0[16]; buf1[18] = buf0[13]; buf1[19] = buf0[18]; buf1[20] = buf0[11]; buf1[21] = buf0[20]; buf1[22] = buf0[9]; buf1[23] = buf0[22]; buf1[24] = buf0[7]; buf1[25] = buf0[24]; buf1[26] = buf0[5]; buf1[27] = buf0[26]; buf1[28] = buf0[3]; buf1[29] = buf0[28]; buf1[30] = buf0[1]; buf1[31] = buf0[30]; // stage 2 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1], bit); btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3], bit); btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5], bit); btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6], buf0[7], bit); btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8], buf0[9], bit); btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10], buf0[11], bit); btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12], buf0[13], bit); btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14], buf0[15], bit); btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16], buf0[17], bit); btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18], buf0[19], bit); btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20], buf0[21], bit); btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22], buf0[23], bit); btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24], buf0[25], bit); btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26], buf0[27], bit); btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28], buf0[29], bit); btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30], buf0[31], bit); // stage 3 stage_idx++; buf1[0] = _mm_add_epi32(buf0[0], buf0[16]); buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]); buf1[1] = _mm_add_epi32(buf0[1], buf0[17]); buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]); buf1[2] = _mm_add_epi32(buf0[2], buf0[18]); buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]); buf1[3] = _mm_add_epi32(buf0[3], buf0[19]); buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]); buf1[4] = _mm_add_epi32(buf0[4], buf0[20]); buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]); buf1[5] = _mm_add_epi32(buf0[5], buf0[21]); buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]); buf1[6] = _mm_add_epi32(buf0[6], buf0[22]); buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]); buf1[7] = _mm_add_epi32(buf0[7], buf0[23]); buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]); buf1[8] = _mm_add_epi32(buf0[8], buf0[24]); buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]); buf1[9] = _mm_add_epi32(buf0[9], buf0[25]); buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]); buf1[10] = _mm_add_epi32(buf0[10], buf0[26]); buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]); buf1[11] = _mm_add_epi32(buf0[11], buf0[27]); buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]); buf1[12] = _mm_add_epi32(buf0[12], buf0[28]); buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]); buf1[13] = _mm_add_epi32(buf0[13], buf0[29]); buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]); buf1[14] = _mm_add_epi32(buf0[14], buf0[30]); buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]); buf1[15] = _mm_add_epi32(buf0[15], buf0[31]); buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]); // stage 4 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; buf0[4] = buf1[4]; buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; buf0[12] = buf1[12]; buf0[13] = buf1[13]; buf0[14] = buf1[14]; buf0[15] = buf1[15]; btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16], buf0[17], bit); btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18], buf0[19], bit); btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20], buf0[21], bit); btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22], buf0[23], bit); btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24], buf0[25], bit); btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26], buf0[27], bit); btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28], buf0[29], bit); btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30], buf0[31], bit); // stage 5 stage_idx++; buf1[0] = _mm_add_epi32(buf0[0], buf0[8]); buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]); buf1[1] = _mm_add_epi32(buf0[1], buf0[9]); buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]); buf1[2] = _mm_add_epi32(buf0[2], buf0[10]); buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]); buf1[3] = _mm_add_epi32(buf0[3], buf0[11]); buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]); buf1[4] = _mm_add_epi32(buf0[4], buf0[12]); buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]); buf1[5] = _mm_add_epi32(buf0[5], buf0[13]); buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]); buf1[6] = _mm_add_epi32(buf0[6], buf0[14]); buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]); buf1[7] = _mm_add_epi32(buf0[7], buf0[15]); buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]); buf1[16] = _mm_add_epi32(buf0[16], buf0[24]); buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]); buf1[17] = _mm_add_epi32(buf0[17], buf0[25]); buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]); buf1[18] = _mm_add_epi32(buf0[18], buf0[26]); buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]); buf1[19] = _mm_add_epi32(buf0[19], buf0[27]); buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]); buf1[20] = _mm_add_epi32(buf0[20], buf0[28]); buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]); buf1[21] = _mm_add_epi32(buf0[21], buf0[29]); buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]); buf1[22] = _mm_add_epi32(buf0[22], buf0[30]); buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]); buf1[23] = _mm_add_epi32(buf0[23], buf0[31]); buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]); // stage 6 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; buf0[4] = buf1[4]; buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9], bit); btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10], buf0[11], bit); btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12], buf0[13], bit); btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14], buf0[15], bit); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; buf0[20] = buf1[20]; buf0[21] = buf1[21]; buf0[22] = buf1[22]; buf0[23] = buf1[23]; btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24], buf0[25], bit); btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26], buf0[27], bit); btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28], buf0[29], bit); btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30], buf0[31], bit); // stage 7 stage_idx++; buf1[0] = _mm_add_epi32(buf0[0], buf0[4]); buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]); buf1[1] = _mm_add_epi32(buf0[1], buf0[5]); buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]); buf1[2] = _mm_add_epi32(buf0[2], buf0[6]); buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]); buf1[3] = _mm_add_epi32(buf0[3], buf0[7]); buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]); buf1[8] = _mm_add_epi32(buf0[8], buf0[12]); buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]); buf1[9] = _mm_add_epi32(buf0[9], buf0[13]); buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]); buf1[10] = _mm_add_epi32(buf0[10], buf0[14]); buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]); buf1[11] = _mm_add_epi32(buf0[11], buf0[15]); buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]); buf1[16] = _mm_add_epi32(buf0[16], buf0[20]); buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]); buf1[17] = _mm_add_epi32(buf0[17], buf0[21]); buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]); buf1[18] = _mm_add_epi32(buf0[18], buf0[22]); buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]); buf1[19] = _mm_add_epi32(buf0[19], buf0[23]); buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]); buf1[24] = _mm_add_epi32(buf0[24], buf0[28]); buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]); buf1[25] = _mm_add_epi32(buf0[25], buf0[29]); buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]); buf1[26] = _mm_add_epi32(buf0[26], buf0[30]); buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]); buf1[27] = _mm_add_epi32(buf0[27], buf0[31]); buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]); // stage 8 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5], bit); btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7], bit); buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12], buf0[13], bit); btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14], buf0[15], bit); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20], buf0[21], bit); btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22], buf0[23], bit); buf0[24] = buf1[24]; buf0[25] = buf1[25]; buf0[26] = buf1[26]; buf0[27] = buf1[27]; btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28], buf0[29], bit); btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30], buf0[31], bit); // stage 9 stage_idx++; buf1[0] = _mm_add_epi32(buf0[0], buf0[2]); buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]); buf1[1] = _mm_add_epi32(buf0[1], buf0[3]); buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]); buf1[4] = _mm_add_epi32(buf0[4], buf0[6]); buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]); buf1[5] = _mm_add_epi32(buf0[5], buf0[7]); buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]); buf1[8] = _mm_add_epi32(buf0[8], buf0[10]); buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]); buf1[9] = _mm_add_epi32(buf0[9], buf0[11]); buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]); buf1[12] = _mm_add_epi32(buf0[12], buf0[14]); buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]); buf1[13] = _mm_add_epi32(buf0[13], buf0[15]); buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]); buf1[16] = _mm_add_epi32(buf0[16], buf0[18]); buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]); buf1[17] = _mm_add_epi32(buf0[17], buf0[19]); buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]); buf1[20] = _mm_add_epi32(buf0[20], buf0[22]); buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]); buf1[21] = _mm_add_epi32(buf0[21], buf0[23]); buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]); buf1[24] = _mm_add_epi32(buf0[24], buf0[26]); buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]); buf1[25] = _mm_add_epi32(buf0[25], buf0[27]); buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]); buf1[28] = _mm_add_epi32(buf0[28], buf0[30]); buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]); buf1[29] = _mm_add_epi32(buf0[29], buf0[31]); buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]); // stage 10 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3], bit); buf0[4] = buf1[4]; buf0[5] = buf1[5]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7], bit); buf0[8] = buf1[8]; buf0[9] = buf1[9]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10], buf0[11], bit); buf0[12] = buf1[12]; buf0[13] = buf1[13]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14], buf0[15], bit); buf0[16] = buf1[16]; buf0[17] = buf1[17]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18], buf0[19], bit); buf0[20] = buf1[20]; buf0[21] = buf1[21]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22], buf0[23], bit); buf0[24] = buf1[24]; buf0[25] = buf1[25]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26], buf0[27], bit); buf0[28] = buf1[28]; buf0[29] = buf1[29]; btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30], buf0[31], bit); // stage 11 stage_idx++; buf1[0] = buf0[0]; buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]); buf1[2] = buf0[24]; buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]); buf1[4] = buf0[12]; buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]); buf1[6] = buf0[20]; buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]); buf1[8] = buf0[6]; buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]); buf1[10] = buf0[30]; buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]); buf1[12] = buf0[10]; buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]); buf1[14] = buf0[18]; buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]); buf1[16] = buf0[3]; buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]); buf1[18] = buf0[27]; buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]); buf1[20] = buf0[15]; buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]); buf1[22] = buf0[23]; buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]); buf1[24] = buf0[5]; buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]); buf1[26] = buf0[29]; buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]); buf1[28] = buf0[9]; buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]); buf1[30] = buf0[17]; buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]); output[0 * col_num + col] = buf1[0]; output[1 * col_num + col] = buf1[1]; output[2 * col_num + col] = buf1[2]; output[3 * col_num + col] = buf1[3]; output[4 * col_num + col] = buf1[4]; output[5 * col_num + col] = buf1[5]; output[6 * col_num + col] = buf1[6]; output[7 * col_num + col] = buf1[7]; output[8 * col_num + col] = buf1[8]; output[9 * col_num + col] = buf1[9]; output[10 * col_num + col] = buf1[10]; output[11 * col_num + col] = buf1[11]; output[12 * col_num + col] = buf1[12]; output[13 * col_num + col] = buf1[13]; output[14 * col_num + col] = buf1[14]; output[15 * col_num + col] = buf1[15]; output[16 * col_num + col] = buf1[16]; output[17 * col_num + col] = buf1[17]; output[18 * col_num + col] = buf1[18]; output[19 * col_num + col] = buf1[19]; output[20 * col_num + col] = buf1[20]; output[21 * col_num + col] = buf1[21]; output[22 * col_num + col] = buf1[22]; output[23 * col_num + col] = buf1[23]; output[24 * col_num + col] = buf1[24]; output[25 * col_num + col] = buf1[25]; output[26 * col_num + col] = buf1[26]; output[27 * col_num + col] = buf1[27]; output[28 * col_num + col] = buf1[28]; output[29 * col_num + col] = buf1[29]; output[30 * col_num + col] = buf1[30]; output[31 * col_num + col] = buf1[31]; } }
void nb_kernel430_sse2_double(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset,tj,tj2,nti; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0,-1.0}; const __m128d zero = {0.0,0.0}; const __m128d half = {0.5,0.5}; const __m128d two = {2.0,2.0}; const __m128d three = {3.0,3.0}; const __m128d six = {6.0,6.0}; const __m128d twelwe = {12.0,12.0}; const __m128i four = _mm_set_epi32(4,4,4,4); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ Vvdwtmp = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); vgb = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); xmm1 = _mm_setzero_pd(); xmm2 = _mm_setzero_pd(); xmm3 = _mm_setzero_pd(); xmm4 = _mm_setzero_pd(); jnr1 = jnr2 = 0; j13 = j23 = 0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); nti = 2*ntype*type[ii]; fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; tj2 = nti+2*type[jnr2]; xmm1 = _mm_loadu_pd(vdwparam+tj); xmm2 = _mm_loadu_pd(vdwparam+tj2); c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); /* Calculate GB table index */ r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijC); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; c6 = _mm_load_sd(vdwparam+tj); c12 = _mm_load_sd(vdwparam+tj+1); /* Calculate GB table index */ r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijC); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* VdW potential */ Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); _mm_storeh_pd(&vdwt,Vvdwtot); Vvdw[ggid] = Vvdw[ggid] + vdwt; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
/* Input image must be large enough to have valid pixels for the offset (dx,dy). I.e., with (dx,dy)=(-10,8), x-value up to -10 and y-values up to (h-1)+8 will be accessed. The integral image will be access with (x,y) in [-1,w)x[-1,h). Note also that we use 32bit for the integral image even though the values may overflow that range. However, the modulo-arithmetic used when computing the block sums later will be still correct when the block size is not too large. */ static void buildIntegralImage_SSE(uint32_t* integral_image, int integral_stride, const uint8_t* current_image, int current_image_stride, const uint8_t* compare_image, int compare_image_stride, int w,int hStart, int hEnd, int dx,int dy) { const __m128i zero = _mm_set1_epi8(0); memset(integral_image -1 -integral_stride, 0, (w+1)*sizeof(uint32_t)); for (int y=hStart;y<hEnd;y++) { const uint8_t* p1 = current_image + y *current_image_stride; const uint8_t* p2 = compare_image + (y+dy)*compare_image_stride + dx; uint32_t* out = integral_image + y*integral_stride-1; __m128i prevadd = _mm_set1_epi32(0); const int pixels_step = 16; *out++ = 0; for (int x=0 ; x<w ; x+=pixels_step) { __m128i pa, pb; __m128i pla, plb; __m128i ldiff, lldiff, lhdiff; __m128i ltmp,htmp; __m128i ladd,hadd; __m128i pha,phb; __m128i hdiff,hldiff,hhdiff; __m128i l2tmp,h2tmp; pa = _mm_loadu_si128((__m128i*)p1); pb = _mm_loadu_si128((__m128i*)p2); pla = _mm_unpacklo_epi8(pa,zero); plb = _mm_unpacklo_epi8(pb,zero); ldiff = _mm_sub_epi16(pla,plb); ldiff = _mm_mullo_epi16(ldiff,ldiff); lldiff = _mm_unpacklo_epi16(ldiff,zero); lhdiff = _mm_unpackhi_epi16(ldiff,zero); ltmp = _mm_slli_si128(lldiff, 4); lldiff = _mm_add_epi32(lldiff, ltmp); ltmp = _mm_slli_si128(lldiff, 8); lldiff = _mm_add_epi32(lldiff, ltmp); lldiff = _mm_add_epi32(lldiff, prevadd); ladd = _mm_shuffle_epi32(lldiff, 0xff); htmp = _mm_slli_si128(lhdiff, 4); lhdiff = _mm_add_epi32(lhdiff, htmp); htmp = _mm_slli_si128(lhdiff, 8); lhdiff = _mm_add_epi32(lhdiff, htmp); lhdiff = _mm_add_epi32(lhdiff, ladd); prevadd = _mm_shuffle_epi32(lhdiff, 0xff); _mm_store_si128((__m128i*)(out), lldiff); _mm_store_si128((__m128i*)(out+4),lhdiff); pha = _mm_unpackhi_epi8(pa,zero); phb = _mm_unpackhi_epi8(pb,zero); hdiff = _mm_sub_epi16(pha,phb); hdiff = _mm_mullo_epi16(hdiff,hdiff); hldiff = _mm_unpacklo_epi16(hdiff,zero); hhdiff = _mm_unpackhi_epi16(hdiff,zero); l2tmp = _mm_slli_si128(hldiff, 4); hldiff = _mm_add_epi32(hldiff, l2tmp); l2tmp = _mm_slli_si128(hldiff, 8); hldiff = _mm_add_epi32(hldiff, l2tmp); hldiff = _mm_add_epi32(hldiff, prevadd); hadd = _mm_shuffle_epi32(hldiff, 0xff); h2tmp = _mm_slli_si128(hhdiff, 4); hhdiff = _mm_add_epi32(hhdiff, h2tmp); h2tmp = _mm_slli_si128(hhdiff, 8); hhdiff = _mm_add_epi32(hhdiff, h2tmp); hhdiff = _mm_add_epi32(hhdiff, hadd); prevadd = _mm_shuffle_epi32(hhdiff, 0xff); _mm_store_si128((__m128i*)(out+8), hldiff); _mm_store_si128((__m128i*)(out+12),hhdiff); out+=pixels_step; p1 +=pixels_step; p2 +=pixels_step; } if (y>0) { out = integral_image + y*integral_stride; for (int x=0 ; x<w ; x+=pixels_step) { *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride), *(__m128i*)(out)); *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride), *(__m128i*)(out+4)); *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride), *(__m128i*)(out+8)); *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride), *(__m128i*)(out+12)); out += 4*4; } } } }
FourColorVals VCALL operator+(FourColorVals b) { return FourColorVals(_mm_add_epi32(vec,b.vec)); }
void sincos_ps(__m128 x, __m128 *s, __m128 *c) { __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; __m128i emm0, emm2, emm4; sign_bit_sin = x; x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask)); sign_bit_sin = _mm_and_ps(sign_bit_sin, *reinterpret_cast<const __m128*>(_pi_sign_mask)); y = _mm_mul_ps(x, *_ps_cephes_FOPI); emm2 = _mm_cvttps_epi32(y); emm2 = _mm_add_epi32(emm2, *_pi_1); emm2 = _mm_and_si128(emm2, *_pi_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; emm0 = _mm_and_si128(emm2, *_pi_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); emm2 = _mm_and_si128(emm2, *_pi_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); xmm1 = *_ps_minus_cephes_DP1; xmm2 = *_ps_minus_cephes_DP2; xmm3 = *_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); emm4 = _mm_sub_epi32(emm4, *_pi_2); emm4 = _mm_andnot_si128(emm4, *_pi_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); __m128 z = _mm_mul_ps(x, x); y = *_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *_ps_1); __m128 y2 = *_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1, ysin2); xmm2 = _mm_add_ps(y, y2); *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride , width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 16) { __m128i sum[4] = { zero, zero, zero, zero }; for (int i = 0; i < 25; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); sumfp = _mm_add_ps(sumfp, bias); if (!ch->saturate) { sumfp = mm_abs_ps(sumfp); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }