inline void rotate_left_wm1(F64vec2 *v0, const F64vec2 v1) { //v0 {1.0, 2.0}; //v1 {3.0, 4.0}; //v0 {2.0, 3.0, 4.0}; *v0 = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(v1), _mm_castpd_si128(*v0), 8)); }
void*drawman(void*x){ int c=col++; unsigned _m=mx,mxx=16777216/_m; double _x=xx,_y=yy,_w=wh; do{ __m128d cr=_mm_set1_pd(_x+_w*c); for(int j=0;j<512;j+=2){ __m128d zr=cr, zi=_mm_set_pd(_y+_w*j,_y+_w*(j+1)),ci=zi, zr2=_mm_mul_pd(zr,zr),zi2=_mm_mul_pd(zi,zi); unsigned mk=mx-1; uint64_t kk[2]__attribute__((aligned(16)))={mk,mk}; __m128i k=_mm_load_si128((__m128i*)kk); do{ zi=_mm_mul_pd(zi,zr); zi=_mm_add_pd(_mm_add_pd(zi,zi),ci); zr=_mm_add_pd(_mm_sub_pd(zr2,zi2),cr); zr2=_mm_mul_pd(zr,zr); zi2=_mm_mul_pd(zi,zi); __m128d n=_mm_cmplt_pd(_mm_add_pd(zr2,zi2),_mm_set1_pd(4)); if(!_mm_movemask_pd(n))break; k=_mm_add_epi64(k,_mm_castpd_si128(n)); }while(--mk); _mm_store_si128((__m128i*)kk,k); manor[c][j]=kk[1]*mxx>>16; manor[c][j+1]=kk[0]*mxx>>16; } done[c>>6]|=1ULL<<(c&63); c=col++; }while(c<512&&!pull); }
static SIMD_INLINE __m128d sin_vml_pd(__m128d x) { SIMD_CONST_SD(1_PI , 0.318309886183790671538); SIMD_CONST_SD(PI4_A, 0.78539816290140151978 * -4.0); SIMD_CONST_SD(PI4_B, 4.9604678871439933374e-10 * -4.0); SIMD_CONST_SD(PI4_C, 1.1258708853173288931e-18 * -4.0); SIMD_CONST_SD(PI4_D, 1.7607799325916000908e-27 * -4.0); SIMD_CONST_SD(sin_0,-7.97255955009037868891952e-18); SIMD_CONST_SD(sin_1, 2.81009972710863200091251e-15); SIMD_CONST_SD(sin_2,-7.64712219118158833288484e-13); SIMD_CONST_SD(sin_3, 1.60590430605664501629054e-10); SIMD_CONST_SD(sin_4,-2.50521083763502045810755e-08); SIMD_CONST_SD(sin_5, 2.75573192239198747630416e-06); SIMD_CONST_SD(sin_6,-1.98412698412696162806809e-04); SIMD_CONST_SD(sin_7, 8.33333333333332974823815e-03); SIMD_CONST_SD(sin_8,-1.66666666666666657414808e-01); SIMD_CONST_SD(magic, 6755399441055744.0); __m128d y = _mm_mul_pd(x, SIMD_GET_PD(1_PI)); __m128d q = _mm_add_pd(y, SIMD_GET_PD(magic)); __m128d i = _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(q), 63)); q = _mm_sub_pd(q, SIMD_GET_PD(magic)); x = Simd128::mad(q, SIMD_GET_PD(PI4_A), x); x = Simd128::mad(q, SIMD_GET_PD(PI4_B), x); x = Simd128::mad(q, SIMD_GET_PD(PI4_C), x); x = Simd128::mad(q, SIMD_GET_PD(PI4_D), x); __m128d xx = _mm_mul_pd(x, x); x = _mm_xor_pd(x, i); __m128d u = SIMD_GET_PD(sin_0); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_1)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_2)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_3)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_4)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_5)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_6)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_7)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_8)); u = Simd128::mad(xx, _mm_mul_pd(u, x), x); return u; }
/** Return the significand and the exponent both in double precision **/ __m128d frexp_sse(__m128d x, __m128d *e) { /* Integer exponent */ __m128i ei; /* Save the exponent */ ei = _mm_and_si128(_mm_castpd_si128(x), *(__m128i*)pi64_mantissa_mask); ei = _mm_srli_epi64(ei, 52); ei = _mm_shuffle_epi32(ei,216); ei = _mm_sub_epi32(ei, *(__m128i*)pi32_bias4i); *e = _mm_cvtepi32_pd(ei); /* Save the significand */ x = _mm_and_pd(x, *(__m128d*)pi64_inv_mantissa_mask); x = _mm_or_pd(x, *(__m128d*)pd_half_mask); return x; }
double bst_compute_123_m128_unaligned8_maskstore( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128d v20, v21, v22, v23; __m128d v30, v31, v32, v33; __m128i v_cur_roots; __m128 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3; idx1 = IDX(n,n); e[idx1] = q[n]; idx1++; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx3 = idx1; for (r = i; r < n; ++r) { // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&7); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 8 ) { v01 = _mm_loadu_pd( &w[idx1 ] ); v11 = _mm_loadu_pd( &w[idx1+2] ); v21 = _mm_loadu_pd( &w[idx1+4] ); v31 = _mm_loadu_pd( &w[idx1+6] ); v00 = _mm_loadu_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); v10 = _mm_loadu_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v20 = _mm_loadu_pd( &e[idx2+4] ); v21 = _mm_add_pd( v21, v_tmp ); v30 = _mm_loadu_pd( &e[idx2+6] ); v31 = _mm_add_pd( v31, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_loadu_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_loadu_pd( &e[idx1+2] ); v21 = _mm_add_pd( v21, v20 ); v23 = _mm_loadu_pd( &e[idx1+4] ); v31 = _mm_add_pd( v31, v30 ); v33 = _mm_loadu_pd( &e[idx1+6] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v22 = _mm_cmplt_pd( v21, v23 ); v32 = _mm_cmplt_pd( v31, v33 ); _mm_maskstore_pd( &e[idx1 ], _mm_castpd_si128( v02 ), v01 ); _mm_maskstore_pd( &e[idx1+2], _mm_castpd_si128( v12 ), v11 ); _mm_maskstore_pd( &e[idx1+4], _mm_castpd_si128( v22 ), v21 ); _mm_maskstore_pd( &e[idx1+6], _mm_castpd_si128( v32 ), v31 ); v_rootmask0 = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_rootmask1 = _mm_shuffle_ps( _mm_castpd_ps( v12 ), _mm_castpd_ps( v22 ), _MM_SHUFFLE(0,2,0,2) ); _mm_maskstore_ps( &root[idx1], _mm_castps_si128( v_rootmask0 ), _mm_castsi128_ps( v_cur_roots ) ); _mm_maskstore_ps( &root[idx1+4], _mm_castps_si128( v_rootmask1 ), _mm_castsi128_ps( v_cur_roots ) ); idx1 += 8; } idx3++; } } return e[IDX(0,n)]; }
c = _mm_madd_epi16(*data2, fir); d = _mm_madd_epi16(*data3, fir); c = _mm_hadd_epi32(c, d); a = _mm_hadd_epi32(a, c); return a; } void kvz_eight_tap_filter_and_flip_avx2(int8_t filter[4][8], kvz_pixel *src, int16_t src_stride, int16_t* __restrict dst) { //Load 2 rows per xmm register __m128i rows01 = _mm_loadl_epi64((__m128i*)(src + 0 * src_stride)); rows01 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows01), (double*)(src + 1 * src_stride))); __m128i rows23 = _mm_loadl_epi64((__m128i*)(src + 2 * src_stride)); rows23 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows23), (double*)(src + 3 * src_stride))); __m128i rows45 = _mm_loadl_epi64((__m128i*)(src + 4 * src_stride)); rows45 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows45), (double*)(src + 5 * src_stride))); __m128i rows67 = _mm_loadl_epi64((__m128i*)(src + 6 * src_stride)); rows67 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(rows67), (double*)(src + 7 * src_stride))); //Filter rows const int dst_stride = MAX_WIDTH; kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[0]), (__m128i*)(dst + 0)); kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[1]), (__m128i*)(dst + 1 * dst_stride)); kvz_eight_tap_filter_x8_and_flip(&rows01, &rows23, &rows45, &rows67, (__m128i*)(&filter[2]), (__m128i*)(dst + 2 * dst_stride));