mlib_status F_NAME( mlib_d64 *dst, const mlib_d64 *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) { mlib_u8 *buff, *buff1; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m128d *dp0, *dp1; __m128d aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1, t0, t1; __m128d e_mask; mlib_s32 i, j, wid16, tail; wid = (wid - 2) * SSIZE; wid16 = (wid + 15) & ~15; buff = __mlib_malloc(2 * wid16); buff1 = buff + wid16; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; tail = wid & 15; ((mlib_d64 *)&e_mask)[0] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[0]; ((mlib_d64 *)&e_mask)[1] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[1]; sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; sl += 2 * slb; for (i = 0; i < wid; i += 16) { c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); d0 = _mm_loadu_pd((mlib_d64 *)sp3); d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE)); d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE)); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); _mm_storeu_pd((mlib_d64 *)sp0, cc); _mm_storeu_pd((mlib_d64 *)sp1, dd); sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; } for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; /* * line0: aa * line1: bb * line2: c0 c1 c2 * line3: d0 d1 d2 */ for (i = 0; i <= wid - 16; i += 16) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); d0 = _mm_loadu_pd((mlib_d64 *)sp3); d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE)); d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE)); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); _mm_storeu_pd((mlib_d64 *)sp0, cc); _mm_storeu_pd((mlib_d64 *)sp1, dd); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; _mm_storeu_pd((mlib_d64 *)dp1, r1); dp1++; sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; } if (tail) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); d0 = _mm_loadu_pd((mlib_d64 *)sp3); d1 = _mm_loadu_pd((mlib_d64 *)(sp3 + SSIZE)); d2 = _mm_loadu_pd((mlib_d64 *)(sp3 + 2 * SSIZE)); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); _mm_storeu_pd((mlib_d64 *)sp0, cc); _mm_storeu_pd((mlib_d64 *)sp1, dd); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t1 = _mm_loadu_pd((mlib_d64 *)dp1); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); t1 = _mm_or_pd(_mm_and_pd(e_mask, r1), _mm_andnot_pd(e_mask, t1)); _mm_storeu_pd((mlib_d64 *)dp0, t0); _mm_storeu_pd((mlib_d64 *)dp1, t1); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; for (i = 0; i <= wid - 16; i += 16) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); cc = C_COMP(c0, c1); cc = C_COMP(cc, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; sp0 += 16; sp1 += 16; sp2 += 16; } if (tail) { aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); c0 = _mm_loadu_pd((mlib_d64 *)sp2); c1 = _mm_loadu_pd((mlib_d64 *)(sp2 + SSIZE)); c2 = _mm_loadu_pd((mlib_d64 *)(sp2 + 2 * SSIZE)); c1 = C_COMP(c0, c1); cc = C_COMP(c1, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); _mm_storeu_pd((mlib_d64 *)dp0, t0); } } __mlib_free(buff); return (MLIB_SUCCESS); }
inline F64vec2 mask_or(const F64vec2 &l, const F64vec2 &r) { return _mm_or_pd(l, r); }
mlib_status F_NAME( mlib_d64 *dst, const mlib_d64 *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl; __m128d *dp0, *dp1; __m128d aa, bb, cc, dd, ee, ff, r0, r1, t0, t1; __m128d g0, g1, g2, g3, g4, g5, g6, gg; __m128d h0, h1, h2, h3, h4, h5, h6, hh; __m128d e_mask; mlib_s32 i, j, wid16, tail; wid = (wid - KSIZE1) * SSIZE; wid16 = (wid + 15) & ~15; pbuff = __mlib_malloc(KSIZE1 * wid16); buff0 = pbuff; buff1 = buff0 + wid16; buff2 = buff1 + wid16; buff3 = buff2 + wid16; buff4 = buff3 + wid16; buff5 = buff4 + wid16; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE); tail = wid & 15; ((mlib_d64 *)&e_mask)[0] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[0]; ((mlib_d64 *)&e_mask)[1] = ((mlib_d64 *)((__m128d *) mlib_mask128i_arr + tail))[1]; for (j = 0; j < 3; j++) { sp0 = buff4; sp1 = buff5; sp6 = sl; sp7 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 16) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); h0 = _mm_loadu_pd((mlib_d64 *)sp7); h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE)); h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE)); h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE)); h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE)); h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE)); h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE)); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); _mm_storeu_pd((mlib_d64 *)sp0, gg); _mm_storeu_pd((mlib_d64 *)sp1, hh); sp0 += 16; sp1 += 16; sp6 += 16; sp7 += 16; } if (j < 2) { buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; } } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; sp7 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: ee * line5: ff * line4: g0 g1 g2 g3 g4 g5 g6 * line5: h0 h1 h2 h3 h4 h5 h6 */ for (i = 0; i <= wid - 16; i += 16) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); h0 = _mm_loadu_pd((mlib_d64 *)sp7); h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE)); h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE)); h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE)); h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE)); h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE)); h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE)); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); _mm_storeu_pd((mlib_d64 *)sp0, gg); _mm_storeu_pd((mlib_d64 *)sp1, hh); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; _mm_storeu_pd((mlib_d64 *)dp1, r1); dp1++; sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; sp4 += 16; sp5 += 16; sp6 += 16; sp7 += 16; } if (tail) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); h0 = _mm_loadu_pd((mlib_d64 *)sp7); h1 = _mm_loadu_pd((mlib_d64 *)(sp7 + SSIZE)); h2 = _mm_loadu_pd((mlib_d64 *)(sp7 + 2 * SSIZE)); h3 = _mm_loadu_pd((mlib_d64 *)(sp7 + 3 * SSIZE)); h4 = _mm_loadu_pd((mlib_d64 *)(sp7 + 4 * SSIZE)); h5 = _mm_loadu_pd((mlib_d64 *)(sp7 + 5 * SSIZE)); h6 = _mm_loadu_pd((mlib_d64 *)(sp7 + 6 * SSIZE)); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); _mm_storeu_pd((mlib_d64 *)sp0, gg); _mm_storeu_pd((mlib_d64 *)sp1, hh); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t1 = _mm_loadu_pd((mlib_d64 *)dp1); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); t1 = _mm_or_pd(_mm_and_pd(e_mask, r1), _mm_andnot_pd(e_mask, t1)); _mm_storeu_pd((mlib_d64 *)dp0, t0); _mm_storeu_pd((mlib_d64 *)dp1, t1); } buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; for (i = 0; i <= wid - 16; i += 16) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); _mm_storeu_pd((mlib_d64 *)dp0, r0); dp0++; sp0 += 16; sp1 += 16; sp2 += 16; sp3 += 16; sp4 += 16; sp5 += 16; sp6 += 16; } if (tail) { g0 = _mm_loadu_pd((mlib_d64 *)sp6); g1 = _mm_loadu_pd((mlib_d64 *)(sp6 + SSIZE)); g2 = _mm_loadu_pd((mlib_d64 *)(sp6 + 2 * SSIZE)); g3 = _mm_loadu_pd((mlib_d64 *)(sp6 + 3 * SSIZE)); g4 = _mm_loadu_pd((mlib_d64 *)(sp6 + 4 * SSIZE)); g5 = _mm_loadu_pd((mlib_d64 *)(sp6 + 5 * SSIZE)); g6 = _mm_loadu_pd((mlib_d64 *)(sp6 + 6 * SSIZE)); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = _mm_loadu_pd((mlib_d64 *)sp0); bb = _mm_loadu_pd((mlib_d64 *)sp1); cc = _mm_loadu_pd((mlib_d64 *)sp2); dd = _mm_loadu_pd((mlib_d64 *)sp3); ee = _mm_loadu_pd((mlib_d64 *)sp4); ff = _mm_loadu_pd((mlib_d64 *)sp5); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); t0 = _mm_loadu_pd((mlib_d64 *)dp0); t0 = _mm_or_pd(_mm_and_pd(e_mask, r0), _mm_andnot_pd(e_mask, t0)); _mm_storeu_pd((mlib_d64 *)dp0, t0); } } __mlib_free(pbuff); return (MLIB_SUCCESS); }
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, l_end_pre, j; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128i v_cur_roots, v_old_roots, v_new_roots; __m128 v_rootmask; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3, pad, pad_r; idx1 = (n+1)*(n+2)/2 + n/2; e[idx1] = q[n]; idx1++; pad = 1; // pad contains the padding for row i+1 // for row n it's always 1 for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } // idx2 now points to the beginning of the next line. idx2 += pad; // padding of line i+1 idx3 = idx1; pad_r = pad; // padding of line r for (r = i; r < n; ++r) { pad_r = !pad_r; // padding of line r+1 // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&3); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 4 ) { v01 = _mm_load_pd( &w[idx1 ] ); v11 = _mm_load_pd( &w[idx1+2] ); v00 = _mm_load_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency v10 = _mm_load_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_load_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_load_pd( &e[idx1+2] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 )); v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 )); _mm_store_pd( &e[idx1 ], v00 ); _mm_store_pd( &e[idx1+2], v10 ); v_rootmask = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_old_roots = _mm_lddqu_si128( &root[idx1] ); v_new_roots = _mm_or_si128( _mm_and_si128( v_cur_roots, _mm_castps_si128( v_rootmask ) ), _mm_andnot_si128( v_old_roots, _mm_castps_si128( v_rootmask ) ) ); _mm_storeu_si128( &root[idx1], v_new_roots ); idx1 += 4; } idx2 += pad_r; idx3++; } pad = !pad; // every other line as padding 0, or 1, respectively } // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[n + !(n&1)]; }
// The input must be in domain [-1686629712, 1686629712]. // // I tried to optimize the double to int conversion by using `magic`, but // it was actually slower than using `_mm_cvttpd_epi32()` and it didn't // offer greater domain for `x`. static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) { SIMD_CONST_SQ(sign , SIMD_UINT64_C(0x8000000000000000)); SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF)); SIMD_CONST_SI(int32_one, 1); SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698); SIMD_CONST_SD(DP1 , 7.85398125648498535156e-1); SIMD_CONST_SD(DP2 , 3.77489470793079817668e-8); SIMD_CONST_SD(DP3 , 2.69515142907905952645e-15); #define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \ SIMD_ALIGN_VAR(static const double, name[], 16) = { \ x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \ y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \ x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \ y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya \ } DEFINE_DATA(sincos_coeff, 1.58962301576546568060e-10,-2.50507477628578072866e-8, 2.75573136213857245213e-6 ,-1.98412698295895385996e-4, 8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0, -1.13585365213876817300e-11, 2.08757008419747316778e-9, -2.75573141792967388112e-7 , 2.48015872888517045348e-5, -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0); __m128d y; __m128d sign = x; // Sign bit. x = _mm_and_pd(x, SIMD_GET_PD(inv_sign)); // Take the absolute value. y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI)); // Integer part of `x * 4 / PI`. __m128i ival = _mm_cvttpd_epi32(y); // Extract the integer part of y. __m128i ione = SIMD_GET_PI(int32_one); ival = _mm_add_epi32(ival, ione); // j += 1. ival = _mm_andnot_si128(ione, ival); // j &=~1. y = _mm_cvtepi32_pd(ival); ival = _mm_unpacklo_epi32(ival, ival); sign = _mm_xor_pd(sign, // Swap the sign bit if `j & 4`. _mm_castsi128_pd(_mm_slli_epi64(ival, 61))); sign = _mm_and_pd(sign, SIMD_GET_PD(sign)); // Keep only the sign bit. // Get the polynom selection mask (j & 2): // 1. `0x0000000000000000` => `0 <= x <= PI/4` // 2. `0xFFFFFFFFFFFFFFFF` => `PI/4 < x <= PI/2` ival = _mm_slli_epi32(ival, 30); ival = _mm_srai_epi32(ival, 31); // Extended precision modular arithmetic: // x = ((x - y * DP1) - y * DP2) - y * DP3 x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3))); // Get the polynom coefficients for each lane (sin/cos). __m128d poly_mask = _mm_castsi128_pd(ival); const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) + static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8; __m128d xx = _mm_mul_pd(x, x); y = coeff[0]; y = Simd128::mad(y, xx, coeff[1]); y = Simd128::mad(y, xx, coeff[2]); y = Simd128::mad(y, xx, coeff[3]); y = Simd128::mad(y, xx, coeff[4]); y = Simd128::mad(y, xx, coeff[5]); y = _mm_mul_pd(y, xx); __m128d x_or_xx = _mm_or_pd( _mm_and_pd(xx, poly_mask), _mm_andnot_pd(poly_mask, x)); y = _mm_mul_pd(y, x_or_xx); y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6])); y = _mm_add_pd(y, coeff[7]); return _mm_xor_pd(y, sign); }
__m128d t3(void) { return _mm_or_pd (magic_a,magic_b); }
int calc_gb_rad_hct_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top, const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md,int gb_algorithm) { int i,ai,k,n,ii,ii3,is3,nj0,nj1,at0,at1,offset; int jnrA,jnrB; int j3A,j3B; double shX,shY,shZ; double rr,rr_inv,rr_inv2,sum_tmp,sum,sum2,sum3,gbr; double sum_ai2, sum_ai3,tsum,tchain,doffset; double *obc_param; double *gb_radius; double *work; int * jjnr; double *dadx; double *shiftvec; double min_rad,rad; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3,t4; __m128d rsq,rinv,r; __m128d rai,rai_inv,raj, raj_inv,rai_inv2,sk,sk2,lij,dlij,duij; __m128d uij,lij2,uij2,lij3,uij3,diff2; __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum; __m128d sum_ai, tmp_ai,sk_ai,sk_aj,sk2_ai,sk2_aj,sk2_rinv; __m128d dadx1,dadx2; __m128d logterm; __m128d mask; __m128d obc_mask1,obc_mask2,obc_mask3; __m128d oneeighth = _mm_set1_pd(0.125); __m128d onefourth = _mm_set1_pd(0.25); const __m128d half = _mm_set1_pd(0.5); const __m128d three = _mm_set1_pd(3.0); const __m128d one = _mm_set1_pd(1.0); const __m128d two = _mm_set1_pd(2.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d neg = _mm_set1_pd(-1.0); /* Set the dielectric offset */ doffset = born->gb_doffset; gb_radius = born->gb_radius; obc_param = born->param; work = born->gpol_hct_work; jjnr = nl->jjnr; dadx = fr->dadx; shiftvec = fr->shift_vec[0]; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); jnrA = jnrB = 0; for(i=0;i<born->nr;i++) { work[i] = 0; } for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); rai = _mm_load1_pd(gb_radius+ii); rai_inv= gmx_mm_inv_pd(rai); sum_ai = _mm_setzero_pd(); sk_ai = _mm_load1_pd(born->param+ii); sk2_ai = _mm_mul_pd(sk_ai,sk_ai); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj); GMX_MM_LOAD_2VALUES_PD(obc_param+jnrA,obc_param+jnrB,sk_aj); dx = _mm_sub_pd(ix, jx); dy = _mm_sub_pd(iy, jy); dz = _mm_sub_pd(iz, jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); r = _mm_mul_pd(rsq,rinv); /* Compute raj_inv aj1-4 */ raj_inv = gmx_mm_inv_pd(raj); /* Evaluate influence of atom aj -> ai */ t1 = _mm_add_pd(r,sk_aj); t2 = _mm_sub_pd(r,sk_aj); t3 = _mm_sub_pd(sk_aj,r); obc_mask1 = _mm_cmplt_pd(rai, t1); obc_mask2 = _mm_cmplt_pd(rai, t2); obc_mask3 = _mm_cmplt_pd(rai, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,rai_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_pd(uij, uij); uij3 = _mm_mul_pd(uij2,uij); lij2 = _mm_mul_pd(lij, lij); lij3 = _mm_mul_pd(lij2,lij); diff2 = _mm_sub_pd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_aj = _mm_mul_pd(sk_aj,sk_aj); sk2_rinv = _mm_mul_pd(sk2_aj,rinv); prod = _mm_mul_pd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv)); t1 = _mm_sub_pd(lij,uij); t2 = _mm_mul_pd(diff2, _mm_sub_pd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm)); t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3)); t4 = _mm_mul_pd(two,_mm_sub_pd(rai_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4)); sum_ai = _mm_add_pd(sum_ai, _mm_and_pd(t1,obc_mask1) ); t1 = _mm_add_pd(_mm_mul_pd(half,lij2), _mm_mul_pd(prod,lij3)); t1 = _mm_sub_pd(t1, _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(lij,rinv), _mm_mul_pd(lij3,r)))); t2 = _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(uij,rinv), _mm_mul_pd(uij3,r))); t2 = _mm_sub_pd(t2, _mm_add_pd(_mm_mul_pd(half,uij2), _mm_mul_pd(prod,uij3))); t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm), _mm_mul_pd(rinv,rinv)); t3 = _mm_sub_pd(t3, _mm_mul_pd(_mm_mul_pd(diff2,oneeighth), _mm_add_pd(one, _mm_mul_pd(sk2_rinv,rinv)))); t1 = _mm_mul_pd(rinv, _mm_add_pd(_mm_mul_pd(dlij,t1), _mm_add_pd(t2,t3))); dadx1 = _mm_and_pd(t1,obc_mask1); /* Evaluate influence of atom ai -> aj */ t1 = _mm_add_pd(r,sk_ai); t2 = _mm_sub_pd(r,sk_ai); t3 = _mm_sub_pd(sk_ai,r); obc_mask1 = _mm_cmplt_pd(raj, t1); obc_mask2 = _mm_cmplt_pd(raj, t2); obc_mask3 = _mm_cmplt_pd(raj, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,raj_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_pd(uij, uij); uij3 = _mm_mul_pd(uij2,uij); lij2 = _mm_mul_pd(lij, lij); lij3 = _mm_mul_pd(lij2,lij); diff2 = _mm_sub_pd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_rinv = _mm_mul_pd(sk2_ai,rinv); prod = _mm_mul_pd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_pd(uij,lij_inv)); t1 = _mm_sub_pd(lij,uij); t2 = _mm_mul_pd(diff2, _mm_sub_pd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_pd(half,_mm_mul_pd(rinv,logterm)); t1 = _mm_add_pd(t1,_mm_add_pd(t2,t3)); t4 = _mm_mul_pd(two,_mm_sub_pd(raj_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_pd(half,_mm_add_pd(t1,t4)); GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_and_pd(t1,obc_mask1)); t1 = _mm_add_pd(_mm_mul_pd(half,lij2), _mm_mul_pd(prod,lij3)); t1 = _mm_sub_pd(t1, _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(lij,rinv), _mm_mul_pd(lij3,r)))); t2 = _mm_mul_pd(onefourth, _mm_add_pd(_mm_mul_pd(uij,rinv), _mm_mul_pd(uij3,r))); t2 = _mm_sub_pd(t2, _mm_add_pd(_mm_mul_pd(half,uij2), _mm_mul_pd(prod,uij3))); t3 = _mm_mul_pd(_mm_mul_pd(onefourth,logterm), _mm_mul_pd(rinv,rinv)); t3 = _mm_sub_pd(t3, _mm_mul_pd(_mm_mul_pd(diff2,oneeighth), _mm_add_pd(one, _mm_mul_pd(sk2_rinv,rinv)))); t1 = _mm_mul_pd(rinv, _mm_add_pd(_mm_mul_pd(dlij,t1), _mm_add_pd(t2,t3))); dadx2 = _mm_and_pd(t1,obc_mask1); _mm_store_pd(dadx,dadx1); dadx += 2; _mm_store_pd(dadx,dadx2); dadx += 2; } /* end normal inner loop */ if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj); GMX_MM_LOAD_1VALUE_PD(obc_param+jnrA,sk_aj); dx = _mm_sub_sd(ix, jx); dy = _mm_sub_sd(iy, jy); dz = _mm_sub_sd(iz, jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); r = _mm_mul_sd(rsq,rinv); /* Compute raj_inv aj1-4 */ raj_inv = gmx_mm_inv_pd(raj); /* Evaluate influence of atom aj -> ai */ t1 = _mm_add_sd(r,sk_aj); t2 = _mm_sub_sd(r,sk_aj); t3 = _mm_sub_sd(sk_aj,r); obc_mask1 = _mm_cmplt_sd(rai, t1); obc_mask2 = _mm_cmplt_sd(rai, t2); obc_mask3 = _mm_cmplt_sd(rai, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd(_mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,rai_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_sd(uij, uij); uij3 = _mm_mul_sd(uij2,uij); lij2 = _mm_mul_sd(lij, lij); lij3 = _mm_mul_sd(lij2,lij); diff2 = _mm_sub_sd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_aj = _mm_mul_sd(sk_aj,sk_aj); sk2_rinv = _mm_mul_sd(sk2_aj,rinv); prod = _mm_mul_sd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv)); t1 = _mm_sub_sd(lij,uij); t2 = _mm_mul_sd(diff2, _mm_sub_sd(_mm_mul_pd(onefourth,r), prod)); t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm)); t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3)); t4 = _mm_mul_sd(two,_mm_sub_sd(rai_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4)); sum_ai = _mm_add_sd(sum_ai, _mm_and_pd(t1,obc_mask1) ); t1 = _mm_add_sd(_mm_mul_sd(half,lij2), _mm_mul_sd(prod,lij3)); t1 = _mm_sub_sd(t1, _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(lij,rinv), _mm_mul_sd(lij3,r)))); t2 = _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(uij,rinv), _mm_mul_sd(uij3,r))); t2 = _mm_sub_sd(t2, _mm_add_sd(_mm_mul_sd(half,uij2), _mm_mul_sd(prod,uij3))); t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm), _mm_mul_sd(rinv,rinv)); t3 = _mm_sub_sd(t3, _mm_mul_sd(_mm_mul_sd(diff2,oneeighth), _mm_add_sd(one, _mm_mul_sd(sk2_rinv,rinv)))); t1 = _mm_mul_sd(rinv, _mm_add_sd(_mm_mul_sd(dlij,t1), _mm_add_pd(t2,t3))); dadx1 = _mm_and_pd(t1,obc_mask1); /* Evaluate influence of atom ai -> aj */ t1 = _mm_add_sd(r,sk_ai); t2 = _mm_sub_sd(r,sk_ai); t3 = _mm_sub_sd(sk_ai,r); obc_mask1 = _mm_cmplt_sd(raj, t1); obc_mask2 = _mm_cmplt_sd(raj, t2); obc_mask3 = _mm_cmplt_sd(raj, t3); uij = gmx_mm_inv_pd(t1); lij = _mm_or_pd( _mm_and_pd(obc_mask2,gmx_mm_inv_pd(t2)), _mm_andnot_pd(obc_mask2,raj_inv)); dlij = _mm_and_pd(one,obc_mask2); uij2 = _mm_mul_sd(uij, uij); uij3 = _mm_mul_sd(uij2,uij); lij2 = _mm_mul_sd(lij, lij); lij3 = _mm_mul_sd(lij2,lij); diff2 = _mm_sub_sd(uij2,lij2); lij_inv = gmx_mm_invsqrt_pd(lij2); sk2_rinv = _mm_mul_sd(sk2_ai,rinv); prod = _mm_mul_sd(onefourth,sk2_rinv); logterm = gmx_mm_log_pd(_mm_mul_sd(uij,lij_inv)); t1 = _mm_sub_sd(lij,uij); t2 = _mm_mul_sd(diff2, _mm_sub_sd(_mm_mul_sd(onefourth,r), prod)); t3 = _mm_mul_sd(half,_mm_mul_sd(rinv,logterm)); t1 = _mm_add_sd(t1,_mm_add_sd(t2,t3)); t4 = _mm_mul_sd(two,_mm_sub_sd(raj_inv,lij)); t4 = _mm_and_pd(t4,obc_mask3); t1 = _mm_mul_sd(half,_mm_add_sd(t1,t4)); GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_and_pd(t1,obc_mask1)); t1 = _mm_add_sd(_mm_mul_sd(half,lij2), _mm_mul_sd(prod,lij3)); t1 = _mm_sub_sd(t1, _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(lij,rinv), _mm_mul_sd(lij3,r)))); t2 = _mm_mul_sd(onefourth, _mm_add_sd(_mm_mul_sd(uij,rinv), _mm_mul_sd(uij3,r))); t2 = _mm_sub_sd(t2, _mm_add_sd(_mm_mul_sd(half,uij2), _mm_mul_sd(prod,uij3))); t3 = _mm_mul_sd(_mm_mul_sd(onefourth,logterm), _mm_mul_sd(rinv,rinv)); t3 = _mm_sub_sd(t3, _mm_mul_sd(_mm_mul_sd(diff2,oneeighth), _mm_add_sd(one, _mm_mul_sd(sk2_rinv,rinv)))); t1 = _mm_mul_sd(rinv, _mm_add_sd(_mm_mul_sd(dlij,t1), _mm_add_sd(t2,t3))); dadx2 = _mm_and_pd(t1,obc_mask1); _mm_store_pd(dadx,dadx1); dadx += 2; _mm_store_pd(dadx,dadx2); dadx += 2; } gmx_mm_update_1pot_pd(sum_ai,work+ii); } /* Parallel summations */ if(PARTDECOMP(cr)) { gmx_sum(natoms, work, cr); } else if(DOMAINDECOMP(cr)) { dd_atom_sum_real(cr->dd, work); } if(gb_algorithm==egbHCT) { /* HCT */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { rr = top->atomtypes.gb_radius[md->typeA[i]]-doffset; sum = 1.0/rr - work[i]; min_rad = rr + doffset; rad = 1.0/sum; born->bRad[i] = rad > min_rad ? rad : min_rad; fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); } } /* Extra communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); } } else { /* OBC */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { rr = top->atomtypes.gb_radius[md->typeA[i]]; rr_inv2 = 1.0/rr; rr = rr-doffset; rr_inv = 1.0/rr; sum = rr * work[i]; sum2 = sum * sum; sum3 = sum2 * sum; tsum = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3); born->bRad[i] = rr_inv - tsum*rr_inv2; born->bRad[i] = 1.0 / born->bRad[i]; fr->invsqrta[i]=gmx_invsqrt(born->bRad[i]); tchain = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2); born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2; } } /* Extra (local) communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); dd_atom_spread_real(cr->dd, born->drobc); } } return 0; }
BOOST_FORCEINLINE __m128d __vectorcall operator | ( __m128d const left, __m128d const right ) { return _mm_or_pd ( left, right ); }