static void inverse_f32_sse_unroll2 (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = 1.0 / *src1++; } for (; n >= 8; n -= 8) { __m128 xmm0, xmm1; /* While _mm_rcp_ps sounds promising, the results it gives are rather * different from the 1.0 / src1 reference implementation, so do that. */ xmm0 = _mm_set_ps1(1.0); xmm1 = _mm_loadu_ps(src1); xmm0 = _mm_div_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); xmm0 = _mm_set_ps1(1.0); xmm1 = _mm_loadu_ps(src1 + 4); xmm0 = _mm_div_ps(xmm0, xmm1); _mm_store_ps(dest + 4, xmm0); dest += 8; src1 += 8; } for (; n > 0; n--) { *dest++ = 1.0 / *src1++; } }
static void divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = *src1++ / *src2++; } for (; n >= 8; n -= 8) { __m128 xmm0, xmm1; xmm0 = _mm_loadu_ps(src1); xmm1 = _mm_loadu_ps(src2); xmm0 = _mm_div_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); xmm0 = _mm_loadu_ps(src1 + 4); xmm1 = _mm_loadu_ps(src2 + 4); xmm0 = _mm_div_ps(xmm0, xmm1); _mm_store_ps(dest + 4, xmm0); dest += 8; src1 += 8; src2 += 8; } for (; n > 0; n--) { *dest++ = *src1++ / *src2++; } }
static inline __m128 lanczos_sse(__m128 width, __m128 t) { /* Compute a value for sinf(pi.t) in [-pi pi] for which the value will be * correct */ __m128i a = _mm_cvtps_epi32(t); __m128 r = _mm_sub_ps(t, _mm_cvtepi32_ps(a)); // Compute the correct sign for sinf(pi.r) static const uint32_t fone[] __attribute__((aligned(SSE_ALIGNMENT))) = { 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000}; static const uint32_t ione[] __attribute__((aligned(SSE_ALIGNMENT))) = { 1, 1, 1, 1}; static const __m128 eps = {DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON, DT_LANCZOS_EPSILON}; static const __m128 pi = {M_PI, M_PI, M_PI, M_PI}; static const __m128 pi2 = {M_PI*M_PI, M_PI*M_PI, M_PI*M_PI, M_PI*M_PI}; __m128i isign = _mm_and_si128(*(__m128i*)ione, a); isign = _mm_slli_epi64(isign, 31); isign = _mm_or_si128(*(__m128i*)fone, isign); __m128 fsign = _mm_castsi128_ps(isign); __m128 num = _mm_mul_ps(width, fsign); num = _mm_mul_ps(num, sinf_fast_sse(_mm_mul_ps(pi, r))); num = _mm_mul_ps(num, sinf_fast_sse(_mm_div_ps(_mm_mul_ps(pi, t), width))); num = _mm_add_ps(eps, num); __m128 den = _mm_mul_ps(pi2, _mm_mul_ps(t, t)); den = _mm_add_ps(eps, den); return _mm_div_ps(num, den); }
static inline Simd div(const Simd& lhs, float rhs) { Simd res; __m128 tmp = _mm_set1_ps(rhs); res.reg[0] = _mm_div_ps(lhs.reg[0], tmp); res.reg[1] = _mm_div_ps(lhs.reg[1], tmp); return res; }
int main() { float m=1.0; /* initial magnification */ /* Timing variables */ struct timeval start_time, stop_time; long long compute_time; /* */ /* Create a screen to render to */ Screen *screen; screen = new Screen(HXRES, HYRES); gettimeofday(&start_time, NULL); //Sets up the parallel stuff int depth=0; while (depth < MAX_DEPTH) { #pragma omp parallel { float * answers = (float *)malloc(sizeof(float) * 4); #pragma omp for schedule(dynamic) for (int hy=0; hy<HYRES; hy++) { float cy = ((((float)hy/(float)HYRES) -0.5 + (PY/(4.0/m)))*(4.0f/m)); __m128 cy_m = _mm_set1_ps(cy); __m128 four_m = _mm_set1_ps((4.0/m)); for (int hx=0; hx<HXRES; hx+=4) { __m128 cx_m = _mm_setr_ps(hx, hx+1, hx+2, hx+3); cx_m = _mm_div_ps(cx_m, _mm_set1_ps(HXRES)); cx_m = _mm_sub_ps(cx_m, _mm_set1_ps(0.5)); cx_m = _mm_add_ps(cx_m, _mm_div_ps(_mm_set1_ps(PX),four_m)); cx_m = _mm_mul_ps(cx_m, four_m); //Store and check the four iterations and update the screen accordingly ! _mm_storeu_ps(answers, member_speed(cx_m, cy_m)); for (int k = 0; k < 4; k++) { if (answers[k] != MAX_ITS) { int l=((int)(answers[k]) % 40) - 1; l = l*3; screen->putpixel(hx+k, hy, pal[l], pal[l+1], pal[l+2]); }else{ screen->putpixel(hx+k, hy, 0, 0, 0); } } } } } screen->flip(); /* Show the rendered image on the screen */ std::cerr << "Render done " << depth++ << " " << m << std::endl; /* Zoom in */ m *= ZOOM_FACTOR; } gettimeofday(&stop_time, NULL); compute_time = (stop_time.tv_sec - start_time.tv_sec) * 1000000L + (stop_time.tv_usec - start_time.tv_usec); fprintf(stderr, "Time to find Richys tour: %lld microseconds\n", compute_time); sleep(5); std::cout << "Clean Exit"<< std::endl; }
void lfModifier::ModifyCoord_Dist_PTLens_SSE (void *data, float *iocoord, int count) { // See "Note about PT-based distortion models" at the top of mod-coord.cpp. /* * If buffer is not aligned, fall back to plain code */ if((uintptr_t)(iocoord) & 0xf) { return ModifyCoord_Dist_PTLens(data, iocoord, count); } lfCoordDistCallbackData* cddata = (lfCoordDistCallbackData*) data; // Rd = Ru * (a_ * Ru^3 + b_ * Ru^2 + c_ * Ru + 1) __m128 a_ = _mm_set_ps1 (cddata->Terms [0]); __m128 b_ = _mm_set_ps1 (cddata->Terms [1]); __m128 c_ = _mm_set_ps1 (cddata->Terms [2]); __m128 cx = _mm_set_ps1 (cddata->centerX); __m128 cy = _mm_set_ps1 (cddata->centerY); __m128 cc = _mm_set_ps1 (cddata->coordinate_correction); __m128 one = _mm_set_ps1 (1.0f); // SSE Loop processes 4 pixels/loop int loop_count = count / 4; for (int i = 0; i < loop_count ; i++) { __m128 c0 = _mm_load_ps (&iocoord [8 * i]); __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]); __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0)); __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1)); x = _mm_sub_ps(_mm_mul_ps(x, cc), cx); y = _mm_sub_ps(_mm_mul_ps(y, cc), cy); __m128 ru2 = _mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y)); __m128 ru = _mm_rcp_ps (_mm_rsqrt_ps (ru2)); // Calculate poly3 = a_ * ru2 * ru + b_ * ru2 + c_ * ru + 1; __m128 t = _mm_mul_ps (ru2, b_); __m128 poly3 = _mm_mul_ps (_mm_mul_ps (a_, ru2), ru); t = _mm_add_ps (t, _mm_mul_ps (ru, c_)); poly3 = _mm_add_ps (t, _mm_add_ps (poly3, one)); x = _mm_add_ps(_mm_mul_ps (x, poly3), cx); y = _mm_add_ps(_mm_mul_ps (y, poly3), cy); x = _mm_div_ps (x, cc); y = _mm_div_ps (y, cc); c0 = _mm_unpacklo_ps(x, y); c1 = _mm_unpackhi_ps(x, y); _mm_store_ps (&iocoord [8 * i], c0); _mm_store_ps (&iocoord [8 * i + 4], c1); } loop_count *= 4; int remain = count - loop_count; if (remain) ModifyCoord_Dist_PTLens (data, &iocoord [loop_count * 2], remain); }
kw_mat4 kw_div(kw_mat4 m, f32 scale){ kw_mat4 result; result.simd[0] = _mm_div_ps(m.simd[0], _mm_set_ps1(scale)); result.simd[1] = _mm_div_ps(m.simd[1], _mm_set_ps1(scale)); result.simd[2] = _mm_div_ps(m.simd[2], _mm_set_ps1(scale)); result.simd[3] = _mm_div_ps(m.simd[3], _mm_set_ps1(scale)); return result; }
static void SubbandCoherenceSSE2(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], float* fft, float* cohde, float* cohxd, int* extreme_filter_divergence) { int i; SmoothedPSD(aec, efw, dfw, xfw, extreme_filter_divergence); { const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f); // Subband coherence for (i = 0; i + 3 < PART_LEN1; i += 4) { const __m128 vec_sd = _mm_loadu_ps(&aec->sd[i]); const __m128 vec_se = _mm_loadu_ps(&aec->se[i]); const __m128 vec_sx = _mm_loadu_ps(&aec->sx[i]); const __m128 vec_sdse = _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se)); const __m128 vec_sdsx = _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx)); const __m128 vec_sde_3210 = _mm_loadu_ps(&aec->sde[i][0]); const __m128 vec_sde_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); const __m128 vec_sxd_3210 = _mm_loadu_ps(&aec->sxd[i][0]); const __m128 vec_sxd_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); const __m128 vec_sde_0 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 vec_sde_1 = _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1)); const __m128 vec_sxd_0 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 vec_sxd_1 = _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1)); __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0); __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0); vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1)); vec_cohde = _mm_div_ps(vec_cohde, vec_sdse); vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1)); vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx); _mm_storeu_ps(&cohde[i], vec_cohde); _mm_storeu_ps(&cohxd[i], vec_cohxd); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { cohde[i] = (aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) / (aec->sd[i] * aec->se[i] + 1e-10f); cohxd[i] = (aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) / (aec->sx[i] * aec->sd[i] + 1e-10f); } } }
void experienceNet::normalPDF_sse(float* result, const float* _partitions, float _mean, float _stdDev) { /* CODE ADAPTED FROM boost/math/normal.hpp RealType exponent = x - mean; exponent *= -exponent; exponent /= 2 * sd * sd; result = exp(exponent); result /= sd * sqrt(2 * constants::pi<RealType>()); return result; */ const __m128& partitions = *(__m128*)_partitions; __m128 exponent, tmp, mean, sd; /* CODE ADAPTED FROM http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html */ static const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); static const __m128 twos = _mm_set_ps1(2.0f); static const __m128 sqrt_pi_2_s = _mm_set_ps1(sqrt(2.0 * M_PI)); // store mean and sd: mean = _mm_load_ps1(&_mean); sd = _mm_load_ps1(&_stdDev); // exponent = x - mean exponent = _mm_sub_ps(partitions, mean); // exponent *= -exponent; tmp = _mm_xor_ps(exponent, signmask); exponent = _mm_mul_ps(exponent, tmp); // exponent /= 2 * sd * sd; tmp = _mm_mul_ps(sd, sd); tmp = _mm_mul_ps(tmp, twos); exponent = _mm_div_ps(exponent, tmp); // exponent = exp(exponent); exponent = _mm_exp_ps(exponent); // exponent /= sd * sqrt(2 * pi) tmp = _mm_mul_ps(sd, sqrt_pi_2_s); tmp = _mm_div_ps(exponent, tmp); #ifndef NDEBUG const float* _result = (float*)&tmp; boost::math::normal_distribution<float> cNormal(_mean, _stdDev); assert(fastabs(_result[0] - boost::math::pdf(cNormal, _partitions[0])) < 0.001f); assert(fastabs(_result[1] - boost::math::pdf(cNormal, _partitions[1])) < 0.001f); assert(fastabs(_result[2] - boost::math::pdf(cNormal, _partitions[2])) < 0.001f); assert(fastabs(_result[3] - boost::math::pdf(cNormal, _partitions[3])) < 0.001f); #endif // return result: _mm_store_ps(result, tmp); };
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1]) { const __m128 k1e_10f = _mm_set1_ps(1e-10f); const __m128 kThresh = _mm_set1_ps(aec->errThresh); const __m128 kMu = _mm_set1_ps(aec->mu); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]); const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]); const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]); const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f); __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus); __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus); const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re); const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im); const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2); const __m128 absEf = _mm_sqrt_ps(ef_sum2); const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh); __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f); const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus); __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv); __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv); ef_re_if = _mm_and_ps(bigger, ef_re_if); ef_im_if = _mm_and_ps(bigger, ef_im_if); ef_re = _mm_andnot_ps(bigger, ef_re); ef_im = _mm_andnot_ps(bigger, ef_im); ef_re = _mm_or_ps(ef_re, ef_re_if); ef_im = _mm_or_ps(ef_im, ef_im_if); ef_re = _mm_mul_ps(ef_re, kMu); ef_im = _mm_mul_ps(ef_im, kMu); _mm_storeu_ps(&ef[0][i], ef_re); _mm_storeu_ps(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < (PART_LEN1); i++) { float absEf; ef[0][i] /= (aec->xPow[i] + 1e-10f); ef[1][i] /= (aec->xPow[i] + 1e-10f); absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (absEf > aec->errThresh) { absEf = aec->errThresh / (absEf + 1e-10f); ef[0][i] *= absEf; ef[1][i] *= absEf; } // Stepsize factor ef[0][i] *= aec->mu; ef[1][i] *= aec->mu; } }
inline float16 operator / (const float16 & a, const float16 & b) { float16 res; res.x.m128 = _mm_div_ps(a.x.m128, b.x.m128); res.y.m128 = _mm_div_ps(a.y.m128, b.y.m128); res.z.m128 = _mm_div_ps(a.z.m128, b.z.m128); res.w.m128 = _mm_div_ps(a.w.m128, b.w.m128); return res; }
static inline __m128 curve_vec4( const __m128 x, const __m128 g, const __m128 sigma, const __m128 shadows, const __m128 highlights, const __m128 clarity) { // TODO: pull these non-data depedent constants out of the loop to see // whether the compiler fail to do so const __m128 const0 = _mm_set_ps1(0x3f800000u); const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 const __m128 one = _mm_set1_ps(1.0f); const __m128 two = _mm_set1_ps(2.0f); const __m128 twothirds = _mm_set1_ps(2.0f/3.0f); const __m128 twosig = _mm_mul_ps(two, sigma); const __m128 sigma2 = _mm_mul_ps(sigma, sigma); const __m128 s22 = _mm_mul_ps(twothirds, sigma2); const __m128 c = _mm_sub_ps(x, g); const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps()); // select shadows or highlights as multiplier for linear part, based on c < 0 const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights)); // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma) const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask)); // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma)))); const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(), _mm_div_ps(c, _mm_mul_ps(two, ssigma)))); const __m128 t2 = _mm_mul_ps(t, t); const __m128 mt = _mm_sub_ps(one, t); // midtone value fading over to linear part, without local contrast: const __m128 vmid = _mm_add_ps(g, _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)), _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi))))); // c > 2*sigma? const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig); const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid)); // midtone local contrast // dt_fast_expf in sse: const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22)); const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0))); const __m128 k = _mm_max_ps(k0, _mm_setzero_ps()); const __m128i ki = _mm_cvtps_epi32(k); const __m128 gauss = _mm_load_ps((float*)&ki); const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss)); return _mm_add_ps(val, vcon); }
void process( struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const float divider = (float)UINT16_MAX; const __m128 dividers = _mm_set_ps1(divider); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(ovoid) #endif for(int j = 0; j < roi_out->height; j++) { const uint16_t *in = ((uint16_t *)ivoid) + (size_t)j * roi_out->width; float *out = ((float *)ovoid) + (size_t)j * roi_out->width; int i = 0; int alignment = ((8 - (j * roi_out->width & (8 - 1))) & (8 - 1)); // process unaligned pixels for ( ; i < alignment ; i++, out++, in++) *out = ((float)(*in)) / divider; // process aligned pixels with SSE for( ; i < roi_out->width - (8 - 1); i += 8, in += 8) { const __m128i input = _mm_load_si128((__m128i *)in); __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0)); __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0)); __m128 flo = _mm_cvtepi32_ps(ilo); __m128 fhi = _mm_cvtepi32_ps(ihi); flo = _mm_div_ps(flo, dividers); fhi = _mm_div_ps(fhi, dividers); _mm_stream_ps(out, flo); out += 4; _mm_stream_ps(out, fhi); out += 4; } // process the rest for( ; i < roi_out->width; i++, out++, in++) *out = ((float)(*in)) / divider; } _mm_sfence(); }
static void spline_n_4(int i, float t, float *knot, float *splineVal) { knot += i + 1; #ifdef _M_SSE const __m128 knot012 = _mm_loadu_ps(&knot[0]); const __m128 knot345 = _mm_loadu_ps(&knot[3]); const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012); const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012)); const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0)); const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1)); const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1)); const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122)); // It's still faster to use SSE, even with this. float MEMORY_ALIGNED16(ff30_41_52[4]); float MEMORY_ALIGNED16(ff31_42_32[4]); _mm_store_ps(ff30_41_52, f30_41_52); _mm_store_ps(ff31_42_32, f31_42_32); const float &f30 = ff30_41_52[0]; const float &f41 = ff30_41_52[1]; const float &f52 = ff30_41_52[2]; const float &f31 = ff31_42_32[0]; const float &f42 = ff31_42_32[1]; const float &f32 = ff31_42_32[2]; #else // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... float t0 = (t - knot[0]); float t1 = (t - knot[1]); float t2 = (t - knot[2]); // TODO: All our knots are integers so we should be able to get rid of these divisions (How?) float f30 = t0/(knot[3]-knot[0]); float f41 = t1/(knot[4]-knot[1]); float f52 = t2/(knot[5]-knot[2]); float f31 = t1/(knot[3]-knot[1]); float f42 = t2/(knot[4]-knot[2]); float f32 = t2/(knot[3]-knot[2]); #endif float a = (1-f30)*(1-f31); float b = (f31*f41); float c = (1-f41)*(1-f42); float d = (f42*f52); splineVal[0] = a-(a*f32); splineVal[1] = 1-a-b+((a+b+c-1)*f32); splineVal[2] = b+((1-b-c-d)*f32); splineVal[3] = d*f32; }
/* use compiler intrinsics for 4x parallel processing */ static inline float chi2_intrinsic_aligned_float(int n, const float* x, const float* y) { float result=0; const __m128 eps = _mm_set1_ps(FLT_MIN); const __m128 zero = _mm_setzero_ps(); __m128 chi2 = _mm_setzero_ps(); for (; n>3; n-=4) { const __m128 a = _mm_loadu_ps(x); const __m128 b = _mm_loadu_ps(y); const __m128 a_plus_eps = _mm_add_ps(a,eps); const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b); const __m128 a_minus_b = _mm_sub_ps(a,b); const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b); const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps); chi2 = _mm_add_ps(chi2, prod); x+=4; y+=4; } const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2)); const __m128 sum1 = _mm_add_ps(chi2, shuffle1); const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1)); const __m128 sum2 = _mm_add_ps(sum1, shuffle2); // with SSE3, we could use hadd_ps, but the difference is negligible _mm_store_ss(&result,sum2); _mm_empty(); if (n) result += chi2_baseline_float(n, x, y); // remaining 1-3 entries return result; }
RETf DIV(const __m128 x, const __m128 y) { #ifdef __ARM_NEON__ // NEON doesn't seem to support this return x / y; #else return _mm_div_ps(x, y); #endif }
void warmup_vector(float* x, float* y, int size, float* alpha){ int i; __m128 RX_2, RX_2i, RY, R_t1, R_t2, R_alpha; int x_2; int x_2i; R_alpha = _mm_load_ps(&alpha[0]); for(i=0; i<size; i+=4){ //load our various x values RX_2 = _mm_load_ps(&x[i*2]); RX_2i = _mm_load_ps(&x[2*i+1]); //perform x[2*i+1]/alpha -> store in R_t1 R_t1 = _mm_div_ps(RX_2i, R_alpha); //multiply x[2*i] by x[2*i] -> store in R_t2 R_t2= _mm_mul_ps(RX_2, RX_2); //add all our variables together; this will be stored as our new y[i] values (in RY) //issue resulting in seg fault seems to reside in below code.... not sure what's going on //tried increasing the size of things in our array to see if we were getting some negative values //something able to set to handle negatives? // RY = _mm_add_ps(R_t1, R_t2); RY = _mm_add_ps(R_t1, R_t2); // copy everything to y[i] _mm_store_ps(&y[i], RY); } }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, __m128 accIx, __m128 accIy, __m128 accIz, float *accI) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); accIx = _mm_add_ps(accIx, aix); accIy = _mm_add_ps(accIy, aiy); accIz = _mm_add_ps(accIz, aiz); _mm_storer_ps(accI, accIx); _mm_storer_ps(accI + 4, accIy); _mm_storer_ps(accI + 8, accIz); }
/** returns a kernel averaged from the two kernels between them, sse * version * * @param kernel the return pointer for the kernel * @param amp the amp which holds the kernels */ void average_kernels_sse(float *kernel, Amp * amp) { int i; __m128 right; __m128 left; __m128 out; __m128 leftindex = {1.0f, 2.0f, 3.0f, 4.0f}; __m128 rightindex = { FOURIER_SIZE - 1.0, FOURIER_SIZE - 2.0, FOURIER_SIZE - 3.0, FOURIER_SIZE - 4.0}; __m128 fouriersize = _mm_set_ps1((float)FOURIER_SIZE); __m128 jumpsize = _mm_set_ps1(4.0f); for (i = 0; i < FOURIER_SIZE; i=i+4) { left = _mm_loadu_ps(amp->previous_buffer + i); right = _mm_loadu_ps(amp->fourier_buffer + i); out = _mm_div_ps( _mm_add_ps( _mm_mul_ps(left, leftindex), _mm_mul_ps(right, rightindex)), fouriersize); leftindex = _mm_add_ps(leftindex, jumpsize); rightindex = _mm_sub_ps(rightindex, jumpsize); _mm_store_ps(kernel + i, out); } for (; i < FOURIER_SIZE; i++) { kernel[i] = (amp->previous_buffer[i] * (i + 1) + amp->fourier_buffer[i] * (FOURIER_SIZE - i - 1)) / (FOURIER_SIZE); } }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4]) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); for (int i = 0; i < 4; i++) { accI[3 - i].x = aix.m128_f32[i]; accI[3 - i].y = aiy.m128_f32[i]; accI[3 - i].z = aiz.m128_f32[i]; } }
void calculateSSE(int start, int end) { int size = end - start + 1; // we use aligned memory, because SSE instructions are really slow // working on unaligned memory float* result = (float*)aligned_alloc(16, size * sizeof(float)); __m128 x; __m128 delta_x = _mm_set_ps1(4.0f); __m128 y = _mm_set_ps1(1.0f); __m128* sse_result = (__m128*)result; const int sse_length = size / 4; x = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f); for (int loop = 0; loop < 100000; ++loop) { for (int i = 0; i < sse_length; ++i) { __m128 sqrt_result = _mm_sqrt_ps(x); sse_result[i] = _mm_div_ps(sqrt_result, x); //sse_result[i] = _mm_add_ps(x, y); // move x value to next 4 numbers x = _mm_add_ps(x, delta_x); } } }
static inline void boxVerticesToScreenVertices(vec4f vertices[8],const vec4f& screenCenterMul,vec2i screenCenter){ #ifdef ARPHEG_ARCH_X86 __m128 screenSpaceMul = _mm_load_ps((float*)&screenCenterMul.x); __m128 screenCenterOffset = _mm_setr_ps(float(screenCenter.x),float(screenCenter.y),0,0); __m128 nearClip = _mm_setzero_ps(); for(uint32 i = 0;i<8;++i){ __m128 hv = _mm_load_ps((float*)(vertices + i)); __m128 w = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(3,3,3,3)); //get the w component __m128 z = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(2,2,2,2)); hv = _mm_div_ps(hv,w); //Project XYZW to clip space (divide by w) hv = _mm_mul_ps(hv,screenSpaceMul); //XY to screen space [-width/2,-height/2 -> width/2,height/2] hv = _mm_add_ps(hv,screenCenterOffset);//XY to screen space [0,0 -> width,height] __m128 mNoNearClip = _mm_cmpge_ps(z, nearClip ); //Set to all-0 if near-clipped hv = _mm_and_ps(hv, mNoNearClip); _mm_store_ps((float*)(vertices + i),hv); } #else //TODO ScreenSpaceVertex* screenVerts= (ScreenSpaceVertex*)vertices; for(uint32 i =0;i<8;++i){ vertices[i] = vertices[i] * (1.0f/vertices[i].w) ; auto v = vertices[i] * screenCenterMul; screenVerts[i].pos = vec2i(int32(v.x),int32(v.y))+screenCenter; } #endif }
static inline __m128 sigmoid_positive_ps( __m128 xin ) { union { __m128i i; int32_t i32[4]; } i; __m128 ex; float *ex_elem = (float*) &ex; __m128 x1 = _mm_min_ps( xin, tens.ps ); x1 = _mm_mul_ps( x1, tens.ps ); i.i = _mm_cvttps_epi32( x1 ); ex_elem[0] = e[i.i32[0]]; ex_elem[1] = e[i.i32[1]]; ex_elem[2] = e[i.i32[2]]; ex_elem[3] = e[i.i32[3]]; x1 = _mm_sub_ps( x1, _mm_cvtepi32_ps( i.i ) ); x1 = _mm_add_ps( x1, tens.ps ); x1 = _mm_mul_ps( x1, ex ); x1 = _mm_add_ps( x1, ones.ps ); #ifdef __FAST_MATH__ return _mm_rcp_ps( x1 ); #else return _mm_div_ps( ones.ps, x1 ); #endif }
inline float4 operator / (const float4 & a, const float4 & b) { float4 res; res.m128 = _mm_div_ps(a.m128, b.m128); return res; }
inline void operator()(const IrradianceSample &sample) { /* Distance to the positive point source of the dipole */ const __m128 lengthSquared = _mm_set1_ps((p - sample.p).lengthSquared()), drSqr = _mm_add_ps(zrSqr, lengthSquared), dvSqr = _mm_add_ps(zvSqr, lengthSquared), dr = _mm_sqrt_ps(drSqr), dv = _mm_sqrt_ps(dvSqr), one = _mm_set1_ps(1.0f), factor = _mm_mul_ps(_mm_set1_ps(0.25f*INV_PI*sample.area * Fdt), _mm_set_ps(sample.E[0], sample.E[1], sample.E[2], 0)), C1fac = _mm_div_ps(_mm_mul_ps(zr, _mm_add_ps(sigmaTr, _mm_div_ps(one, dr))), drSqr), C2fac = _mm_div_ps(_mm_mul_ps(zv, _mm_add_ps(sigmaTr, _mm_div_ps(one, dv))), dvSqr); SSEVector temp1(_mm_mul_ps(dr, sigmaTr)), temp2(_mm_mul_ps(dv, sigmaTr)); const __m128 exp1 = _mm_set_ps(expf(-temp1.f[3]), expf(-temp1.f[2]), expf(-temp1.f[1]), 0), exp2 = _mm_set_ps(expf(-temp2.f[3]), expf(-temp2.f[2]), expf(-temp2.f[1]), 0); result.ps = _mm_add_ps(result.ps, _mm_mul_ps(factor, _mm_add_ps( _mm_mul_ps(C1fac, exp1), _mm_mul_ps(C2fac, exp2)))); }
static void vectorDiv(float* a, float* b, float* c, size_t n) { __m128 A, B, C; for(size_t i = 0; i < n; i += 4) { A = _mm_load_ps(&a[i]); B = _mm_load_ps(&b[i]); C = _mm_div_ps(A, B); _mm_store_ps(&c[i], C); } }
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f); const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f); const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f); const __m128 _ps_pi = _mm_set1_ps(pi); const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2, mm3; __m128 axm, aym; __m128 xm = _mm_set1_ps(x); __m128 ym = _mm_set1_ps(y); axm = _mm_and_ps(xm, _mask_sign_inv); aym = _mm_and_ps(ym, _mask_sign_inv); mm1 = _mm_min_ps(axm, aym); mm2 = _mm_max_ps(axm, aym); mm1 = _mm_div_ps(mm1, mm2); mm2 = _mm_mul_ps(mm1, mm1); mm3 = _mm_mul_ps(mm2, _ps_atan_p0); mm3 = _mm_add_ps(mm3, _ps_atan_p1); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_sub_ps(mm3, _ps_atan_p2); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_mul_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm1); __m128 mask; /* |y| > |x| */ mask = _mm_cmpgt_ss(aym, axm); mm2 = _mm_and_ps(_ps_pi0p5, mask); mm1 = _mm_and_ps(_mask_sign_raw, mask); mm3 = _mm_xor_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm2); /* x < 0 */ mask = _mm_and_ps(xm, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mask); mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30)); mm1 = _mm_and_ps(_ps_pi, mm1); mm3 = _mm_add_ps(mm3, mm1); /* y < 0 */ mm1 = _mm_and_ps(ym, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mm1); return _mm_cvtss_f32(mm3); }
SIMDValue SIMDFloat32x4Operation::OpDiv(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128_value = _mm_div_ps(tmpaValue.m128_value, tmpbValue.m128_value); // a / b return X86SIMDValue::ToSIMDValue(x86Result); }
/* V_SgDivideOp */ __SIMD _SIMD_div_ps(__SIMD a, __SIMD b) { #ifdef USE_SSE return _mm_div_ps(a,b); #elif defined USE_AVX return _m256_div_ps(a,b); #elif defined USE_IBM return vec_div(a,b); #endif }
SIMDValue SIMDFloat32x4Operation::OpReciprocalSqrt(const SIMDValue& value) { X86SIMDValue x86Result; X86SIMDValue temp; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); temp.m128_value = _mm_div_ps(X86_ALL_ONES_F4.m128_value, v.m128_value); // temp = 1.0/value x86Result.m128_value = _mm_sqrt_ps(temp.m128_value); // result = sqrt(1.0/value) return X86SIMDValue::ToSIMDValue(x86Result); }