/** returns a kernel averaged from the two kernels between them, sse * version * * @param kernel the return pointer for the kernel * @param amp the amp which holds the kernels */ void average_kernels_sse(float *kernel, Amp * amp) { int i; __m128 right; __m128 left; __m128 out; __m128 leftindex = {1.0f, 2.0f, 3.0f, 4.0f}; __m128 rightindex = { FOURIER_SIZE - 1.0, FOURIER_SIZE - 2.0, FOURIER_SIZE - 3.0, FOURIER_SIZE - 4.0}; __m128 fouriersize = _mm_set_ps1((float)FOURIER_SIZE); __m128 jumpsize = _mm_set_ps1(4.0f); for (i = 0; i < FOURIER_SIZE; i=i+4) { left = _mm_loadu_ps(amp->previous_buffer + i); right = _mm_loadu_ps(amp->fourier_buffer + i); out = _mm_div_ps( _mm_add_ps( _mm_mul_ps(left, leftindex), _mm_mul_ps(right, rightindex)), fouriersize); leftindex = _mm_add_ps(leftindex, jumpsize); rightindex = _mm_sub_ps(rightindex, jumpsize); _mm_store_ps(kernel + i, out); } for (; i < FOURIER_SIZE; i++) { kernel[i] = (amp->previous_buffer[i] * (i + 1) + amp->fourier_buffer[i] * (FOURIER_SIZE - i - 1)) / (FOURIER_SIZE); } }
static void inverse_f32_sse_unroll2 (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = 1.0 / *src1++; } for (; n >= 8; n -= 8) { __m128 xmm0, xmm1; /* While _mm_rcp_ps sounds promising, the results it gives are rather * different from the 1.0 / src1 reference implementation, so do that. */ xmm0 = _mm_set_ps1(1.0); xmm1 = _mm_loadu_ps(src1); xmm0 = _mm_div_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); xmm0 = _mm_set_ps1(1.0); xmm1 = _mm_loadu_ps(src1 + 4); xmm0 = _mm_div_ps(xmm0, xmm1); _mm_store_ps(dest + 4, xmm0); dest += 8; src1 += 8; } for (; n > 0; n--) { *dest++ = 1.0 / *src1++; } }
static void find_center (const vox_dot set[], size_t n, const struct vox_box *box, vox_dot res) { size_t i; __v4sf len = _mm_set_ps1 (n); __v4sf sum = _mm_set_ps1 (0.0); __v4sf voxel = _mm_load_ps (vox_voxel); __v4sf min = _mm_load_ps (box->min); /* * Subtract bounding box minimal value from voxel coordinates to reduce * computational error. I add it again after division by len. */ for (i=0; i<n; i++) sum += (_mm_load_ps (set[i]) - min); sum /= len; sum += min; /* * Align the center of division, so any voxel belongs to only one subspace * entirely. Faces of voxels may be the exception though */ __v4sf resv = sum / voxel; resv = _mm_ceil_ps (resv) * voxel; _mm_store_ps (res, resv); }
void mandel_sse2(unsigned char *image, const struct spec *s) { __m128 xmin = _mm_set_ps1(s->xlim[0]); __m128 ymin = _mm_set_ps1(s->ylim[0]); __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width); __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height); __m128 threshold = _mm_set_ps1(4); __m128 one = _mm_set_ps1(1); __m128i zero = _mm_setzero_si128(); __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations); __m128 depth_scale = _mm_set_ps1(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 4) { __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0); __m128 my = _mm_set_ps1(y); __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin); __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin); __m128 zr = cr; __m128 zi = ci; int k = 1; __m128 mk = _mm_set_ps1(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m128 zr2 = _mm_mul_ps(zr, zr); __m128 zi2 = _mm_mul_ps(zi, zi); __m128 zrzi = _mm_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr); zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm_mul_ps(zr, zr); zi2 = _mm_mul_ps(zi, zi); __m128 mag2 = _mm_add_ps(zr2, zi2); __m128 mask = _mm_cmplt_ps(mag2, threshold); mk = _mm_add_ps(_mm_and_ps(mask, one), mk); /* Early bailout? */ __m128i maski = _mm_castps_si128(mask); if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero))) break; } mk = _mm_mul_ps(mk, iter_scale); mk = _mm_sqrt_ps(mk); mk = _mm_mul_ps(mk, depth_scale); __m128i pixels = _mm_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 4; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
////////////////////////////////////////////////////////////////////////////// // // Set() // void ColorValueXmm::Set(F32 r, F32 g, F32 b, F32 a) { R = _mm_set_ps1(r); G = _mm_set_ps1(g); B = _mm_set_ps1(b); A = _mm_set_ps1(a); }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, __m128 accIx, __m128 accIy, __m128 accIz, float *accI) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); accIx = _mm_add_ps(accIx, aix); accIy = _mm_add_ps(accIy, aiy); accIz = _mm_add_ps(accIz, aiz); _mm_storer_ps(accI, accIx); _mm_storer_ps(accI + 4, accIy); _mm_storer_ps(accI + 8, accIz); }
void calculateSSE(int start, int end) { int size = end - start + 1; // we use aligned memory, because SSE instructions are really slow // working on unaligned memory float* result = (float*)aligned_alloc(16, size * sizeof(float)); __m128 x; __m128 delta_x = _mm_set_ps1(4.0f); __m128 y = _mm_set_ps1(1.0f); __m128* sse_result = (__m128*)result; const int sse_length = size / 4; x = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f); for (int loop = 0; loop < 100000; ++loop) { for (int i = 0; i < sse_length; ++i) { __m128 sqrt_result = _mm_sqrt_ps(x); sse_result[i] = _mm_div_ps(sqrt_result, x); //sse_result[i] = _mm_add_ps(x, y); // move x value to next 4 numbers x = _mm_add_ps(x, delta_x); } } }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4]) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); for (int i = 0; i < 4; i++) { accI[3 - i].x = aix.m128_f32[i]; accI[3 - i].y = aiy.m128_f32[i]; accI[3 - i].z = aiz.m128_f32[i]; } }
void init_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum) { int i; float tmp_max = 0; float tmp_sum = 0; int upper4 = (upper / 4) * 4; int rest = upper-upper4; const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }}; const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]); vecfloat_union vec_xrpow_max; vecfloat_union vec_sum; vecfloat_union vec_tmp; _mm_prefetch((char *) cod_info->xr, _MM_HINT_T0); _mm_prefetch((char *) xrpow, _MM_HINT_T0); vec_xrpow_max._m128 = _mm_set_ps1(0); vec_sum._m128 = _mm_set_ps1(0); for (i = 0; i < upper4; i += 4) { vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */ vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */ } vec_tmp._m128 = _mm_set_ps1(0); switch (rest) { case 3: vec_tmp._float[2] = cod_info->xr[upper4+2]; case 2: vec_tmp._float[1] = cod_info->xr[upper4+1]; case 1: vec_tmp._float[0] = cod_info->xr[upper4+0]; vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ switch (rest) { case 3: xrpow[upper4+2] = vec_tmp._float[2]; case 2: xrpow[upper4+1] = vec_tmp._float[1]; case 1: xrpow[upper4+0] = vec_tmp._float[0]; default: break; } default: break; } tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3]; { float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1] ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1]; float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3] ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3]; tmp_max = ma > mb ? ma : mb; } cod_info->xrpow_max = tmp_max; *sum = tmp_sum; }
kw_mat4 kw_div(kw_mat4 m, f32 scale){ kw_mat4 result; result.simd[0] = _mm_div_ps(m.simd[0], _mm_set_ps1(scale)); result.simd[1] = _mm_div_ps(m.simd[1], _mm_set_ps1(scale)); result.simd[2] = _mm_div_ps(m.simd[2], _mm_set_ps1(scale)); result.simd[3] = _mm_div_ps(m.simd[3], _mm_set_ps1(scale)); return result; }
float calcCubicNoiseValSSE(const vec3 p) { int ix, iy, iz; __m128 fx, fy; float fz; ix = (int)floor(p[0]); fx = _mm_set_ps1(p[0] - ix); iy = (int)floor(p[1]); fy = _mm_set_ps1(p[1] - iy); iz = (int)floor(p[2]); fz = p[2] - iz; uSIMD k0, k1, k2, k3; __m128 out0, out1, out2, out3; for(int k = -1; k <= 2; k++) { for(int j = -1; j <= 2; j++) { k0.a[j+1] = getLatticeVal(ix-1, iy + j, iz + k); k1.a[j+1] = getLatticeVal(ix+0, iy + j, iz + k); k2.a[j+1] = getLatticeVal(ix+1, iy + j, iz + k); k3.a[j+1] = getLatticeVal(ix+2, iy + j, iz + k); } switch(k) { case -1: out0 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 0: out1 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 1: out2 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; case 2: out3 = fourKnotSplineSSE(&fx, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); break; } } // Transpose the matrix formed by the out vectors. __m128 t1 = _mm_movelh_ps(out1, out0); __m128 t2 = _mm_movehl_ps(out0, out1); __m128 t3 = _mm_movelh_ps(out3, out2); __m128 t4 = _mm_movehl_ps(out2, out3); k0.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(0, 2, 0, 2)); k1.m = _mm_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 3, 1, 3)); k2.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(0, 2, 0, 2)); k3.m = _mm_shuffle_ps(t2, t4, _MM_SHUFFLE(1, 3, 1, 3)); uSIMD final_knots; final_knots.m = fourKnotSplineSSE(&fy, &(k0.m), &(k1.m), &(k2.m), &(k3.m)); return clamp(fourKnotSpline(fz, final_knots.a), -1.0f, 1.0f); }
void experienceNet::normalPDF_sse(float* result, const float* _partitions, float _mean, float _stdDev) { /* CODE ADAPTED FROM boost/math/normal.hpp RealType exponent = x - mean; exponent *= -exponent; exponent /= 2 * sd * sd; result = exp(exponent); result /= sd * sqrt(2 * constants::pi<RealType>()); return result; */ const __m128& partitions = *(__m128*)_partitions; __m128 exponent, tmp, mean, sd; /* CODE ADAPTED FROM http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html */ static const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); static const __m128 twos = _mm_set_ps1(2.0f); static const __m128 sqrt_pi_2_s = _mm_set_ps1(sqrt(2.0 * M_PI)); // store mean and sd: mean = _mm_load_ps1(&_mean); sd = _mm_load_ps1(&_stdDev); // exponent = x - mean exponent = _mm_sub_ps(partitions, mean); // exponent *= -exponent; tmp = _mm_xor_ps(exponent, signmask); exponent = _mm_mul_ps(exponent, tmp); // exponent /= 2 * sd * sd; tmp = _mm_mul_ps(sd, sd); tmp = _mm_mul_ps(tmp, twos); exponent = _mm_div_ps(exponent, tmp); // exponent = exp(exponent); exponent = _mm_exp_ps(exponent); // exponent /= sd * sqrt(2 * pi) tmp = _mm_mul_ps(sd, sqrt_pi_2_s); tmp = _mm_div_ps(exponent, tmp); #ifndef NDEBUG const float* _result = (float*)&tmp; boost::math::normal_distribution<float> cNormal(_mean, _stdDev); assert(fastabs(_result[0] - boost::math::pdf(cNormal, _partitions[0])) < 0.001f); assert(fastabs(_result[1] - boost::math::pdf(cNormal, _partitions[1])) < 0.001f); assert(fastabs(_result[2] - boost::math::pdf(cNormal, _partitions[2])) < 0.001f); assert(fastabs(_result[3] - boost::math::pdf(cNormal, _partitions[3])) < 0.001f); #endif // return result: _mm_store_ps(result, tmp); };
static inline __m128 curve_vec4( const __m128 x, const __m128 g, const __m128 sigma, const __m128 shadows, const __m128 highlights, const __m128 clarity) { // TODO: pull these non-data depedent constants out of the loop to see // whether the compiler fail to do so const __m128 const0 = _mm_set_ps1(0x3f800000u); const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 const __m128 one = _mm_set1_ps(1.0f); const __m128 two = _mm_set1_ps(2.0f); const __m128 twothirds = _mm_set1_ps(2.0f/3.0f); const __m128 twosig = _mm_mul_ps(two, sigma); const __m128 sigma2 = _mm_mul_ps(sigma, sigma); const __m128 s22 = _mm_mul_ps(twothirds, sigma2); const __m128 c = _mm_sub_ps(x, g); const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps()); // select shadows or highlights as multiplier for linear part, based on c < 0 const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights)); // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma) const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask)); // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma)))); const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(), _mm_div_ps(c, _mm_mul_ps(two, ssigma)))); const __m128 t2 = _mm_mul_ps(t, t); const __m128 mt = _mm_sub_ps(one, t); // midtone value fading over to linear part, without local contrast: const __m128 vmid = _mm_add_ps(g, _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)), _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi))))); // c > 2*sigma? const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig); const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid)); // midtone local contrast // dt_fast_expf in sse: const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22)); const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0))); const __m128 k = _mm_max_ps(k0, _mm_setzero_ps()); const __m128i ki = _mm_cvtps_epi32(k); const __m128 gauss = _mm_load_ps((float*)&ki); const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss)); return _mm_add_ps(val, vcon); }
/** Computes an upsampling filtering kernel (SSE version, four taps per inner loop) * * @param itor [in] Interpolator used * @param kernel [out] resulting itor->width*2 filter taps (array must be at least (itor->width*2+3)/4*4 floats long) * @param norm [out] Kernel norm * @param first [out] first input sample index used * @param t [in] Interpolated coordinate * * @return kernel norm */ static inline void compute_upsampling_kernel_sse( const struct dt_interpolation* itor, float* kernel, float* norm, int* first, float t) { int f = (int)t - itor->width + 1; if (first) { *first = f; } /* Find closest integer position and then offset that to match first * filtered sample position */ t = t - (float)f; // Prepare t vector to compute four values a loop static const __m128 bootstrap = { 0.f, -1.f, -2.f, -3.f}; static const __m128 iter = { -4.f, -4.f, -4.f, -4.f}; __m128 vt = _mm_add_ps(_mm_set_ps1(t), bootstrap); __m128 vw = _mm_set_ps1((float)itor->width); // Prepare counters (math kept stupid for understanding) int i = 0; int runs = (2*itor->width + 3)/4; while (i<runs) { // Compute the values __m128 vr = itor->funcsse(vw, vt); // Save result *(__m128*)kernel = vr; // Prepare next iteration vt = _mm_add_ps(vt, iter); kernel += 4; i++; } // compute norm now if (norm) { float n = 0.f; i = 0; kernel -= 4*runs; while (i<2*itor->width) { n += *kernel; kernel++; i++; } *norm = n; } }
void lfModifier::ModifyCoord_Dist_Poly3_SSE (void *data, float *iocoord, int count) { // See "Note about PT-based distortion models" at the top of mod-coord.cpp. /* * If buffer is not aligned, fall back to plain code */ if((uintptr_t)(iocoord) & 0xf) { return ModifyCoord_Dist_Poly3(data, iocoord, count); } lfCoordDistCallbackData* cddata = (lfCoordDistCallbackData*) data; // Rd = Ru * (1 + k1 * Ru^2) __m128 k1_ = _mm_set_ps1 (cddata->Terms [0]); __m128 cx = _mm_set_ps1 (cddata->centerX); __m128 cy = _mm_set_ps1 (cddata->centerY); __m128 cc = _mm_set_ps1 (cddata->coordinate_correction); __m128 one = _mm_set_ps1 (1.0f); // SSE Loop processes 4 pixels/loop int loop_count = count / 4; for (int i = 0; i < loop_count ; i++) { __m128 c0 = _mm_load_ps (&iocoord [8 * i]); __m128 c1 = _mm_load_ps (&iocoord [8 * i + 4]); __m128 x = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (2, 0, 2, 0)); __m128 y = _mm_shuffle_ps (c0, c1, _MM_SHUFFLE (3, 1, 3, 1)); x = _mm_sub_ps(_mm_mul_ps(x, cc), cx); y = _mm_sub_ps(_mm_mul_ps(y, cc), cy); // Calculate poly3 = k1_ * ru * ru + 1; __m128 poly3 = _mm_add_ps (_mm_mul_ps (_mm_add_ps (_mm_mul_ps (x, x), _mm_mul_ps (y, y)), k1_), one); x = _mm_add_ps(_mm_mul_ps (x, poly3), cx); y = _mm_add_ps(_mm_mul_ps (y, poly3), cy); x = _mm_div_ps (x, cc); y = _mm_div_ps (y, cc); c0 = _mm_unpacklo_ps(x, y); c1 = _mm_unpackhi_ps(x, y); _mm_store_ps (&iocoord [8 * i], c0); _mm_store_ps (&iocoord [8 * i + 4], c1); } loop_count *= 4; int remain = count - loop_count; if (remain) ModifyCoord_Dist_Poly3 (data, &iocoord [loop_count * 2], remain); }
static void clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1) { __m128 xmm1; float max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { float x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set_ps1(max); for (; n >= 4; n -= 4) { __m128 xmm0; xmm0 = _mm_loadu_ps(src1); xmm0 = _mm_min_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); dest += 4; src1 += 4; } for (; n > 0; n--) { float x = *src1++; if (x > max) x = max; *dest++ = x; } }
template <bool align> void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * hue, size_t hueStride) { assert(width >= A); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); } const __m128 KF_255_DIV_6 = _mm_set_ps1(Base::KF_255_DIV_6); size_t bodyWidth = AlignLo(width, A); size_t tail = width - bodyWidth; for(size_t row = 0; row < height; row += 1) { for(size_t col = 0; col < bodyWidth; col += A) { Store<align>((__m128i*)(hue + col), YuvToHue8(Load<align>((__m128i*)(y + col)), Load<align>((__m128i*)(u + col)), Load<align>((__m128i*)(v + col)), KF_255_DIV_6)); } if(tail) { size_t offset = width - A; Store<false>((__m128i*)(hue + offset), YuvToHue8(Load<false>((__m128i*)(y + offset)), Load<false>((__m128i*)(u + offset)), Load<false>((__m128i*)(v + offset)), KF_255_DIV_6)); } y += yStride; u += uStride; v += vStride; hue += hueStride; } }
////////////////////////////////////////////////////////////////////////////// // // Scale() // void ColorValueXmm::Scale() { __m128 _255 = _mm_set_ps1(255.0f); R = _mm_mul_ps(R, _255); G = _mm_mul_ps(G, _255); B = _mm_mul_ps(B, _255); A = _mm_mul_ps(A, _255); }
void depth_convert_w2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right) { const uint16_t *src_p = static_cast<const uint16_t *>(src); float *dst_p = static_cast<float *>(dst); unsigned vec_left = ceil_n(left, 8); unsigned vec_right = floor_n(right, 8); const __m128 scale_ps = _mm_set_ps1(scale); const __m128 offset_ps = _mm_set_ps1(offset); __m128 lo, hi; #define XITER depth_convert_w2f_sse2_xiter #define XARGS src_p, scale_ps, offset_ps, lo, hi if (left != vec_left) { XITER(vec_left - 8, XARGS); if (vec_left - left > 4) { mm_store_left(dst_p + vec_left - 8, lo, vec_left - left - 4); _mm_store_ps(dst_p + vec_left - 4, hi); } else { mm_store_left(dst_p + vec_left - 4, hi, vec_left - left); } } for (unsigned j = vec_left; j < vec_right; j += 8) { XITER(j, XARGS); _mm_store_ps(dst_p + j + 0, lo); _mm_store_ps(dst_p + j + 4, hi); } if (right != vec_right) { XITER(vec_right, XARGS); if (right - vec_right > 4) { _mm_store_ps(dst_p + vec_right + 0, lo); mm_store_right(dst_p + vec_right + 4, hi, right - vec_right - 4); } else { mm_store_right(dst_p + vec_right, lo, right - vec_right); } } #undef XITER #undef XARGS }
//internal simd using sse3 void LLMDCTOpt(const float* x, float* y) { float t4,t5,t6,t7; float c0,c1,c2,c3; float* r = dct_tbl; const float invsqrt2= 0.707107f;//(float)(1.0f / M_SQRT2); const float invsqrt2h=0.353554f;//invsqrt2*0.5f; { __m128 mc1 = _mm_load_ps(x); __m128 mc2 = _mm_loadr_ps(x+4); __m128 mt1 = _mm_add_ps(mc1,mc2); __m128 mt2 = _mm_sub_ps(mc1,mc2);//rev mc1 = _mm_addsub_ps(_mm_shuffle_ps(mt1,mt1,_MM_SHUFFLE(1,1,0,0)),_mm_shuffle_ps(mt1,mt1,_MM_SHUFFLE(2,2,3,3))); mc1 = _mm_shuffle_ps(mc1,mc1,_MM_SHUFFLE(0,2,3,1)); _mm_store_ps(y,mc1); _mm_store_ps(y+4,mt2); } c0=y[0]; c1=y[1]; c2=y[2]; c3=y[3]; /*c3=y[0]; c0=y[1]; c2=y[2]; c1=y[3];*/ t7=y[4]; t6=y[5]; t5=y[6]; t4=y[7]; y[0] = c0 + c1; y[4] = c0 - c1; y[2] = c2 * r[6] + c3 * r[2]; y[6] = c3 * r[6] - c2 * r[2]; c3 = t4 * r[3] + t7 * r[5]; c0 = t7 * r[3] - t4 * r[5]; c2 = t5 * r[1] + t6 * r[7]; c1 = t6 * r[1] - t5 * r[7]; y[5] = c3 - c1; y[3] = c0 - c2; c0 = (c0 + c2) * invsqrt2; c3 = (c3 + c1) * invsqrt2; y[1] = c0 + c3; y[7] = c0 - c3; const __m128 invsqh = _mm_set_ps1(invsqrt2h); __m128 my = _mm_load_ps(y); _mm_store_ps(y,_mm_mul_ps(my,invsqh)); my = _mm_load_ps(y+4); _mm_store_ps(y+4,_mm_mul_ps(my,invsqh)); }
static inline __m128 set_bitmask(unsigned int mask) { union { unsigned int i; float f; } u; u.i = mask; return _mm_set_ps1(u.f); }
static double compute_step_prob_simd(unsigned w, unsigned h, float alpha, struct coef *coef, float *cos, float *obj_gradient) { double prob_dist = 0.; unsigned block_w = coef->w / 8; unsigned block_h = coef->h / 8; for(unsigned block_y = 0; block_y < block_h; block_y++) { for(unsigned block_x = 0; block_x < block_w; block_x++) { unsigned i = block_y * block_w + block_x; float *cosb = &cos[i*64]; for(unsigned j = 0; j < 64; j+=4) { __m128 coef_data = _mm_cvtpi16_ps(*(__m64 *)&(coef->data[i*64+j])); __m128 coef_quant_table = _mm_cvtpi16_ps(*(__m64 *)&(coef->quant_table[j])); _mm_empty(); __m128 cosb_j = _mm_load_ps(&cosb[j]); cosb_j = cosb_j - coef_data * coef_quant_table; __m128 dist = SQR(cosb_j / coef_quant_table); prob_dist += dist[0]; prob_dist += dist[1]; prob_dist += dist[2]; prob_dist += dist[3]; cosb_j = cosb_j / SQR(coef_quant_table); _mm_store_ps(&cosb[j], cosb_j); } idct8x8s(cosb); if(coef->w_samp > 1 || coef->h_samp > 1) { for(unsigned in_y = 0; in_y < 8; in_y++) { for(unsigned in_x = 0; in_x < 8; in_x++) { unsigned j = in_y * 8 + in_x; unsigned cx = block_x * 8 + in_x; unsigned cy = block_y * 8 + in_y; for(unsigned sy = 0; sy < coef->h_samp; sy++) { for(unsigned sx = 0; sx < coef->w_samp; sx++) { unsigned y = cy * coef->h_samp + sy; unsigned x = cx * coef->w_samp + sx; *p(obj_gradient, x, y, w, h) += alpha * cosb[j]; } } } } } else { __m128 malpha = _mm_set_ps1(alpha); for(unsigned j = 0; j < 64; j+=4) { unsigned in_y = j / 8; unsigned in_x = j % 8; unsigned x = block_x * 8 + in_x; unsigned y = block_y * 8 + in_y; __m128 obj = _mm_load_ps(&obj_gradient[y*w+x]); __m128 cosb_j = _mm_load_ps(&cosb[j]); obj += malpha * cosb_j; _mm_store_ps(&obj_gradient[y*w+x], obj); } } } } return 0.5 * prob_dist; }
void mad(register float *dst, register float *src1, float alpha, float beta, int w) { register int j = 0; #if CV_SSE if (CPU_SUPPORT_SSE1) { __m128 a, b, c; a = _mm_set_ps1(alpha); b = _mm_set_ps1(beta); for (; j < w - 3; j += 4) { c = _mm_loadu_ps(src1 + j); c = _mm_mul_ps(c, a); c = _mm_add_ps(c, b); _mm_storeu_ps(dst + j, c); } } #endif for (; j < w; j++) dst[j] = alpha*src1[j] + beta; }
void process( struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const float divider = (float)UINT16_MAX; const __m128 dividers = _mm_set_ps1(divider); #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) shared(ovoid) #endif for(int j = 0; j < roi_out->height; j++) { const uint16_t *in = ((uint16_t *)ivoid) + (size_t)j * roi_out->width; float *out = ((float *)ovoid) + (size_t)j * roi_out->width; int i = 0; int alignment = ((8 - (j * roi_out->width & (8 - 1))) & (8 - 1)); // process unaligned pixels for ( ; i < alignment ; i++, out++, in++) *out = ((float)(*in)) / divider; // process aligned pixels with SSE for( ; i < roi_out->width - (8 - 1); i += 8, in += 8) { const __m128i input = _mm_load_si128((__m128i *)in); __m128i ilo = _mm_unpacklo_epi16(input, _mm_set1_epi16(0)); __m128i ihi = _mm_unpackhi_epi16(input, _mm_set1_epi16(0)); __m128 flo = _mm_cvtepi32_ps(ilo); __m128 fhi = _mm_cvtepi32_ps(ihi); flo = _mm_div_ps(flo, dividers); fhi = _mm_div_ps(fhi, dividers); _mm_stream_ps(out, flo); out += 4; _mm_stream_ps(out, fhi); out += 4; } // process the rest for( ; i < roi_out->width; i++, out++, in++) *out = ((float)(*in)) / divider; } _mm_sfence(); }
void fir_sse_float(float *in, float *out, int len) { int i = 0; __m128 i0, i1, two = _mm_set_ps1(2.0); for (; i < len-5; i+=4) { i0 = _mm_load_ps(in + i); i1 = _mm_loadu_ps(in + (i+1)); i0 = _mm_add_ps(i0, i1); i0 = _mm_div_ps(i0, two); _mm_store_ps(out + i, i0); } for (; i < len-1; ++i) out[i] = (in[i+1] + in[i])/2; }
void vertex_t::mult( float x) { float *src = sse2_ptr(); __m128 fact = _mm_set_ps1( x); for( int i = 0, ie = sse2_size(); i < ie; ++i) { __m128 v0 = _mm_load_ps( src); __m128 v1 = _mm_mul_ps( v0, fact); _mm_store_ps( src, v1); src += 4; } }
void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) { int n = framesToProcess; #if CPU(X86) || CPU(X86_64) if ((sourceStride == 1) && (destStride == 1)) { float k = *scale; // If the sourceP address is not 16-byte aligned, the first several frames // (at most three) should be processed separately. while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { *destP += k * *sourceP; sourceP++; destP++; n--; } // Now the sourceP is aligned, use SSE. int tailFrames = n % 4; const float* endP = destP + n - tailFrames; __m128 pSource; __m128 dest; __m128 temp; __m128 mScale = _mm_set_ps1(k); bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); #define SSE2_MULT_ADD(loadInstr, storeInstr) \ while (destP < endP) { \ pSource = _mm_load_ps(sourceP); \ temp = _mm_mul_ps(pSource, mScale); \ dest = _mm_##loadInstr##_ps(destP); \ dest = _mm_add_ps(dest, temp); \ _mm_##storeInstr##_ps(destP, dest); \ sourceP += 4; \ destP += 4; \ } if (destAligned) SSE2_MULT_ADD(load, store) else SSE2_MULT_ADD(loadu, storeu) n = tailFrames; }
static void spline_n_4(int i, float t, float *knot, float *splineVal) { knot += i + 1; #ifdef _M_SSE const __m128 knot012 = _mm_loadu_ps(&knot[0]); const __m128 knot345 = _mm_loadu_ps(&knot[3]); const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012); const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012)); const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0)); const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1)); const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1)); const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122)); // It's still faster to use SSE, even with this. float MEMORY_ALIGNED16(ff30_41_52[4]); float MEMORY_ALIGNED16(ff31_42_32[4]); _mm_store_ps(ff30_41_52, f30_41_52); _mm_store_ps(ff31_42_32, f31_42_32); const float &f30 = ff30_41_52[0]; const float &f41 = ff30_41_52[1]; const float &f52 = ff30_41_52[2]; const float &f31 = ff31_42_32[0]; const float &f42 = ff31_42_32[1]; const float &f32 = ff31_42_32[2]; #else // TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly... float t0 = (t - knot[0]); float t1 = (t - knot[1]); float t2 = (t - knot[2]); // TODO: All our knots are integers so we should be able to get rid of these divisions (How?) float f30 = t0/(knot[3]-knot[0]); float f41 = t1/(knot[4]-knot[1]); float f52 = t2/(knot[5]-knot[2]); float f31 = t1/(knot[3]-knot[1]); float f42 = t2/(knot[4]-knot[2]); float f32 = t2/(knot[3]-knot[2]); #endif float a = (1-f30)*(1-f31); float b = (f31*f41); float c = (1-f41)*(1-f42); float d = (f42*f52); splineVal[0] = a-(a*f32); splineVal[1] = 1-a-b+((a+b+c-1)*f32); splineVal[2] = b+((1-b-c-d)*f32); splineVal[3] = d*f32; }
void CAEUtil::SSEMulAddArray(float *data, float *add, const float mul, uint32_t count) { const __m128 m = _mm_set_ps1(mul); /* work around invalid alignment */ while ((((uintptr_t)data & 0xF) || ((uintptr_t)add & 0xF)) && count > 0) { data[0] += add[0] * mul; ++add; ++data; --count; } uint32_t even = count & ~0x3; for (uint32_t i = 0; i < even; i+=4, data+=4, add+=4) { __m128 ad = _mm_load_ps(add ); __m128 to = _mm_load_ps(data); *(__m128*)data = _mm_add_ps (to, _mm_mul_ps(ad, m)); } if (even != count) { uint32_t odd = count - even; if (odd == 1) data[0] += add[0] * mul; else { __m128 ad; __m128 to; if (odd == 2) { ad = _mm_setr_ps(add [0], add [1], 0, 0); to = _mm_setr_ps(data[0], data[1], 0, 0); __m128 ou = _mm_add_ps(to, _mm_mul_ps(ad, m)); data[0] = ((float*)&ou)[0]; data[1] = ((float*)&ou)[1]; } else { ad = _mm_setr_ps(add [0], add [1], add [2], 0); to = _mm_setr_ps(data[0], data[1], data[2], 0); __m128 ou = _mm_add_ps(to, _mm_mul_ps(ad, m)); data[0] = ((float*)&ou)[0]; data[1] = ((float*)&ou)[1]; data[2] = ((float*)&ou)[2]; } } } }
static void clearDepthSSE(float* begin,size_t length,float value){ enum { SimdLaneWidth = 4 }; assertRelease(uintptr_t(begin)%16 == 0); auto rem = length % SimdLaneWidth; length = (length/SimdLaneWidth)*SimdLaneWidth; __m128 k = _mm_set_ps1(value); for(size_t i = 0;i<length;i+=SimdLaneWidth) _mm_store_ps(begin+i,k); if(rem == 0) return; for(size_t i = 0;i<rem;i++){ begin[length+i] = value; } }