inline void Sort4Deg6(__m256 llrI, int pos[], int ipos[]) { int llr[8] __attribute__((aligned(64))); const auto v1 = _mm256_set1_ps( 67108864.0f ); const auto v2 = _mm256_mul_ps( v1, llrI ); _mm256_store_si256((__m256i *)llr, _mm256_cvttps_epi32(v2)); //register float x0,x1,x2,x3,x4,x5; const auto x0 = llr[0]; const auto x1 = llr[1]; const auto x2 = llr[2]; const auto x3 = llr[3]; const auto x4 = llr[4]; const auto x5 = llr[5]; int o0 = (x0<x1) +(x0<x2)+(x0<x3)+(x0<x4)+(x0<x5); int o1 = (x1<=x0)+(x1<x2)+(x1<x3)+(x1<x4)+(x1<x5); int o2 = (x2<=x0)+(x2<=x1)+(x2<x3)+(x2<x4)+(x2<x5); int o3 = (x3<=x0)+(x3<=x1)+(x3<=x2)+(x3<x4)+(x3<x5); int o4 = (x4<=x0)+(x4<=x1)+(x4<=x2)+(x4<=x3)+(x4<x5); int o5 = 15-(o0+o1+o2+o3+o4); pos[o0] = 0; pos[o1]= 1; pos[o2]= 2; pos[o3]= 3; pos[o4]= 4; pos[o5]= 5; pos[6]=6; pos[7]=7; ipos[ 0] = o0; ipos[ 1]=o1; ipos[ 2]=o2; ipos[ 3]=o3; ipos[ 4]=o4; ipos[ 5]=o5; ipos[6]=6; ipos[7]=7; }
void sigm_deriv (float *deriv_res, float *sigm_res, int dim) { #ifdef __APPLE__ for (int i=0; i<dim; i++) { deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]); } #elif __linux int residual = dim % SIMD_WIDTH; int stopSIMD = dim - residual; __m256 vec_deriv, vec_sigm; __m256 vec_one = _mm256_set1_ps(1.f); for (int i=0; i<stopSIMD; i+=SIMD_WIDTH) { vec_sigm = _mm256_loadu_ps(sigm_res + i); vec_deriv = _mm256_mul_ps(vec_sigm, _mm256_sub_ps(vec_one, vec_sigm)); _mm256_storeu_ps(deriv_res + i, vec_deriv); } for (int i=stopSIMD; i<dim; ++i) { deriv_res[i] = sigm_res[i] * (1 - sigm_res[i]); } #endif }
void polynomial(float *ret, const float *const r_values, int num) { // r*r*r*(10+r*(-15+r*6)); __m256 const_6 = _mm256_set1_ps(6.0f); __m256 const_neg_15 = _mm256_set1_ps(-15.0f); __m256 const_10 = _mm256_set1_ps(10.0f); // constants const int loop_factor = 8; for (int i = 0; i < num; i+=loop_factor) { #ifdef USE_IACA IACA_START #endif __m256 r; __m256 left; __m256 right; // aligned load of 256 bits r r = _mm256_load_ps(&r_values[i]); left = _mm256_mul_ps(r, r); // r * r #ifndef __FMA__ right = _mm256_mul_ps(r, const_6); // r * 6 left = _mm256_mul_ps(left, r); // r * r * r right = _mm256_add_ps(right, const_neg_15); //-15 + r * 6 right = _mm256_mul_ps(right, r); //r * (-15 + r * 6) right = _mm256_add_ps(right, const_10); //10 + (r * (-15 + r * 6)) #else right = _mm256_fmadd_ps(r, const_6, const_neg_15); left = _mm256_mul_ps(left, r); right = _mm256_fmadd_ps(r, right, const_10); #endif right = _mm256_mul_ps(right, left); // r*r*r *(10 + r * (-15 + r * 6)) _mm256_store_ps(&ret[i], right); // store 8 values to ret[i] } #ifdef USE_IACA IACA_END #endif }
int RowVec_32f_AVX(const float* src0, const float* _kx, float* dst, int width, int cn, int _ksize) { int i = 0, k; for (; i <= width - 8; i += 8) { const float* src = src0 + i; __m256 f, x0; __m256 s0 = _mm256_set1_ps(0.0f); for (k = 0; k < _ksize; k++, src += cn) { f = _mm256_set1_ps(_kx[k]); x0 = _mm256_loadu_ps(src); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
template <bool align> SIMD_INLINE float SquaredDifferenceSum32f(const float * a, const float * b, size_t size) { if(align) assert(Aligned(a) && Aligned(b)); float sum = 0; size_t i = 0; size_t alignedSize = AlignLo(size, 8); if(alignedSize) { __m256 _sum = _mm256_setzero_ps(); for(; i < alignedSize; i += 8) { __m256 _a = Avx::Load<align>(a + i); __m256 _b = Avx::Load<align>(b + i); __m256 _d = _mm256_sub_ps(_a, _b); _sum = _mm256_add_ps(_sum, _mm256_mul_ps(_d, _d)); } sum += Avx::ExtractSum(_sum); } for(; i < size; ++i) sum += Simd::Square(a[i] - b[i]); return sum; }
void kernel_strmv_u_n_8_lib8(int kmax, float *A, float *x, float *y, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, x_1, x_2, x_3, y_0, y_0_b, y_0_c, y_0_d, z_0; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_0_b = _mm256_setzero_ps(); y_0_c = _mm256_setzero_ps(); y_0_d = _mm256_setzero_ps(); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); x_3 = _mm256_blend_ps( zeros, x_3, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; k=8; for(; k<kmax-7; k+=8) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } for(; k<kmax-3; k+=4) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } y_0 = _mm256_add_ps( y_0 , y_0_c ); y_0_b = _mm256_add_ps( y_0_b, y_0_d ); if(kmax%4>=2) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); A += 2*lda; x += 2; } y_0 = _mm256_add_ps( y_0 , y_0_b ); if(kmax%2==1) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); /* A += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } }
Triangle* OctreeLeaf::Query(const Ray& ray, float& t) const { float tBox = std::numeric_limits<float>::min(); if (!Intersects(ray, bb, tBox) || tBox > t) return nullptr; const __m256 rayDirX = _mm256_set1_ps(ray.Direction.X); const __m256 rayDirY = _mm256_set1_ps(ray.Direction.Y); const __m256 rayDirZ = _mm256_set1_ps(ray.Direction.Z); const __m256 rayPosX = _mm256_set1_ps(ray.Origin.X); const __m256 rayPosY = _mm256_set1_ps(ray.Origin.Y); const __m256 rayPosZ = _mm256_set1_ps(ray.Origin.Z); union { float dists[MAXSIZE]; __m256 distances[MAXSIZE / NROFLANES]; }; for (int i = 0; i < count; i++) { // Vector3F e1 = triangle.Vertices[1].Position - triangle.Vertices[0].Position; const __m256 e1X = edge1X8[i]; const __m256 e1Y = edge1Y8[i]; const __m256 e1Z = edge1Z8[i]; // Vector3F e2 = triangle.Vertices[2].Position - triangle.Vertices[0].Position; const __m256 e2X = edge2X8[i]; const __m256 e2Y = edge2Y8[i]; const __m256 e2Z = edge2Z8[i]; // Vector3F p = ray.Direction.Cross(e2); const __m256 pX = _mm256_sub_ps(_mm256_mul_ps(rayDirY, e2Z), _mm256_mul_ps(rayDirZ, e2Y)); const __m256 pY = _mm256_sub_ps(_mm256_mul_ps(rayDirZ, e2X), _mm256_mul_ps(rayDirX, e2Z)); const __m256 pZ = _mm256_sub_ps(_mm256_mul_ps(rayDirX, e2Y), _mm256_mul_ps(rayDirY, e2X)); // float det = e1.Dot(p); const __m256 det = _mm256_add_ps(_mm256_mul_ps(e1X, pX), _mm256_add_ps(_mm256_mul_ps(e1Y, pY), _mm256_mul_ps(e1Z, pZ))); // if (det > -EPSILON && det < EPSILON) // return false; __m256 mask = _mm256_or_ps(_mm256_cmp_ps(det, _mm256_set1_ps(-EPSILON), _CMP_LE_OS), _mm256_cmp_ps(det, _mm256_set1_ps(EPSILON), _CMP_GE_OS)); // float invDet = 1 / det; const __m256 invDet = _mm256_div_ps(_mm256_set1_ps(1.0f), det); // Vector3F r = ray.Origin - triangle.Vertices[0].Position; const __m256 rX = _mm256_sub_ps(rayPosX, vert0X8[i]); const __m256 rY = _mm256_sub_ps(rayPosY, vert0Y8[i]); const __m256 rZ = _mm256_sub_ps(rayPosZ, vert0Z8[i]); // float u = r.Dot(p) * invDet; const __m256 u = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rX, pX), _mm256_add_ps(_mm256_mul_ps(rY, pY), _mm256_mul_ps(rZ, pZ)))); // if (u < 0 || u > 1) // return false; mask = _mm256_and_ps(mask, _mm256_cmp_ps(u, _mm256_setzero_ps(), _CMP_GE_OS)); // Vector3F q = r.Cross(e1); const __m256 qX = _mm256_sub_ps(_mm256_mul_ps(rY, e1Z), _mm256_mul_ps(rZ, e1Y)); const __m256 qY = _mm256_sub_ps(_mm256_mul_ps(rZ, e1X), _mm256_mul_ps(rX, e1Z)); const __m256 qZ = _mm256_sub_ps(_mm256_mul_ps(rX, e1Y), _mm256_mul_ps(rY, e1X)); // float v = ray.Direction.Dot(q) * invDet; const __m256 v = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rayDirX, qX), _mm256_add_ps(_mm256_mul_ps(rayDirY, qY), _mm256_mul_ps(rayDirZ, qZ)))); // if (v < 0 || u + v > 1) // return false; mask = _mm256_and_ps(mask, _mm256_and_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GE_OS), _mm256_cmp_ps(_mm256_add_ps(u, v), _mm256_set1_ps(1.0f), _CMP_LE_OS))); // float tt = e2.Dot(q) * invDet; const __m256 tt = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(e2X, qX), _mm256_add_ps(_mm256_mul_ps(e2Y, qY), _mm256_mul_ps(e2Z, qZ)))); // if (tt > EPSILON) // { // t = tt; // return true; // } // // return false; distances[i] = _mm256_and_ps(tt, mask); } Triangle* triangle = nullptr; for (int i = 0; i < count * NROFLANES; i++) if (dists[i] < t && dists[i] > EPSILON) { t = dists[i]; triangle = triangles[i]; } return triangle; }
inline vec8 operator*(vec8 a, vec8 b) { return _mm256_mul_ps(a, b); }
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 8; /* const int bs = 8;*/ __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_1 = _mm256_setzero_ps(); y_2 = _mm256_setzero_ps(); y_3 = _mm256_setzero_ps(); y_4 = _mm256_setzero_ps(); y_5 = _mm256_setzero_ps(); y_6 = _mm256_setzero_ps(); y_7 = _mm256_setzero_ps(); k=0; for(; k<kmax-7; k+=8) { x_0 = _mm256_loadu_ps( &x[0] ); __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*4] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); A += sda*lda; x += lda; } x_0 = _mm256_loadu_ps( &x[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( zeros, a_03, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); a_00 = _mm256_load_ps( &A[0+lda*4] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); /* a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/ ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); // reduction __m256 z_0; y_0 = _mm256_hadd_ps(y_0, y_1); y_2 = _mm256_hadd_ps(y_2, y_3); y_4 = _mm256_hadd_ps(y_4, y_5); y_6 = _mm256_hadd_ps(y_6, y_7); y_0 = _mm256_hadd_ps(y_0, y_2); y_4 = _mm256_hadd_ps(y_4, y_6); y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20); y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31); y_0 = _mm256_add_ps(y_1, y_2); // store if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } }
iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA; krf = _mm256_set1_ps(fr->ic->k_rf); krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0); crf = _mm256_set1_ps(fr->ic->c_rf); /* Setup water-specific parameters */ inr = nlist->iinr[0]; iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0])); iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1])); iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2])); /* Avoid stupid compiler warnings */ jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0; j_coord_offsetA = 0; j_coord_offsetB = 0; j_coord_offsetC = 0; j_coord_offsetD = 0; j_coord_offsetE = 0; j_coord_offsetF = 0; j_coord_offsetG = 0; j_coord_offsetH = 0; outeriter = 0;
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2, avx_m256_t *c1, avx_m256_t *c2) { avx_m256_t tempa = _ps_sign_mask; avx_m256_t tempb = _ps_inv_sign_mask; avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa); avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa); x1 = _mm256_and_ps(x1, tempb); x2 = _mm256_and_ps(x2, tempb); tempa = _ps_cephes_FOPI; avx_m256_t y1 = _mm256_mul_ps(x1, tempa); avx_m256_t y2 = _mm256_mul_ps(x2, tempa); //avx_m256i_t emm21 = _mm256_cvttps_epi32(y1); //avx_m256i_t emm22 = _mm256_cvttps_epi32(y2); //emm21 = _mm256_add_epi32(emm21, _pi32_1); //emm22 = _mm256_add_epi32(emm22, _pi32_1); avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1)); avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1)); //emm21 = _mm256_and_si256(emm21, _pi32_inv1); //emm22 = _mm256_and_si256(emm22, _pi32_inv1); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1))); y1 = _mm256_cvtepi32_ps(emm21); y2 = _mm256_cvtepi32_ps(emm22); //avx_m256i_t tempia = _pi32_2; //avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia); //avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia); avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2)); avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2)); //avx_m256i_t tempib = _pi32_4; //avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib); //avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib); avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib); //avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib); avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_4))); //emm01 = _mm256_slli_epi32(emm01, 29); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 29); emm0lo1 = _mm_slli_epi32(emm0lo1, 29); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 29); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 29); emm0lo2 = _mm_slli_epi32(emm0lo2, 29); emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1); //cos_emm01 = _mm256_slli_epi32(cos_emm01, 29); __m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0); __m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1); cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29); cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1); //cos_emm02 = _mm256_slli_epi32(cos_emm02, 29); __m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0); __m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1); cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29); cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1); //tempia = _pi32_2; //tempib = _mm256_setzero_si256(); //emm21 = _mm256_and_si256(emm21, tempia); //emm22 = _mm256_and_si256(emm22, tempia); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_2))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_2))); //cos_emm21 = _mm256_and_si256(cos_emm21, tempia); //cos_emm22 = _mm256_and_si256(cos_emm22, tempia); cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_2))); cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_2))); //emm21 = _mm256_cmpeq_epi32(emm21, tempib); //emm22 = _mm256_cmpeq_epi32(emm22, tempib); emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib); //cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib); cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01); avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02); avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21); avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22); avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01); avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02); avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21); avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22); sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1); sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2); tempa = _ps_minus_cephes_DP123; tempb = _mm256_mul_ps(y2, tempa); tempa = _mm256_mul_ps(y1, tempa); x2 = _mm256_add_ps(x2, tempb); x1 = _mm256_add_ps(x1, tempa); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); tempa = _ps_coscof_p0; tempb = _ps_sincof_p0; y1 = _mm256_mul_ps(x21, tempa); y2 = _mm256_mul_ps(x22, tempa); avx_m256_t y21 = _mm256_mul_ps(x21, tempb); avx_m256_t y22 = _mm256_mul_ps(x22, tempb); tempa = _ps_coscof_p1; tempb = _ps_sincof_p1; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x21); y2 = _mm256_mul_ps(y2, x22); y21 = _mm256_mul_ps(y21, x21); y22 = _mm256_mul_ps(y22, x22); tempa = _ps_coscof_p2; tempb = _ps_sincof_p2; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x41); y2 = _mm256_mul_ps(y2, x42); y21 = _mm256_mul_ps(y21, x31); y22 = _mm256_mul_ps(y22, x32); tempa = _ps_0p5; tempb = _ps_1; avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa); avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa); y21 = _mm256_add_ps(y21, x1); y22 = _mm256_add_ps(y22, x2); temp_21 = _mm256_sub_ps(temp_21, tempb); temp_22 = _mm256_sub_ps(temp_22, tempb); y1 = _mm256_sub_ps(y1, temp_21); y2 = _mm256_sub_ps(y2, temp_22); avx_m256_t cos_y1 = y1; avx_m256_t cos_y2 = y2; avx_m256_t cos_y21 = y21; avx_m256_t cos_y22 = y22; y1 = _mm256_andnot_ps(emm2f1, y1); y2 = _mm256_andnot_ps(emm2f2, y2); cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1); cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2); y21 = _mm256_and_ps(emm2f1, y21); y22 = _mm256_and_ps(emm2f2, y22); cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21); cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22); y1 = _mm256_add_ps(y1, y21); y2 = _mm256_add_ps(y2, y22); cos_y1 = _mm256_add_ps(cos_y1, cos_y21); cos_y2 = _mm256_add_ps(cos_y2, cos_y22); *s1 = _mm256_xor_ps(y1, sign_bit1); *s2 = _mm256_xor_ps(y2, sign_bit2); *c1 = _mm256_xor_ps(cos_y1, cos_emm0f1); *c2 = _mm256_xor_ps(cos_y2, cos_emm0f2); } // newsincos_ps_dual()
void TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); // Constants static const int mant_size = 23; static const int exp_bias = 127; static const uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size; static const float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2); // static const float val_max = float (int64_t (1) << LOGLUT_MAX_L2); static const int frac_size = mant_size - LOGLUT_RES_L2; static const uint32_t frac_mask = (1 << frac_size) - 1; const __m256 zero_f = _mm256_setzero_ps (); const __m256 one_f = _mm256_set1_ps (1); const __m256 frac_mul = _mm256_set1_ps (1.0f / (1 << frac_size)); const __m256 mul_eps = _mm256_set1_ps (1.0f / val_min); const __m256 mask_abs_f = _mm256_load_ps ( reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs) ); const __m256i zero_i = _mm256_setzero_si256 (); const __m256i mask_abs_epi32 = _mm256_set1_epi32 (0x7FFFFFFF); const __m256i one_epi32 = _mm256_set1_epi32 (1); const __m256i base_epi32 = _mm256_set1_epi32 (int (base)); const __m256i frac_mask_epi32 = _mm256_set1_epi32 (frac_mask); const __m256i val_min_epi32 = _mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size); const __m256i val_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size); const __m256i index_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2); const __m256i hsize_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE); const __m256i mirror_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE - 1); // It really starts here const __m256 val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_a = _mm256_and_ps (val_f, mask_abs_f); const __m256i val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr)); const __m256i val_u = _mm256_and_si256 (val_i, mask_abs_epi32); // Standard path __m256i index_std = _mm256_sub_epi32 (val_u, base_epi32); index_std = _mm256_srli_epi32 (index_std, frac_size); index_std = _mm256_add_epi32 (index_std, one_epi32); __m256i frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32); __m256 frac_std = _mm256_cvtepi32_ps (frac_stdi); frac_std = _mm256_mul_ps (frac_std, frac_mul); // Epsilon path __m256 frac_eps = _mm256_max_ps (val_a, zero_f); frac_eps = _mm256_mul_ps (frac_eps, mul_eps); // Range cases const __m256i eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u); const __m256i std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u); const __m256 eps_flag_f = _mm256_castsi256_ps (eps_flag_i); const __m256 std_flag_f = _mm256_castsi256_ps (std_flag_i); __m256i index_tmp = fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32); __m256 frac_tmp = fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f); index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp); frac_tmp = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp); // Sign cases const __m256i neg_flag_i = _mm256_srai_epi32 (val_i, 31); const __m256 neg_flag_f = _mm256_castsi256_ps (neg_flag_i); const __m256i index_neg = _mm256_sub_epi32 (mirror_epi32, index_tmp); const __m256i index_pos = _mm256_add_epi32 (hsize_epi32, index_tmp); const __m256 frac_neg = _mm256_sub_ps (one_f, frac_tmp); index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos); frac = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp); }
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX( const void *poOptions, GUInt32 nPoints, CPL_UNUSED const double *unused_padfX, CPL_UNUSED const double *unused_padfY, CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, double *pdfValue, void* hExtraParamsIn ) { size_t i = 0; GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn; const float* pafX = psExtraParams->pafX; const float* pafY = psExtraParams->pafY; const float* pafZ = psExtraParams->pafZ; const float fEpsilon = 0.0000000000001f; const float fXPoint = (float)dfXPoint; const float fYPoint = (float)dfYPoint; const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon); const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint); const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint); __m256 ymm_nominator = _mm256_setzero_ps(); __m256 ymm_denominator = _mm256_setzero_ps(); int mask = 0; #undef LOOP_SIZE #if defined(__x86_64) || defined(_M_X64) /* This would also work in 32bit mode, but there are only 8 XMM registers */ /* whereas we have 16 for 64bit */ #define LOOP_SIZE 16 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x); __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y); __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8), _mm256_mul_ps(ymm_ry_8, ymm_ry_8)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8); ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i))); ymm_nominator = _mm256_add_ps(ymm_nominator, _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8); mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */ (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8); if( mask ) break; } #else #define LOOP_SIZE 8 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps((float*)pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps((float*)pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps((float*)pafZ + i))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */ if( mask ) break; } #endif /* Find which i triggered r2 < fEpsilon */ if( mask ) { for(int j = 0; j < LOOP_SIZE; j++ ) { if( mask & (1 << j) ) { (*pdfValue) = (pafZ)[i + j]; // GCC and MSVC need explicit zeroing #if !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; } } } #undef LOOP_SIZE /* Get back nominator and denominator values for YMM registers */ float afNominator[8], afDenominator[8]; _mm256_storeu_ps(afNominator, ymm_nominator); _mm256_storeu_ps(afDenominator, ymm_denominator); // MSVC doesn't emit AVX afterwards but may use SSE, so clear upper bits // Other compilers will continue using AVX for the below floating points operations #if defined(_MSC_FULL_VER) _mm256_zeroupper(); #endif float fNominator = afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3] + afNominator[4] + afNominator[5] + afNominator[6] + afNominator[7]; float fDenominator = afDenominator[0] + afDenominator[1] + afDenominator[2] + afDenominator[3] + afDenominator[4] + afDenominator[5] + afDenominator[6] + afDenominator[7]; /* Do the few remaining loop iterations */ for ( ; i < nPoints; i++ ) { const float fRX = pafX[i] - fXPoint; const float fRY = pafY[i] - fYPoint; const float fR2 = fRX * fRX + fRY * fRY; // If the test point is close to the grid node, use the point // value directly as a node value to avoid singularity. if ( fR2 < 0.0000000000001 ) { break; } else { const float fInvR2 = 1.0f / fR2; fNominator += fInvR2 * pafZ[i]; fDenominator += fInvR2; } } if( i != nPoints ) { (*pdfValue) = pafZ[i]; } else if ( fDenominator == 0.0 ) { (*pdfValue) = ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue; } else (*pdfValue) = fNominator / fDenominator; // GCC needs explicit zeroing #if defined(__GNUC__) && !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; }
int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) { int i = 0, k; const float *S, *S2; const __m128 d4 = _mm_set1_ps(delta); const __m256 d8 = _mm256_set1_ps(delta); for( ; i <= width - 16; i += 16 ) { __m256 f = _mm256_set1_ps(ky[0]); __m256 s0, s1; __m256 x0; S = src[0] + i; s0 = _mm256_loadu_ps(S); #if CV_FMA3 s0 = _mm256_fmadd_ps(s0, f, d8); #else s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8); #endif s1 = _mm256_loadu_ps(S+8); #if CV_FMA3 s1 = _mm256_fmadd_ps(s1, f, d8); #else s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8); #endif for( k = 1; k <= ksize2; k++ ) { S = src[k] + i; S2 = src[-k] + i; f = _mm256_set1_ps(ky[k]); x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8)); #if CV_FMA3 s1 = _mm256_fmadd_ps(x0, f, s1); #else s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); _mm256_storeu_ps(dst + i + 8, s1); } for( ; i <= width - 4; i += 4 ) { __m128 f = _mm_set1_ps(ky[0]); __m128 x0, s0 = _mm_load_ps(src[0] + i); s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); for( k = 1; k <= ksize2; k++ ) { f = _mm_set1_ps(ky[k]); S = src[k] + i; S2 = src[-k] + i; x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); } _mm_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
void plot(u32 w, u32 h, float x1, float y1, float x2, float y2, float dx, float dy, u32 max_iter = 4096) { assert(w % 8 == 0); // AVX Constants float const constants[] { x1, y1, dx, dy, 1.0f, 4.0f }; __m256 const vx1 = _mm256_broadcast_ss(constants); __m256 const vy1 = _mm256_broadcast_ss(constants + 1); __m256 const vdx = _mm256_broadcast_ss(constants + 2); __m256 const vdy = _mm256_broadcast_ss(constants + 3); __m256 const v1 = _mm256_broadcast_ss(constants + 4); __m256 const v4 = _mm256_broadcast_ss(constants + 5); // Start timing std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2; std::chrono::duration<double> dt; t1 = std::chrono::high_resolution_clock::now(); // Zero line counter __m256 vj = _mm256_xor_ps(v1, v1); for (u32 j = 0; j < h; j++) { for (u32 i = 0; i < w; i += 8) { // Fill column counter float const vi_[8] { i+0.f, i+1.f, i+2.f, i+3.f, i+4.f, i+5.f, i+6.f, i+7.f }; __m256 vi = _mm256_load_ps(vi_); // Compute start point __m256 vx0 = _mm256_mul_ps(vi, vdx); vx0 = _mm256_add_ps(vx0, vx1); __m256 vy0 = _mm256_mul_ps(vj, vdy); vy0 = _mm256_add_ps(vy0, vy1); __m256 vx = vx0; __m256 vy = vy0; __m256 vcount = _mm256_xor_ps(v1, v1); // Zero iteration counter u32 iter = 0; u8 no_overflow = 0; do { // Compute products __m256 vxx = _mm256_mul_ps(vx, vx); __m256 vyy = _mm256_mul_ps(vy, vy); // Check termination condition __m256 vtmp = _mm256_add_ps(vxx, vyy); vtmp = _mm256_cmp_ps(vtmp, v4, _CMP_LT_OQ); no_overflow = _mm256_movemask_ps(vtmp) & 0xff; // Accumulate iteration counter vtmp = _mm256_and_ps(vtmp, v1); vcount = _mm256_add_ps(vcount, vtmp); // Step vtmp = _mm256_mul_ps(vx, vy); vtmp = _mm256_add_ps(vtmp, vtmp); vy = _mm256_add_ps(vtmp, vy0); vtmp = _mm256_sub_ps(vxx, vyy); vx = _mm256_add_ps(vtmp, vx0); ++iter; } while (no_overflow && (iter < max_iter)); for (u32 k = 0; k < 8; k++) { u32 n = ((float *) &vcount)[k] + 0.5f; if (n == max_iter) n = 0; char c = ' '; if (n > 0) { static char const charset[] = ".,c8M@jawrpogOQEPGJ"; c = charset[n % (sizeof(charset) - 1)]; } attron(COLOR_PAIR((n % 7) + 1)); addch(c); attroff(COLOR_PAIR((n % 7) + 1)); if (i + k + 1 == w) addch('\n'); } } // Increment line counter vj = _mm256_add_ps(vj, v1); } // End timing t2 = std::chrono::high_resolution_clock::now(); dt = t2 - t1; std::string info = std::to_string(dt.count() * 1000.0) + "ms"; attron(COLOR_PAIR(1)); printw(info.c_str()); attroff(COLOR_PAIR(1)); }
float nv_vector_norm(const nv_matrix_t *vec, int vec_m) { #if NV_ENABLE_AVX { NV_ALIGNED(float, mm[8], 32); __m256 x, u; int n; int pk_lp = (vec->n & 0xfffffff8); float dp = 0.0f; u = _mm256_setzero_ps(); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec, vec_m, n)); u = _mm256_add_ps(u, _mm256_mul_ps(x, x)); } _mm256_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7]; for (n = pk_lp; n < vec->n; ++n) { dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n); } if (dp > 0.0f) { return sqrtf(dp); } return 0.0f; } #elif NV_ENABLE_SSE2 { NV_ALIGNED(float, mm[4], 16); __m128 x, u; int n; int pk_lp = (vec->n & 0xfffffffc); float dp = 0.0f; u = _mm_setzero_ps(); for (n = 0; n < pk_lp; n += 4) { x = _mm_load_ps(&NV_MAT_V(vec, vec_m, n)); u = _mm_add_ps(u, _mm_mul_ps(x, x)); } _mm_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3]; for (n = pk_lp; n < vec->n; ++n) { dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n); } if (dp > 0.0f) { return sqrtf(dp); } return 0.0f; } #else { int n; float dp = 0.0f; for (n = 0; n < vec->n; ++n) { dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n); } if (dp > 0.0f) { return sqrtf(dp); } return 0.0f; } #endif }
static __m128i cielabv (union hvrgbpix rgb) { __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5); __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0); __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0); __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0); __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]); __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]); __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]); __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]); __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]); __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v)); xvxyz[0] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO))); xvxyz[1] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO))); __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]); __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]); #ifdef __AVX__ __m256 vlab, vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], 0}, vxyz2 = {0, cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]}; vlab = _mm256_sub_ps(vxyz,vxyz2); vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0)); vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0)); vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64)); vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO); __m256i vlabi = _mm256_cvtps_epi32(vlab); return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]); #else __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], 0}; __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], 0}; vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3))); vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0)); vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0)); vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64)); vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO); vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3))); vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0)); vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0)); vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64)); vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO); return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh)); #endif }
float tricub_x86_f(float *src, float *abcd, float x, float y){ float *s; float x0, x1, x2, x3, y0, y1, y2, y3; float dst[4]; #if defined(__AVX2__) && defined(__x86_64__) __m256 v1, v2, v3, v4; __m256 va, vb, vc, vd; __m128 va4, vb4, vc4, vd4; __m128 v128a, v128b; __m128 vy0, vy1, vy2, vy3; #else int i, ni2, ni3, ninj2, ninj3; float va4[4], vb4[4], vc4[4], vd4[4]; ninj2 = ninj + ninj; ninj3 = ninj2 + ninj; ni2 = ni + ni; ni3 = ni2 + ni; #endif #if defined(__AVX2__) && defined(__x86_64__) // ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ==== va = _mm256_broadcast_ss(abcd); // promote constants to vectors vb = _mm256_broadcast_ss(abcd+1); vc = _mm256_broadcast_ss(abcd+2); vd = _mm256_broadcast_ss(abcd+3); s = src; // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 0 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 1 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 0 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 1 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 0 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 1 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 0 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 1 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low) vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high) s = src + 2*ni; // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 2 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 3 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 2 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 3 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 2 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 3 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 2 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 3 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2 (v1 low) vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3 (v1 high) // ==== interpolation along Y, vector length is 4 (4 rows) ==== y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); va4 = _mm_broadcast_ss(&y0); // promote constants to vectors vb4 = _mm_broadcast_ss(&y1); vc4 = _mm_broadcast_ss(&y2); vd4 = _mm_broadcast_ss(&y3); vy0 = _mm_mul_ps(vy0,va4); // vy0 * va4 vy0 = _mm_fmadd_ps(vy1,vb4,vy0); // += vy1 * vb4 vy0 = _mm_fmadd_ps(vy2,vc4,vy0); // += vy2 * vc4 vy0 = _mm_fmadd_ps(vy3,vd4,vy0); // += vy3 * vd4 _mm_storeu_ps(dst,vy0); // store 4 values along X #else y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); for (i=0 ; i<4 ; i++){ va4[i] = src[i ]*abcd[0] + src[i +ninj]*abcd[1] + src[i +ninj2]*abcd[2] + src[i +ninj3]*abcd[3]; vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] + src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3]; vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] + src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3]; vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] + src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3]; dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3; } #endif // ==== interpolation along x, scalar ==== x0 = cm167*x*(x-one)*(x-two); x1 = cp5*(x+one)*(x-one)*(x-two); x2 = cm5*x*(x+one)*(x-two); x3 = cp167*x*(x+one)*(x-one); return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3); }
inline avx_m256_t newexp_ps(avx_m256_t x) { avx_m256_t one = _ps_1; avx_m256_t zero = _ps_0; x = _mm256_min_ps(x, _ps_exp_hi); x = _mm256_max_ps(x, _ps_exp_lo); avx_m256_t temp_2 = _mm256_mul_ps(x, _ps_cephes_LOG2EF); temp_2 = _mm256_add_ps(temp_2, _ps_0p5); avx_m256i_t emm0 = _mm256_cvttps_epi32(temp_2); avx_m256_t temp_1 = _mm256_cvtepi32_ps(emm0); avx_m256_t temp_3 = _mm256_sub_ps(temp_1, temp_2); avx_m256_t mask = _mm256_cmp_ps(temp_3, zero, _CMP_GT_OQ); mask = _mm256_and_ps(mask, one); temp_2 = _mm256_sub_ps(temp_1, mask); emm0 = _mm256_cvttps_epi32(temp_2); temp_1 = _mm256_mul_ps(temp_2, _ps_cephes_exp_C12); x = _mm256_sub_ps(x, temp_1); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); temp_1 = _mm256_add_ps(x, one); temp_2 = _mm256_mul_ps(x2, _ps_cephes_exp_p5); temp_3 = _mm256_mul_ps(x3, _ps_cephes_exp_p4); temp_1 = _mm256_add_ps(temp_1, temp_2); temp_2 = _mm256_mul_ps(x3, _ps_cephes_exp_p0); temp_1 = _mm256_add_ps(temp_1, temp_3); avx_m256_t temp_4 = _mm256_mul_ps(x, _ps_cephes_exp_p2); temp_3 = _mm256_mul_ps(x2, _ps_cephes_exp_p1); emm0 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm0), _mm256_castsi256_ps(_pi32_0x7f))); temp_2 = _mm256_add_ps(temp_2, temp_3); temp_3 = _mm256_add_ps(temp_3, temp_4); //emm0 = _mm256_slli_epi32(emm0, 23); // convert emm0 into two 128-bit integer vectors // perform shift on both vectors // combine both vectors into 256-bit emm0 __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 23); emm0lo = _mm_slli_epi32(emm0lo, 23); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); avx_m256_t pow2n = _mm256_castsi256_ps(emm0); temp_2 = _mm256_add_ps(temp_2, temp_3); temp_2 = _mm256_mul_ps(temp_2, x4); avx_m256_t y = _mm256_add_ps(temp_1, temp_2); y = _mm256_mul_ps(y, pow2n); return y; } // newexp_ps()
void run_dct(int width, int height, float *quant, float *input, int32_t *output) { float acosvals[8][8]; /* Calculating cosines is expensive, and there * are only 64 cosines that need to be calculated * so precompute them and cache. */ for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { if (j == 0) { acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j); } else { acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j); } } } /* Separate the parallel from the for, so each processor gets its * own copy of the buffers and variables. */ #pragma omp parallel { float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; avload[0] = sqrt(1.0 / 8.0); __m256 row0, row1, row2, row3, row4, row5, row6, row7; __m256 loaderlow, loaderhigh; __m256 temp; __m256 minus128 = _mm256_set1_ps(-128.0); __m256 avxcosloader, avxcos; float avxcosmover; __m256i integer; /* The DCT breaks the image into 8 by 8 blocks and then * transforms them into color frequencies. */ #pragma omp for for (int brow = 0; brow < height / 8; brow++) { for (int bcol = 0; bcol < width / 8; bcol++) { int head_pointer = bcol * 8 + brow * 8 * width; row0 = _mm256_setzero_ps(); row1 = _mm256_setzero_ps(); row2 = _mm256_setzero_ps(); row3 = _mm256_setzero_ps(); row4 = _mm256_setzero_ps(); row5 = _mm256_setzero_ps(); row6 = _mm256_setzero_ps(); row7 = _mm256_setzero_ps(); /* This pair of loops uses AVX instuctions to add the frequency * component from each pixel to all of the buckets at once. Allows * us to do the DCT on a block in 64 iterations of a loop rather * than 64 iterations of 64 iterations of a loop (all 64 pixels affect * all 64 frequencies) */ for (int x = 0; x < 8; x++) { for (int y = 0; y < 4; y++) { loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]); loaderlow = _mm256_add_ps(loaderlow, minus128); loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]); loaderhigh = _mm256_add_ps(loaderhigh, minus128); avxcos = _mm256_loadu_ps(&acosvals[x][0]); loaderlow = _mm256_mul_ps(loaderlow, avxcos); loaderhigh = _mm256_mul_ps(loaderhigh, avxcos); avxcosloader = _mm256_loadu_ps(&acosvals[y][0]); avxcosmover = avxcosloader[0]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row0 = _mm256_add_ps(row0, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row0 = _mm256_add_ps(row0, temp); avxcosmover = avxcosloader[1]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row1 = _mm256_add_ps(row1, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row1 = _mm256_sub_ps(row1, temp); avxcosmover = avxcosloader[2]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row2 = _mm256_add_ps(row2, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row2 = _mm256_add_ps(row2, temp); avxcosmover = avxcosloader[3]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row3 = _mm256_add_ps(row3, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row3 = _mm256_sub_ps(row3, temp); avxcosmover = avxcosloader[4]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row4 = _mm256_add_ps(row4, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row4 = _mm256_add_ps(row4, temp); avxcosmover = avxcosloader[5]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row5 = _mm256_add_ps(row5, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row5 = _mm256_sub_ps(row5, temp); avxcosmover = avxcosloader[6]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row6 = _mm256_add_ps(row6, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row6 = _mm256_add_ps(row6, temp); avxcosmover = avxcosloader[7]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row7 = _mm256_add_ps(row7, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row7 = _mm256_sub_ps(row7, temp); } } /* Each frequency stored as a float needs to be divided by * the quantization value, then rounded to the nearest integer. * Also changes the order of the values from pixel order to * each 8 by 8 block stored one after another. */ temp = _mm256_loadu_ps(&quant[0]); row0 = _mm256_div_ps(row0, temp); row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row0); _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[8]); row1 = _mm256_div_ps(row1, temp); row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row1); _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[16]); row2 = _mm256_div_ps(row2, temp); row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row2); _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[24]); row3 = _mm256_div_ps(row3, temp); row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row3); _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[32]); row4 = _mm256_div_ps(row4, temp); row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row4); _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[40]); row5 = _mm256_div_ps(row5, temp); row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row5); _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[48]); row6 = _mm256_div_ps(row6, temp); row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row6); _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[56]); row7 = _mm256_div_ps(row7, temp); row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row7); _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer); } } } }
void AVXAccelerator::forcesFor(std::size_t i, std::vector<XY>& forces) const { const std::size_t objs = m_objects->size(); // AVX can be used for 8 element aligned packs. const std::size_t first_simd_idx = (i + 8) & (-8); const std::size_t last_simd_idx = objs & (-8); // pre AVX calculations (for elements before first_simd_idx) std::size_t j = i + 1; for(; j < std::min(first_simd_idx, objs); j++) { const XY force_vector = force(i, j); forces[i] += force_vector; forces[j] += -force_vector; } // AVX calculations (for elements between first_simd_idx and last_simd_idx) for(; j < last_simd_idx; j+=8) { const float G = 6.6732e-11; const float xi = m_objects->getX()[i]; const __m256 x0 = {xi, xi, xi, xi, xi, xi, xi, xi}; const __m256 x1234 = _mm256_load_ps( &m_objects->getX()[j] ); const float yi = m_objects->getY()[i]; const __m256 y0 = {yi, yi, yi, yi, yi, yi, yi, yi}; const __m256 y1234 = _mm256_load_ps( &m_objects->getY()[j] ); const float mi = m_objects->getMass()[i]; const __m256 m0 = {mi, mi, mi, mi, mi, mi, mi, mi}; const __m256 m1234 = _mm256_load_ps( &m_objects->getMass()[j] ); const __m256 dist = utils::distance(x0, y0, x1234, y1234); const __m256 dist2 = _mm256_mul_ps(dist, dist); const __m256 vG = {G, G, G, G, G, G, G, G}; const __m256 vG_m0 = _mm256_mul_ps(vG, m0); const __m256 m1234_dist2 = _mm256_div_ps(m1234, dist2); const __m256 Fg = _mm256_mul_ps(vG_m0, m1234_dist2); utils::vector force_vector = utils::unit_vector(x0, y0, x1234, y1234); force_vector.x = _mm256_mul_ps(force_vector.x, Fg); force_vector.y = _mm256_mul_ps(force_vector.y, Fg); for (int k = 0; k < 8; k++) { forces[i] += XY(force_vector.x[k], force_vector.y[k]); forces[j + k] += XY(-force_vector.x[k], -force_vector.y[k]); } } // post AVX calculations (for elements after last_simd_idx) for(; j < objs; j++) { const XY force_vector = force(i, j); forces[i] += force_vector; forces[j] += -force_vector; } }
template <bool inversion, bool align> void Convert(const uint8_t * src, const __m256 &_1_255, float * dst) { __m128i _src = Invert<inversion>(_mm_loadl_epi64((__m128i*)src)); Avx::Store<align>(dst, _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(_src)), _1_255)); }
void TransLut::process_plane_flt_any_avx2 (uint8_t *dst_ptr, const uint8_t *src_ptr, int stride_dst, int stride_src, int w, int h) { assert (dst_ptr != 0); assert (src_ptr != 0); assert (stride_dst != 0 || h == 1); assert (stride_src != 0 || h == 1); assert (w > 0); assert (h > 0); for (int y = 0; y < h; ++y) { const FloatIntMix * s_ptr = reinterpret_cast <const FloatIntMix *> (src_ptr); TD * d_ptr = reinterpret_cast < TD *> (dst_ptr); for (int x = 0; x < w; x += 8) { union { __m256i _vect; uint32_t _scal [8]; } index; __m256 lerp; TransLut_FindIndexAvx2 <M>::find_index (s_ptr + x, index._vect, lerp); #if 1 // Looks as fast as _mm256_set_ps // G++ complains about sizeof() as argument __m256 val = _mm256_i32gather_ps ( &_lut.use <float> (0), index._vect, 4 // 4 == sizeof (float) ); const __m256 va2 = _mm256_i32gather_ps ( &_lut.use <float> (1), index._vect, 4 // 4 == sizeof (float) ); #else __m256 val = _mm256_set_ps ( _lut.use <float> (index._scal [7] ), _lut.use <float> (index._scal [6] ), _lut.use <float> (index._scal [5] ), _lut.use <float> (index._scal [4] ), _lut.use <float> (index._scal [3] ), _lut.use <float> (index._scal [2] ), _lut.use <float> (index._scal [1] ), _lut.use <float> (index._scal [0] ) ); const __m256 va2 = _mm256_set_ps ( _lut.use <float> (index._scal [7] + 1), _lut.use <float> (index._scal [6] + 1), _lut.use <float> (index._scal [5] + 1), _lut.use <float> (index._scal [4] + 1), _lut.use <float> (index._scal [3] + 1), _lut.use <float> (index._scal [2] + 1), _lut.use <float> (index._scal [1] + 1), _lut.use <float> (index._scal [0] + 1) ); #endif const __m256 dif = _mm256_sub_ps (va2, val); val = _mm256_add_ps (val, _mm256_mul_ps (dif, lerp)); TransLut_store_avx2 (&d_ptr [x], val); } src_ptr += stride_src; dst_ptr += stride_dst; } _mm256_zeroupper (); // Back to SSE state }
iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA; nvdwtype = fr->ntype; vdwparam = fr->nbfp; vdwtype = mdatoms->typeA; vdwgridparam = fr->ljpme_c6grid; sh_lj_ewald = _mm256_set1_ps(fr->ic->sh_lj_ewald); ewclj = _mm256_set1_ps(fr->ewaldcoeff_lj); ewclj2 = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj)); sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald); beta = _mm256_set1_ps(fr->ic->ewaldcoeff_q); beta2 = _mm256_mul_ps(beta,beta); beta3 = _mm256_mul_ps(beta,beta2); ewtab = fr->ic->tabq_coul_FDV0; ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale); ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale); /* Avoid stupid compiler warnings */ jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0; j_coord_offsetA = 0; j_coord_offsetB = 0; j_coord_offsetC = 0;
iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA; nvdwtype = fr->ntype; vdwparam = fr->nbfp; vdwtype = mdatoms->typeA; sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald); beta = _mm256_set1_ps(fr->ic->ewaldcoeff); beta2 = _mm256_mul_ps(beta,beta); beta3 = _mm256_mul_ps(beta,beta2); ewtab = fr->ic->tabq_coul_FDV0; ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale); ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale); /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */ rcutoff_scalar = fr->rcoulomb; rcutoff = _mm256_set1_ps(rcutoff_scalar); rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff); rswitch_scalar = fr->rcoulomb_switch; rswitch = _mm256_set1_ps(rswitch_scalar); /* Setup switch parameters */ d_scalar = rcutoff_scalar-rswitch_scalar;
f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA; sh_ewald = _mm256_set1_ps(fr->ic->sh_ewald); beta = _mm256_set1_ps(fr->ic->ewaldcoeff); beta2 = _mm256_mul_ps(beta,beta); beta3 = _mm256_mul_ps(beta,beta2); ewtab = fr->ic->tabq_coul_FDV0; ewtabscale = _mm256_set1_ps(fr->ic->tabq_scale); ewtabhalfspace = _mm256_set1_ps(0.5/fr->ic->tabq_scale); /* Avoid stupid compiler warnings */ jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0; j_coord_offsetA = 0; j_coord_offsetB = 0; j_coord_offsetC = 0; j_coord_offsetD = 0; j_coord_offsetE = 0; j_coord_offsetF = 0; j_coord_offsetG = 0;
void softmax_finalize_block( float* &output_ptr, __m256 &acc_sum) { // We are not using table of registers and unroll pragmas // due to compiler which have issues with register allocation // and needs special, obvious treatment. Template immediate // arguments matching will remove all conditions in this code. __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7, acc8, acc9, acc10, acc11, acc12, acc13, acc14, acc15; // Load outputs and perform multiplication. if (T_SIZE >= 1) acc0 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 0 * C_batch_size), acc_sum); if (T_SIZE >= 2) acc1 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 1 * C_batch_size), acc_sum); if (T_SIZE >= 3) acc2 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 2 * C_batch_size), acc_sum); if (T_SIZE >= 4) acc3 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 3 * C_batch_size), acc_sum); if (T_SIZE >= 5) acc4 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 4 * C_batch_size), acc_sum); if (T_SIZE >= 6) acc5 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 5 * C_batch_size), acc_sum); if (T_SIZE >= 7) acc6 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 6 * C_batch_size), acc_sum); if (T_SIZE >= 8) acc7 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 7 * C_batch_size), acc_sum); if (T_SIZE >= 9) acc8 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 8 * C_batch_size), acc_sum); if (T_SIZE >= 10) acc9 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 9 * C_batch_size), acc_sum); if (T_SIZE >= 11) acc10 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 10 * C_batch_size), acc_sum); if (T_SIZE >= 12) acc11 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 11 * C_batch_size), acc_sum); if (T_SIZE >= 13) acc12 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 12 * C_batch_size), acc_sum); if (T_SIZE >= 14) acc13 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 13 * C_batch_size), acc_sum); if (T_SIZE >= 15) acc14 = _mm256_mul_ps(_mm256_loadu_ps(output_ptr + 14 * C_batch_size), acc_sum); // Store results. if (T_SIZE >= 1) _mm256_storeu_ps(output_ptr + 0 * C_batch_size, acc0); if (T_SIZE >= 2) _mm256_storeu_ps(output_ptr + 1 * C_batch_size, acc1); if (T_SIZE >= 3) _mm256_storeu_ps(output_ptr + 2 * C_batch_size, acc2); if (T_SIZE >= 4) _mm256_storeu_ps(output_ptr + 3 * C_batch_size, acc3); if (T_SIZE >= 5) _mm256_storeu_ps(output_ptr + 4 * C_batch_size, acc4); if (T_SIZE >= 6) _mm256_storeu_ps(output_ptr + 5 * C_batch_size, acc5); if (T_SIZE >= 7) _mm256_storeu_ps(output_ptr + 6 * C_batch_size, acc6); if (T_SIZE >= 8) _mm256_storeu_ps(output_ptr + 7 * C_batch_size, acc7); if (T_SIZE >= 9) _mm256_storeu_ps(output_ptr + 8 * C_batch_size, acc8); if (T_SIZE >= 10) _mm256_storeu_ps(output_ptr + 9 * C_batch_size, acc9); if (T_SIZE >= 11) _mm256_storeu_ps(output_ptr + 10 * C_batch_size, acc10); if (T_SIZE >= 12) _mm256_storeu_ps(output_ptr + 11 * C_batch_size, acc11); if (T_SIZE >= 13) _mm256_storeu_ps(output_ptr + 12 * C_batch_size, acc12); if (T_SIZE >= 14) _mm256_storeu_ps(output_ptr + 13 * C_batch_size, acc13); if (T_SIZE >= 15) _mm256_storeu_ps(output_ptr + 14 * C_batch_size, acc14); output_ptr += C_batch_size*T_SIZE; }
void __hv_biquad_f_win32(SignalBiquad *o, hv_bInf_t *_bIn, hv_bInf_t *_bX0, hv_bInf_t *_bX1, hv_bInf_t *_bX2, hv_bInf_t *_bY1, hv_bInf_t *_bY2, hv_bOutf_t bOut) { hv_bInf_t bIn = *_bIn; hv_bInf_t bX0 = *_bX0; hv_bInf_t bX1 = *_bX1; hv_bInf_t bX2 = *_bX2; hv_bInf_t bY1 = *_bY1; hv_bInf_t bY2 = *_bY2; #else void __hv_biquad_f(SignalBiquad *o, hv_bInf_t bIn, hv_bInf_t bX0, hv_bInf_t bX1, hv_bInf_t bX2, hv_bInf_t bY1, hv_bInf_t bY2, hv_bOutf_t bOut) { #endif #if HV_SIMD_AVX __m256 a = _mm256_mul_ps(bIn, bX0); __m256 b = _mm256_mul_ps(o->xm1, bX1); __m256 c = _mm256_mul_ps(o->xm2, bX2); __m256 d = _mm256_add_ps(a, b); __m256 e = _mm256_add_ps(c, d); // bIn*bX0 + o->x1*bX1 + o->x2*bX2 float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0]; float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1]; float y2 = e[2] - y1*bY1[2] - y0*bY2[2]; float y3 = e[3] - y2*bY1[3] - y1*bY2[3]; float y4 = e[4] - y3*bY1[4] - y2*bY2[4]; float y5 = e[5] - y4*bY1[5] - y3*bY2[5]; float y6 = e[6] - y5*bY1[6] - y4*bY2[6]; float y7 = e[7] - y6*bY1[7] - y5*bY2[7]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y7; o->ym2 = y6; *bOut = _mm256_set_ps(y7, y6, y5, y4, y3, y2, y1, y0); #elif HV_SIMD_SSE __m128 a = _mm_mul_ps(bIn, bX0); __m128 b = _mm_mul_ps(o->xm1, bX1); __m128 c = _mm_mul_ps(o->xm2, bX2); __m128 d = _mm_add_ps(a, b); __m128 e = _mm_add_ps(c, d); const float *const bbe = (float *) &e; const float *const bbY1 = (float *) &bY1; const float *const bbY2 = (float *) &bY2; float y0 = bbe[0] - o->ym1*bbY1[0] - o->ym2*bbY2[0]; float y1 = bbe[1] - y0*bbY1[1] - o->ym1*bbY2[1]; float y2 = bbe[2] - y1*bbY1[2] - y0*bbY2[2]; float y3 = bbe[3] - y2*bbY1[3] - y1*bbY2[3]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y3; o->ym2 = y2; *bOut = _mm_set_ps(y3, y2, y1, y0); #elif HV_SIMD_NEON float32x4_t a = vmulq_f32(bIn, bX0); float32x4_t b = vmulq_f32(o->xm1, bX1); float32x4_t c = vmulq_f32(o->xm2, bX2); float32x4_t d = vaddq_f32(a, b); float32x4_t e = vaddq_f32(c, d); float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0]; float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1]; float y2 = e[2] - y1*bY1[2] - y0*bY2[2]; float y3 = e[3] - y2*bY1[3] - y1*bY2[3]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y3; o->ym2 = y2; *bOut = (float32x4_t) {y0, y1, y2, y3}; #else const float y = bIn*bX0 + o->xm1*bX1 + o->xm2*bX2 - o->ym1*bY1 - o->ym2*bY2; o->xm2 = o->xm1; o->xm1 = bIn; o->ym2 = o->ym1; o->ym1 = y; *bOut = y; #endif }
nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; nvdwtype = fr->ntype; vdwparam = fr->nbfp; vdwtype = mdatoms->typeA; rcutoff_scalar = fr->rvdw; rcutoff = _mm256_set1_ps(rcutoff_scalar); rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff); sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6); rvdw = _mm256_set1_ps(fr->rvdw); /* Avoid stupid compiler warnings */ jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0; j_coord_offsetA = 0; j_coord_offsetB = 0; j_coord_offsetC = 0; j_coord_offsetD = 0; j_coord_offsetE = 0; j_coord_offsetF = 0; j_coord_offsetG = 0; j_coord_offsetH = 0;
/*! * \brief Multiply the two given vectors */ ETL_STATIC_INLINE(avx_simd_float) mul(avx_simd_float lhs, avx_simd_float rhs) { return _mm256_mul_ps(lhs.value, rhs.value); }