double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const { float pe = 0.0; int atom_i = 0; /* BEGIN SIMD SECTION */ // This performs the exact same calculation after the SIMD section // but doing it on 8 atoms at a time using SIMD instructions. coordinates8 rand_xyz8(rand_xyz), atom_xyz; __m256 r2_8, mask, r6, ri6, pe_tmp; __m256 pe_sum = _mm256_setzero_ps(); float result[n] __attribute__((aligned (16))); for (; atom_i < this->n-8; atom_i+=8) { atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i); r2_8 = distance2(atom_xyz, rand_xyz8, box); mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS); r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8)); ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6)); pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8))); pe_sum = _mm256_add_ps(pe_tmp, pe_sum); } _mm256_store_ps(result, pe_sum); for (int i = 0; i < 8; i++) { pe += result[i]; } /* END SIMD SECTION */ for (; atom_i < this->n; atom_i++) { coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i); float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box)); if (r2 < this->rcut2) { float ri6 = 1.0/(pow(r2,3)); pe += ri6*(this->c12*ri6 - this->c6); } } pe += this->n/vol * this->tail_factor;; return pe; }
/*---------------------------------------------------------------------------*/ __m256 TTriangle::THit::HitTest8(__m256 mask, const TPoint8& orig, const D3DXVECTOR3& d, HitResult8* result) const { int u, v, w; w = ci; u = w == 0 ? 1 : 0; v = w == 2 ? 1 : 2; __m256 nu = _mm256_broadcast_ss(&this->nu); __m256 np = _mm256_broadcast_ss(&this->np); __m256 nv = _mm256_broadcast_ss(&this->nv); __m256 pu = _mm256_broadcast_ss(&this->pu); __m256 pv = _mm256_broadcast_ss(&this->pv); __m256 e0u = _mm256_broadcast_ss(&this->e0u); __m256 e0v = _mm256_broadcast_ss(&this->e0v); __m256 e1u = _mm256_broadcast_ss(&this->e1u); __m256 e1v = _mm256_broadcast_ss(&this->e1v); __m256 ou = orig[u]; __m256 ov = orig[v]; __m256 ow = orig[w]; __m256 du = _mm256_broadcast_ss(&d[u]); __m256 dv = _mm256_broadcast_ss(&d[v]); __m256 dw = _mm256_broadcast_ss(&d[w]); __m256 dett = np -(ou*nu+ov*nv+ow); __m256 det = du*nu+dv*nv+dw; __m256 Du = du*dett - (pu-ou)*det; __m256 Dv = dv*dett - (pv-ov)*det; __m256 detu = (e1v*Du - e1u*Dv); __m256 detv = (e0u*Dv - e0v*Du); __m256 tmpdet0 = det - detu - detv; __m256 detMask = _mm256_xor_ps(_mm256_xor_ps(tmpdet0, detv) | _mm256_xor_ps(detv, detu), g_one8) > _mm256_setzero_ps(); mask = mask & detMask; __m256 rdet = _mm256_rcp_ps(det); result->t = dett * rdet; result->u = detu * rdet; result->v = detv * rdet; return mask & (result->t > _mm256_setzero_ps()); /**/ }
__m256 ori_to_bin_256( const __m256& ori, const int nbins) { //! For convenience const __m256 x2PI = _mm256_set1_ps(2 * M_PI); const __m256 xbins = _mm256_set1_ps(nbins); //! Get it positive const __m256 mask = _mm256_cmp_ps(ori, _mm256_setzero_ps(), _CMP_LT_OS); //! Get the value const __m256 val = _mm256_round_ps(applyMask256_ps(mask, ori + x2PI, ori) / x2PI * xbins + _mm256_set1_ps(0.5f), _MM_FROUND_TO_ZERO); //! Return the modulo of it return val - xbins * _mm256_round_ps(val / xbins, _MM_FROUND_TO_ZERO); }
float dot_product(const int N, const float *X, const int incX, const float *Y, const int incY) { __m256 accum = _mm256_setzero_ps(); for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) { __m256 xval = _mm256_load_ps(X); __m256 yval = _mm256_load_ps(Y); __m256 val = _mm256_mul_ps(xval, yval); accum = _mm256_add_ps(val, accum); } // Reduce the values in accum into the smallest 32-bit subsection // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3 __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum), _mm256_extractf128_ps(accum, 1)); // b0 b1 b2 b3 -> c0 c1 b2 b3 accum2 = _mm_add_ps(accum2, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8))); __m128 final_val = _mm_add_ss( _mm_insert_ps(accum2, accum2, 0x4e), accum2); // Add the high and low halves return final_val[0]; }
// Rounding half away from zero (equivalent to round() from math.h) // __m256 contains 8 floats, but to simplify the examples, only 4 will be shown // Initial values to be used in the examples: // [-12.49 -0.5 1.5 3.7] static __m256 c63_mm256_roundhalfawayfromzero_ps(const __m256 initial) { const __m256 sign_mask = _mm256_set1_ps(-0.f); const __m256 one_half = _mm256_set1_ps(0.5f); const __m256 all_zeros = _mm256_setzero_ps(); const __m256 pos_one = _mm256_set1_ps(1.f); const __m256 neg_one = _mm256_set1_ps(-1.f); // Creates a mask based on the sign of the floats, true for negative floats // Example: [true true false false] __m256 less_than_zero = _mm256_cmp_ps(initial, all_zeros, _CMP_LT_OQ); // Returns the integer part of the floats // Example: [-12.0 -0.0 1.0 3.0] __m256 without_fraction = _mm256_round_ps(initial, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); // Returns the fraction part of the floats // Example: [-0.49 -0.5 0.5 0.7] __m256 fraction = _mm256_sub_ps(initial, without_fraction); // Absolute values of the fractions // Example: [0.49 0.5 0.5 0.7] __m256 fraction_abs = _mm256_andnot_ps(sign_mask, fraction); // Compares abs(fractions) to 0.5, true if lower // Example: [true false false false] __m256 less_than_one_half = _mm256_cmp_ps(fraction_abs, one_half, _CMP_LT_OQ); // Blends 1.0 and -1.0 depending on the initial sign of the floats // Example: [-1.0 -1.0 1.0 1.0] __m256 signed_ones = _mm256_blendv_ps(pos_one, neg_one, less_than_zero); // Blends the previous result with zeros depending on the fractions that are lower than 0.5 // Example: [0.0 -1.0 1.0 1.0] __m256 to_add = _mm256_blendv_ps(signed_ones, all_zeros, less_than_one_half); // Adds the previous result to the floats without fractions // Example: [-12.0 -1.0 2.0 4.0] return _mm256_add_ps(without_fraction, to_add); }
template <bool align> SIMD_INLINE float SquaredDifferenceSum32f(const float * a, const float * b, size_t size) { if(align) assert(Aligned(a) && Aligned(b)); float sum = 0; size_t i = 0; size_t alignedSize = AlignLo(size, 8); if(alignedSize) { __m256 _sum = _mm256_setzero_ps(); for(; i < alignedSize; i += 8) { __m256 _a = Avx::Load<align>(a + i); __m256 _b = Avx::Load<align>(b + i); __m256 _d = _mm256_sub_ps(_a, _b); _sum = _mm256_add_ps(_sum, _mm256_mul_ps(_d, _d)); } sum += Avx::ExtractSum(_sum); } for(; i < size; ++i) sum += Simd::Square(a[i] - b[i]); return sum; }
Triangle* OctreeLeaf::Query(const Ray& ray, float& t) const { float tBox = std::numeric_limits<float>::min(); if (!Intersects(ray, bb, tBox) || tBox > t) return nullptr; const __m256 rayDirX = _mm256_set1_ps(ray.Direction.X); const __m256 rayDirY = _mm256_set1_ps(ray.Direction.Y); const __m256 rayDirZ = _mm256_set1_ps(ray.Direction.Z); const __m256 rayPosX = _mm256_set1_ps(ray.Origin.X); const __m256 rayPosY = _mm256_set1_ps(ray.Origin.Y); const __m256 rayPosZ = _mm256_set1_ps(ray.Origin.Z); union { float dists[MAXSIZE]; __m256 distances[MAXSIZE / NROFLANES]; }; for (int i = 0; i < count; i++) { // Vector3F e1 = triangle.Vertices[1].Position - triangle.Vertices[0].Position; const __m256 e1X = edge1X8[i]; const __m256 e1Y = edge1Y8[i]; const __m256 e1Z = edge1Z8[i]; // Vector3F e2 = triangle.Vertices[2].Position - triangle.Vertices[0].Position; const __m256 e2X = edge2X8[i]; const __m256 e2Y = edge2Y8[i]; const __m256 e2Z = edge2Z8[i]; // Vector3F p = ray.Direction.Cross(e2); const __m256 pX = _mm256_sub_ps(_mm256_mul_ps(rayDirY, e2Z), _mm256_mul_ps(rayDirZ, e2Y)); const __m256 pY = _mm256_sub_ps(_mm256_mul_ps(rayDirZ, e2X), _mm256_mul_ps(rayDirX, e2Z)); const __m256 pZ = _mm256_sub_ps(_mm256_mul_ps(rayDirX, e2Y), _mm256_mul_ps(rayDirY, e2X)); // float det = e1.Dot(p); const __m256 det = _mm256_add_ps(_mm256_mul_ps(e1X, pX), _mm256_add_ps(_mm256_mul_ps(e1Y, pY), _mm256_mul_ps(e1Z, pZ))); // if (det > -EPSILON && det < EPSILON) // return false; __m256 mask = _mm256_or_ps(_mm256_cmp_ps(det, _mm256_set1_ps(-EPSILON), _CMP_LE_OS), _mm256_cmp_ps(det, _mm256_set1_ps(EPSILON), _CMP_GE_OS)); // float invDet = 1 / det; const __m256 invDet = _mm256_div_ps(_mm256_set1_ps(1.0f), det); // Vector3F r = ray.Origin - triangle.Vertices[0].Position; const __m256 rX = _mm256_sub_ps(rayPosX, vert0X8[i]); const __m256 rY = _mm256_sub_ps(rayPosY, vert0Y8[i]); const __m256 rZ = _mm256_sub_ps(rayPosZ, vert0Z8[i]); // float u = r.Dot(p) * invDet; const __m256 u = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rX, pX), _mm256_add_ps(_mm256_mul_ps(rY, pY), _mm256_mul_ps(rZ, pZ)))); // if (u < 0 || u > 1) // return false; mask = _mm256_and_ps(mask, _mm256_cmp_ps(u, _mm256_setzero_ps(), _CMP_GE_OS)); // Vector3F q = r.Cross(e1); const __m256 qX = _mm256_sub_ps(_mm256_mul_ps(rY, e1Z), _mm256_mul_ps(rZ, e1Y)); const __m256 qY = _mm256_sub_ps(_mm256_mul_ps(rZ, e1X), _mm256_mul_ps(rX, e1Z)); const __m256 qZ = _mm256_sub_ps(_mm256_mul_ps(rX, e1Y), _mm256_mul_ps(rY, e1X)); // float v = ray.Direction.Dot(q) * invDet; const __m256 v = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rayDirX, qX), _mm256_add_ps(_mm256_mul_ps(rayDirY, qY), _mm256_mul_ps(rayDirZ, qZ)))); // if (v < 0 || u + v > 1) // return false; mask = _mm256_and_ps(mask, _mm256_and_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GE_OS), _mm256_cmp_ps(_mm256_add_ps(u, v), _mm256_set1_ps(1.0f), _CMP_LE_OS))); // float tt = e2.Dot(q) * invDet; const __m256 tt = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(e2X, qX), _mm256_add_ps(_mm256_mul_ps(e2Y, qY), _mm256_mul_ps(e2Z, qZ)))); // if (tt > EPSILON) // { // t = tt; // return true; // } // // return false; distances[i] = _mm256_and_ps(tt, mask); } Triangle* triangle = nullptr; for (int i = 0; i < count * NROFLANES; i++) if (dists[i] < t && dists[i] > EPSILON) { t = dists[i]; triangle = triangles[i]; } return triangle; }
float nv_vector_norm(const nv_matrix_t *vec, int vec_m) { #if NV_ENABLE_AVX { NV_ALIGNED(float, mm[8], 32); __m256 x, u; int n; int pk_lp = (vec->n & 0xfffffff8); float dp = 0.0f; u = _mm256_setzero_ps(); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec, vec_m, n)); u = _mm256_add_ps(u, _mm256_mul_ps(x, x)); } _mm256_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3] + mm[4] + mm[5] + mm[6] + mm[7]; for (n = pk_lp; n < vec->n; ++n) { dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n); } if (dp > 0.0f) { return sqrtf(dp); } return 0.0f; } #elif NV_ENABLE_SSE2 { NV_ALIGNED(float, mm[4], 16); __m128 x, u; int n; int pk_lp = (vec->n & 0xfffffffc); float dp = 0.0f; u = _mm_setzero_ps(); for (n = 0; n < pk_lp; n += 4) { x = _mm_load_ps(&NV_MAT_V(vec, vec_m, n)); u = _mm_add_ps(u, _mm_mul_ps(x, x)); } _mm_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3]; for (n = pk_lp; n < vec->n; ++n) { dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n); } if (dp > 0.0f) { return sqrtf(dp); } return 0.0f; } #else { int n; float dp = 0.0f; for (n = 0; n < vec->n; ++n) { dp += NV_MAT_V(vec, vec_m, n) * NV_MAT_V(vec, vec_m, n); } if (dp > 0.0f) { return sqrtf(dp); } return 0.0f; } #endif }
{ 1.1f, 1.1f, 1.1f, 1.1f, 1.1f, 1.1f, 1.1f, 1.1f }, // RCX + 352 { 1.2f, 1.2f, 1.2f, 1.2f, 1.2f, 1.2f, 1.2f, 1.2f }, // RCX + 384 { 1.3f, 1.3f, 1.3f, 1.3f, 1.3f, 1.3f, 1.3f, 1.3f }, // RCX + 416 { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f, 1.4f, 1.4f, 1.4f }, // RCX + 480 { 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f }, // RCX + 512 } VPU_ALIGN_SUFFIX(32); // our assembler vpu::IAssembler* a = g_lib->createAssembler(); // This example demonstrates a simple loop. The equivalent C++ code would look like: #if 0 __m256 YMM0 = _mm256_setzero_ps(); __m256* R9 = (__m256*)(argument_data + 1); for (int i = 0; i < 10; ++i) { YMM0 = _mm256_add_ps(YMM0, R9[i]); } _mm256_store_ps(argument_data[0], YMM0); #endif // start assembling a->begin(); // we'll accumulate the sum of the array in YMM0 a->setzero(vpu::YMM0);
void run_softmax_int32_float_work_item_batch8x(nn_workload_item *const work_item, uint16_t NoBatch8) { nn_workload_data_t *input_view = work_item->input[0]->output; const auto &arguments = work_item->arguments.forward_softmax_fixedpoint; const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1; const auto batch_size_global = work_item->output->parent->lengths.t[NN_DATA_COORD_n]; const auto batch_size = 8; const auto num_full_blocks = output_width / C_max_acc; const auto partial_block_size = output_width % C_max_acc; const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x] * batch_size * NoBatch8; const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p] * batch_size * NoBatch8; const auto out_fraction = arguments.input_fraction; float * input_f = (float*)_mm_malloc(input_width * batch_size * sizeof(float), 64); float * output_f = (float*)_mm_malloc(output_width * batch_size * sizeof(float), 64); auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start + NoBatch8 * 8 * input_width]; //auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start]; auto shift = out_fraction; if (shift > 0) { for (uint32_t i = 0; i < input_width * batch_size; i++) input_f[i] = (float)(input_buffer[i]) / (1 << shift); } else if (shift < 0) { for (uint32_t i = 0; i < input_width* batch_size; i++) input_f[i] = (float)(input_buffer[i]) * (1 << -shift); } else { for (uint32_t i = 0; i < input_width* batch_size; i++) input_f[i] = (float)(input_buffer[i]); } __m256 acc_sum = _mm256_setzero_ps(); { auto input_buffer = input_f; //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width]; auto output_buffer = output_f; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break; case 2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break; case 3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break; case 4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break; case 5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break; case 6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break; case 7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break; case 8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break; case 9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break; case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break; case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break; case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break; case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break; case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } } acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum); { //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width]; auto output_buffer = output_f; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_finalize_block<C_max_acc>(output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_finalize_block< 1>(output_buffer, acc_sum); break; case 2: softmax_finalize_block< 2>(output_buffer, acc_sum); break; case 3: softmax_finalize_block< 3>(output_buffer, acc_sum); break; case 4: softmax_finalize_block< 4>(output_buffer, acc_sum); break; case 5: softmax_finalize_block< 5>(output_buffer, acc_sum); break; case 6: softmax_finalize_block< 6>(output_buffer, acc_sum); break; case 7: softmax_finalize_block< 7>(output_buffer, acc_sum); break; case 8: softmax_finalize_block< 8>(output_buffer, acc_sum); break; case 9: softmax_finalize_block< 9>(output_buffer, acc_sum); break; case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break; case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break; case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break; case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break; case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } } auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto itrW = 0; itrW < output_width; itrW++) for (auto itr8 = 0; itr8 < C_batch_size; itr8++) output_buffer[itr8 + itrW * batch_size_global + NoBatch8 * C_batch_size] = output_f[itr8 + itrW * C_batch_size]; _mm_free(input_f); _mm_free(output_f); }
void fastGEMM( const float* aptr, size_t astep, const float* bptr, size_t bstep, float* cptr, size_t cstep, int ma, int na, int nb ) { int n = 0; for( ; n <= nb - 16; n += 16 ) { for( int m = 0; m < ma; m += 4 ) { const float* aptr0 = aptr + astep*m; const float* aptr1 = aptr + astep*std::min(m+1, ma-1); const float* aptr2 = aptr + astep*std::min(m+2, ma-1); const float* aptr3 = aptr + astep*std::min(m+3, ma-1); float* cptr0 = cptr + cstep*m; float* cptr1 = cptr + cstep*std::min(m+1, ma-1); float* cptr2 = cptr + cstep*std::min(m+2, ma-1); float* cptr3 = cptr + cstep*std::min(m+3, ma-1); __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); for( int k = 0; k < na; k++ ) { __m256 a0 = _mm256_set1_ps(aptr0[k]); __m256 a1 = _mm256_set1_ps(aptr1[k]); __m256 a2 = _mm256_set1_ps(aptr2[k]); __m256 a3 = _mm256_set1_ps(aptr3[k]); __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n); __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8); d00 = _mm256_fmadd_ps(a0, b0, d00); d01 = _mm256_fmadd_ps(a0, b1, d01); d10 = _mm256_fmadd_ps(a1, b0, d10); d11 = _mm256_fmadd_ps(a1, b1, d11); d20 = _mm256_fmadd_ps(a2, b0, d20); d21 = _mm256_fmadd_ps(a2, b1, d21); d30 = _mm256_fmadd_ps(a3, b0, d30); d31 = _mm256_fmadd_ps(a3, b1, d31); } _mm256_storeu_ps(cptr0 + n, d00); _mm256_storeu_ps(cptr0 + n + 8, d01); _mm256_storeu_ps(cptr1 + n, d10); _mm256_storeu_ps(cptr1 + n + 8, d11); _mm256_storeu_ps(cptr2 + n, d20); _mm256_storeu_ps(cptr2 + n + 8, d21); _mm256_storeu_ps(cptr3 + n, d30); _mm256_storeu_ps(cptr3 + n + 8, d31); } } for( ; n < nb; n++ ) { for( int m = 0; m < ma; m++ ) { const float* aptr0 = aptr + astep*m; float* cptr0 = cptr + cstep*m; float d0 = 0.f; for( int k = 0; k < na; k++ ) d0 += aptr0[k]*bptr[k*bstep + n]; cptr0[n] = d0; } } _mm256_zeroupper(); }
void kernel_strmv_u_n_8_lib8(int kmax, float *A, float *x, float *y, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, x_1, x_2, x_3, y_0, y_0_b, y_0_c, y_0_d, z_0; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_0_b = _mm256_setzero_ps(); y_0_c = _mm256_setzero_ps(); y_0_d = _mm256_setzero_ps(); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); x_3 = _mm256_blend_ps( zeros, x_3, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; k=8; for(; k<kmax-7; k+=8) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } for(; k<kmax-3; k+=4) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } y_0 = _mm256_add_ps( y_0 , y_0_c ); y_0_b = _mm256_add_ps( y_0_b, y_0_d ); if(kmax%4>=2) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); A += 2*lda; x += 2; } y_0 = _mm256_add_ps( y_0 , y_0_b ); if(kmax%2==1) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); /* A += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } }
void TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); // Constants static const int mant_size = 23; static const int exp_bias = 127; static const uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size; static const float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2); // static const float val_max = float (int64_t (1) << LOGLUT_MAX_L2); static const int frac_size = mant_size - LOGLUT_RES_L2; static const uint32_t frac_mask = (1 << frac_size) - 1; const __m256 zero_f = _mm256_setzero_ps (); const __m256 one_f = _mm256_set1_ps (1); const __m256 frac_mul = _mm256_set1_ps (1.0f / (1 << frac_size)); const __m256 mul_eps = _mm256_set1_ps (1.0f / val_min); const __m256 mask_abs_f = _mm256_load_ps ( reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs) ); const __m256i zero_i = _mm256_setzero_si256 (); const __m256i mask_abs_epi32 = _mm256_set1_epi32 (0x7FFFFFFF); const __m256i one_epi32 = _mm256_set1_epi32 (1); const __m256i base_epi32 = _mm256_set1_epi32 (int (base)); const __m256i frac_mask_epi32 = _mm256_set1_epi32 (frac_mask); const __m256i val_min_epi32 = _mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size); const __m256i val_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size); const __m256i index_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2); const __m256i hsize_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE); const __m256i mirror_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE - 1); // It really starts here const __m256 val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_a = _mm256_and_ps (val_f, mask_abs_f); const __m256i val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr)); const __m256i val_u = _mm256_and_si256 (val_i, mask_abs_epi32); // Standard path __m256i index_std = _mm256_sub_epi32 (val_u, base_epi32); index_std = _mm256_srli_epi32 (index_std, frac_size); index_std = _mm256_add_epi32 (index_std, one_epi32); __m256i frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32); __m256 frac_std = _mm256_cvtepi32_ps (frac_stdi); frac_std = _mm256_mul_ps (frac_std, frac_mul); // Epsilon path __m256 frac_eps = _mm256_max_ps (val_a, zero_f); frac_eps = _mm256_mul_ps (frac_eps, mul_eps); // Range cases const __m256i eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u); const __m256i std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u); const __m256 eps_flag_f = _mm256_castsi256_ps (eps_flag_i); const __m256 std_flag_f = _mm256_castsi256_ps (std_flag_i); __m256i index_tmp = fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32); __m256 frac_tmp = fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f); index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp); frac_tmp = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp); // Sign cases const __m256i neg_flag_i = _mm256_srai_epi32 (val_i, 31); const __m256 neg_flag_f = _mm256_castsi256_ps (neg_flag_i); const __m256i index_neg = _mm256_sub_epi32 (mirror_epi32, index_tmp); const __m256i index_pos = _mm256_add_epi32 (hsize_epi32, index_tmp); const __m256 frac_neg = _mm256_sub_ps (one_f, frac_tmp); index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos); frac = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp); }
void sEnv_process(HvBase *_c, SignalEnvelope *o, hv_bInf_t bIn, void (*sendMessage)(HvBase *, int, const HvMessage *)) { #if HV_SIMD_AVX _mm256_stream_ps(o->buffer+o->numSamplesInBuffer, _mm256_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m256 sum = _mm256_setzero_ps(); while (n4) { __m256 x = _mm256_load_ps(o->buffer + n4 - HV_N_SIMD); __m256 h = _mm256_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm256_mul_ps(x, h); sum = _mm256_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm256_hadd_ps(sum,sum); // horizontal sum sum = _mm256_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0]+sum[4], sendMessage); // updates numSamplesInBuffer } #elif HV_SIMD_SSE _mm_stream_ps(o->buffer+o->numSamplesInBuffer, _mm_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m128 sum = _mm_setzero_ps(); while (n4) { __m128 x = _mm_load_ps(o->buffer + n4 - HV_N_SIMD); __m128 h = _mm_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm_mul_ps(x, h); sum = _mm_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm_hadd_ps(sum,sum); // horizontal sum sum = _mm_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0], sendMessage); } #elif HV_SIMD_NEON vst1q_f32(o->buffer+o->numSamplesInBuffer, vmulq_f32(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; float32x4_t sum = vdupq_n_f32(0.0f); while (n4) { float32x4_t x = vld1q_f32(o->buffer + n4 - HV_N_SIMD); float32x4_t h = vld1q_f32(o->hanningWeights + n4 - HV_N_SIMD); x = vmulq_f32(x, h); sum = vaddq_f32(sum, x); n4 -= HV_N_SIMD; } sEnv_sendMessage(_c, o, sum[0]+sum[1]+sum[2]+sum[3], sendMessage); } #else // HV_SIMD_NONE o->buffer[o->numSamplesInBuffer] = (bIn*bIn); o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { float sum = 0.0f; for (int i = 0; i < o->windowSize; ++i) { sum += (o->hanningWeights[i] * o->buffer[i]); } sEnv_sendMessage(_c, o, sum, sendMessage); } #endif }
DLL_LOCAL void fname(const float *a00, const float *a01, const float *a02, const float *a11, const float *a12, const float *a22, float *ev0, float *ev1, float *ev2, const size_t len) { const size_t avx_end = len & ~7; __m256 v_inv3 = _mm256_set1_ps(1.0 / 3.0); __m256 v_root3 = _mm256_sqrt_ps(_mm256_set1_ps(3.0)); __m256 two = _mm256_set1_ps(2.0); __m256 half = _mm256_set1_ps(0.5); __m256 zero = _mm256_setzero_ps(); for (size_t i = 0; i < avx_end; i += 8) { __m256 v_a00 = _mm256_loadu_ps(a00 + i); __m256 v_a01 = _mm256_loadu_ps(a01 + i); __m256 v_a02 = _mm256_loadu_ps(a02 + i); __m256 v_a11 = _mm256_loadu_ps(a11 + i); __m256 v_a12 = _mm256_loadu_ps(a12 + i); __m256 v_a22 = _mm256_loadu_ps(a22 + i); __m256 c0 = _avx_sub(_avx_sub(_avx_sub(_avx_add(_avx_mul(_avx_mul(v_a00, v_a11), v_a22), _avx_mul(_avx_mul(_avx_mul(two, v_a01), v_a02), v_a12)), _avx_mul(_avx_mul(v_a00, v_a12), v_a12)), _avx_mul(_avx_mul(v_a11, v_a02), v_a02)), _avx_mul(_avx_mul(v_a22, v_a01), v_a01)); __m256 c1 = _avx_sub(_avx_add(_avx_sub(_avx_add(_avx_sub(_avx_mul(v_a00, v_a11), _avx_mul(v_a01, v_a01)), _avx_mul(v_a00, v_a22)), _avx_mul(v_a02, v_a02)), _avx_mul(v_a11, v_a22)), _avx_mul(v_a12, v_a12)); __m256 c2 = _avx_add(_avx_add(v_a00, v_a11), v_a22); __m256 c2Div3 = _avx_mul(c2, v_inv3); __m256 aDiv3 = _avx_mul(_avx_sub(c1, _avx_mul(c2, c2Div3)), v_inv3); aDiv3 = _mm256_min_ps(aDiv3, zero); __m256 mbDiv2 = _avx_mul(half, _avx_add(c0, _avx_mul(c2Div3, _avx_sub(_avx_mul(_avx_mul(two, c2Div3), c2Div3), c1)))); __m256 q = _avx_add(_avx_mul(mbDiv2, mbDiv2), _avx_mul(_avx_mul(aDiv3, aDiv3), aDiv3)); q = _mm256_min_ps(q, zero); __m256 magnitude = _mm256_sqrt_ps(_avx_neg(aDiv3)); __m256 angle = _avx_mul(atan2_256_ps(_mm256_sqrt_ps(_avx_neg(q)), mbDiv2), v_inv3); __m256 cs, sn; sincos256_ps(angle, &sn, &cs); __m256 r0 = _avx_add(c2Div3, _avx_mul(_avx_mul(two, magnitude), cs)); __m256 r1 = _avx_sub(c2Div3, _avx_mul(magnitude, _avx_add(cs, _avx_mul(v_root3, sn)))); __m256 r2 = _avx_sub(c2Div3, _avx_mul(magnitude, _avx_sub(cs, _avx_mul(v_root3, sn)))); __m256 v_r0_tmp = _mm256_min_ps(r0, r1); __m256 v_r1_tmp = _mm256_max_ps(r0, r1); __m256 v_r0 = _mm256_min_ps(v_r0_tmp, r2); __m256 v_r2_tmp = _mm256_max_ps(v_r0_tmp, r2); __m256 v_r1 = _mm256_min_ps(v_r1_tmp, v_r2_tmp); __m256 v_r2 = _mm256_max_ps(v_r1_tmp, v_r2_tmp); _mm256_storeu_ps(ev2 + i, v_r0); _mm256_storeu_ps(ev1 + i, v_r1); _mm256_storeu_ps(ev0 + i, v_r2); } for (size_t i = avx_end; i < len; ++i) { float inv3 = 1.0 / 3.0; float root3 = sqrt(3.0); float c0 = a00[i] * a11[i] * a22[i] + 2.0 * a01[i] * a02[i] * a12[i] - a00[i] * a12[i] * a12[i] - a11[i] * a02[i] * a02[i] - a22[i] * a01[i] * a01[i]; float c1 = a00[i] * a11[i] - a01[i] * a01[i] + a00[i] * a22[i] - a02[i] * a02[i] + a11[i] * a22[i] - a12[i] * a12[i]; float c2 = a00[i] + a11[i] + a22[i]; float c2Div3 = c2 * inv3; float aDiv3 = (c1 - c2 * c2Div3) * inv3; if (aDiv3 > 0.0) aDiv3 = 0.0; float mbDiv2 = 0.5 * (c0 + c2Div3 * (2.0 * c2Div3 * c2Div3 - c1)); float q = mbDiv2 * mbDiv2 + aDiv3 * aDiv3 * aDiv3; if (q > 0.0) q = 0.0; float magnitude = sqrt(-aDiv3); float angle = atan2(sqrt(-q), mbDiv2) * inv3; float cs = cos(angle); float sn = sin(angle); float r0 = (c2Div3 + 2.0 * magnitude * cs); float r1 = (c2Div3 - magnitude * (cs + root3 * sn)); float r2 = (c2Div3 - magnitude * (cs - root3 * sn)); if (r0 < r1) swap(&r0, &r1); if (r0 < r2) swap(&r0, &r2); if (r1 < r2) swap(&r1, &r2); ev0[i] = r0; ev1[i] = r1; ev2[i] = r2; } }
while( i-- ) { sad += Math::abs( *src1++ - *src2++ ); } // Zero upper half of AVX registers to avoid AVX-SSE transition penalties _mm256_zeroupper( ); return sad; } float SIMDAVX::SSD( const float* src1, const float* src2, const size_t n ) const { size_t i = n >> 3; __m256 a, b, diff, sqr, sum; sum = _mm256_setzero_ps( ); if( ( ( size_t ) src1 | ( size_t ) src2 ) & 0x1f ) { while( i-- ) { a = _mm256_loadu_ps( src1 ); b = _mm256_loadu_ps( src2 ); diff = _mm256_sub_ps( a, b ); sqr = _mm256_mul_ps( diff, diff ); sum = _mm256_add_ps( sum, sqr ); src1 += 8; src2 += 8; } } else { while( i-- ) { a = _mm256_load_ps( src1 ); b = _mm256_load_ps( src2 ); diff = _mm256_sub_ps( a, b ); sqr = _mm256_mul_ps( diff, diff );
void calc_dnn_fma(float *dst, float *src, float *w, float *b, int out, int in, float *fstore) { #ifdef HAS_SIMD_FMA float *s; int i, j; int n = in / 8; for (i = 0; i + 3 < out; i += 4) { float *w2, *w3, *w4; __m256 x1 = _mm256_setzero_ps(); __m256 x2 = _mm256_setzero_ps(); __m256 x3 = _mm256_setzero_ps(); __m256 x4 = _mm256_setzero_ps(); w2 = w + in; w3 = w2 + in; w4 = w3 + in; s = src; for (j = 0; j < n; j++) { __m256 vs = _mm256_load_ps(s); __m256 vw1 = _mm256_load_ps(w); __m256 vw2 = _mm256_load_ps(w2); __m256 vw3 = _mm256_load_ps(w3); __m256 vw4 = _mm256_load_ps(w4); x1 = _mm256_fmadd_ps(vs, vw1, x1); x2 = _mm256_fmadd_ps(vs, vw2, x2); x3 = _mm256_fmadd_ps(vs, vw3, x3); x4 = _mm256_fmadd_ps(vs, vw4, x4); s += 8; w += 8; w2 += 8; w3 += 8; w4 += 8; } _mm256_store_ps(fstore, x1); *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++); _mm256_store_ps(fstore, x2); *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++); _mm256_store_ps(fstore, x3); *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++); _mm256_store_ps(fstore, x4); *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++); w = w4; } /* process last <4 nodes */ for (; i < out; i++) { __m256 x = _mm256_setzero_ps(); s = src; for (j = 0; j < n; j++) { __m256 vs = _mm256_load_ps(s); __m256 v = _mm256_load_ps(w); x = _mm256_fmadd_ps(vs, v, x); s += 8; w += 8; } _mm256_store_ps(fstore, x); *(dst++) = fstore[0] + fstore[1] + fstore[2] + fstore[3] + fstore[4] + fstore[5] + fstore[6] + fstore[7] + *(b++); } #endif /* HAS_SIMD_FMA */ }
__m256 mm256_cos_ps(__m256 x) { __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y; __m256i emm0, emm2; /* take the absolute value */ x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1); emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2); /* get the swap sign flag */ emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); __m256 sign_bit = _mm256_castsi256_ps(emm0); __m256 poly_mask = _mm256_castsi256_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m256*)m256_ps_minus_cephes_DP1; xmm2 = *(__m256*)m256_ps_minus_cephes_DP2; xmm3 = *(__m256*)m256_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m256*)m256_ps_coscof_p0; __m256 z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(__m256*)m256_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m256 y2 = *(__m256*)m256_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); _mm256_zeroupper(); return y; }
static inline void rectifier_kernel_avx(float *a, const size_t blocks) { for (size_t i = 0; i < blocks; ++i) { _mm256_store_ps( &a[i*8], _mm256_max_ps( _mm256_load_ps( &a[i*8] ) , _mm256_setzero_ps() ) ); } }
static inline void rectifier_kernel_avx10(float *a, const size_t blocks) { for (size_t i = 0; i < blocks; ++i) { _mm256_store_ps( &a[i*8*10 + 0*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 0*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 1*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 1*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 2*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 2*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 3*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 3*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 4*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 4*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 5*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 5*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 6*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 6*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 7*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 7*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 8*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 8*8] ) , _mm256_setzero_ps() ) ); _mm256_store_ps( &a[i*8*10 + 9*8], _mm256_max_ps( _mm256_load_ps( &a[i*8*10 + 9*8] ) , _mm256_setzero_ps() ) ); } }
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k, k_left, ii; float k_left_d; const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5}; float temp_space[8] = {}; __m256 mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; mask = _mm256_loadu_ps( mask_f ); zeros = _mm256_setzero_ps(); x_n_0 = _mm256_broadcast_ss( &x_n[0] ); x_n_1 = _mm256_broadcast_ss( &x_n[1] ); x_n_2 = _mm256_broadcast_ss( &x_n[2] ); x_n_3 = _mm256_broadcast_ss( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_ps( zeros, x_n_0 ); x_n_1 = _mm256_sub_ps( zeros, x_n_1 ); x_n_2 = _mm256_sub_ps( zeros, x_n_2 ); x_n_3 = _mm256_sub_ps( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_ps(); y_t_1 = _mm256_setzero_ps(); y_t_2 = _mm256_setzero_ps(); y_t_3 = _mm256_setzero_ps(); k=0; // corner if(tri==1) { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ /*exit(1);*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( a_00, zeros, 0x00 ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); a_00 = _mm256_blend_ps( a_00, zeros, 0x01 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( a_01, zeros, 0x01 ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_01 = _mm256_blend_ps( a_01, zeros, 0x03 ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( a_02, zeros, 0x03 ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_02 = _mm256_blend_ps( a_02, zeros, 0x07 ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( a_03, zeros, 0x07 ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_03 = _mm256_blend_ps( a_03, zeros, 0x0f ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} /* for(; k<kna; k++)*/ { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /*printf("\nk_left = %d\n", k_left);*/ /*exit(1);*/ A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(kna>0 || tri==1) { A += (sda-1)*lda; } for(; k<kmax-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); y_n_0 = _mm256_loadu_ps( &y_n[0] ); x_t_0 = _mm256_loadu_ps( &x_t[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); temp = _mm256_mul_ps( a_00, x_n_0 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += sda*lda; y_n += 8; x_t += 8; } if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} { k_left = kmax-k; k_left_d = 8.0 - k_left; /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*printf("\nk_left2 = %d\n", k_left, kmax, k);*/ a_00 = _mm256_load_ps( &A[0+lda*0] ); /*printf("\nk_left2 = %d\n", k_left);*/ a_01 = _mm256_load_ps( &A[0+lda*1] ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_03 = _mm256_load_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /* A += 1;*/ /* y_n += 1;*/ /* x_t += 1;*/ } // reduction __m128 z_0, z_1; y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1); y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3); y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2); y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01); z_0 = _mm256_castps256_ps128(y_t_0); z_1 = _mm256_castps256_ps128(y_t_1); z_1 = _mm_add_ps(z_0, z_1); if(alg==1) { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_add_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } else // alg==-1 { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_sub_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } }
static inline __m256 gen_zero(void) { return _mm256_setzero_ps(); }
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 8; /* const int bs = 8;*/ __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_1 = _mm256_setzero_ps(); y_2 = _mm256_setzero_ps(); y_3 = _mm256_setzero_ps(); y_4 = _mm256_setzero_ps(); y_5 = _mm256_setzero_ps(); y_6 = _mm256_setzero_ps(); y_7 = _mm256_setzero_ps(); k=0; for(; k<kmax-7; k+=8) { x_0 = _mm256_loadu_ps( &x[0] ); __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*4] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); A += sda*lda; x += lda; } x_0 = _mm256_loadu_ps( &x[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( zeros, a_03, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); a_00 = _mm256_load_ps( &A[0+lda*4] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); /* a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/ ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); // reduction __m256 z_0; y_0 = _mm256_hadd_ps(y_0, y_1); y_2 = _mm256_hadd_ps(y_2, y_3); y_4 = _mm256_hadd_ps(y_4, y_5); y_6 = _mm256_hadd_ps(y_6, y_7); y_0 = _mm256_hadd_ps(y_0, y_2); y_4 = _mm256_hadd_ps(y_4, y_6); y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20); y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31); y_0 = _mm256_add_ps(y_1, y_2); // store if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } }
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); //avx_m256i_t emm2 = _mm256_cvttps_epi32(y); //emm2 = _mm256_add_epi32(emm2, _pi32_1); avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1)); //emm2 = _mm256_and_si256(emm2, _pi32_inv1); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1))); y = _mm256_cvtepi32_ps(emm2); //avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2); avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2)); //avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4); avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_4))); //emm0 = _mm256_slli_epi32(emm0, 29); __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 29); emm0lo = _mm_slli_epi32(emm0lo, 29); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); //cos_emm0 = _mm256_slli_epi32(cos_emm0, 29); __m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0); __m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1); cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29); cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1); //emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_2))); //cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2); cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_2))); //emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256()); cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f = _mm256_castsi256_ps(emm0); avx_m256_t emm2f = _mm256_castsi256_ps(emm2); avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0); avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2); sign_bit = _mm256_xor_ps(sign_bit, emm0f); avx_m256_t temp_2 = _ps_minus_cephes_DP123; temp_2 = _mm256_mul_ps(y, temp_2); x = _mm256_add_ps(x, temp_2); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp_2 = _mm256_mul_ps(x2, _ps_0p5); y2 = _mm256_add_ps(y2, x); temp_2 = _mm256_sub_ps(temp_2, _ps_1); y = _mm256_sub_ps(y, temp_2); avx_m256_t cos_y = y; avx_m256_t cos_y2 = y2; y = _mm256_andnot_ps(emm2f, y); cos_y = _mm256_andnot_ps(cos_emm2f, cos_y); y2 = _mm256_and_ps(emm2f, y2); cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2); y = _mm256_add_ps(y, y2); cos_y = _mm256_add_ps(cos_y, cos_y2); *s = _mm256_xor_ps(y, sign_bit); *c = _mm256_xor_ps(cos_y, cos_emm0f); } // newsincos_ps()
void sLine_onMessage(HvBase *_c, SignalLine *o, int letIn, const HvMessage * const m, void *sendMessage) { if (msg_isFloat(m,0)) { if (msg_isFloat(m,1)) { // new ramp int n = ctx_millisecondsToSamples(_c, msg_getFloat(m,1)); #if HV_SIMD_AVX float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7]; // current output value float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample o->n = _mm_set_epi32(n-3, n-2, n-1, n); o->x = _mm256_set_ps(x+7.0f*s, x+6.0f*s, x+5.0f*s, x+4.0f*s, x+3.0f*s, x+2.0f*s, x+s, x); o->m = _mm256_set1_ps(8.0f*s); o->t = _mm256_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_SSE float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; float s = (msg_getFloat(m,0) - x) / ((float) n); // slope per sample o->n = _mm_set_epi32(n-3, n-2, n-1, n); o->x = _mm_set_ps(x+3.0f*s, x+2.0f*s, x+s, x); o->m = _mm_set1_ps(4.0f*s); o->t = _mm_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_NEON float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; float s = (msg_getFloat(m,0) - x) / ((float) n); o->n = (int32x4_t) {n, n-1, n-2, n-3}; o->x = (float32x4_t) {x, x+s, x+2.0f*s, x+3.0f*s}; o->m = vdupq_n_f32(4.0f*s); o->t = vdupq_n_f32(msg_getFloat(m,0)); #else // HV_SIMD_NONE o->x = (o->n > 0) ? (o->x + o->m) : o->t; // new current value o->n = n; // new distance to target o->m = (msg_getFloat(m,0) - o->x) / ((float) n); // slope per sample o->t = msg_getFloat(m,0); #endif } else { // Jump to value #if HV_SIMD_AVX o->n = _mm_setzero_si128(); o->x = _mm256_set1_ps(msg_getFloat(m,0)); o->m = _mm256_setzero_ps(); o->t = _mm256_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_SSE o->n = _mm_setzero_si128(); o->x = _mm_set1_ps(msg_getFloat(m,0)); o->m = _mm_setzero_ps(); o->t = _mm_set1_ps(msg_getFloat(m,0)); #elif HV_SIMD_NEON o->n = vdupq_n_s32(0); o->x = vdupq_n_f32(0.0f); o->m = vdupq_n_f32(0.0f); o->t = vdupq_n_f32(0.0f); #else // HV_SIMD_NONE o->n = 0; o->x = msg_getFloat(m,0); o->m = 0.0f; o->t = msg_getFloat(m,0); #endif } } else if (msg_compareSymbol(m,0,"stop")) { // Stop line at current position #if HV_SIMD_AVX // note o->n[1] is a 64-bit integer; two packed 32-bit ints. We only want to know if the high int is positive, // which can be done simply by testing the long int for positiveness. float x = (o->n[1] > 0) ? (o->x[7] + (o->m[7]/8.0f)) : o->t[7]; o->n = _mm_setzero_si128(); o->x = _mm256_set1_ps(x); o->m = _mm256_setzero_ps(); o->t = _mm256_set1_ps(x); #elif HV_SIMD_SSE float x = (o->n[1] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; o->n = _mm_setzero_si128(); o->x = _mm_set1_ps(x); o->m = _mm_setzero_ps(); o->t = _mm_set1_ps(x); #elif HV_SIMD_NEON float x = (o->n[3] > 0) ? (o->x[3] + (o->m[3]/4.0f)) : o->t[3]; o->n = vdupq_n_s32(0); o->x = vdupq_n_f32(x); o->m = vdupq_n_f32(0.0f); o->t = vdupq_n_f32(x); #else // HV_SIMD_NONE o->n = 0; o->x += o->m; o->m = 0.0f; o->t = o->x; #endif } }
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2, avx_m256_t *c1, avx_m256_t *c2) { avx_m256_t tempa = _ps_sign_mask; avx_m256_t tempb = _ps_inv_sign_mask; avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa); avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa); x1 = _mm256_and_ps(x1, tempb); x2 = _mm256_and_ps(x2, tempb); tempa = _ps_cephes_FOPI; avx_m256_t y1 = _mm256_mul_ps(x1, tempa); avx_m256_t y2 = _mm256_mul_ps(x2, tempa); //avx_m256i_t emm21 = _mm256_cvttps_epi32(y1); //avx_m256i_t emm22 = _mm256_cvttps_epi32(y2); //emm21 = _mm256_add_epi32(emm21, _pi32_1); //emm22 = _mm256_add_epi32(emm22, _pi32_1); avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1)); avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1)); //emm21 = _mm256_and_si256(emm21, _pi32_inv1); //emm22 = _mm256_and_si256(emm22, _pi32_inv1); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1))); y1 = _mm256_cvtepi32_ps(emm21); y2 = _mm256_cvtepi32_ps(emm22); //avx_m256i_t tempia = _pi32_2; //avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia); //avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia); avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2)); avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2)); //avx_m256i_t tempib = _pi32_4; //avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib); //avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib); avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib); //avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib); avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_4))); //emm01 = _mm256_slli_epi32(emm01, 29); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 29); emm0lo1 = _mm_slli_epi32(emm0lo1, 29); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 29); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 29); emm0lo2 = _mm_slli_epi32(emm0lo2, 29); emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1); //cos_emm01 = _mm256_slli_epi32(cos_emm01, 29); __m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0); __m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1); cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29); cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1); //cos_emm02 = _mm256_slli_epi32(cos_emm02, 29); __m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0); __m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1); cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29); cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1); //tempia = _pi32_2; //tempib = _mm256_setzero_si256(); //emm21 = _mm256_and_si256(emm21, tempia); //emm22 = _mm256_and_si256(emm22, tempia); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_2))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_2))); //cos_emm21 = _mm256_and_si256(cos_emm21, tempia); //cos_emm22 = _mm256_and_si256(cos_emm22, tempia); cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_2))); cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_2))); //emm21 = _mm256_cmpeq_epi32(emm21, tempib); //emm22 = _mm256_cmpeq_epi32(emm22, tempib); emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib); //cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib); cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01); avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02); avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21); avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22); avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01); avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02); avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21); avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22); sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1); sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2); tempa = _ps_minus_cephes_DP123; tempb = _mm256_mul_ps(y2, tempa); tempa = _mm256_mul_ps(y1, tempa); x2 = _mm256_add_ps(x2, tempb); x1 = _mm256_add_ps(x1, tempa); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); tempa = _ps_coscof_p0; tempb = _ps_sincof_p0; y1 = _mm256_mul_ps(x21, tempa); y2 = _mm256_mul_ps(x22, tempa); avx_m256_t y21 = _mm256_mul_ps(x21, tempb); avx_m256_t y22 = _mm256_mul_ps(x22, tempb); tempa = _ps_coscof_p1; tempb = _ps_sincof_p1; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x21); y2 = _mm256_mul_ps(y2, x22); y21 = _mm256_mul_ps(y21, x21); y22 = _mm256_mul_ps(y22, x22); tempa = _ps_coscof_p2; tempb = _ps_sincof_p2; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x41); y2 = _mm256_mul_ps(y2, x42); y21 = _mm256_mul_ps(y21, x31); y22 = _mm256_mul_ps(y22, x32); tempa = _ps_0p5; tempb = _ps_1; avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa); avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa); y21 = _mm256_add_ps(y21, x1); y22 = _mm256_add_ps(y22, x2); temp_21 = _mm256_sub_ps(temp_21, tempb); temp_22 = _mm256_sub_ps(temp_22, tempb); y1 = _mm256_sub_ps(y1, temp_21); y2 = _mm256_sub_ps(y2, temp_22); avx_m256_t cos_y1 = y1; avx_m256_t cos_y2 = y2; avx_m256_t cos_y21 = y21; avx_m256_t cos_y22 = y22; y1 = _mm256_andnot_ps(emm2f1, y1); y2 = _mm256_andnot_ps(emm2f2, y2); cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1); cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2); y21 = _mm256_and_ps(emm2f1, y21); y22 = _mm256_and_ps(emm2f2, y22); cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21); cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22); y1 = _mm256_add_ps(y1, y21); y2 = _mm256_add_ps(y2, y22); cos_y1 = _mm256_add_ps(cos_y1, cos_y21); cos_y2 = _mm256_add_ps(cos_y2, cos_y22); *s1 = _mm256_xor_ps(y1, sign_bit1); *s2 = _mm256_xor_ps(y2, sign_bit2); *c1 = _mm256_xor_ps(cos_y1, cos_emm0f1); *c2 = _mm256_xor_ps(cos_y2, cos_emm0f2); } // newsincos_ps_dual()
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX( const void *poOptions, GUInt32 nPoints, CPL_UNUSED const double *unused_padfX, CPL_UNUSED const double *unused_padfY, CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, double *pdfValue, void* hExtraParamsIn ) { size_t i = 0; GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn; const float* pafX = psExtraParams->pafX; const float* pafY = psExtraParams->pafY; const float* pafZ = psExtraParams->pafZ; const float fEpsilon = 0.0000000000001f; const float fXPoint = (float)dfXPoint; const float fYPoint = (float)dfYPoint; const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon); const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint); const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint); __m256 ymm_nominator = _mm256_setzero_ps(); __m256 ymm_denominator = _mm256_setzero_ps(); int mask = 0; #undef LOOP_SIZE #if defined(__x86_64) || defined(_M_X64) /* This would also work in 32bit mode, but there are only 8 XMM registers */ /* whereas we have 16 for 64bit */ #define LOOP_SIZE 16 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x); __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y); __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8), _mm256_mul_ps(ymm_ry_8, ymm_ry_8)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8); ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i))); ymm_nominator = _mm256_add_ps(ymm_nominator, _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8); mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */ (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8); if( mask ) break; } #else #define LOOP_SIZE 8 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps((float*)pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps((float*)pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps((float*)pafZ + i))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */ if( mask ) break; } #endif /* Find which i triggered r2 < fEpsilon */ if( mask ) { for(int j = 0; j < LOOP_SIZE; j++ ) { if( mask & (1 << j) ) { (*pdfValue) = (pafZ)[i + j]; // GCC and MSVC need explicit zeroing #if !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; } } } #undef LOOP_SIZE /* Get back nominator and denominator values for YMM registers */ float afNominator[8], afDenominator[8]; _mm256_storeu_ps(afNominator, ymm_nominator); _mm256_storeu_ps(afDenominator, ymm_denominator); // MSVC doesn't emit AVX afterwards but may use SSE, so clear upper bits // Other compilers will continue using AVX for the below floating points operations #if defined(_MSC_FULL_VER) _mm256_zeroupper(); #endif float fNominator = afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3] + afNominator[4] + afNominator[5] + afNominator[6] + afNominator[7]; float fDenominator = afDenominator[0] + afDenominator[1] + afDenominator[2] + afDenominator[3] + afDenominator[4] + afDenominator[5] + afDenominator[6] + afDenominator[7]; /* Do the few remaining loop iterations */ for ( ; i < nPoints; i++ ) { const float fRX = pafX[i] - fXPoint; const float fRY = pafY[i] - fYPoint; const float fR2 = fRX * fRX + fRY * fRY; // If the test point is close to the grid node, use the point // value directly as a node value to avoid singularity. if ( fR2 < 0.0000000000001 ) { break; } else { const float fInvR2 = 1.0f / fR2; fNominator += fInvR2 * pafZ[i]; fDenominator += fInvR2; } } if( i != nPoints ) { (*pdfValue) = pafZ[i]; } else if ( fDenominator == 0.0 ) { (*pdfValue) = ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue; } else (*pdfValue) = fNominator / fDenominator; // GCC needs explicit zeroing #if defined(__GNUC__) && !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; }
void run_dct(int width, int height, float *quant, float *input, int32_t *output) { float acosvals[8][8]; /* Calculating cosines is expensive, and there * are only 64 cosines that need to be calculated * so precompute them and cache. */ for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { if (j == 0) { acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j); } else { acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j); } } } /* Separate the parallel from the for, so each processor gets its * own copy of the buffers and variables. */ #pragma omp parallel { float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; avload[0] = sqrt(1.0 / 8.0); __m256 row0, row1, row2, row3, row4, row5, row6, row7; __m256 loaderlow, loaderhigh; __m256 temp; __m256 minus128 = _mm256_set1_ps(-128.0); __m256 avxcosloader, avxcos; float avxcosmover; __m256i integer; /* The DCT breaks the image into 8 by 8 blocks and then * transforms them into color frequencies. */ #pragma omp for for (int brow = 0; brow < height / 8; brow++) { for (int bcol = 0; bcol < width / 8; bcol++) { int head_pointer = bcol * 8 + brow * 8 * width; row0 = _mm256_setzero_ps(); row1 = _mm256_setzero_ps(); row2 = _mm256_setzero_ps(); row3 = _mm256_setzero_ps(); row4 = _mm256_setzero_ps(); row5 = _mm256_setzero_ps(); row6 = _mm256_setzero_ps(); row7 = _mm256_setzero_ps(); /* This pair of loops uses AVX instuctions to add the frequency * component from each pixel to all of the buckets at once. Allows * us to do the DCT on a block in 64 iterations of a loop rather * than 64 iterations of 64 iterations of a loop (all 64 pixels affect * all 64 frequencies) */ for (int x = 0; x < 8; x++) { for (int y = 0; y < 4; y++) { loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]); loaderlow = _mm256_add_ps(loaderlow, minus128); loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]); loaderhigh = _mm256_add_ps(loaderhigh, minus128); avxcos = _mm256_loadu_ps(&acosvals[x][0]); loaderlow = _mm256_mul_ps(loaderlow, avxcos); loaderhigh = _mm256_mul_ps(loaderhigh, avxcos); avxcosloader = _mm256_loadu_ps(&acosvals[y][0]); avxcosmover = avxcosloader[0]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row0 = _mm256_add_ps(row0, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row0 = _mm256_add_ps(row0, temp); avxcosmover = avxcosloader[1]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row1 = _mm256_add_ps(row1, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row1 = _mm256_sub_ps(row1, temp); avxcosmover = avxcosloader[2]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row2 = _mm256_add_ps(row2, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row2 = _mm256_add_ps(row2, temp); avxcosmover = avxcosloader[3]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row3 = _mm256_add_ps(row3, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row3 = _mm256_sub_ps(row3, temp); avxcosmover = avxcosloader[4]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row4 = _mm256_add_ps(row4, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row4 = _mm256_add_ps(row4, temp); avxcosmover = avxcosloader[5]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row5 = _mm256_add_ps(row5, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row5 = _mm256_sub_ps(row5, temp); avxcosmover = avxcosloader[6]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row6 = _mm256_add_ps(row6, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row6 = _mm256_add_ps(row6, temp); avxcosmover = avxcosloader[7]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row7 = _mm256_add_ps(row7, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row7 = _mm256_sub_ps(row7, temp); } } /* Each frequency stored as a float needs to be divided by * the quantization value, then rounded to the nearest integer. * Also changes the order of the values from pixel order to * each 8 by 8 block stored one after another. */ temp = _mm256_loadu_ps(&quant[0]); row0 = _mm256_div_ps(row0, temp); row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row0); _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[8]); row1 = _mm256_div_ps(row1, temp); row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row1); _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[16]); row2 = _mm256_div_ps(row2, temp); row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row2); _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[24]); row3 = _mm256_div_ps(row3, temp); row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row3); _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[32]); row4 = _mm256_div_ps(row4, temp); row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row4); _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[40]); row5 = _mm256_div_ps(row5, temp); row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row5); _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[48]); row6 = _mm256_div_ps(row6, temp); row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row6); _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[56]); row7 = _mm256_div_ps(row7, temp); row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row7); _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer); } } } }
void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item) { nn_workload_data_t *input_view = work_item->input[0]->output; const auto &arguments = work_item->arguments.forward_softmax_fixedpoint; const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1; const auto num_full_blocks = output_width / C_data_stride; const auto partial_block_size = (output_width / C_simd_width) % C_max_acc; const auto subsimd_block_size = output_width % C_simd_width; const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x]; const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto out_fraction = arguments.input_fraction; float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64); auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start]; auto shift = out_fraction; if (shift > 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) / (1 << shift); } else if (shift < 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) * (1 << -shift); } else { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]); } __m256 acc_sum = _mm256_setzero_ps(); float subsimd_sum = 0.0f; { auto input_buffer = input_f; auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break; case 2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break; case 3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break; case 4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break; case 5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break; case 6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break; case 7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break; case 8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break; case 9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break; case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break; case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break; case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break; case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break; case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break; case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break; case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break; case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break; case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break; case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break; case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } { __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum); intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7)); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum)); subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0)); acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum); subsimd_sum = 1.0f / subsimd_sum; } { auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_finalize_block<C_max_acc>(output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_finalize_block< 1>(output_buffer, acc_sum); break; case 2: softmax_finalize_block< 2>(output_buffer, acc_sum); break; case 3: softmax_finalize_block< 3>(output_buffer, acc_sum); break; case 4: softmax_finalize_block< 4>(output_buffer, acc_sum); break; case 5: softmax_finalize_block< 5>(output_buffer, acc_sum); break; case 6: softmax_finalize_block< 6>(output_buffer, acc_sum); break; case 7: softmax_finalize_block< 7>(output_buffer, acc_sum); break; case 8: softmax_finalize_block< 8>(output_buffer, acc_sum); break; case 9: softmax_finalize_block< 9>(output_buffer, acc_sum); break; case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break; case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break; case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break; case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break; case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break; case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break; case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break; case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break; case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break; case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break; case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } _mm_free(input_f); }
// PRE: all vectors aligned, // imag_c = [i1,i1,...,i4,i4] // vec = [v1r,v1i,...,v4r,v4i] // component-wise multiplication // POST: returns [-i1*v1i,i1*v1r,...,-i4*v4i,i4*v4r] inline __m256 avx_multiply_float_imag_(const __m256& imag_c, const __m256& vec) { static const __m256 zero = _mm256_setzero_ps(); __m256 vec1 = _mm256_mul_ps(imag_c,vec); vec1 = _mm256_permute_ps(vec1,0xB1); return _mm256_addsub_ps(zero,vec1); }