void AlignedAvxNonTemporalMult(float* d, float const* a, float const* b) { for(int i = 0; i < gNumFloats; i += 8) { __m256 v1 = _mm256_load_ps(&a[i]); __m256 v2 = _mm256_load_ps(&b[i]); __m256 r = _mm256_mul_ps(v1, v2); _mm256_stream_ps(&d[i], r); } }
void rectifier_avx_3(float *a, const size_t len) { float *p = a; assert(len > 8); for (; (uintptr_t)p%32 != 0; ++p) { *p = *p > 0.0 ? *p : 0.0; } for (; p + 8 <= &a[len]; p += 8) { _mm256_stream_ps(p, _mm256_max_ps( _mm256_load_ps(p) , _mm256_setzero_ps() ) ); } for (; p < &a[len]; ++p) { *p = *p > 0.0 ? *p : 0.0; } }
bool enqueue_try_nosync(ArenaT& arena, const T* entry) { const float* pSrc = (const float*)entry; float* pDst = (float*)&mCurBlock[mTail]; auto lambda = [&](int32_t i) { __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH); _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); }; const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4); static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T), "FIFO element size should be multiple of SIMD width."); UnrollerL<0, numSimdLines, 1>::step(lambda); mTail ++; if (mTail == mBlockSize) { if (++mCurBlockIdx < mBlocks.size()) { mCurBlock = mBlocks[mCurBlockIdx]; } else { T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4); SWR_ASSERT(newBlock); mBlocks.push_back(newBlock); mCurBlock = newBlock; } mTail = 0; } mNumEntries ++; return true; }
/*! * \brief Non-temporal, aligned, store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) stream(etl::complex<float>* memory, avx_simd_complex_float<etl::complex<float>> value) { _mm256_stream_ps(reinterpret_cast<float*>(memory), value.value); }
/*! * \brief Non-temporal, aligned, store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) stream(float* memory, avx_simd_float value) { _mm256_stream_ps(memory, value.value); }
void trya(){ Timer tt; const int c = 10000000; const int c2= 2*c; double summm; //float * const a1 = new float[c]; //float * const a2 = new float[c]; //float * const a3 = new float[c]; float * const a1 = (float *)_mm_malloc(2*c*sizeof(float), 32); float * const a3 = (float *)_mm_malloc(8*c*sizeof(float), 32); float * const a4 = (float *)_mm_malloc(c*sizeof(float), 32); float * const a5 = (float *)_mm_malloc(c*sizeof(float), 32);; //register float always_in = 5; //register float always_in2 = 6; //register float always_in3 = 7; float * fck = (float *)_mm_malloc(8*sizeof(float), 32); fck[0] = 1; fck[1] = 1; fck[2] = 1; fck[3] = 1; fck[4] = 1; fck[5] = 1; fck[6] = 1; fck[7] = 1; float * fck2 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck3 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck4 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck5 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck6 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck7 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck8 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck9 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck10 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck11 = (float *)_mm_malloc(8*sizeof(float), 32); float * fck12 = (float *)_mm_malloc(8*sizeof(float), 32); fck2[0] = 1; fck2[1] = 1; fck2[2] = 1; fck2[3] = 1; fck2[4] = 1; fck2[5] = 1; fck2[6] = 1; fck2[7] = 1; fck3[0] = 1; fck3[1] = 1; fck3[2] = 1; fck3[3] = 1; fck3[4] = 1; fck3[5] = 1; fck3[6] = 1; fck3[7] = 1; fck4[0] = rand(); fck4[1] = rand(); fck4[2] = rand(); fck4[3] = rand(); fck4[4] = rand(); fck4[5] = rand(); fck4[6] = rand(); fck4[7] = rand(); fck5[0] = rand(); fck5[1] = rand(); fck5[2] = rand(); fck5[3] = rand(); fck5[4] = rand(); fck5[5] = rand(); fck5[6] = rand(); fck5[7] = rand(); fck6[0] = rand(); fck6[1] = rand(); fck6[2] = rand(); fck6[3] = rand(); fck6[4] = rand(); fck6[5] = rand(); fck6[6] = rand(); fck6[7] = rand(); fck7[0] = rand(); fck7[1] = rand(); fck7[2] = rand(); fck7[3] = rand(); fck7[4] = rand(); fck7[5] = rand(); fck7[6] = rand(); fck7[7] = rand(); fck8[0] = rand(); fck8[1] = rand(); fck8[2] = rand(); fck8[3] = rand(); fck8[4] = rand(); fck8[5] = rand(); fck8[6] = rand(); fck8[7] = rand(); fck9[0] = rand(); fck9[1] = rand(); fck9[2] = rand(); fck9[3] = rand(); fck9[4] = rand(); fck9[5] = rand(); fck9[6] = rand(); fck9[7] = rand(); fck10[0] = rand(); fck10[1] = rand(); fck10[2] = rand(); fck10[3] = rand(); fck10[4] = rand(); fck10[5] = rand(); fck10[6] = rand(); fck10[7] = rand(); fck11[0] = rand(); fck11[1] = rand(); fck11[2] = rand(); fck11[3] = rand(); fck11[4] = rand(); fck11[5] = rand(); fck11[6] = rand(); fck11[7] = rand(); fck12[0] = rand(); fck12[1] = rand(); fck12[2] = rand(); fck12[3] = rand(); fck12[4] = rand(); fck12[5] = rand(); fck12[6] = rand(); fck12[7] = rand(); //std::cout << "~~~~~~~~~" << std::endl; register __m256 a_fck = _mm256_load_ps(fck); register __m256 b_fck = _mm256_load_ps(fck2); register __m256 c_fck = _mm256_load_ps(fck3); register __m256 d_fck = _mm256_load_ps(fck4); register __m256 e_fck = _mm256_load_ps(fck5); register __m256 f_fck = _mm256_load_ps(fck6); register __m256 g_fck = _mm256_load_ps(fck7); register __m256 h_fck = _mm256_load_ps(fck8); register __m256 i_fck = _mm256_load_ps(fck9); register __m256 j_fck = _mm256_load_ps(fck10); register __m256 k_fck = _mm256_load_ps(fck11); register __m256 l_fck = _mm256_load_ps(fck12); //std::cout << "+++++++++" << std::endl; for(int i=0;i<c2;i++){ a1[i] = rand(); //a2[i] = rand(); } for(int i=0;i<c;i++){ a4[i] = rand(); a5[i] = rand(); a3[i] = a1[i]; } //std::cout << "#########" << std::endl; tt.restart(); __m256 b_i, c_i, d_i, e_i; __m256 out_i, out_i2, out_i3, out_i4, out_i5, out_i6, out_i7, out_i8, out_i9, out_i10, out_i11, out_i12; const float * pLoad = a4; float * pStore = a3; b_i = _mm256_loadu_ps(pLoad); const int SKIP = 8; for(int i=0;i<c;i+=SKIP){ out_i = _mm256_mul_ps(b_i, a_fck); out_i2 = _mm256_mul_ps(b_i, b_fck); out_i3 = _mm256_mul_ps(b_i, c_fck); out_i4 = _mm256_mul_ps(b_i, d_fck); out_i5 = _mm256_mul_ps(b_i, e_fck); out_i6 = _mm256_mul_ps(b_i, f_fck); out_i7 = _mm256_mul_ps(b_i, g_fck); out_i8 = _mm256_mul_ps(b_i, h_fck); out_i9 = _mm256_mul_ps(b_i, i_fck); out_i10 = _mm256_mul_ps(b_i, j_fck); out_i11 = _mm256_mul_ps(b_i, k_fck); out_i12 = _mm256_mul_ps(b_i, l_fck); out_i = _mm256_add_ps(out_i, out_i2); out_i3 = _mm256_add_ps(out_i3, out_i4); out_i5 = _mm256_add_ps(out_i5, out_i6); out_i7 = _mm256_add_ps(out_i7, out_i8); out_i9 = _mm256_add_ps(out_i9, out_i10); out_i11 = _mm256_add_ps(out_i11, out_i12); out_i2 = _mm256_add_ps(out_i, out_i3); out_i5 = _mm256_add_ps(out_i5, out_i7); out_i9 = _mm256_add_ps(out_i9, out_i11); out_i = _mm256_add_ps(out_i2, out_i5); out_i = _mm256_add_ps(out_i, out_i9); //_mm256_storeu_ps(pStore, out_i); _mm256_stream_ps(pStore, out_i); //_mm256_stream_ps(pStore, out_i2); //_mm256_stream_ps(pStore, out_i3); //_mm256_stream_ps(pStore, out_i4); pStore = pStore + 8; pLoad = pLoad + SKIP; } double ttt = 1.0*tt.elapsed(); std::cout << ttt << std::endl; summm = 0.0; for(int i=0;i<c;i++){ summm += a3[i]; } std::cout << summm << std::endl; }
void sEnv_process(HvBase *_c, SignalEnvelope *o, hv_bInf_t bIn, void (*sendMessage)(HvBase *, int, const HvMessage *)) { #if HV_SIMD_AVX _mm256_stream_ps(o->buffer+o->numSamplesInBuffer, _mm256_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m256 sum = _mm256_setzero_ps(); while (n4) { __m256 x = _mm256_load_ps(o->buffer + n4 - HV_N_SIMD); __m256 h = _mm256_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm256_mul_ps(x, h); sum = _mm256_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm256_hadd_ps(sum,sum); // horizontal sum sum = _mm256_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0]+sum[4], sendMessage); // updates numSamplesInBuffer } #elif HV_SIMD_SSE _mm_stream_ps(o->buffer+o->numSamplesInBuffer, _mm_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; __m128 sum = _mm_setzero_ps(); while (n4) { __m128 x = _mm_load_ps(o->buffer + n4 - HV_N_SIMD); __m128 h = _mm_load_ps(o->hanningWeights + n4 - HV_N_SIMD); x = _mm_mul_ps(x, h); sum = _mm_add_ps(sum, x); n4 -= HV_N_SIMD; } sum = _mm_hadd_ps(sum,sum); // horizontal sum sum = _mm_hadd_ps(sum,sum); sEnv_sendMessage(_c, o, sum[0], sendMessage); } #elif HV_SIMD_NEON vst1q_f32(o->buffer+o->numSamplesInBuffer, vmulq_f32(bIn,bIn)); // store bIn^2, no need to cache block o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { int n4 = o->windowSize & ~HV_N_SIMD_MASK; float32x4_t sum = vdupq_n_f32(0.0f); while (n4) { float32x4_t x = vld1q_f32(o->buffer + n4 - HV_N_SIMD); float32x4_t h = vld1q_f32(o->hanningWeights + n4 - HV_N_SIMD); x = vmulq_f32(x, h); sum = vaddq_f32(sum, x); n4 -= HV_N_SIMD; } sEnv_sendMessage(_c, o, sum[0]+sum[1]+sum[2]+sum[3], sendMessage); } #else // HV_SIMD_NONE o->buffer[o->numSamplesInBuffer] = (bIn*bIn); o->numSamplesInBuffer += HV_N_SIMD; if (o->numSamplesInBuffer >= o->windowSize) { float sum = 0.0f; for (int i = 0; i < o->windowSize; ++i) { sum += (o->hanningWeights[i] * o->buffer[i]); } sEnv_sendMessage(_c, o, sum, sendMessage); } #endif }