void experienceNet::addExperience_feature_sse(float cVal, float magnitude, size_t iFeature) { assert(cVal >= 0 && cVal <= 1.0); static const __m128 addOffsets = _mm_set_ps(0.0, 1.0, 2.0, 3.0); floatIterator_t cVals = vals.begin() + offsets[iFeature]; const __m128 partitions = _mm_load_ps1(&partition); const __m128 maxNormalRecips = _mm_load_ps1(&maxNormalRecip); const __m128 magnitudes = _mm_load_ps1(&magnitude); __m128 cPartitions, result; for (size_t iTap = 0; iTap < eSet.numTaps; iTap +=4, cVals += 4) { // set to iTap,iTap,iTap,iTap cPartitions = _mm_set1_ps(iTap); // iTap+0, iTap+1, iTap+2, iTap+3 cPartitions = _mm_add_ps(cPartitions, addOffsets); // partition[0], partition[1], partition[2], partition[3] cPartitions = _mm_mul_ps(cPartitions, partitions); // compute PDF of the normal distribution: normalPDF_sse((float*)&result, (float*)&cPartitions, cVal, eSet.extrapolation); // divide by maxNormal, multiply by magnitude: result = _mm_mul_ps(result, maxNormalRecips); result = _mm_mul_ps(result, magnitudes); // add recency/histogram: expAdd((float*)&*cVals, (float*)&*cVals, (float*)&result); } }
void experienceNet::normalPDF_sse(float* result, const float* _partitions, float _mean, float _stdDev) { /* CODE ADAPTED FROM boost/math/normal.hpp RealType exponent = x - mean; exponent *= -exponent; exponent /= 2 * sd * sd; result = exp(exponent); result /= sd * sqrt(2 * constants::pi<RealType>()); return result; */ const __m128& partitions = *(__m128*)_partitions; __m128 exponent, tmp, mean, sd; /* CODE ADAPTED FROM http://fastcpp.blogspot.com/2011/03/changing-sign-of-float-values-using-sse.html */ static const __m128 signmask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); static const __m128 twos = _mm_set_ps1(2.0f); static const __m128 sqrt_pi_2_s = _mm_set_ps1(sqrt(2.0 * M_PI)); // store mean and sd: mean = _mm_load_ps1(&_mean); sd = _mm_load_ps1(&_stdDev); // exponent = x - mean exponent = _mm_sub_ps(partitions, mean); // exponent *= -exponent; tmp = _mm_xor_ps(exponent, signmask); exponent = _mm_mul_ps(exponent, tmp); // exponent /= 2 * sd * sd; tmp = _mm_mul_ps(sd, sd); tmp = _mm_mul_ps(tmp, twos); exponent = _mm_div_ps(exponent, tmp); // exponent = exp(exponent); exponent = _mm_exp_ps(exponent); // exponent /= sd * sqrt(2 * pi) tmp = _mm_mul_ps(sd, sqrt_pi_2_s); tmp = _mm_div_ps(exponent, tmp); #ifndef NDEBUG const float* _result = (float*)&tmp; boost::math::normal_distribution<float> cNormal(_mean, _stdDev); assert(fastabs(_result[0] - boost::math::pdf(cNormal, _partitions[0])) < 0.001f); assert(fastabs(_result[1] - boost::math::pdf(cNormal, _partitions[1])) < 0.001f); assert(fastabs(_result[2] - boost::math::pdf(cNormal, _partitions[2])) < 0.001f); assert(fastabs(_result[3] - boost::math::pdf(cNormal, _partitions[3])) < 0.001f); #endif // return result: _mm_store_ps(result, tmp); };
static void scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n) { __m128 xmm1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = *src1++ * *val; } xmm1 = _mm_load_ps1(val); for (; n >= 8; n -= 8) { __m128 xmm0; xmm0 = _mm_loadu_ps(src1); xmm0 = _mm_mul_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); xmm0 = _mm_loadu_ps(src1 + 4); xmm0 = _mm_mul_ps(xmm0, xmm1); _mm_store_ps(dest + 4, xmm0); dest += 8; src1 += 8; } for (; n > 0; n--) { *dest++ = *src1++ * *val; } }
static void SSE2_StereoMixToFloat(const int32 *pSrc, float *pOut1, float *pOut2, uint32 nCount, const float _i2fc) //---------------------------------------------------------------------------------------------------------------- { __m128 i2fc = _mm_load_ps1(&_i2fc); const __m128i *in = reinterpret_cast<const __m128i *>(pSrc); // We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE nCount = (nCount + 3) / 4; do { __m128i i1 = _mm_loadu_si128(in); // Load four integer values, LRLR __m128i i2 = _mm_loadu_si128(in + 1); // Load four integer values, LRLR in += 2; __m128 f1 = _mm_cvtepi32_ps(i1); // Convert to four floats, LRLR __m128 f2 = _mm_cvtepi32_ps(i2); // Convert to four floats, LRLR f1 = _mm_mul_ps(f1, i2fc); // Apply int->float factor f2 = _mm_mul_ps(f2, i2fc); // Apply int->float factor __m128 fl = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(2, 0, 2, 0)); // LRLR+LRLR => LLLL __m128 fr = _mm_shuffle_ps(f1, f2, _MM_SHUFFLE(3, 1, 3, 1)); // LRLR+LRLR => RRRR _mm_storeu_ps(pOut1, fl); // Store four float values, LLLL _mm_storeu_ps(pOut2, fr); // Store four float values, RRRR pOut1 += 4; pOut2 += 4; } while(--nCount); }
static void SSE2_FloatToStereoMix(const float *pIn1, const float *pIn2, int32 *pOut, uint32 nCount, const float _f2ic) //-------------------------------------------------------------------------------------------------------------------- { __m128 f2ic = _mm_load_ps1(&_f2ic); __m128i *out = reinterpret_cast<__m128i *>(pOut); // We may read beyond the wanted length... this works because we know that we will always work on our buffers of size MIXBUFFERSIZE nCount = (nCount + 3) / 4; do { __m128 fl = _mm_loadu_ps(pIn1); // Load four float values, LLLL __m128 fr = _mm_loadu_ps(pIn2); // Load four float values, RRRR pIn1 += 4; pIn2 += 4; fl = _mm_mul_ps(fl, f2ic); // Apply int->float factor fr = _mm_mul_ps(fr, f2ic); // Apply int->float factor __m128 f1 = _mm_unpacklo_ps(fl, fr); // LL__+RR__ => LRLR __m128 f2 = _mm_unpackhi_ps(fl, fr); // __LL+__RR => LRLR __m128i i1 =_mm_cvtps_epi32(f1); // Convert to four ints __m128i i2 =_mm_cvtps_epi32(f2); // Convert to four ints _mm_storeu_si128(out, i1); // Store four int values, LRLR _mm_storeu_si128(out + 1, i2); // Store four int values, LRLR out += 2; } while(--nCount); }
Vect4D_SIMD Vect4D_SIMD::operator * (const Matrix_SIMD &M) { /* x Values = x*m.m0 x * m.m1 x*m.m2 x*m.m3 + + + + y Values = y*m.m4 y*m.m5 y*m.m6 y*m.m7 + + + + z Values = z*m.m8 z*m.m9 z*m.m10 z*m.m11 + + + + W Values = w * m.m12 w * m.m13 w * m.m14 w * m.m15 = C.x , C.y, C.z, C.w (added form top to bottom) */ __m128 xVals = _mm_mul_ps(M.v0.m, _mm_load_ps1(&x)); __m128 yVals = _mm_mul_ps(M.v1.m, _mm_load_ps1(&y)); __m128 zVals = _mm_mul_ps(M.v2.m, _mm_load_ps1(&z)); __m128 wVals = _mm_mul_ps(M.v3.m, _mm_load_ps1(&w)); return Vect4D_SIMD(_mm_add_ps( _mm_add_ps(xVals, yVals), _mm_add_ps(zVals, wVals))); };
void conv_filter_sse(int imgHeight, int imgWidth, int imgHeightF, int imgWidthF, int imgFOfssetH, int imgFOfssetW, float* filter, float *imgFloatSrc, float *imgFloatDst) { //1. const register __declspec(align(16)) auto const_0 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); //2. const register __declspec(align(16)) auto const_255 = _mm_set_ps(255.0, 255.0, 255.0, 255.0); //3. __declspec(align(16)) __m128 filter_l[FILTER_SIZE]; #pragma omp parallel for for (auto i = 0; i < FILTER_SIZE; i++) { //mind a 4 floatba ugyanazt tölti // float -> m128 konverzió filter_l[i] = _mm_load_ps1(filter + i); } const auto rw_base = (imgFOfssetW + imgFOfssetH * imgWidthF) << 2; const auto imgWidthbyte = imgWidth << 2; const auto imgWidthFbyte = imgWidthF << 2; const auto imgLengthbyte = imgHeight * imgWidthbyte; //4. register __declspec(align(16)) __m128 a_sse; //8. reg register __declspec(align(16)) __m128 r_sse; #pragma omp parallel for for (auto row = 0; row < imgLengthbyte; row += 4) { // RGBA komponensek akkumulátora r_sse = _mm_setzero_ps(); // konvolúció minden komponensre for (auto y = 0; y < FILTER_H; y++ ) { r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (y * imgWidthFbyte)), filter_l[5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (4 + y * imgWidthFbyte)), filter_l[1 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (8 + y * imgWidthFbyte)), filter_l[2 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (12 + y * imgWidthFbyte)), filter_l[3 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (16 + y * imgWidthFbyte)), filter_l[4 + 5 * y])); } a_sse = _mm_load_ps(imgFloatSrc + row + 8 + 2 * imgWidthFbyte); //számítás eredményének limitálása 0-255 közé // kimenetí pixel írása _mm_store_ps(imgFloatDst + rw_base + row, _mm_min_ps(const_255, _mm_add_ps(a_sse, _mm_max_ps(const_0, _mm_sub_ps(a_sse, _mm_min_ps(const_255, _mm_max_ps(const_0, r_sse))))))); } }
void experienceNet::decayExperience() { if (mostlyEQ(eSet.decay, 1.0f)) { return; } // exit early if no change const __m128 decays = _mm_load_ps1(&eSet.decay); for (__m128* cVal = (__m128*)&vals.front(), *endVal = (__m128*)(&vals.front()+vals.size()); cVal != endVal; ++cVal) { *cVal = _mm_mul_ps(*cVal, decays); }; /*// TODO: SIMD for (size_t iVal = 0; iVal != vals.size(); ++iVal) { vals[iVal] *= eSet.decay; }*/ };
SSE_FUNCTION static void scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n) { __m128 xmm1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = *src1++ + *val; } xmm1 = _mm_load_ps1(val); for (; n >= 4; n -= 4) { __m128 xmm0; xmm0 = _mm_loadu_ps(src1); xmm0 = _mm_add_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); dest += 4; src1 += 4; } for (; n > 0; n--) { *dest++ = *src1++ + *val; } }
/*---------------------------------------------------------------------------*/ __m128 TTriangle::THit::HitTest4(__m128 mask, const TPoint4& orig, const D3DXVECTOR3& d, HitResult4* result) const { int u, v, w; w = ci; u = w == 0 ? 1 : 0; v = w == 2 ? 1 : 2; __m128 nu = _mm_load_ps1(&this->nu); __m128 np = _mm_load_ps1(&this->np); __m128 nv = _mm_load_ps1(&this->nv); __m128 pu = _mm_load_ps1(&this->pu); __m128 pv = _mm_load_ps1(&this->pv); __m128 e0u = _mm_load_ps1(&this->e0u); __m128 e0v = _mm_load_ps1(&this->e0v); __m128 e1u = _mm_load_ps1(&this->e1u); __m128 e1v = _mm_load_ps1(&this->e1v); __m128 ou = orig[u]; __m128 ov = orig[v]; __m128 ow = orig[w]; __m128 du = _mm_load_ps1(&d[u]); __m128 dv = _mm_load_ps1(&d[v]); __m128 dw = _mm_load_ps1(&d[w]); __m128 dett = np -(ou*nu+ov*nv+ow); __m128 det = du*nu+dv*nv+dw; __m128 Du = du*dett - (pu-ou)*det; __m128 Dv = dv*dett - (pv-ov)*det; __m128 detu = (e1v*Du - e1u*Dv); __m128 detv = (e0u*Dv - e0v*Du); __m128 tmpdet0 = det - detu - detv; __m128 detMask = _mm_xor_ps(_mm_xor_ps(tmpdet0, detv) | _mm_xor_ps(detv, detu), g_one4) > _mm_setzero_ps(); mask = mask & detMask; __m128 rdet = _mm_rcp_ps(det); result->t = dett * rdet; result->u = detu * rdet; result->v = detv * rdet; return mask & (result->t > _mm_setzero_ps()); }
inline void sse_micro_kernel<float>(float const *buffer_A, float const *buffer_B, float *buffer_C, vcl_size_t num_micro_slivers, vcl_size_t mr, vcl_size_t nr) { assert( (mr == MR_F) && (nr == NR_F) && bool("mr and nr obtained by 'get_block_sizes()' in 'matrix_operations.hpp' and given to 'avx_micro_kernel()' do not match with MR_F/NR_F defined in 'gemm_avx_micro_kernel.hpp' ") ); __m128 xmm0 , xmm1 , xmm2 , xmm3 ; __m128 xmm4 , xmm5 , xmm6 , xmm7 ; __m128 xmm8 , xmm9 , xmm10, xmm11; __m128 xmm12, xmm13, xmm14, xmm15; for (vcl_size_t l=0; l<num_micro_slivers; ++l) { xmm0 = _mm_load_ps(buffer_B+l*NR_F); xmm1 = _mm_load_ps(buffer_B+l*NR_F+4); xmm2 = _mm_load_ps1(buffer_A+l*MR_F); xmm3 = _mm_mul_ps(xmm0, xmm2); xmm4 = _mm_mul_ps(xmm1, xmm2); xmm2 = _mm_load_ps1(buffer_A+l*MR_F+1); xmm5 = _mm_mul_ps(xmm0, xmm2); xmm6 = _mm_mul_ps(xmm1, xmm2); xmm2 = _mm_load_ps1(buffer_A+l*MR_F+2); xmm7 = _mm_mul_ps(xmm0, xmm2); xmm8 = _mm_mul_ps(xmm1, xmm2); xmm2 = _mm_load_ps1(buffer_A+l*MR_F+3); xmm9 = _mm_mul_ps(xmm0, xmm2); xmm10 = _mm_mul_ps(xmm1, xmm2); xmm2 = _mm_load_ps1(buffer_A+l*MR_F+4); xmm11 = _mm_mul_ps(xmm0, xmm2); xmm12 = _mm_mul_ps(xmm1, xmm2); xmm2 = _mm_load_ps1(buffer_A+l*MR_F+5); xmm13 = _mm_mul_ps(xmm0, xmm2); xmm14 = _mm_mul_ps(xmm1, xmm2); /* free registers by storing their results */ xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(0)); xmm15 = _mm_add_ps(xmm15, xmm3); _mm_store_ps(buffer_C+C0_ROW_F(0), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(0)); xmm15 = _mm_add_ps(xmm15, xmm4); _mm_store_ps(buffer_C+C1_ROW_F(0), xmm15); /* continue calculating */ xmm2 = _mm_load_ps1(buffer_A+l*MR_F+6); xmm3 = _mm_mul_ps(xmm0, xmm2); xmm4 = _mm_mul_ps(xmm1, xmm2); /* free registers by storing their results */ xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(1)); xmm15 = _mm_add_ps(xmm15, xmm5); _mm_store_ps(buffer_C+C0_ROW_F(1), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(1)); xmm15 = _mm_add_ps(xmm15, xmm6); _mm_store_ps(buffer_C+C1_ROW_F(1), xmm15); /* continue calculating */ xmm2 = _mm_load_ps1(buffer_A+l*MR_F+7); xmm5 = _mm_mul_ps(xmm0, xmm2); xmm6 = _mm_mul_ps(xmm1, xmm2); /* store the rest of the results */ xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(2)); xmm15 = _mm_add_ps(xmm15, xmm7); _mm_store_ps(buffer_C+C0_ROW_F(2), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(2)); xmm15 = _mm_add_ps(xmm15, xmm8); _mm_store_ps(buffer_C+C1_ROW_F(2), xmm15); xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(3)); xmm15 = _mm_add_ps(xmm15, xmm9); _mm_store_ps(buffer_C+C0_ROW_F(3), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(3)); xmm15 = _mm_add_ps(xmm15, xmm10); _mm_store_ps(buffer_C+C1_ROW_F(3), xmm15); xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(4)); xmm15 = _mm_add_ps(xmm15, xmm11); _mm_store_ps(buffer_C+C0_ROW_F(4), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(4)); xmm15 = _mm_add_ps(xmm15, xmm12); _mm_store_ps(buffer_C+C1_ROW_F(4), xmm15); xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(5)); xmm15 = _mm_add_ps(xmm15, xmm13); _mm_store_ps(buffer_C+C0_ROW_F(5), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(5)); xmm15 = _mm_add_ps(xmm15, xmm14); _mm_store_ps(buffer_C+C1_ROW_F(5), xmm15); xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(6)); xmm15 = _mm_add_ps(xmm15, xmm3); _mm_store_ps(buffer_C+C0_ROW_F(6), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(6)); xmm15 = _mm_add_ps(xmm15, xmm4); _mm_store_ps(buffer_C+C1_ROW_F(6), xmm15); xmm15 = _mm_load_ps(buffer_C+C0_ROW_F(7)); xmm15 = _mm_add_ps(xmm15, xmm5); _mm_store_ps(buffer_C+C0_ROW_F(7), xmm15); xmm15 = _mm_load_ps(buffer_C+C1_ROW_F(7)); xmm15 = _mm_add_ps(xmm15, xmm6); _mm_store_ps(buffer_C+C1_ROW_F(7), xmm15); }//for }//sse_micro_kernel()
static void FilterAdaptationSSE2(aec_t *aec, float *fft, float ef[2][PART_LEN1]) { int i, j; for (i = 0; i < NR_PART; i++) { int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1); int pos = i * PART_LEN1; // Check for wrap if (i + aec->xfBufBlockPos >= NR_PART) { xPos -= NR_PART * PART_LEN1; } #ifdef UNCONSTR for (j = 0; j < PART_LEN1; j++) { aec->wfBuf[pos + j][0] += MulRe(aec->xfBuf[xPos + j][0], -aec->xfBuf[xPos + j][1], ef[j][0], ef[j][1]); aec->wfBuf[pos + j][1] += MulIm(aec->xfBuf[xPos + j][0], -aec->xfBuf[xPos + j][1], ef[j][0], ef[j][1]); } #else // Process the whole array... for (j = 0; j < PART_LEN; j+= 4) { // Load xfBuf and ef. const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]); const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]); const __m128 ef_re = _mm_loadu_ps(&ef[0][j]); const __m128 ef_im = _mm_loadu_ps(&ef[1][j]); // Calculate the product of conjugate(xfBuf) by ef. // re(conjugate(a) * b) = aRe * bRe + aIm * bIm // im(conjugate(a) * b)= aRe * bIm - aIm * bRe const __m128 a = _mm_mul_ps(xfBuf_re, ef_re); const __m128 b = _mm_mul_ps(xfBuf_im, ef_im); const __m128 c = _mm_mul_ps(xfBuf_re, ef_im); const __m128 d = _mm_mul_ps(xfBuf_im, ef_re); const __m128 e = _mm_add_ps(a, b); const __m128 f = _mm_sub_ps(c, d); // Interleave real and imaginary parts. const __m128 g = _mm_unpacklo_ps(e, f); const __m128 h = _mm_unpackhi_ps(e, f); // Store _mm_storeu_ps(&fft[2*j + 0], g); _mm_storeu_ps(&fft[2*j + 4], h); } // ... and fixup the first imaginary entry. fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], -aec->xfBuf[1][xPos + PART_LEN], ef[0][PART_LEN], ef[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN); // fft scaling { float scale = 2.0f / PART_LEN2; const __m128 scale_ps = _mm_load_ps1(&scale); for (j = 0; j < PART_LEN; j+=4) { const __m128 fft_ps = _mm_loadu_ps(&fft[j]); const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps); _mm_storeu_ps(&fft[j], fft_scale); } } aec_rdft_forward_128(fft); { float wt1 = aec->wfBuf[1][pos]; aec->wfBuf[0][pos + PART_LEN] += fft[1]; for (j = 0; j < PART_LEN; j+= 4) { __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0)); const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1)); wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re); _mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im); } aec->wfBuf[1][pos] = wt1; } #endif // UNCONSTR } }
static void FilterAdaptationSSE2( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float e_fft[2][PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) { float fft[PART_LEN2]; int i, j; for (i = 0; i < num_partitions; i++) { int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1); int pos = i * PART_LEN1; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // Process the whole array... for (j = 0; j < PART_LEN; j += 4) { // Load x_fft_buf and e_fft. const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]); const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]); const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]); const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]); // Calculate the product of conjugate(x_fft_buf) by e_fft. // re(conjugate(a) * b) = aRe * bRe + aIm * bIm // im(conjugate(a) * b)= aRe * bIm - aIm * bRe const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re); const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im); const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im); const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re); const __m128 e = _mm_add_ps(a, b); const __m128 f = _mm_sub_ps(c, d); // Interleave real and imaginary parts. const __m128 g = _mm_unpacklo_ps(e, f); const __m128 h = _mm_unpackhi_ps(e, f); // Store _mm_storeu_ps(&fft[2 * j + 0], g); _mm_storeu_ps(&fft[2 * j + 4], h); } // ... and fixup the first imaginary entry. fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN], e_fft[0][PART_LEN], e_fft[1][PART_LEN]); aec_rdft_inverse_128(fft); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN); // fft scaling { float scale = 2.0f / PART_LEN2; const __m128 scale_ps = _mm_load_ps1(&scale); for (j = 0; j < PART_LEN; j += 4) { const __m128 fft_ps = _mm_loadu_ps(&fft[j]); const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps); _mm_storeu_ps(&fft[j], fft_scale); } } aec_rdft_forward_128(fft); { float wt1 = h_fft_buf[1][pos]; h_fft_buf[0][pos + PART_LEN] += fft[1]; for (j = 0; j < PART_LEN; j += 4) { __m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]); __m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]); const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1)); wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); _mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re); _mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im); } h_fft_buf[1][pos] = wt1; } } }
HRESULT CGraphics::Update(void) { AUTO_SECTION(m_UpdateSection); FLOAT factor = min(GetElapsedTime() * m_Speed,1.0f); if(IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE)) { // Update the levels for(UINT i = 0; i < VISUALIZATION_BARCOUNT; i += 4) { //SSELerpArr(m_Levels + i,m_Levels + i,m_LevelsBuffer + i,factor); _mm_store_ps(m_Levels + i,_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_load_ps(m_LevelsBuffer + i),_mm_load_ps(m_Levels + i)),_mm_load_ps1(&factor)),_mm_load_ps(m_Levels + i))); } // Update the waveform for(UINT i = 0; i < SA_BUFFER_SIZE; i += 4) { //SSELerpArr(m_Waveform + i,m_Waveform + i,m_WaveformBuffer + i,factor); _mm_store_ps(m_Waveform + i,_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_load_ps(m_WaveformBuffer + i),_mm_load_ps(m_Waveform + i)),_mm_load_ps1(&factor)),_mm_load_ps(m_Waveform + i))); } } else { // Update the levels for(UINT i = 0; i < VISUALIZATION_BARCOUNT; ++i) m_Levels[i] = m_Levels[i] + min(GetElapsedTime() * m_Speed,1.0f) * (m_LevelsBuffer[i] - m_Levels[i]); // Update the waveform for(UINT i = 0; i < SA_BUFFER_SIZE; ++i) m_Waveform[i] = m_Waveform[i] + min(GetElapsedTime() * m_Speed,1.0f) * (m_WaveformBuffer[i] - m_Waveform[i]); } // Go through all the peaks and update each for(UINT i = 0; i < VISUALIZATION_BARCOUNT; ++i) { // Update the position and velocity if(m_Peaks[i].timeout <= 0.0f) { m_Peaks[i].position += m_Peaks[i].velocity * min(GetElapsedTime(),1.0f); m_Peaks[i].velocity += m_PeakGravity * min(GetElapsedTime(),1.0f); } else m_Peaks[i].timeout -= min(GetElapsedTime(),1.0f); // Check if it has collided with a bar if(m_Peaks[i].position < m_Levels[i]) { m_Peaks[i].position = m_Levels[i]; m_Peaks[i].velocity = 0.0f; m_Peaks[i].timeout = m_PeakTimeout; } } return S_OK; }
void mBior53::transcols(char** dest, char** sour, unsigned int w, unsigned int h) const { float fz = 0.0f; int n; float s, d; __m128 ms, md; unsigned int h2 = h / 2; const vec1D& tH = gettH(); const vec1D& tG = gettG(); for (unsigned int x = 0; x < w / 4; x++) { //x<w/4 x = 4*x for (unsigned int k = 0; k < h2; k++) { ms = _mm_load_ss(&fz); md = ms; for (int m = -2; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms = _mm_add_ps(ms, _mm_mul_ps(_mm_load_ps1(tH.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } for (int m = 0; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); md = _mm_add_ps(md, _mm_mul_ps(_mm_load_ps1(tG.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } if (4*x < w / 2) { if ((w / 2) - (4*x) >= 4) mmxround4(&dest[k][4*x], ms); else mmxround4TH(&dest[k][4*x], ms, (w / 2) - (4*x)); //skip first from LL part 10/2-4=1 [lo] o o o o * | * * * o o [hi] } else mmxround4TH(&dest[k][4*x], ms); mmxround4TH(&dest[k+h2][4*x], md); } } _mm_empty(); //odd remainder for (unsigned int x = w - (w % 4); x < w; x++) { for (unsigned int k = 0; k < h2; k++) { s = 0; d = 0; for (int m = -2; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s += tH[m] * float(sour[n][x]); } for (int m = 0; m <= 2; m++) { n = 2 * k + m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); d += tG[m] * float(sour[n][x]); } if (x < w / 2) dest[k][x] = mmxround(s); else dest[k][x] = mmxroundTH(s); //is this needed? hi band were TH'ed on transrows dest[k+h2][x] = mmxroundTH(d); //is this needed? hi band were TH'ed on transrows on x>w/2 } } }
void SaxpyKernel::SaxPy_SSE(float* out, float a, float* x, float* y, int n) { //Validate Alignment /* if( size_t(out) % 16 != 0 || size_t(x) % 16 != 0 || size_t(y) % 16 != 0 || n % 4 !=0 ) { printf("输入数据未对齐, 无法使用sse加速!"); return; } */ __m128 _x1, _x2, _o1, _o2, _a; _a = _mm_load_ps1(&a); int i = 0; //_mm_prefetch((const char*)x, _MM_HINT_T0); //_mm_prefetch((const char*)y, _MM_HINT_T0); for(i = 0; i < n; i += 8){ _x1 = _mm_load_ps(x); _x2 = _mm_load_ps(x+4); _o1 = _x1; _o2 = _x2; for(int j = 0; j < 6; ++j) { _x1 = _mm_mul_ps(_o1, _a); _o1 = _mm_add_ps(_x1, _o1); _x2 = _mm_mul_ps(_o2, _a); _o2 = _mm_add_ps(_x2, _o2); _x1 = _mm_mul_ps(_o1, _a); _o1 = _mm_add_ps(_x1, _o1); _x2 = _mm_mul_ps(_o2, _a); _o2 = _mm_add_ps(_x2, _o2); _x1 = _mm_mul_ps(_o1, _a); _o1 = _mm_add_ps(_x1, _o1); _x2 = _mm_mul_ps(_o2, _a); _o2 = _mm_add_ps(_x2, _o2); _x1 = _mm_mul_ps(_o1, _a); _o1 = _mm_add_ps(_x1, _o1); _x2 = _mm_mul_ps(_o2, _a); _o2 = _mm_add_ps(_x2, _o2); _x1 = _mm_mul_ps(_o1, _a); _o1 = _mm_add_ps(_x1, _o1); _x2 = _mm_mul_ps(_o2, _a); _o2 = _mm_add_ps(_x2, _o2); } _mm_stream_ps(out, _o1); _mm_stream_ps(out+4, _o2); x+=8; out+=8; } if( n % 2 == 1){ __m128 _x = _mm_load_ps(x); __m128 _y = _mm_load_ps(y); __m128 _o = _mm_mul_ps(_x, _a); _o = _mm_add_ps(_o, _y); _mm_store_ps(out, _o); } }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
void sINLINE RNMarchingCubesBase<T>::func(const sVector31 &v,typename T::FieldType &pot,const funcinfo &fi) { __m128 vx = _mm_load_ps1(&v.x); __m128 vy = _mm_load_ps1(&v.y); __m128 vz = _mm_load_ps1(&v.z); __m128 po = _mm_setzero_ps(); // p __m128 nx = _mm_setzero_ps(); __m128 ny = _mm_setzero_ps(); __m128 nz = _mm_setzero_ps(); __m128 akkur = _mm_setzero_ps(); __m128 akkug = _mm_setzero_ps(); __m128 akkub = _mm_setzero_ps(); __m128 akkua = _mm_setzero_ps(); __m128 s255 = _mm_set_ps1(255.0f); sBool good = 0; for(sInt i=0;i<fi.pn4;i++) { const T::SimdType *part = fi.parts4 + i; __m128 dx = _mm_sub_ps(vx,part->x); __m128 dy = _mm_sub_ps(vy,part->y); __m128 dz = _mm_sub_ps(vz,part->z); __m128 ddx = _mm_mul_ps(dx,dx); __m128 ddy = _mm_mul_ps(dy,dy); __m128 ddz = _mm_mul_ps(dz,dz); __m128 pp = _mm_add_ps(_mm_add_ps(ddx,ddy),ddz); if(_mm_movemask_ps(_mm_cmple_ps(pp,fi.treshf4))!=0) { __m128 pp2 = _mm_sub_ps(_mm_div_ps(fi.one,pp),fi.tresh4); __m128 pp3 = _mm_max_ps(pp2,_mm_setzero_ps()); po = _mm_add_ps(po,pp3); // p = p+pp; __m128 pp4 = _mm_mul_ps(pp3,pp3); // pp*pp nx = _mm_add_ps(nx,_mm_mul_ps(pp4,dx)); // n += d*(pp*pp) ny = _mm_add_ps(ny,_mm_mul_ps(pp4,dy)); nz = _mm_add_ps(nz,_mm_mul_ps(pp4,dz)); if(T::Color==1) { akkur = _mm_add_ps(akkur,_mm_mul_ps(pp3,part->cr)); akkug = _mm_add_ps(akkug,_mm_mul_ps(pp3,part->cg)); akkub = _mm_add_ps(akkub,_mm_mul_ps(pp3,part->cb)); good = 1; } } } sF32 p = 0; sVector30 n; _MM_TRANSPOSE4_PS(po,nx,ny,nz); __m128 r = _mm_add_ps(_mm_add_ps(_mm_add_ps(nx,ny),nz),po); n.x = r.m128_f32[1]; n.y = r.m128_f32[2]; n.z = r.m128_f32[3]; p = r.m128_f32[0]; if(p==0) n.Init(0,0,0); else n.UnitFast(); pot.x = n.x; pot.y = n.y; pot.z = n.z; pot.w = p-fi.iso; if(T::Color) { if(good) { r = _mm_mul_ss(s255,_mm_rcp_ss(r)); // r = _mm_rcp_ss(r); _MM_TRANSPOSE4_PS(akkub,akkug,akkur,akkua); __m128 r2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(akkur,akkug),akkub),akkua); r2 = _mm_mul_ps(r2,_mm_shuffle_ps(r,r,0x00)); __m128i r3 = _mm_cvtps_epi32(r2); r3 = _mm_packs_epi32(r3,r3); __m128i r4 = _mm_packus_epi16(r3,r3); pot.c = r4.m128i_u32[0]|0xff000000; } else { pot.c = 0; } } }
void RNMarchingCubesBase<T>::RenderT(sInt start,sInt count,sInt thread) { for(sInt i_=start;i_<start+count;i_++) { HashContainer *hc = ThreadHashConts[i_]; PartContainer *con = hc->FirstPart; const sInt s = 1<<base; const sInt m = (s+1); const sInt mm = (s+1)*(s+1); sF32 S = Para.GridSize/s; sVector31 tpos(hc->IX*Para.GridSize,hc->IY*Para.GridSize,hc->IZ*Para.GridSize); // sInt size = (s+2)*(s+1)*(s+1); typename T::FieldType *pot = PotData[thread]; funcinfo fi; // calculate potential and normal sClear(fi); fi.tresh = 1/(Para.Influence*Para.Influence); fi.treshf = 1.0f/fi.tresh-0.00001f; fi.iso = Para.IsoValue; // reorganize array for SIMD sInt pn4 = 0; PartContainer *cp = con; while(cp) { pn4 += (cp->Count+3)/4; cp = cp->Next; } fi.tresh4 = _mm_load_ps1(&fi.tresh); fi.treshf4 = _mm_load_ps1(&fi.treshf); fi.one = _mm_set_ps1(1.0f); fi.epsilon = _mm_set_ps1(0.01f); fi.pn4 = pn4; fi.parts4 = SimdParts[thread]; sInt i4 = 0; typename T::PartType far; far.x = 1024*1024; far.y = 0; far.z = 0; cp = con; while(cp) { sInt pn = cp->Count; typename T::PartType *p = cp->Parts; switch(pn&3) { case 1: p[pn+2] = far; case 2: p[pn+1] = far; case 3: p[pn+0] = far; case 0: break; } for(sInt i=0;i<(pn+3)/4;i++) { fi.parts4[i4].x.m128_f32[0] = p[0].x; fi.parts4[i4].x.m128_f32[1] = p[1].x; fi.parts4[i4].x.m128_f32[2] = p[2].x; fi.parts4[i4].x.m128_f32[3] = p[3].x; fi.parts4[i4].y.m128_f32[0] = p[0].y; fi.parts4[i4].y.m128_f32[1] = p[1].y; fi.parts4[i4].y.m128_f32[2] = p[2].y; fi.parts4[i4].y.m128_f32[3] = p[3].y; fi.parts4[i4].z.m128_f32[0] = p[0].z; fi.parts4[i4].z.m128_f32[1] = p[1].z; fi.parts4[i4].z.m128_f32[2] = p[2].z; fi.parts4[i4].z.m128_f32[3] = p[3].z; if(T::Color) { fi.parts4[i4].cr.m128_f32[0] = ((p[0].c>>16)&255)/255.0f; fi.parts4[i4].cr.m128_f32[1] = ((p[1].c>>16)&255)/255.0f; fi.parts4[i4].cr.m128_f32[2] = ((p[2].c>>16)&255)/255.0f; fi.parts4[i4].cr.m128_f32[3] = ((p[3].c>>16)&255)/255.0f; fi.parts4[i4].cg.m128_f32[0] = ((p[0].c>> 8)&255)/255.0f; fi.parts4[i4].cg.m128_f32[1] = ((p[1].c>> 8)&255)/255.0f; fi.parts4[i4].cg.m128_f32[2] = ((p[2].c>> 8)&255)/255.0f; fi.parts4[i4].cg.m128_f32[3] = ((p[3].c>> 8)&255)/255.0f; fi.parts4[i4].cb.m128_f32[0] = ((p[0].c>> 0)&255)/255.0f; fi.parts4[i4].cb.m128_f32[1] = ((p[1].c>> 0)&255)/255.0f; fi.parts4[i4].cb.m128_f32[2] = ((p[2].c>> 0)&255)/255.0f; fi.parts4[i4].cb.m128_f32[3] = ((p[3].c>> 0)&255)/255.0f; } p+=4; i4++; } cp = cp->Next; } sVERIFY(i4==fi.pn4); // pass 1: skip every second vertex for(sInt z=0;z<s+1;z++) { for(sInt y=0;y<s+1;y++) { for(sInt x=0;x<s+1;x++) { sVector31 v = sVector30(x,y,z) * S + tpos; func(v,pot[z*mm+y*m+x],fi); } } } // subdivision schemes if(subdiv==0) // none { // i don't understand, but manually inlining this makes things a bit faster... // MC.March(Para.BaseGrid,pot,S,tpos); switch(base) { case 0: MC.March_0_1(pot,S,tpos,thread); break; case 1: MC.March_1_1(pot,S,tpos,thread); break; case 2: MC.March_2_1(pot,S,tpos,thread); break; case 3: MC.March_3_1(pot,S,tpos,thread); break; case 4: MC.March_4_1(pot,S,tpos,thread); break; case 5: MC.March_5_1(pot,S,tpos,thread); break; default: sVERIFYFALSE; } } else // subdiv once { typename T::FieldType pot2[4][3][3]; sVector31 v; typename T::FieldType pot2y[s][4]; sInt lastyz[s]; for(sInt i=0;i<s;i++) lastyz[i] = -2; for(sInt z=0;z<s;z++) { sInt LastY = -2; for(sInt y=0;y<s;y++) { sInt LastX = -2; for(sInt x=0;x<s;x++) { sU32 flo,ma,mo; flo = *(sU32 *)&pot[(z+0)*mm+(y+0)*m+(x+0)].w; ma = flo; mo = flo; flo = *(sU32 *)&pot[(z+0)*mm+(y+0)*m+(x+1)].w; ma &= flo; mo |= flo; flo = *(sU32 *)&pot[(z+0)*mm+(y+1)*m+(x+0)].w; ma &= flo; mo |= flo; flo = *(sU32 *)&pot[(z+0)*mm+(y+1)*m+(x+1)].w; ma &= flo; mo |= flo; flo = *(sU32 *)&pot[(z+1)*mm+(y+0)*m+(x+0)].w; ma &= flo; mo |= flo; flo = *(sU32 *)&pot[(z+1)*mm+(y+0)*m+(x+1)].w; ma &= flo; mo |= flo; flo = *(sU32 *)&pot[(z+1)*mm+(y+1)*m+(x+0)].w; ma &= flo; mo |= flo; flo = *(sU32 *)&pot[(z+1)*mm+(y+1)*m+(x+1)].w; ma &= flo; mo |= flo; if((ma&0x80000000)==0 && (mo&0x80000000)!=0) { // get the dots we already have pot2[0][0][0] = pot[(z+0)*mm+(y+0)*m+(x+0)]; pot2[0][0][2] = pot[(z+0)*mm+(y+0)*m+(x+1)]; pot2[0][2][0] = pot[(z+0)*mm+(y+1)*m+(x+0)]; pot2[0][2][2] = pot[(z+0)*mm+(y+1)*m+(x+1)]; pot2[2][0][0] = pot[(z+1)*mm+(y+0)*m+(x+0)]; pot2[2][0][2] = pot[(z+1)*mm+(y+0)*m+(x+1)]; pot2[2][2][0] = pot[(z+1)*mm+(y+1)*m+(x+0)]; pot2[2][2][2] = pot[(z+1)*mm+(y+1)*m+(x+1)]; // reuse last x2 for current x0 if(LastX==x-1) { pot2[1][0][0] = pot2[1][0][2]; pot2[0][1][0] = pot2[0][1][2]; pot2[1][1][0] = pot2[1][1][2]; pot2[2][1][0] = pot2[2][1][2]; pot2[1][2][0] = pot2[1][2][2]; } else { v = sVector30(x+0.0f,y+0.0f,z+0.5f) * S + tpos; func(v,pot2[1][0][0],fi); v = sVector30(x+0.0f,y+0.5f,z+0.0f) * S + tpos; func(v,pot2[0][1][0],fi); v = sVector30(x+0.0f,y+0.5f,z+0.5f) * S + tpos; func(v,pot2[1][1][0],fi); v = sVector30(x+0.0f,y+0.5f,z+1.0f) * S + tpos; func(v,pot2[2][1][0],fi); v = sVector30(x+0.0f,y+1.0f,z+0.5f) * S + tpos; func(v,pot2[1][2][0],fi); } LastX = x; // resuse last y2 for current y0 if(LastY==y-1 && lastyz[x]==z) { pot2[0][0][1] = pot2y[x][0]; pot2[1][0][1] = pot2y[x][1]; pot2[2][0][1] = pot2y[x][2]; pot2[1][0][2] = pot2y[x][3]; } else { v = sVector30(x+0.5f,y+0.0f,z+0.0f) * S + tpos; func(v,pot2[0][0][1],fi); v = sVector30(x+0.5f,y+0.0f,z+0.5f) * S + tpos; func(v,pot2[1][0][1],fi); v = sVector30(x+0.5f,y+0.0f,z+1.0f) * S + tpos; func(v,pot2[2][0][1],fi); v = sVector30(x+1.0f,y+0.0f,z+0.5f) * S + tpos; func(v,pot2[1][0][2],fi); } v = sVector30(x+0.5f,y+1.0f,z+0.0f) * S + tpos; func(v,pot2[0][2][1],fi); pot2y[x][0] = pot2[0][2][1]; v = sVector30(x+0.5f,y+1.0f,z+0.5f) * S + tpos; func(v,pot2[1][2][1],fi); pot2y[x][1] = pot2[1][2][1]; v = sVector30(x+0.5f,y+1.0f,z+1.0f) * S + tpos; func(v,pot2[2][2][1],fi); pot2y[x][2] = pot2[2][2][1]; v = sVector30(x+1.0f,y+1.0f,z+0.5f) * S + tpos; func(v,pot2[1][2][2],fi); pot2y[x][3] = pot2[1][2][2]; LastY = y; lastyz[x] = z; // do the rest, don't bother caching v = sVector30(x+0.5f,y+0.5f,z+0.0f) * S + tpos; func(v,pot2[0][1][1],fi); v = sVector30(x+0.5f,y+0.5f,z+0.5f) * S + tpos; func(v,pot2[1][1][1],fi); v = sVector30(x+0.5f,y+0.5f,z+1.0f) * S + tpos; func(v,pot2[2][1][1],fi); v = sVector30(x+1.0f,y+0.5f,z+0.0f) * S + tpos; func(v,pot2[0][1][2],fi); v = sVector30(x+1.0f,y+0.5f,z+0.5f) * S + tpos; func(v,pot2[1][1][2],fi); v = sVector30(x+1.0f,y+0.5f,z+1.0f) * S + tpos; func(v,pot2[2][1][2],fi); // render it MC.March_1_1(&pot2[0][0][0],S/2,tpos+sVector30(x*S,y*S,z*S),thread); } } } } } } }
ibMtx4& ibMtx4::Invert() { f32* src = &data.a[0][0]; __m128 minor0, minor1, minor2, minor3; __m128 row0, row1, row2, row3; __m128 det, tmp1; #if !defined NDEBUG || defined STATIC // Suppress RTC error for uninit vars f32 init = 0.f; row3 = row1 = tmp1 = _mm_load_ps1( &init ); #endif // NDEBUG tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); // ----------------------------------------------- tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); // ----------------------------------------------- det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); _mm_storel_pi((__m64*)(src), minor0); _mm_storeh_pi((__m64*)(src+2), minor0); minor1 = _mm_mul_ps(det, minor1); _mm_storel_pi((__m64*)(src+4), minor1); _mm_storeh_pi((__m64*)(src+6), minor1); minor2 = _mm_mul_ps(det, minor2); _mm_storel_pi((__m64*)(src+ 8), minor2); _mm_storeh_pi((__m64*)(src+10), minor2); minor3 = _mm_mul_ps(det, minor3); _mm_storel_pi((__m64*)(src+12), minor3); _mm_storeh_pi((__m64*)(src+14), minor3); return *this; }
void mBior53::synthcols(char** dest, char** sour, unsigned int w, unsigned int h) const //w,h of the LO part { float fz = 0.0f; float mul2 = 2.0f; int n; float s2k, s2k1; __m128 ms2k, ms2k1; unsigned int w2 = 2 * w; const vec1D& H2m = getH2m(); const vec1D& G2m = getG2m(); const vec1D& H2m1 = getH2m1(); const vec1D& G2m1 = getG2m1(); for (unsigned int x = 0; x < w2 / 4; x++) { //x<w2/2 x = 4*x for (unsigned int k = 0; k < h; k++) { ms2k = _mm_load_ss(&fz); ms2k1 = ms2k; for (int m = 0; m <= 0; m++) { //s2k even H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k = _mm_add_ps(ms2k, _mm_mul_ps(_mm_load_ps1(H2m.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } for (int m = 0; m <= 1; m++) { //s2k even G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k = _mm_add_ps(ms2k, _mm_mul_ps(_mm_load_ps1(G2m.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n+h][4*x])))); } for (int m = -1; m <= 0; m++) { //s2k1 odd H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k1 = _mm_add_ps(ms2k1, _mm_mul_ps(_mm_load_ps1(H2m1.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n][4*x])))); } for (int m = -1; m <= 1; m++) { //s2k1 odd G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); ms2k1 = _mm_add_ps(ms2k1, _mm_mul_ps(_mm_load_ps1(G2m1.data(m)), _mm_cvtpi8_ps(*(__m64 *)(&sour[n+h][4*x])))); } __m128 mmul2 = _mm_load_ps1(&mul2); mmxround4(&dest[2*k][4*x], _mm_mul_ps(ms2k, mmul2)); mmxround4(&dest[2*k+1][4*x], _mm_mul_ps(ms2k1, mmul2)); } } _mm_empty(); //odd remainder for (unsigned int x = w2 - (w2 % 4); x < w2; x++) { for (unsigned int k = 0; k < h; k++) { s2k = 0; s2k1 = 0; for (int m = H2m.first(); m <= H2m.last(); m++) { //s2k even H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k += H2m[m] * float(sour[n][x]); } for (int m = G2m.first(); m <= G2m.last(); m++) { //s2k even G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k += G2m[m] * float(sour[n+h][x]); } for (int m = H2m1.first(); m <= H2m1.last(); m++) { //s2k1 odd H n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k1 += H2m1[m] * float(sour[n][x]); } for (int m = G2m1.first(); m <= G2m1.last(); m++) { //s2k1 odd G n = k - m; if (n < 0) n = 0 - n; if (n >= (int)h) n -= 2 * (1 + n - h); s2k1 += G2m1[m] * float(sour[n+h][x]); } dest[2*k][x] = mmxround(2.0f * s2k); dest[2*k+1][x] = mmxround(2.0f * s2k1); } } }