void conv_filter_sse(int imgHeight, int imgWidth, int imgHeightF, int imgWidthF, int imgFOfssetH, int imgFOfssetW, float* filter, float *imgFloatSrc, float *imgFloatDst) { //1. const register __declspec(align(16)) auto const_0 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); //2. const register __declspec(align(16)) auto const_255 = _mm_set_ps(255.0, 255.0, 255.0, 255.0); //3. __declspec(align(16)) __m128 filter_l[FILTER_SIZE]; #pragma omp parallel for for (auto i = 0; i < FILTER_SIZE; i++) { //mind a 4 floatba ugyanazt tölti // float -> m128 konverzió filter_l[i] = _mm_load_ps1(filter + i); } const auto rw_base = (imgFOfssetW + imgFOfssetH * imgWidthF) << 2; const auto imgWidthbyte = imgWidth << 2; const auto imgWidthFbyte = imgWidthF << 2; const auto imgLengthbyte = imgHeight * imgWidthbyte; //4. register __declspec(align(16)) __m128 a_sse; //8. reg register __declspec(align(16)) __m128 r_sse; #pragma omp parallel for for (auto row = 0; row < imgLengthbyte; row += 4) { // RGBA komponensek akkumulátora r_sse = _mm_setzero_ps(); // konvolúció minden komponensre for (auto y = 0; y < FILTER_H; y++ ) { r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (y * imgWidthFbyte)), filter_l[5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (4 + y * imgWidthFbyte)), filter_l[1 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (8 + y * imgWidthFbyte)), filter_l[2 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (12 + y * imgWidthFbyte)), filter_l[3 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (16 + y * imgWidthFbyte)), filter_l[4 + 5 * y])); } a_sse = _mm_load_ps(imgFloatSrc + row + 8 + 2 * imgWidthFbyte); //számítás eredményének limitálása 0-255 közé // kimenetí pixel írása _mm_store_ps(imgFloatDst + rw_base + row, _mm_min_ps(const_255, _mm_add_ps(a_sse, _mm_max_ps(const_0, _mm_sub_ps(a_sse, _mm_min_ps(const_255, _mm_max_ps(const_0, r_sse))))))); } }
float sumar(float *a){ float sumaF[4] __attribute__((aligned(16))); __m128 sumas= _mm_set1_ps(0);//suma alineada 1313 iniciadas en 0 __m128 calculo; int i; __m128 aux; for ( i = 0; i < 100000; i+=4){ // multip =1; aux = _mm_load_ps(&a[i]); //corta el ciclo cuando el arreglo no tiene mas valores validos; // calculo = sqrt(a[i]); calculo = _mm_sqrt_ps(aux);//se calcula la raiz cuadrada de los 4 float en paralelo calculo = _mm_pow2_ps(calculo,aux);// se decidió que por precicion de calculo se utilizará la funcion pow2 if(_mm_compare_ps(aux,_mm_set1_ps(0)))break; sumas = _mm_add_ps(sumas, calculo); } _mm_store_ps(sumaF, sumas); return sumaF[0]+sumaF[1]+sumaF[2]+sumaF[3]; }
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale, float* aOutput, uint32_t aSize) { __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0, vout1, vout2, vout3, vgain; ASSERT_ALIGNED16(aInput); ASSERT_ALIGNED16(aOutput); ASSERT_MULTIPLE16(aSize); vgain = _mm_load1_ps(&aScale); for (unsigned i = 0; i < aSize; i += 16) { vin0 = _mm_load_ps(&aInput[i]); vin1 = _mm_load_ps(&aInput[i + 4]); vin2 = _mm_load_ps(&aInput[i + 8]); vin3 = _mm_load_ps(&aInput[i + 12]); vscaled0 = _mm_mul_ps(vin0, vgain); vscaled1 = _mm_mul_ps(vin1, vgain); vscaled2 = _mm_mul_ps(vin2, vgain); vscaled3 = _mm_mul_ps(vin3, vgain); vin0 = _mm_load_ps(&aOutput[i]); vin1 = _mm_load_ps(&aOutput[i + 4]); vin2 = _mm_load_ps(&aOutput[i + 8]); vin3 = _mm_load_ps(&aOutput[i + 12]); vout0 = _mm_add_ps(vin0, vscaled0); vout1 = _mm_add_ps(vin1, vscaled1); vout2 = _mm_add_ps(vin2, vscaled2); vout3 = _mm_add_ps(vin3, vscaled3); _mm_store_ps(&aOutput[i], vout0); _mm_store_ps(&aOutput[i + 4], vout1); _mm_store_ps(&aOutput[i + 8], vout2); _mm_store_ps(&aOutput[i + 12], vout3); } }
PHP_METHOD(Float32x4, __construct) { double lanes[4] = php_float32x4_empty; float flanes[4] = php_float32x4_empty; php_float32x4_t *p = php_float32x4_fetch(); if (zend_parse_parameters_throw(ZEND_NUM_ARGS(), "dddd", &lanes[0], &lanes[1], &lanes[2], &lanes[3]) != SUCCESS) { return; } flanes[0] = (float) lanes[0]; flanes[1] = (float) lanes[1]; flanes[2] = (float) lanes[2]; flanes[3] = (float) lanes[3]; if (posix_memalign( (void**) &p->v, 16, sizeof(__m128)) != SUCCESS) { zend_throw_exception_ex(php_float32x4_exception_ce, 0, "memory alignment error"); } *p->v = _mm_load_ps (flanes); }
void nv_vector_normalize_L2(nv_matrix_t *v, int vm) { #if NV_ENABLE_SSE2 { const int i_lp = (v->n & 0xfffffffc); __m128 x, u; NV_ALIGNED(float, mm[4], 16); int i; float dp; u = _mm_setzero_ps(); for (i = 0; i < i_lp; i += 4) { x = _mm_load_ps(&NV_MAT_V(v, vm, i)); u = _mm_add_ps(u, _mm_mul_ps(x, x)); } _mm_store_ps(mm, u); dp = mm[0] + mm[1] + mm[2] + mm[3]; for (i = i_lp; i < v->n; ++i) { dp += NV_MAT_V(v, vm, i) * NV_MAT_V(v, vm, i); } if (dp > 0.0f) { x = _mm_set1_ps(1.0f / sqrtf(dp)); for (i = 0; i < i_lp; i += 4) { _mm_store_ps(&NV_MAT_V(v, vm, i), _mm_mul_ps(*(const __m128*)&NV_MAT_V(v, vm, i), x)); } for (i = i_lp; i < v->n; ++i) { NV_MAT_V(v, vm, i) *= dp; } } } #else float norm = nv_vector_norm(v, vm); if (norm > 0.0f) { float scale = 1.0f / norm; nv_vector_muls(v, vm, v, vm, scale); } #endif }
Float evalFourier(const float *coeffs, size_t nCoeffs, Float phi) { #if FOURIER_SCALAR == 1 double cosPhi = std::cos((double) phi), cosPhi_prev = cosPhi, cosPhi_cur = 1.0, value = 0.0; for (size_t i=0; i<nCoeffs; ++i) { value += coeffs[i] * cosPhi_cur; double cosPhi_next = 2.0*cosPhi*cosPhi_cur - cosPhi_prev; cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next; } return (Float) value; #else double cosPhi = std::cos((double) phi); __m256d cosPhi_prev = _mm256_set1_pd(cosPhi), cosPhi_cur = _mm256_set1_pd(1.0), value = _mm256_set_sd((double) coeffs[0]), factorPhi_prev, factorPhi_cur; initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur); for (size_t i=1; i<nCoeffs; i+=4) { __m256d coeff = _mm256_cvtps_pd(_mm_load_ps(coeffs+i)); __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev), _mm256_mul_pd(factorPhi_cur, cosPhi_cur)); value = _mm256_add_pd(value, _mm256_mul_pd(cosPhi_next, coeff)); cosPhi_prev = _mm256_splat2_pd(cosPhi_next); cosPhi_cur = _mm256_splat3_pd(cosPhi_next); } return (Float) simd::hadd(value); #endif }
int nv_vector_eq(const nv_matrix_t *vec1, int j1, const nv_matrix_t *vec2, int j2) { NV_ASSERT(vec1->n == vec2->n); #if NV_ENABLE_SSE2 { __m128 xmm; int i = 0; int eq; int pk_lp = (vec1->n & 0xfffffffc); for (i = 0; i < pk_lp; i += 4) { xmm = _mm_load_ps(&NV_MAT_V(vec2, j2, i)); xmm = _mm_cmpneq_ps(xmm, *(const __m128 *)&NV_MAT_V(vec1, j1, i)); eq = _mm_movemask_ps(xmm); if (eq != 0) { return 0; } } for (i = pk_lp; i < vec1->n; ++i) { if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) { return 0; } } return 1; } #else { int i; for (i = 0; i < vec1->n; ++i) { if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) { return 0; } } return 1; } #endif }
float vsum(const float *a, int _n) { float sum; int n = _n - _n%3; __m128 vsum = _mm_set1_ps(0.0f); assert((n & 3) == 0); assert(((uintptr_t)a & 15) == 0); for (int i = 0; i < n; i += 4) { __m128 v = _mm_load_ps(&a[i]); vsum = _mm_add_ps(vsum, v); } vsum = _mm_hadd_ps(vsum, vsum); vsum = _mm_hadd_ps(vsum, vsum); _mm_store_ss(&sum, vsum); for(int i=n;i<_n;i++){ sum += a[i]; } return sum; }
void HighPassFilter::setFlaggedValuesToZeroAndMakeWeightsSSE(const Image2DCPtr &inputImage, const Image2DPtr &outputImage, const Mask2DCPtr &inputMask, const Image2DPtr &weightsOutput) { const size_t width = inputImage->Width(); const __m128i zero4i = _mm_set_epi32(0, 0, 0, 0); const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); const __m128 one4 = _mm_set_ps(1.0, 1.0, 1.0, 1.0); for(size_t y=0;y<inputImage->Height();++y) { const bool *rowPtr = inputMask->ValuePtr(0, y); const float *inputPtr = inputImage->ValuePtr(0, y); float *outputPtr = outputImage->ValuePtr(0, y); float *weightsPtr = weightsOutput->ValuePtr(0, y); const float *end = inputPtr + width; while(inputPtr < end) { // Assign each integer to one bool in the mask // Convert false to 0xFFFFFFFF and true to 0 __m128 conditionMask = _mm_castsi128_ps( _mm_cmpeq_epi32(_mm_set_epi32(rowPtr[3] || !isfinite(inputPtr[3]), rowPtr[2] || !isfinite(inputPtr[2]), rowPtr[1] || !isfinite(inputPtr[1]), rowPtr[0] || !isfinite(inputPtr[0])), zero4i)); _mm_store_ps(weightsPtr, _mm_or_ps( _mm_and_ps(conditionMask, one4), _mm_andnot_ps(conditionMask, zero4) )); _mm_store_ps(outputPtr, _mm_or_ps( _mm_and_ps(conditionMask, _mm_load_ps(inputPtr)), _mm_andnot_ps(conditionMask, zero4) )); rowPtr += 4; outputPtr += 4; inputPtr += 4; weightsPtr += 4; } } }
static inline long conv_yF_gamma_yF_linear (const float *src, float *dst, long samples) { long total = samples; const __v4sf *s = (const __v4sf*)src; __v4sf *d = (__v4sf*)dst; if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) { while (samples > 4) { __v4sf rgba0 = _mm_load_ps ((float *)s++); rgba0 = gamma_2_2_to_linear_sse2 (rgba0); _mm_store_ps ((float *)d++, rgba0); samples -= 4; } } else { while (samples > 4) { __v4sf rgba0 = _mm_loadu_ps ((float *)s++); rgba0 = gamma_2_2_to_linear_sse2 (rgba0); _mm_storeu_ps ((float *)d++, rgba0); samples -= 4; } } src = (const float *)s; dst = (float *)d; while (samples--) { *dst++ = babl_gamma_2_2_to_linear (*src++); } return total; }
void pow_fmath(const Mat& src, const float a, Mat & dest) { if (dest.empty())dest.create(src.size(), CV_32F); int width = src.cols; int height = src.rows; int size = src.size().area(); int i = 0; const float* s = src.ptr<float>(0); float* d = dest.ptr<float>(0); const __m128 ma = _mm_set1_ps(a); for (i = 0; i <= size - 4; i += 4) { _mm_store_ps(d + i, _mm_pow_ps(_mm_load_ps(s + i), ma)); } for (; i < size; i++) { d[i] = cv::pow(s[i], a); } }
inline static void histogram_helper_cs_rgb_helper_process_pixel_m128( const dt_dev_histogram_collection_params_t *const histogram_params, const float *pixel, uint32_t *histogram) { const __m128 scale = _mm_set1_ps(histogram_params->mul); const __m128 val_min = _mm_setzero_ps(); const __m128 val_max = _mm_set1_ps(histogram_params->bins_count - 1); assert(dt_is_aligned(pixel, 16)); const __m128 input = _mm_load_ps(pixel); const __m128 scaled = _mm_mul_ps(input, scale); const __m128 clamped = _mm_max_ps(_mm_min_ps(scaled, val_max), val_min); const __m128i indexes = _mm_cvtps_epi32(clamped); __m128i values __attribute__((aligned(16))); _mm_store_si128(&values, indexes); const uint32_t *valuesi = (uint32_t *)(&values); histogram[4 * valuesi[0]]++; histogram[4 * valuesi[1] + 1]++; histogram[4 * valuesi[2] + 2]++; }
void MultiplyAudioBuffer(float *buffer, int totalFloats, float mulVal) { float sum = 0.0f; int totalFloatsStore = totalFloats; if((UPARAM(buffer) & 0xF) == 0) { UINT alignedFloats = totalFloats & 0xFFFFFFFC; __m128 sseMulVal = _mm_set_ps1(mulVal); for(UINT i=0; i<alignedFloats; i += 4) { __m128 sseScaledVals = _mm_mul_ps(_mm_load_ps(buffer+i), sseMulVal); _mm_store_ps(buffer+i, sseScaledVals); } buffer += alignedFloats; totalFloats -= alignedFloats; } for(int i=0; i<totalFloats; i++) buffer[i] *= mulVal; }
int main() { const uint32_t len = 16 * 1024 * 1024; uint32_t i; uint8_t* p = (uint8_t*)heap_new(len); if (!p) { printf("out of memory!\n"); return 1; } printf("Test 16bit stores\n"); for (i = 0;i < len;i += 2) *(uint16_t*)&p[i] = i & 0xffff; printf("Test 32bit stores\n"); for (i = 0;i < len;i += 4) *(uint32_t*)&p[i] = i; printf("Test 64bit stores\n"); for (i = 0;i < len;i += 8) *(uint64_t*)&p[i] = i; printf("Test 128bit loads/stores\n"); __m128 v ; for (i = 0;i < len;i += 16) { v = _mm_load_ps((float*)(p + i)); _mm_store_ps((float*)(p + i),v); } heap_delete(p); printf("Got no unaligned addr exception!\n"); return 0; }
// calculate 2^n for each input sample. // void MLProcExp2::process(const int frames) { static const ml::Symbol preciseSym("precise"); const MLSignal& x1 = getInput(1); MLSignal& y1 = getOutput(); if (mParamsChanged) { mPrecise = getParam(preciseSym); mParamsChanged = false; } if(mPrecise) // scalar code { for (int n=0; n<frames; ++n) { y1[n] = pow(2.f, x1[n]); } } else { const MLSample* px1 = x1.getConstBuffer(); MLSample* py1 = y1.getBuffer(); int c = frames >> kMLSamplesPerSSEVectorBits; __m128 vx1, vr; for (int n = 0; n < c; ++n) { vx1 = _mm_load_ps(px1); vr = exp2Approx4(vx1); _mm_store_ps(py1, vr); px1 += kSSEVecSize; py1 += kSSEVecSize; } } }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *i, void *o, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { dt_iop_exposure_data_t *d = (dt_iop_exposure_data_t *)piece->data; dt_iop_exposure_gui_data_t *g = (dt_iop_exposure_gui_data_t *)self->gui_data; if(d->mode == EXPOSURE_MODE_DEFLICKER) { commit_params_late(self, piece); } const float black = d->black; const float white = exposure2white(d->exposure); const int ch = piece->colors; const float scale = 1.0/(white - black); const __m128 blackv = _mm_set1_ps(black); const __m128 scalev = _mm_set1_ps(scale); #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out,i,o) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float *)i) + (size_t)ch*k*roi_out->width; float *out = ((float *)o) + (size_t)ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=4,out+=4) _mm_store_ps(out, (_mm_load_ps(in)-blackv)*scalev); } if(piece->pipe->mask_display) dt_iop_alpha_copy(i, o, roi_out->width, roi_out->height); for(int k=0; k<3; k++) piece->pipe->processed_maximum[k] *= scale; if(g != NULL && self->dev->gui_attached && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW) { g->deflicker_computed_exposure = d->exposure; } }
int main(int argc, const char * argv[]) { ALIGN32 float a1[ 16 ] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; ALIGN32 float a2[ 16 ] = { 15, 12, 4, 7, 9, 0, 3, 13, 6, 10, 1, 8, 5, 11, 2, 14 }; ALIGN32 float aout[ 16 ]; __m128 x1[ 4 ] = { _mm_load_ps( a1 ), _mm_load_ps( a1 + 4 ), _mm_load_ps( a1 + 8 ), _mm_load_ps( a1 + 12 ) }; __m128 x2[ 4 ] = { _mm_load_ps( a2 ), _mm_load_ps( a2 + 4 ), _mm_load_ps( a2 + 8 ), _mm_load_ps( a2 + 12 ) }; __m128 xout[ 4 ]; __m256 y1[ 2 ] = { _mm256_load_ps( a1 ), _mm256_load_ps( a1 + 8 ) }; __m256 y2[ 2 ] = { _mm256_load_ps( a2 ), _mm256_load_ps( a2 + 8 ) }; __m256 yout[ 2 ]; std::cout << "FPU Mult" << std::endl; mul( a1, a2, aout ); trace( aout, 4 ); std::cout << "SSE Mult" << std::endl; mulX4( x1, x2, xout ); trace( xout, 4 ); std::cout << "AVX2 Mult" << std::endl; mulX8( y1, y2, yout ); trace( yout, 4 ); std::cout << "FPU Transpose" << std::endl; transpose( a1, aout ); trace( aout, 4 ); std::cout << "SSE Transpose" << std::endl; transposeX4( x1, xout ); trace( xout, 4 ); std::cout << "AVX Transpose" << std::endl; transposeX8( y1, yout ); trace( yout, 4 ); return 0; }
void nv_vector_adds(nv_matrix_t *vec0, int m0, const nv_matrix_t *vec1, int m1, float v) { NV_ASSERT(vec1->n == vec0->n); #if NV_ENABLE_SSE2 { __m128 vv; int n; int pk_lp = (vec1->n & 0xfffffffc); vv = _mm_set1_ps(v); #ifdef _OPENMP //#pragma omp parallel for #endif for (n = 0; n < pk_lp; n += 4) { __m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n)); _mm_store_ps(&NV_MAT_V(vec0, m0, n), _mm_add_ps(x, vv)); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + v; } } #else { int n; for (n = 0; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + v; } } #endif }
void operator()(CONTAINER *target, const CONTAINER& oldSelf, const CONTAINER& neighbor) { __m128 forceOffset = _mm_set1_ps(FORCE_OFFSET); #ifndef NO_OMP #pragma omp parallel for schedule(static) #endif for (int j = 0; j < CONTAINER_SIZE; ++j) { __m128 neighborPosX = _mm_set1_ps(neighbor.posX[j]); __m128 neighborPosY = _mm_set1_ps(neighbor.posY[j]); __m128 neighborPosZ = _mm_set1_ps(neighbor.posZ[j]); for (int i = 0; i < CONTAINER_SIZE; i+=4) { __m128 oldSelfPosX = _mm_load_ps(oldSelf.posX + i); __m128 oldSelfPosY = _mm_load_ps(oldSelf.posY + i); __m128 oldSelfPosZ = _mm_load_ps(oldSelf.posZ + i); __m128 myVelX = _mm_load_ps(oldSelf.velX + i); __m128 myVelY = _mm_load_ps(oldSelf.velY + i); __m128 myVelZ = _mm_load_ps(oldSelf.velZ + i); __m128 deltaX = _mm_sub_ps(oldSelfPosX, neighborPosX); __m128 deltaY = _mm_sub_ps(oldSelfPosY, neighborPosY); __m128 deltaZ = _mm_sub_ps(oldSelfPosZ, neighborPosZ); __m128 dist2 = _mm_add_ps(forceOffset, _mm_mul_ps(deltaX, deltaX)); dist2 = _mm_add_ps(dist2, _mm_mul_ps(deltaY, deltaY)); dist2 = _mm_add_ps(dist2, _mm_mul_ps(deltaZ, deltaZ)); __m128 force = _mm_rsqrt_ps(dist2); myVelX = _mm_add_ps(myVelX, _mm_mul_ps(force, deltaX)); myVelY = _mm_add_ps(myVelY, _mm_mul_ps(force, deltaY)); myVelZ = _mm_add_ps(myVelZ, _mm_mul_ps(force, deltaZ)); _mm_store_ps(target->velX + i, myVelX); _mm_store_ps(target->velY + i, myVelY); _mm_store_ps(target->velZ + i, myVelZ); } } }
// ============================================================================= // // sse3_vChirpData // version by: Alex Kan // http://tbp.berkeley.edu/~alexkan/seti/ // int sse3_ChirpData_ak( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); for (i = 0; i < vEnd; i += 4) { const float *data = (const float *) (cx_DataArray + i); float *chirped = (float *) (cx_ChirpDataArray + i); __m128d di = _mm_set1_pd(i); __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di); __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di); __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 s; __m128 c; __m128 m; // load the signal to be chirped prefetchnta((const void *)( data+32 )); d1 = _mm_load_ps(data); d2 = _mm_load_ps(data+4); // calculate the input angle a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate); a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate); // reduce the angle to the range (-0.5, 0.5) a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal)); a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal)); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4), SS3), y), SS2), y), SS1), x); c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3), CC2), y), CC1), y), ONE); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude // m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO)); // m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO)); m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y))); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) s = _mm_mul_ps(s, m); c = _mm_mul_ps(c, m); // chirp the data cd1 = _mm_shuffle_ps(c, c, 0x50); cd2 = _mm_shuffle_ps(c, c, 0xfa); cd1 = _mm_mul_ps(cd1, d1); cd2 = _mm_mul_ps(cd2, d2); d1 = _mm_shuffle_ps(d1, d1, 0xb1); d2 = _mm_shuffle_ps(d2, d2, 0xb1); td1 = _mm_shuffle_ps(s, s, 0x50); td2 = _mm_shuffle_ps(s, s, 0xfa); td1 = _mm_mul_ps(td1, d1); td2 = _mm_mul_ps(td2, d2); cd1 = _mm_addsub_ps(cd1, td1); cd2 = _mm_addsub_ps(cd2, td2); // store chirped values _mm_stream_ps(chirped, cd1); _mm_stream_ps(chirped+4, cd2); } _mm_sfence(); // handle tail elements with scalar code for ( ; i < ul_NumDataPoints; ++i) { double angle = srate * i * i * 0.5; double s = sin(angle); double c = cos(angle); float re = cx_DataArray[i][0]; float im = cx_DataArray[i][1]; cx_ChirpDataArray[i][0] = re * c - im * s; cx_ChirpDataArray[i][1] = re * s + im * c; } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; return 0; }
void alphaBlend(const Mat& src1, const Mat& src2, const Mat& alpha,Mat& dest) { int T; Mat s1,s2; if(src1.channels()<=src2.channels())T=src2.type(); else T=src1.type(); if(dest.empty())dest=Mat::zeros(src1.size(),T); if(src1.channels()==src2.channels()) { s1=src1; s2=src2; } else if(src2.channels()==3) { cvtColor(src1,s1,CV_GRAY2BGR); s2=src2; } else { cvtColor(src2,s2,CV_GRAY2BGR); s1=src1; } Mat a; if(alpha.type()==CV_8U) alpha.convertTo(a,CV_32F,1.0/255.0); else if(alpha.type()==CV_32F || alpha.type()==CV_64F) alpha.convertTo(a,CV_32F); if(dest.channels()==3) { vector<Mat> ss1,ss2; vector<Mat> ss1f(3),ss2f(3); split(s1,ss1); split(s2,ss2); for(int c=0;c<3;c++) { ss1[c].convertTo(ss1f[c],CV_32F); ss2[c].convertTo(ss2f[c],CV_32F); } { float* s1r = ss1f[0].ptr<float>(0); float* s2r = ss2f[0].ptr<float>(0); float* s1g = ss1f[1].ptr<float>(0); float* s2g = ss2f[1].ptr<float>(0); float* s1b = ss1f[2].ptr<float>(0); float* s2b = ss2f[2].ptr<float>(0); float* al = a.ptr<float>(0); const int size = src1.size().area()/4; const __m128 ones = _mm_set1_ps(1.0f); for(int i=size;i--;) { const __m128 msa = _mm_load_ps(al); const __m128 imsa = _mm_sub_ps(ones,msa); __m128 ms1 = _mm_load_ps(s1r); __m128 ms2 = _mm_load_ps(s2r); ms1 = _mm_mul_ps(ms1,msa); ms2 = _mm_mul_ps(ms2,imsa); ms1 = _mm_add_ps(ms1,ms2); _mm_store_ps(s1r,ms1);//store ss1f ms1 = _mm_load_ps(s1g); ms2 = _mm_load_ps(s2g); ms1 = _mm_mul_ps(ms1,msa); ms2 = _mm_mul_ps(ms2,imsa); ms1 = _mm_add_ps(ms1,ms2); _mm_store_ps(s1g,ms1);//store ss1f ms1 = _mm_load_ps(s1b); ms2 = _mm_load_ps(s2b); ms1 = _mm_mul_ps(ms1,msa); ms2 = _mm_mul_ps(ms2,imsa); ms1 = _mm_add_ps(ms1,ms2); _mm_store_ps(s1b,ms1);//store ss1f al+=4,s1r+=4,s2r+=4,s1g+=4,s2g+=4,s1b+=4,s2b+=4; } for(int c=0;c<3;c++) { ss1f[c].convertTo(ss1[c],CV_8U); } merge(ss1,dest); } } else if(dest.channels()==1) { Mat ss1f,ss2f; s1.convertTo(ss1f,CV_32F); s2.convertTo(ss2f,CV_32F); { float* s1r = ss1f.ptr<float>(0); float* s2r = ss2f.ptr<float>(0); float* al = a.ptr<float>(0); const int size = src1.size().area()/4; const int nn = src1.size().area() - size*4; const __m128 ones = _mm_set1_ps(1.0f); for(int i=size;i--;) { const __m128 msa = _mm_load_ps(al); const __m128 imsa = _mm_sub_ps(ones,msa); __m128 ms1 = _mm_load_ps(s1r); __m128 ms2 = _mm_load_ps(s2r); ms1 = _mm_mul_ps(ms1,msa); ms2 = _mm_mul_ps(ms2,imsa); ms1 = _mm_add_ps(ms1,ms2); _mm_store_ps(s1r,ms1);//store ss1f al+=4,s1r+=4,s2r+=4; } for(int i=nn;i--;) { *s1r = *al * *s1r + (1.0f-*al)* *s2r; al++,s1r++,s2r++; } ss1f.convertTo(dest,CV_8U); } } }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { float *in; float *out; dt_iop_zonesystem_gui_data_t *g = NULL; dt_iop_zonesystem_data_t *data = (dt_iop_zonesystem_data_t*)piece->data; guchar *buffer = NULL; if( self->dev->gui_attached && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW ) { g = (dt_iop_zonesystem_gui_data_t *)self->gui_data; dt_pthread_mutex_lock(&g->lock); if(g->preview_buffer) g_free (g->preview_buffer); buffer = g->preview_buffer = g_malloc (roi_in->width*roi_in->height); g->preview_width=roi_out->width; g->preview_height=roi_out->height; } /* calculate zonemap */ const int size = data->size; float zonemap[MAX_ZONE_SYSTEM_SIZE]= {-1}; _iop_zonesystem_calculate_zonemap (data, zonemap); const int ch = piece->colors; /* if gui and have buffer lets gaussblur and fill buffer with zone indexes */ if( self->dev->gui_attached && g && buffer) { /* setup gaussian kernel */ const int radius = 8; const int rad = MIN(radius, ceilf(radius * roi_in->scale / piece->iscale)); const int wd = 2*rad+1; float mat[wd*wd]; float *m; const float sigma2 = (2.5*2.5)*(radius*roi_in->scale/piece->iscale)*(radius*roi_in->scale/piece->iscale); float weight = 0.0f; memset(mat, 0, wd*wd*sizeof(float)); m = mat; for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++) weight += *m = expf(- (l*l + k*k)/(2.f*sigma2)); m = mat; for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++) *m /= weight; /* gauss blur the L channel */ #ifdef _OPENMP #pragma omp parallel for default(none) private(in, out, m) shared(mat, ivoid, ovoid, roi_out, roi_in) schedule(static) #endif for(int j=rad; j<roi_out->height-rad; j++) { in = ((float *)ivoid) + ch*(j*roi_in->width + rad); out = ((float *)ovoid) + ch*(j*roi_out->width + rad); for(int i=rad; i<roi_out->width-rad; i++) { for(int c=0; c<3; c++) out[c] = 0.0f; float sum = 0.0; m = mat; for(int l=-rad; l<=rad; l++) { float *inrow = in + ch*(l*roi_in->width-rad); for(int k=-rad; k<=rad; k++,inrow+=ch,m++) sum += *m * inrow[0]; } out[0] = sum; out += ch; in += ch; } } /* create zonemap preview */ // in = (float *)ivoid; out = (float *)ovoid; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out,out,buffer,g,zonemap) schedule(static) #endif for (int k=0; k<roi_out->width*roi_out->height; k++) { buffer[k] = _iop_zonesystem_zone_index_from_lightness (out[ch*k]/100.0f, zonemap, size); } dt_pthread_mutex_unlock(&g->lock); } /* process the image */ in = (float *)ivoid; out = (float *)ovoid; const float rzscale = (size-1)/100.0f; float zonemap_offset[MAX_ZONE_SYSTEM_SIZE]= {-1}; float zonemap_scale[MAX_ZONE_SYSTEM_SIZE]= {-1}; // precompute scale and offset for (int k=0; k < size-1; k++) zonemap_scale[k] = (zonemap[k+1]-zonemap[k])*(size-1); for (int k=0; k < size-1; k++) zonemap_offset[k] = 100.0f * ((k+1)*zonemap[k] - k*zonemap[k+1]) ; #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, in, out, zonemap_scale,zonemap_offset) schedule(static) #endif for (int j=0; j<roi_out->height; j++) for (int i=0; i<roi_out->width; i++) { /* remap lightness into zonemap and apply lightness */ const float *inp = in + ch*(j*roi_out->width+i); float *outp = out + ch*(j*roi_out->width+i); const int rz = CLAMPS(inp[0]*rzscale, 0, size-2); // zone index const float zs = ((rz > 0) ? (zonemap_offset[rz]/inp[0]) : 0) + zonemap_scale[rz]; _mm_stream_ps(outp,_mm_mul_ps(_mm_load_ps(inp),_mm_set1_ps(zs))); } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch, const complex_t *wa) { uint16_t i, k, ah, ac; for (k = 0; k < l1; k++) { ah = k*ido; ac = 2*k*ido; for (i = 0; i < ido; i+=4) { __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14; __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24; __m128 w1, w2, w3, w4; m1 = _mm_load_ps(&RE(cc[ac+i])); m2 = _mm_load_ps(&RE(cc[ac+ido+i])); m5 = _mm_load_ps(&RE(cc[ac+i+2])); m6 = _mm_load_ps(&RE(cc[ac+ido+i+2])); w1 = _mm_load_ps(&RE(wa[i])); w3 = _mm_load_ps(&RE(wa[i+2])); m3 = _mm_add_ps(m1, m2); m15 = _mm_add_ps(m5, m6); m4 = _mm_sub_ps(m1, m2); m16 = _mm_sub_ps(m5, m6); _mm_store_ps(&RE(ch[ah+i]), m3); _mm_store_ps(&RE(ch[ah+i+2]), m15); w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1)); w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1)); m7 = _mm_mul_ps(m4, w1); m17 = _mm_mul_ps(m16, w3); m8 = _mm_mul_ps(m4, w2); m18 = _mm_mul_ps(m16, w4); m9 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0)); m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0)); m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1)); m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1)); m11 = _mm_add_ps(m9, m10); m21 = _mm_add_ps(m19, m20); m12 = _mm_sub_ps(m9, m10); m22 = _mm_sub_ps(m19, m20); m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2)); m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2)); m14 = _mm_unpacklo_ps(m12, m13); m24 = _mm_unpacklo_ps(m22, m23); _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14); _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24); } } }
phash phash_for_pixmap(const QPixmap& pixmap) { static bool cos_table_initialized = false; ALIGN(16, static float cos_table[8][8][32][32]); ALIGN(16, float intensity[32][32]); if(!cos_table_initialized) { cos_table_initialized = true; // 32x32 DCT, though we are only interested in the top left 8x8, representing lowest frequencies in the image for(int u = 0; u < 8; u++) { for(int v = 0; v < 8; v++) { for(int y = 0; y < 32; y++) { for(int x = 0; x < 32; x++) { cos_table[v][u][y][x] = cosf(M_PI / 32.0f * (x + 0.5f) * u) * cosf(M_PI / 32.0f * (y + 0.5f) * v); } } } } } // Scale down to 32x32 QImage image = pixmap.scaled(32, 32, Qt::IgnoreAspectRatio, Qt::SmoothTransformation).toImage(); float dct[64]; int counter = 0; // Convert to grayscale const __m128 luminance = _mm_set_ps(.0f, 0.2126f, 0.7152f, 0.0722f); for(int y = 0; y < 32; y++) { for(int x = 0; x < 32; x++) { QRgb pixel = image.pixel(x, y); __m128 p = _mm_set_ps(0, qRed(pixel), qGreen(pixel), qBlue(pixel)); __m128 v = _mm_mul_ps(luminance, p); __m128 t = _mm_add_ps(v, _mm_movehl_ps(v, v)); __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1)); _mm_store_ss(&intensity[y][x], sum); } } // DCT for(int u = 0; u < 8; u++) { for(int v = 0; v < 8; v++) { __m128 acc = _mm_setzero_ps(); for(int y = 0; y < 32; y++) { for(int x = 0; x < 32; x+=4) { __m128 in = _mm_load_ps(&intensity[y][x]); __m128 cos = _mm_load_ps(&cos_table[v][u][y][x]); __m128 out = _mm_mul_ps(in, cos); acc = _mm_add_ps(out, acc); } } __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1)); _mm_store_ss(&dct[counter++], sum); } } // Mean, skip first one float mean = 0.0; for(int i = 1; i < 64; i++) { mean += dct[i]; } mean /= 63; // Calculate the final hash phash hash = 0; for(int i = 0; i < 64; i++) { phash val = dct[i] > mean; hash |= val << i; } return hash; }
float FFM::wTx(const ffm_node *instance, const unsigned int & size, float kappa, float eta, float lambda, bool do_update) { long long align0 = (long long)parameters.num_factors*2; long long align1 = (long long)num_fields*align0; __m128 XMMkappa = _mm_set1_ps(kappa); __m128 XMMeta = _mm_set1_ps(eta); __m128 XMMlambda = _mm_set1_ps(lambda); __m128 XMMt = _mm_setzero_ps(); for( unsigned int n1 = 0; n1 < size; n1++ ) { int j1 = instance[n1].index; int f1 = instance[n1].field_index; float v1 = instance[n1].value; if(j1 >= num_features || f1 >= num_fields) continue; for( unsigned int n2 = n1 + 1; n2 < size; n2++ ) { int j2 = instance[n2].index; int f2 = instance[n2].field_index; float v2 = instance[n2].value; if(j2 >= num_features || f2 >= num_fields) continue; float *w1 = W + j1*align1 + f2*align0; float *w2 = W + j2*align1 + f1*align0; __m128 XMMv = _mm_set1_ps(v1*v2); if(do_update) { __m128 XMMkappav = _mm_mul_ps(XMMkappa, XMMv); float *wg1 = w1 + parameters.num_factors; float *wg2 = w2 + parameters.num_factors; for(int d = 0; d < parameters.num_factors; d += 4) { __m128 XMMw1 = _mm_load_ps(w1+d); __m128 XMMw2 = _mm_load_ps(w2+d); __m128 XMMwg1 = _mm_load_ps(wg1+d); __m128 XMMwg2 = _mm_load_ps(wg2+d); __m128 XMMg1 = _mm_add_ps(_mm_mul_ps(XMMlambda, XMMw1), _mm_mul_ps(XMMkappav, XMMw2)); __m128 XMMg2 = _mm_add_ps(_mm_mul_ps(XMMlambda, XMMw2), _mm_mul_ps(XMMkappav, XMMw1)); XMMwg1 = _mm_add_ps(XMMwg1, _mm_mul_ps(XMMg1, XMMg1)); XMMwg2 = _mm_add_ps(XMMwg2, _mm_mul_ps(XMMg2, XMMg2)); XMMw1 = _mm_sub_ps(XMMw1, _mm_mul_ps(XMMeta, _mm_mul_ps(_mm_rsqrt_ps(XMMwg1), XMMg1))); XMMw2 = _mm_sub_ps(XMMw2, _mm_mul_ps(XMMeta, _mm_mul_ps(_mm_rsqrt_ps(XMMwg2), XMMg2))); _mm_store_ps(w1+d, XMMw1); _mm_store_ps(w2+d, XMMw2); _mm_store_ps(wg1+d, XMMwg1); _mm_store_ps(wg2+d, XMMwg2); } } else { for(int d = 0; d < parameters.num_factors; d += 4) { __m128 XMMw1 = _mm_load_ps(w1+d); __m128 XMMw2 = _mm_load_ps(w2+d); XMMt = _mm_add_ps(XMMt, _mm_mul_ps(_mm_mul_ps(XMMw1, XMMw2), XMMv)); } } } } if(do_update) return 0; XMMt = _mm_hadd_ps(XMMt, XMMt); XMMt = _mm_hadd_ps(XMMt, XMMt); float t; _mm_store_ss(&t, XMMt); return t; }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data; const int ch = piece->colors; const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK); if(!isnan(d->cmatrix[0])) { //fprintf(stderr,"Using cmatrix codepath\n"); // convert to rgb using matrix #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + ch*roi_in->width *j; float *out = (float*)ovoid + ch*roi_out->width*j; const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]); const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]); const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]); for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in)); const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2))))); _mm_stream_ps(out,t); } } _mm_sfence(); // apply profile #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + ch*roi_in->width *j; float *out = (float*)ovoid + ch*roi_out->width*j; for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { for(int i=0; i<3; i++) if (d->lut[i][0] >= 0.0f) { out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]); } } } } else { float *in = (float*)ivoid; float *out = (float*)ovoid; const int rowsize=roi_out->width * 3; //fprintf(stderr,"Using xform codepath\n"); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(out, roi_out, in) #endif for (int k=0; k<roi_out->height; k++) { float Lab[rowsize]; float rgb[rowsize]; const int m=(k*(roi_out->width*ch)); for (int l=0; l<roi_out->width; l++) { int li=3*l,ii=ch*l; Lab[li+0] = in[m+ii+0]; Lab[li+1] = in[m+ii+1]; Lab[li+2] = in[m+ii+2]; } cmsDoTransform (d->xform, Lab, rgb, roi_out->width); for (int l=0; l<roi_out->width; l++) { int oi=ch*l, ri=3*l; if(gamutcheck && (rgb[ri+0] < 0.0f || rgb[ri+1] < 0.0f || rgb[ri+2] < 0.0f)) { out[m+oi+0] = 0.0f; out[m+oi+1] = 1.0f; out[m+oi+2] = 1.0f; } else { out[m+oi+0] = rgb[ri+0]; out[m+oi+1] = rgb[ri+1]; out[m+oi+2] = rgb[ri+2]; } } } } if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
void dt_gaussian_blur_4c( dt_gaussian_t *g, float *in, float *out) { const int width = g->width; const int height = g->height; const int ch = 4; assert(g->channels == 4); float a0, a1, a2, a3, b1, b2, coefp, coefn; compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn); const __m128 Labmax = _mm_set_ps(g->max[3], g->max[2], g->max[1], g->max[0]); const __m128 Labmin = _mm_set_ps(g->min[3], g->min[2], g->min[1], g->min[0]); float *temp = g->buf; // vertical blur column by column #ifdef _OPENMP #pragma omp parallel for default(none) shared(in,out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static) #endif for(int i=0; i<width; i++) { __m128 xp = _mm_setzero_ps(); __m128 yb = _mm_setzero_ps(); __m128 yp = _mm_setzero_ps(); __m128 xc = _mm_setzero_ps(); __m128 yc = _mm_setzero_ps(); __m128 xn = _mm_setzero_ps(); __m128 xa = _mm_setzero_ps(); __m128 yn = _mm_setzero_ps(); __m128 ya = _mm_setzero_ps(); // forward filter xp = MMCLAMPPS(_mm_load_ps(in+i*ch), Labmin, Labmax); yb = _mm_mul_ps(_mm_set_ps1(coefp), xp); yp = yb; for(int j=0; j<height; j++) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)), _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)), _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2))))); _mm_store_ps(temp+offset, yc); xp = xc; yb = yp; yp = yc; } // backward filter xn = MMCLAMPPS(_mm_load_ps(in+((height - 1) * width + i)*ch), Labmin, Labmax); xa = xn; yn = _mm_mul_ps(_mm_set_ps1(coefn), xn); ya = yn; for(int j=height - 1; j > -1; j--) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)), _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)), _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2))))); xa = xn; xn = xc; ya = yn; yn = yc; _mm_store_ps(temp+offset, _mm_add_ps(_mm_load_ps(temp+offset), yc)); } } // horizontal blur line by line #ifdef _OPENMP #pragma omp parallel for default(none) shared(out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static) #endif for(int j=0; j<height; j++) { __m128 xp = _mm_setzero_ps(); __m128 yb = _mm_setzero_ps(); __m128 yp = _mm_setzero_ps(); __m128 xc = _mm_setzero_ps(); __m128 yc = _mm_setzero_ps(); __m128 xn = _mm_setzero_ps(); __m128 xa = _mm_setzero_ps(); __m128 yn = _mm_setzero_ps(); __m128 ya = _mm_setzero_ps(); // forward filter xp = MMCLAMPPS(_mm_load_ps(temp+j*width*ch), Labmin, Labmax); yb = _mm_mul_ps(_mm_set_ps1(coefp), xp); yp = yb; for(int i=0; i<width; i++) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)), _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)), _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2))))); _mm_store_ps(out+offset, yc); xp = xc; yb = yp; yp = yc; } // backward filter xn = MMCLAMPPS(_mm_load_ps(temp+((j + 1)*width - 1)*ch), Labmin, Labmax); xa = xn; yn = _mm_mul_ps(_mm_set_ps1(coefn), xn); ya = yn; for(int i=width - 1; i > -1; i--) { int offset = (i + j * width)*ch; xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax); yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)), _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)), _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2))))); xa = xn; xn = xc; ya = yn; yn = yc; _mm_store_ps(out+offset, _mm_add_ps(_mm_load_ps(out+offset), yc)); } } }
static void cftmdl_128_SSE2(float* a) { const int l = 8; const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); int j0; __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); for (j0 = 0; j0 < l; j0 += 2) { const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), _MM_SHUFFLE(1, 0, 1, 0)); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2)); const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3)); const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); const __m128 yy3 = _mm_add_ps(yy0, yy2); const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); _mm_storel_epi64( (__m128i*)&a[j0 + 32], _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); _mm_storel_epi64( (__m128i*)&a[j0 + 48], _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3))); a[j0 + 48] = -a[j0 + 48]; _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); _mm_storel_epi64( (__m128i*)&a[j0 + 56], _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3))); } { int k = 64; int k1 = 2; int k2 = 2 * k1; const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]); const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); for (j0 = k; j0 < l + k; j0 += 2) { const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), _MM_SHUFFLE(1, 0, 1, 0)); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); const __m128 xx3 = _mm_mul_ps(wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx4 = _mm_add_ps(xx2, xx3); const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); const __m128 xx11 = _mm_mul_ps( wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx12 = _mm_add_ps(xx10, xx11); const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); const __m128 xx21 = _mm_mul_ps( wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx22 = _mm_add_ps(xx20, xx21); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); _mm_storel_epi64( (__m128i*)&a[j0 + 32], _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); _mm_storel_epi64( (__m128i*)&a[j0 + 48], _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); _mm_storel_epi64( (__m128i*)&a[j0 + 40], _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); _mm_storel_epi64( (__m128i*)&a[j0 + 56], _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); } } }
static void rftbsub_128_SSE2(float* a) { const float* c = rdft_w + 32; int j1, j2, k1, k2; float wkr, wki, xr, xi, yr, yi; static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; const __m128 mm_half = _mm_load_ps(k_half); a[1] = -a[1]; // Vectorized code (four at once). // Note: commented number are indexes for the first iteration of the loop. for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { // Load 'wk'. const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4, const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, const __m128 wkr_ = _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, const __m128 wki_ = c_j1; // 1, 2, 3, 4, // Load and shuffle 'a'. const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, const __m128 a_j2_p0 = _mm_shuffle_ps( a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8, const __m128 a_j2_p1 = _mm_shuffle_ps( a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9, const __m128 a_k2_p0 = _mm_shuffle_ps( a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120, const __m128 a_k2_p1 = _mm_shuffle_ps( a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121, // Calculate 'x'. const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); // 2-126, 4-124, 6-122, 8-120, const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); // 3-127, 5-125, 7-123, 9-121, // Calculate product into 'y'. // yr = wkr * xr + wki * xi; // yi = wkr * xi - wki * xr; const __m128 a_ = _mm_mul_ps(wkr_, xr_); const __m128 b_ = _mm_mul_ps(wki_, xi_); const __m128 c_ = _mm_mul_ps(wkr_, xi_); const __m128 d_ = _mm_mul_ps(wki_, xr_); const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, // Update 'a'. // a[j2 + 0] = a[j2 + 0] - yr; // a[j2 + 1] = yi - a[j2 + 1]; // a[k2 + 0] = yr + a[k2 + 0]; // a[k2 + 1] = yi - a[k2 + 1]; const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9, const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121, // Shuffle in right order and store. const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); // 2, 3, 4, 5, const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); // 6, 7, 8, 9, const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); // 122, 123, 120, 121, const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); // 126, 127, 124, 125, const __m128 a_k2_0n = _mm_shuffle_ps( a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123, const __m128 a_k2_4n = _mm_shuffle_ps( a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127, _mm_storeu_ps(&a[0 + j2], a_j2_0n); _mm_storeu_ps(&a[4 + j2], a_j2_4n); _mm_storeu_ps(&a[122 - j2], a_k2_0n); _mm_storeu_ps(&a[126 - j2], a_k2_4n); } // Scalar code for the remaining items. for (; j2 < 64; j1 += 1, j2 += 2) { k2 = 128 - j2; k1 = 32 - j1; wkr = 0.5f - c[k1]; wki = c[j1]; xr = a[j2 + 0] - a[k2 + 0]; xi = a[j2 + 1] + a[k2 + 1]; yr = wkr * xr + wki * xi; yi = wkr * xi - wki * xr; a[j2 + 0] = a[j2 + 0] - yr; a[j2 + 1] = yi - a[j2 + 1]; a[k2 + 0] = yr + a[k2 + 0]; a[k2 + 1] = yi - a[k2 + 1]; } a[65] = -a[65]; }
static void cft1st_128_SSE2(float* a) { const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); int j, k2; for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { __m128 a00v = _mm_loadu_ps(&a[j + 0]); __m128 a04v = _mm_loadu_ps(&a[j + 4]); __m128 a08v = _mm_loadu_ps(&a[j + 8]); __m128 a12v = _mm_loadu_ps(&a[j + 12]); __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0)); __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2)); __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0)); __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2)); const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]); const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]); const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]); const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]); const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]); const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]); __m128 x0v = _mm_add_ps(a01v, a23v); const __m128 x1v = _mm_sub_ps(a01v, a23v); const __m128 x2v = _mm_add_ps(a45v, a67v); const __m128 x3v = _mm_sub_ps(a45v, a67v); __m128 x0w; a01v = _mm_add_ps(x0v, x2v); x0v = _mm_sub_ps(x0v, x2v); x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); { const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); a45v = _mm_add_ps(a45_0v, a45_1v); } { __m128 a23_0v, a23_1v; const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1)); const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); x0v = _mm_add_ps(x1v, x3s); x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); a23_0v = _mm_mul_ps(wk1rv, x0v); a23_1v = _mm_mul_ps(wk1iv, x0w); a23v = _mm_add_ps(a23_0v, a23_1v); x0v = _mm_sub_ps(x1v, x3s); x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1)); } { const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); a67v = _mm_add_ps(a67_0v, a67_1v); } a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0)); a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0)); a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2)); a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2)); _mm_storeu_ps(&a[j + 0], a00v); _mm_storeu_ps(&a[j + 4], a04v); _mm_storeu_ps(&a[j + 8], a08v); _mm_storeu_ps(&a[j + 12], a12v); } }