Color3 evalFourier3(float * const coeffs[3], size_t nCoeffs, Float phi) { #if FOURIER_SCALAR == 1 double cosPhi = std::cos((double) phi), cosPhi_prev = cosPhi, cosPhi_cur = 1.0f; double Y = 0, R = 0, B = 0; for (size_t i=0; i<nCoeffs; ++i) { Y += coeffs[0][i] * cosPhi_cur; R += coeffs[1][i] * cosPhi_cur; B += coeffs[2][i] * cosPhi_cur; double cosPhi_next = 2*cosPhi*cosPhi_cur - cosPhi_prev; cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next; } double G = 1.39829f*Y -0.100913f*B - 0.297375f*R; return Color3((Float) R, (Float) G, (Float) B); #else double cosPhi = std::cos((double) phi); __m256d cosPhi_prev = _mm256_set1_pd(cosPhi), cosPhi_cur = _mm256_set1_pd(1.0), Y = _mm256_set_sd((double) coeffs[0][0]), R = _mm256_set_sd((double) coeffs[1][0]), B = _mm256_set_sd((double) coeffs[2][0]), factorPhi_prev, factorPhi_cur; initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur); for (size_t i=1; i<nCoeffs; i+=4) { __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev), _mm256_mul_pd(factorPhi_cur, cosPhi_cur)); Y = _mm256_add_pd(Y, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[0]+i)))); R = _mm256_add_pd(R, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[1]+i)))); B = _mm256_add_pd(B, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[2]+i)))); cosPhi_prev = _mm256_splat2_pd(cosPhi_next); cosPhi_cur = _mm256_splat3_pd(cosPhi_next); } MM_ALIGN32 struct { double Y; double R; double B; double unused; } tmp; simd::hadd(Y, R, B, _mm256_setzero_pd(), (double *) &tmp); double G = 1.39829*tmp.Y -0.100913*tmp.B - 0.297375*tmp.R; return Color3((Float) tmp.R, (Float) G, (Float) tmp.B); #endif }
void ComplexToDouble(Complex *src, double *dstI, double *dstQ, const unsigned int len) { __m128 avxA, avxB; __m256d avxA_D, avxB_D, avxX_D, avxY_D, avxR_D, avxI_D; for (unsigned int i=0; i+4<=len; i+=4) { avxA = _mm_maskload_ps((float*)(src+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1)); //load float avxB = _mm_maskload_ps((float*)(src+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1)); avxA_D = _mm256_cvtps_pd(avxA); //float to double avxB_D = _mm256_cvtps_pd(avxB); avxX_D = _mm256_permute2f128_pd(avxA_D, avxB_D, 0x20); avxY_D = _mm256_permute2f128_pd(avxA_D, avxB_D, 0x31); avxR_D = _mm256_shuffle_pd(avxX_D, avxY_D, 0x00); avxI_D = _mm256_shuffle_pd(avxX_D, avxY_D, 0x0f); _mm256_storeu_pd(dstI+i, avxR_D); //store _mm256_storeu_pd(dstQ+i, avxI_D); } for (unsigned int i=len-(len&0x03); i<len; ++i) { dstI[i] = static_cast<double>(src[i].m_real); dstQ[i] = static_cast<double>(src[i].m_imag); } }
Float evalFourier(const float *coeffs, size_t nCoeffs, Float phi) { #if FOURIER_SCALAR == 1 double cosPhi = std::cos((double) phi), cosPhi_prev = cosPhi, cosPhi_cur = 1.0, value = 0.0; for (size_t i=0; i<nCoeffs; ++i) { value += coeffs[i] * cosPhi_cur; double cosPhi_next = 2.0*cosPhi*cosPhi_cur - cosPhi_prev; cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next; } return (Float) value; #else double cosPhi = std::cos((double) phi); __m256d cosPhi_prev = _mm256_set1_pd(cosPhi), cosPhi_cur = _mm256_set1_pd(1.0), value = _mm256_set_sd((double) coeffs[0]), factorPhi_prev, factorPhi_cur; initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur); for (size_t i=1; i<nCoeffs; i+=4) { __m256d coeff = _mm256_cvtps_pd(_mm_load_ps(coeffs+i)); __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev), _mm256_mul_pd(factorPhi_cur, cosPhi_cur)); value = _mm256_add_pd(value, _mm256_mul_pd(cosPhi_next, coeff)); cosPhi_prev = _mm256_splat2_pd(cosPhi_next); cosPhi_cur = _mm256_splat3_pd(cosPhi_next); } return (Float) simd::hadd(value); #endif }
Color3 sampleFourier3(float * const coeffs[3], const double *recip, size_t nCoeffs, Float sample, Float &pdf, Float &phi) { bool flip = false; if (sample < 0.5f) { sample *= 2.0f; } else { sample = 1.0f - 2.0f * (sample - 0.5f); flip = true; } int iterations = 0; double a = 0.0, c = math::Pi_d, coeff0 = coeffs[0][0], y = coeff0*math::Pi_d*sample, deriv = 0.0, b = 0.5 * math::Pi_d, cosB = 0, sinB = 1; if (nCoeffs > 10 && sample != 0 && sample != 1) { float stddev = std::sqrt(2.0f / 3.0f * std::log(coeffs[0][1] / coeffs[0][2])); if (std::isfinite(stddev)) { b = std::min(c, (double) math::normal_quantile(0.5f + sample / 2) * stddev); cosB = std::cos(b); sinB = std::sqrt(1 - cosB * cosB); } } #if FOURIER_SCALAR != 1 __m256d factorB_prev, factorB_cur; #endif while (true) { #if FOURIER_SCALAR == 1 double cosB_prev = cosB, sinB_prev = -sinB, sinB_cur = 0.0, cosB_cur = 1.0, value = coeff0 * b; deriv = coeff0; for (size_t j=1; j<nCoeffs; ++j) { double sinB_next = 2.0*cosB*sinB_cur - sinB_prev, cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeff = (double) coeffs[0][j]; value += coeff * recip[j] * sinB_next; deriv += coeff * cosB_next; sinB_prev = sinB_cur; sinB_cur = sinB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else initializeRecurrence(cosB, factorB_prev, factorB_cur); __m256d sinB_prev = _mm256_set1_pd(-sinB), sinB_cur = _mm256_set1_pd(0.0), cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), value_vec = _mm256_set_sd(coeff0 * b), deriv_vec = _mm256_set_sd(coeff0); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_vec_f = _mm_load_ps(coeffs[0]+j); __m256d recip_vec = _mm256_load_pd(recip+j); __m256d coeff_vec = _mm256_cvtps_pd(coeff_vec_f); __m256d sinB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, sinB_prev), _mm256_mul_pd(factorB_cur, sinB_cur)); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); value_vec = _mm256_add_pd(value_vec, _mm256_mul_pd( _mm256_mul_pd(recip_vec, coeff_vec), sinB_next)); deriv_vec = _mm256_add_pd(deriv_vec, _mm256_mul_pd(coeff_vec, cosB_next)); sinB_prev = _mm256_splat2_pd(sinB_next); cosB_prev = _mm256_splat2_pd(cosB_next); sinB_cur = _mm256_splat3_pd(sinB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double value = simd::hadd(value_vec); deriv = simd::hadd(deriv_vec); #endif value -= y; if (std::abs(value) <= 1e-5 * coeff0 || ++iterations > 20) break; else if (value > 0.0) c = b; else a = b; b -= value / deriv; if (!(b >= a && b <= c)) b = 0.5f * (a + c); cosB = std::cos(b); sinB = std::sqrt(1-cosB*cosB); } double Y = deriv; if (flip) b = 2.0*math::Pi_d - b; pdf = (Float) (math::InvTwoPi_d * Y / coeff0); phi = (Float) b; #if FOURIER_SCALAR == 1 double cosB_prev = cosB, cosB_cur = 1.0; double R = coeffs[1][0]; double B = coeffs[2][0]; for (size_t j=1; j<nCoeffs; ++j) { double cosB_next = 2.0*cosB*cosB_cur - cosB_prev, coeffR = (double) coeffs[1][j], coeffB = (double) coeffs[2][j]; R += coeffR * cosB_next; B += coeffB * cosB_next; cosB_prev = cosB_cur; cosB_cur = cosB_next; } #else __m256d cosB_prev = _mm256_set1_pd(cosB), cosB_cur = _mm256_set1_pd(1.0), R_vec = _mm256_set_sd(coeffs[1][0]), B_vec = _mm256_set_sd(coeffs[2][0]); for (size_t j=1; j<nCoeffs; j+=4) { __m128 coeff_R_vec_f = _mm_load_ps(coeffs[1]+j); __m128 coeff_B_vec_f = _mm_load_ps(coeffs[2]+j); __m256d coeff_R_vec = _mm256_cvtps_pd(coeff_R_vec_f); __m256d coeff_B_vec = _mm256_cvtps_pd(coeff_B_vec_f); __m256d cosB_next = _mm256_add_pd( _mm256_mul_pd(factorB_prev, cosB_prev), _mm256_mul_pd(factorB_cur, cosB_cur)); R_vec = _mm256_add_pd(R_vec, _mm256_mul_pd(coeff_R_vec, cosB_next)); B_vec = _mm256_add_pd(B_vec, _mm256_mul_pd(coeff_B_vec, cosB_next)); cosB_prev = _mm256_splat2_pd(cosB_next); cosB_cur = _mm256_splat3_pd(cosB_next); } double R = simd::hadd(R_vec); double B = simd::hadd(B_vec); #endif double G = 1.39829 * Y - 0.100913 * B - 0.297375 * R; return Color3((Float) R, (Float) G, (Float) B) * (2 * math::Pi) * (Float) (coeff0 / Y); }
/** * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels. */ FORCE_INLINE double single_pixel( double *im, int center, int top, int left, int right, int bottom, const __m256i mask1110, const __m256d rgb0W, const __m256d onehalf, const __m256d minustwelvehalf){ // double r = im[center]; // double g = im[center+1]; // double b = im[center+2]; // double r1 = im[top]; // double g1 = im[top+1]; // double b1 = im[top+2]; // double r2 = im[left]; // double g2 = im[left+1]; // double b2 = im[left+2]; // double r3 = im[right]; // double g3 = im[right+1]; // double b3 = im[right+2]; // double r4 = im[bottom]; // double g4 = im[bottom+1]; // double b4 = im[bottom+2]; __m256d c = _mm256_maskload_pd(&(im[center]),mask1110); __m256d c1 = _mm256_loadu_pd(&(im[top])); __m256d c2 = _mm256_loadu_pd(&(im[left])); __m256d c3 = _mm256_loadu_pd(&(im[right])); __m256d c4 = _mm256_loadu_pd(&(im[bottom])); COST_INC_LOAD(20); // double grey = rw * r + gw * g + bw * b; // double grey1 = rw * r1 + gw * g1 + bw * b1; // double grey2 = rw * r2 + gw * g2 + bw * b2; // double grey3 = rw * r3 + gw * g3 + bw * b3; // double grey4 = rw * r4 + gw * g4 + bw * b4; __m256d greyc = _mm256_mul_pd(c,rgb0W); __m256d grey1 = _mm256_mul_pd(c1,rgb0W); __m256d grey2 = _mm256_mul_pd(c2,rgb0W); __m256d grey3 = _mm256_mul_pd(c3,rgb0W); __m256d grey4 = _mm256_mul_pd(c4,rgb0W); //AVX: double: horizontal add for 1 vector __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2 __m256d c_h = _mm256_hadd_pd(c,c_perm); __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi double c_hsum_lo = _mm_cvtsd_f64(c_h_lo); double c_hsum_hi = _mm_cvtsd_f64(c_h_hi); double c_hsum = c_hsum_lo + c_hsum_hi; //AVX: double: horizontal add for 1 vector __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2 __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm); __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo); double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi); double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi; //AVX: _m256d: horizontal add for 4 vectors at once __m256d grey12 = _mm256_hadd_pd(grey1,grey2); __m256d grey34 = _mm256_hadd_pd(grey3,grey4); __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011 __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2 __m256d grey_1234 = _mm256_add_pd(grey_1234_perm, grey_1234_blend); //AVX: double: horizontal add for 1 vector __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2 __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm); __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo); double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi); double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi; COST_INC_ADD(10); //+ operations wasted on AVX COST_INC_MUL(15); //+ operations wasted on AVX double mu = c_hsum / 3.0; COST_INC_ADD(2); COST_INC_DIV(1); // double rmu = r-mu; // double gmu = g-mu; // double bmu = b-mu; __m256d c_mu = _mm256_set1_pd(mu); __m256d c_rgbmu = _mm256_sub_pd(c,c_mu); COST_INC_ADD(3); //+1 operations wasted on AVX // double rz = r-0.5; // double gz = g-0.5; // double bz = b-0.5; __m256d c_rgbz = _mm256_sub_pd(c,onehalf); COST_INC_ADD(3); //+1 operations wasted on AVX // double rzrz = rz*rz; // double gzgz = gz*gz; // double bzbz = bz*bz; __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz); COST_INC_MUL(3); //+1 operations wasted on AVX // double re = exp(-12.5*rzrz); // double ge = exp(-12.5*gzgz); // double be = exp(-12.5*bzbz); __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq); __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp); __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps); __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps); COST_INC_EXP(3); COST_INC_MUL(3); //+1 operations wasted on AVX // double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0); __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu); __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo); double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi); double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum); double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo; double t1_tmp2 = t1_tmp1 / 3.0; double t1 = sqrt(t1_tmp2); COST_INC_SQRT(1); COST_INC_ADD(3); COST_INC_MUL(3); //+1 operations wasted on AVX COST_INC_DIV(1); double t2 = fabs(t1); COST_INC_ABS(1); // double t3 = re*ge*be; __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo); double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi); __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped); double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo; COST_INC_MUL(2); double t4 = fabs(t3); COST_INC_ABS(1); double t5 = t2 * t4; COST_INC_MUL(1); // double t6 = -4.0*grey+grey1+grey2+grey3+grey4; double minusfour_times_grey = -4.0*greyc_hsum; double t6 = minusfour_times_grey+grey1234_sum; COST_INC_MUL(1); COST_INC_ADD(2); //2 operations saved due to AVX double t7 = fabs(t6); COST_INC_ABS(1); double t8 = t5 * t7; COST_INC_MUL(1); double t9 = t8 + 1.0E-12; COST_INC_ADD(1); return t9; }