static double evaluateGTRCATPROT_SAVE (int *ex1, int *ex2, int *cptr, int *wptr, double *x1, double *x2, double *tipVector, unsigned char *tipX1, int n, double *diagptable_start, const boolean fastScaling, double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap) { double sum = 0.0, term, *diagptable, *left, *right, *left_ptr = x1, *right_ptr = x2; int i, l; if(tipX1) { for (i = 0; i < n; i++) { left = &(tipVector[20 * tipX1[i]]); if(isGap(x2_gap, i)) right = x2_gapColumn; else { right = right_ptr; right_ptr += 20; } diagptable = &diagptable_start[20 * cptr[i]]; __m128d tv = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d lv = _mm_load_pd(&left[l]); __m128d rv = _mm_load_pd(&right[l]); __m128d mul = _mm_mul_pd(lv, rv); __m128d dv = _mm_load_pd(&diagptable[l]); tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv)); } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); term = LOG(term); sum += wptr[i] * term; } } else { for (i = 0; i < n; i++) { if(isGap(x1_gap, i)) left = x1_gapColumn; else { left = left_ptr; left_ptr += 20; } if(isGap(x2_gap, i)) right = x2_gapColumn; else { right = right_ptr; right_ptr += 20; } diagptable = &diagptable_start[20 * cptr[i]]; __m128d tv = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d lv = _mm_load_pd(&left[l]); __m128d rv = _mm_load_pd(&right[l]); __m128d mul = _mm_mul_pd(lv, rv); __m128d dv = _mm_load_pd(&diagptable[l]); tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv)); } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); term = LOG(term); sum += wptr[i] * term; } } return sum; }
static double evaluateGTRGAMMAPROT (int *ex1, int *ex2, int *wptr, double *x1, double *x2, double *tipVector, unsigned char *tipX1, int n, double *diagptable, const boolean fastScaling) { double sum = 0.0, term; int i, j, l; double *left, *right; assertionError = 0; if(tipX1) { for (i = 0; i < n; i++) { __m128d tv = _mm_setzero_pd(); left = &(tipVector[20 * tipX1[i]]); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; right = &(x2[80 * i + 20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); // [JH] sometimes term contains -0.0000.... which causes the log to become NaN if(term < 0.0) { assertionError = 1; problemCount++; printf("tipX1 i=%d, term=%E\n", i, term); term = fabs(term); } if(fastScaling) term = LOG(0.25 * term); else term = LOG(0.25 * term) + (ex2[i] * LOG(minlikelihood)); sum += wptr[i] * term; } } else { for (i = 0; i < n; i++) { __m128d tv = _mm_setzero_pd(); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; left = &(x1[80 * i + 20 * j]); right = &(x2[80 * i + 20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); // [JH] sometimes term contains -0.0000.... which causes the log to become NaN if(term < 0.0) { assertionError = 1; problemCount++; printf("nontip term=%E\n", term); term = fabs(term); } if(fastScaling) term = LOG(0.25 * term); else term = LOG(0.25 * term) + ((ex1[i] + ex2[i])*LOG(minlikelihood)); sum += wptr[i] * term; } } return sum; }
static double evaluateGTRCATPROT (int *ex1, int *ex2, int *cptr, int *wptr, double *x1, double *x2, double *tipVector, unsigned char *tipX1, int n, double *diagptable_start, const boolean fastScaling) { double sum = 0.0, term; double *diagptable, *left, *right; int i, l; if(tipX1) { for (i = 0; i < n; i++) { left = &(tipVector[20 * tipX1[i]]); right = &(x2[20 * i]); diagptable = &diagptable_start[20 * cptr[i]]; #ifdef __SIM_SSE3 __m128d tv = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d lv = _mm_load_pd(&left[l]); __m128d rv = _mm_load_pd(&right[l]); __m128d mul = _mm_mul_pd(lv, rv); __m128d dv = _mm_load_pd(&diagptable[l]); tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv)); } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); #else for(l = 0, term = 0.0; l < 20; l++) term += left[l] * right[l] * diagptable[l]; #endif term = LOG(term); sum += wptr[i] * term; } } else { for (i = 0; i < n; i++) { left = &x1[20 * i]; right = &x2[20 * i]; diagptable = &diagptable_start[20 * cptr[i]]; #ifdef __SIM_SSE3 __m128d tv = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) { __m128d lv = _mm_load_pd(&left[l]); __m128d rv = _mm_load_pd(&right[l]); __m128d mul = _mm_mul_pd(lv, rv); __m128d dv = _mm_load_pd(&diagptable[l]); tv = _mm_add_pd(tv, _mm_mul_pd(mul, dv)); } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); #else for(l = 0, term = 0.0; l < 20; l++) term += left[l] * right[l] * diagptable[l]; #endif term = LOG(term); sum += wptr[i] * term; } } return sum; }
static double evaluateGTRGAMMAPROT_perSite (int *ex1, int *ex2, int *wptr, double *x1, double *x2, unsigned char *tipX1, int n, int *perSiteAA, siteAAModels *siteProtModel) { double sum = 0.0, term; int i, j, l; double *left, *right; if(tipX1) { for (i = 0; i < n; i++) { double *tipVector = siteProtModel[perSiteAA[i]].tipVector, *diagptable = siteProtModel[perSiteAA[i]].left; __m128d tv = _mm_setzero_pd(); left = &(tipVector[20 * tipX1[i]]); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; right = &(x2[80 * i + 20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); term = LOG(0.25 * term); sum += wptr[i] * term; } } else { for (i = 0; i < n; i++) { double *diagptable = siteProtModel[perSiteAA[i]].left; __m128d tv = _mm_setzero_pd(); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; left = &(x1[80 * i + 20 * j]); right = &(x2[80 * i + 20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); term = LOG(0.25 * term); sum += wptr[i] * term; } } return sum; }
static double evaluateGTRGAMMAPROT_GAPPED_SAVE (int *ex1, int *ex2, int *wptr, double *x1, double *x2, double *tipVector, unsigned char *tipX1, int n, double *diagptable, const boolean fastScaling, double *x1_gapColumn, double *x2_gapColumn, unsigned int *x1_gap, unsigned int *x2_gap) { double sum = 0.0, term; int i, j, l; double *left, *right, *x1_ptr = x1, *x2_ptr = x2, *x1v, *x2v; if(tipX1) { for (i = 0; i < n; i++) { if(x2_gap[i / 32] & mask32[i % 32]) x2v = x2_gapColumn; else { x2v = x2_ptr; x2_ptr += 80; } __m128d tv = _mm_setzero_pd(); left = &(tipVector[20 * tipX1[i]]); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; right = &(x2v[20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); if(fastScaling) term = LOG(0.25 * term); else term = LOG(0.25 * term) + (ex2[i] * LOG(minlikelihood)); sum += wptr[i] * term; } } else { for (i = 0; i < n; i++) { if(x1_gap[i / 32] & mask32[i % 32]) x1v = x1_gapColumn; else { x1v = x1_ptr; x1_ptr += 80; } if(x2_gap[i / 32] & mask32[i % 32]) x2v = x2_gapColumn; else { x2v = x2_ptr; x2_ptr += 80; } __m128d tv = _mm_setzero_pd(); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; left = &(x1v[20 * j]); right = &(x2v[20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); if(fastScaling) term = LOG(0.25 * term); else term = LOG(0.25 * term) + ((ex1[i] + ex2[i])*LOG(minlikelihood)); sum += wptr[i] * term; } } return sum; }
__m128d test_mm_hadd_pd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_hadd_pd // CHECK: call <2 x double> @llvm.x86.sse3.hadd.pd // CHECK-ASM: haddpd %xmm{{.*}}, %xmm{{.*}} return _mm_hadd_pd(A, B); }
static inline void computeVectorGTRGAMMAPROT(double *lVector, int *eVector, double *gammaRates, int i, double qz, double rz, traversalInfo *ti, double *EIGN, double *EI, double *EV, double *tipVector, unsigned char **yVector, int mxtips) { double *x1, *x2, *x3; int s, pNumber = ti->pNumber, rNumber = ti->rNumber, qNumber = ti->qNumber, index1[4], index2[4]; x3 = &(lVector[80 * (pNumber - mxtips)]); switch(ti->tipCase) { case TIP_TIP: x1 = &(tipVector[20 * yVector[qNumber][i]]); x2 = &(tipVector[20 * yVector[rNumber][i]]); for(s = 0; s < 4; s++) { index1[s] = 0; index2[s] = 0; } break; case TIP_INNER: x1 = &(tipVector[20 * yVector[qNumber][i]]); x2 = &( lVector[80 * (rNumber - mxtips)]); for(s = 0; s < 4; s++) index1[s] = 0; for(s = 0; s < 4; s++) index2[s] = s; break; case INNER_INNER: x1 = &(lVector[80 * (qNumber - mxtips)]); x2 = &(lVector[80 * (rNumber - mxtips)]); for(s = 0; s < 4; s++) { index1[s] = s; index2[s] = s; } break; default: assert(0); } { double e1[20] __attribute__ ((aligned (BYTE_ALIGNMENT))), e2[20] __attribute__ ((aligned (BYTE_ALIGNMENT))), d1[20] __attribute__ ((aligned (BYTE_ALIGNMENT))), d2[20] __attribute__ ((aligned (BYTE_ALIGNMENT))), lz1, lz2; int l, k, scale, j; for(j = 0; j < 4; j++) { lz1 = qz * gammaRates[j]; lz2 = rz * gammaRates[j]; e1[0] = 1.0; e2[0] = 1.0; for(l = 1; l < 20; l++) { e1[l] = EXP(EIGN[l] * lz1); e2[l] = EXP(EIGN[l] * lz2); } for(l = 0; l < 20; l+=2) { __m128d d1v = _mm_mul_pd(_mm_load_pd(&x1[20 * index1[j] + l]), _mm_load_pd(&e1[l])); __m128d d2v = _mm_mul_pd(_mm_load_pd(&x2[20 * index2[j] + l]), _mm_load_pd(&e2[l])); _mm_store_pd(&d1[l], d1v); _mm_store_pd(&d2[l], d2v); } __m128d zero = _mm_setzero_pd(); for(l = 0; l < 20; l+=2) _mm_store_pd(&x3[j * 20 + l], zero); for(l = 0; l < 20; l++) { double *ev = &EV[l * 20]; __m128d ump_x1v = _mm_setzero_pd(); __m128d ump_x2v = _mm_setzero_pd(); __m128d x1px2v; for(k = 0; k < 20; k+=2) { __m128d eiv = _mm_load_pd(&EI[20 * l + k]); __m128d d1v = _mm_load_pd(&d1[k]); __m128d d2v = _mm_load_pd(&d2[k]); ump_x1v = _mm_add_pd(ump_x1v, _mm_mul_pd(d1v, eiv)); ump_x2v = _mm_add_pd(ump_x2v, _mm_mul_pd(d2v, eiv)); } ump_x1v = _mm_hadd_pd(ump_x1v, ump_x1v); ump_x2v = _mm_hadd_pd(ump_x2v, ump_x2v); x1px2v = _mm_mul_pd(ump_x1v, ump_x2v); for(k = 0; k < 20; k+=2) { __m128d ex3v = _mm_load_pd(&x3[j * 20 + k]); __m128d EVV = _mm_load_pd(&ev[k]); ex3v = _mm_add_pd(ex3v, _mm_mul_pd(x1px2v, EVV)); _mm_store_pd(&x3[j * 20 + k], ex3v); } } } scale = 1; for(l = 0; scale && (l < 80); l++) scale = ((x3[l] < minlikelihood) && (x3[l] > minusminlikelihood)); if(scale) { __m128d twoto = _mm_set_pd(twotothe256, twotothe256); for(l = 0; l < 80; l+=2) { __m128d ex3v = _mm_mul_pd(_mm_load_pd(&x3[l]),twoto); _mm_store_pd(&x3[l], ex3v); } *eVector = *eVector + 1; } return; } }
static double evaluateGTRGAMMAPROT (int *ex1, int *ex2, int *wptr, double *x1, double *x2, double *tipVector, unsigned char *tipX1, int n, double *diagptable, const boolean fastScaling) { double sum = 0.0, term; int i, j, l; double *left, *right; if(tipX1) { for (i = 0; i < n; i++) { __m128d tv = _mm_setzero_pd(); left = &(tipVector[20 * tipX1[i]]); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; right = &(x2[80 * i + 20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); if(fastScaling) term = LOG(0.25 * term); else term = LOG(0.25 * term) + (ex2[i] * LOG(minlikelihood)); sum += wptr[i] * term; } } else { for (i = 0; i < n; i++) { __m128d tv = _mm_setzero_pd(); for(j = 0, term = 0.0; j < 4; j++) { double *d = &diagptable[j * 20]; left = &(x1[80 * i + 20 * j]); right = &(x2[80 * i + 20 * j]); for(l = 0; l < 20; l+=2) { __m128d mul = _mm_mul_pd(_mm_load_pd(&left[l]), _mm_load_pd(&right[l])); tv = _mm_add_pd(tv, _mm_mul_pd(mul, _mm_load_pd(&d[l]))); } } tv = _mm_hadd_pd(tv, tv); _mm_storel_pd(&term, tv); if(fastScaling) term = LOG(0.25 * term); else term = LOG(0.25 * term) + ((ex1[i] + ex2[i])*LOG(minlikelihood)); sum += wptr[i] * term; } } return sum; }
Point2d CameraATAN::Project(const Point3d& p3d) { if(p3d.z<=0) return Point2d(-1,-1); #ifdef __SSE3__ if(useDistortion) { __m128d xy=(__m128d){p3d.x,p3d.y}; if(p3d.z!=1.) { xy=_mm_sub_pd(xy,(__m128d){p3d.z,p3d.z}); } __m128d xy2=_mm_mul_pd(xy,xy); xy2=_mm_hadd_pd(xy2,xy2); xy2=_mm_sqrt_pd(xy2); double r=((Point2d*)&xy2)->x; if(r < 0.001 || d == 0.0) r=1.0; else r=(d_inv* atan(r * tan2w) / r); xy=_mm_mul_pd((__m128d){fx,fy},xy); xy=_mm_mul_pd(xy,(__m128d){r,r}); xy=_mm_add_pd(xy,(__m128d){cx,cy}); return *(Point2d*)&xy; } else { if(p3d.z==1.) { __m128d xy = _mm_setr_pd(p3d.x,p3d.y); xy=_mm_add_pd(_mm_setr_pd(cx,cy),_mm_mul_pd(xy,(__m128d){fx,fy})); return *(Point2d*)&xy; } else if(p3d.z>0) { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } } #else if(useDistortion) { double X=p3d.x,Y=p3d.y; if(p3d.z!=1.) { double z_inv=1./p3d.z; X*=z_inv;Y*=z_inv; } double r= sqrt(X*X+Y*Y); if(r < 0.001 || d == 0.0) r= 1.0; else r=(d_inv* atan(r * tan2w) / r); return Point2d(cx + fx * r * X,cy + fy * r * Y); } else { if(p3d.z==1.) { return Point2d(fx*p3d.x+cx,fy*p3d.y+cy); } else { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } } #endif return Point2d(-1,-1);// let compiler happy }
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint) { #if CHECK_FUNCTIONAL Vector3d param(EPoint); #endif AVX2TABLETYPE *mp; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; const __m256d ONE_PD = _mm256_set1_pd(1.0); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d ss; __m256d blend; __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)); // blend = _mm256_blend_pd(iii, jjj, 0); INCSUMAVX_VECTOR(mp, ss, iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 2); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)); blend = _mm256_blend_pd(iii, jjj, 6); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 4); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 12); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)); // blend = _mm256_blend_pd(iii, jjj, 14); INCSUMAVX_VECTOR(mp, ss, jjj); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 10); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)); blend = _mm256_blend_pd(iii, jjj, 8); INCSUMAVX_VECTOR(mp, ss, blend); __m256d xy = _mm256_hadd_pd(x,y); __m128d xy_up = _mm256_extractf128_pd(xy,1); xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up); _mm_storeu_pd(&result[X],xy_up); __m128d z_up = _mm256_extractf128_pd(z,1); z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up); z_up = _mm_hadd_pd(z_up,z_up); result[Z] = _mm_cvtsd_f64(z_up); #if CHECK_FUNCTIONAL { Vector3d portable_res; PortableDNoise(portable_res , param); if (fabs(portable_res[X] - result[X]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise X error"); } if (fabs(portable_res[Y] - result[Y]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Y error"); } if (fabs(portable_res[Z] - result[Z]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Z error"); } } #endif _mm256_zeroupper(); return; }
int main(void) { // std::cout<<std::endl<<" Compute inner product..."<<std::endl<<std::endl; // INIT VECTOR //double vec1 [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_)));//__declspec(align(n)) //double vec2 [_PBM_SIZE] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec1 [_PBM_SIZE]; //__declspec(align(_CBSIM_DBL_ALIGN_)) double vec2 [_PBM_SIZE]; //double *vec1 = _aligned_malloc(_PBM_SIZE*sizeof *vec1,_CBSIM_DBL_ALIGN_); //double *vec2 = _aligned_malloc(_PBM_SIZE*sizeof *vec2,_CBSIM_DBL_ALIGN_); double *vec1 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32); double *vec2 =(double *)_mm_malloc(sizeof(double)*_PBM_SIZE,32); double result = 0.0; // tbb::tick_count t1, t2; int loopsToDo = 10000; for (int i=0 ; i < _PBM_SIZE ; i++) { vec1[i] = static_cast<double>(i)*0.01; vec2[i] = static_cast<double>(i)*0.01; } // SERIAL *********************************************************************************** // t1 = tbb::tick_count::now(); for (int z=0 ; z < loopsToDo ; z++) { //__m256d ymm0; //__m256d ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;//, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18; //ymm0 = _mm256_setzero_pd(); // accumulator //double res0 = 0.0, res1 = 0.0, res2 = 0.0, res3 = 0.0; //__m256d acc = _mm256_setzero_pd(); //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0, 0.0, 0.0}; result = 0.0; //double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; for (int i=0 ; i < _PBM_SIZE; i+=8) { /* __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m128d xmm1 = _mm256_extractf128_pd(ymm3,0); __m128d xmm2 = _mm256_extractf128_pd(ymm3,1); __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); */ __assume_aligned(&vec1[0],32); __assume_aligned(&vec2[0],32); __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); __m256d ymm7 = _mm256_add_pd( ymm3, ymm6); __m128d xmm1 = _mm256_extractf128_pd(ymm7,0); __m128d xmm2 = _mm256_extractf128_pd(ymm7,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); //__m256d ymm0 = _mm256_add_pd( ymm0, ymm7); /* //__assume_aligned(&vec1[0],32); //__assume_aligned(&vec2[0],32); __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); //__m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); //__m256d ymm7 = _mm256_add_pd( ymm3, ymm6); __m256d ymm6 = _mm256_fmadd_pd (ymm4,ymm5,ymm3); //ymm0 = _mm256_add_pd( ymm0, ymm7); __m128d xmm1 = _mm256_extractf128_pd(ymm6,0); __m128d xmm2 = _mm256_extractf128_pd(ymm6,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); //_mm256_store_pd(&res[0],ymm6); //result_SIMD_INTRINSICS += (res[0] + res[1]) + (res[2] + res[3]); */ //#define _VER_AVX #ifdef _VER_AVX __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec2[i]); __m256d ymm3 = _mm256_mul_pd( ymm1, ymm2 ); __m256d ymm4 = _mm256_load_pd(&vec1[i+4]); __m256d ymm5 = _mm256_load_pd(&vec2[i+4]); __m256d ymm6 = _mm256_mul_pd( ymm4, ymm5 ); __m256d ymm7 = _mm256_load_pd(&vec1[i+8]); __m256d ymm8 = _mm256_load_pd(&vec2[i+8]); __m256d ymm9 = _mm256_mul_pd( ymm7, ymm8 ); __m256d ymm10 = _mm256_load_pd(&vec1[i+12]); __m256d ymm11 = _mm256_load_pd(&vec2[i+12]); __m256d ymm12 = _mm256_mul_pd( ymm10, ymm11 ); __m256d ymm13 = _mm256_add_pd( ymm3, ymm6); __m256d ymm14 = _mm256_add_pd( ymm9, ymm12); __m256d ymm15 = _mm256_add_pd( ymm13, ymm14); __m128d xmm1 = _mm256_extractf128_pd(ymm15,0); __m128d xmm2 = _mm256_extractf128_pd(ymm15,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res_SIMD_INTRINSICS[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res_SIMD_INTRINSICS[0],xmm3); result += (res_SIMD_INTRINSICS[0] + res_SIMD_INTRINSICS[1]); //ymm0 = _mm256_add_pd( ymm0, ymm13); //ymm0 = _mm256_add_pd( ymm0, ymm14); #endif //#define _VER_AVX2 #ifdef _VER_AVX2 __m256d ymm1 = _mm256_load_pd(&vec1[i]); __m256d ymm2 = _mm256_load_pd(&vec1[i+4]); __m256d ymm3 = _mm256_load_pd(&vec1[i+8]); __m256d ymm4 = _mm256_load_pd(&vec1[i+12]); //__m256d ymm13 = _mm256_load_pd(&vec1[i+16]); //__m256d ymm14 = _mm256_load_pd(&vec1[i+20]); //__m256d ymm15 = _mm256_load_pd(&vec1[i+24]); //__m256d ymm16 = _mm256_load_pd(&vec1[i+28]); __m256d ymm5 = _mm256_load_pd(&vec2[i]); __m256d ymm6 = _mm256_load_pd(&vec2[i+4]); __m256d ymm7 = _mm256_load_pd(&vec2[i+8]); __m256d ymm8 = _mm256_load_pd(&vec2[i+12]); //__m256d ymm17 = _mm256_load_pd(&vec2[i+16]); //__m256d ymm18 = _mm256_load_pd(&vec2[i+20]); //__m256d ymm19 = _mm256_load_pd(&vec2[i+24]); //__m256d ymm20 = _mm256_load_pd(&vec2[i+28]); __m256d ymm9 = _mm256_mul_pd(ymm1,ymm5); __m256d ymm10 = _mm256_fmadd_pd(ymm2,ymm6,ymm9); //__m256d ymm11 = _mm256_mul_pd(ymm3,ymm7); __m256d ymm11 = _mm256_fmadd_pd(ymm3,ymm7,ymm10); __m256d ymm12 = _mm256_fmadd_pd(ymm4,ymm8,ymm11); //ymm12 = _mm256_hadd_pd(ymm10,ymm12); //__m256d ymm21 = _mm256_fmadd_pd(ymm13,ymm17,ymm12); //__m256d ymm22 = _mm256_fmadd_pd(ymm14,ymm18,ymm21); //__m256d ymm23 = _mm256_fmadd_pd(ymm15,ymm19,ymm22); //__m256d ymm24 = _mm256_fmadd_pd(ymm16,ymm20,ymm23); __m128d xmm1 = _mm256_extractf128_pd(ymm12,0); __m128d xmm2 = _mm256_extractf128_pd(ymm12,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); #endif /* // Performing 4 dot product at one time ymm1 = _mm256_load_pd(&vec1[i]); // x[0] ymm2 = _mm256_load_pd(&vec1[i+4]); // x[1] ymm3 = _mm256_load_pd(&vec1[i+8]); // x[2] ymm4 = _mm256_load_pd(&vec1[i+12]); // x[3] ymm5 = _mm256_load_pd(&vec2[i]); // y[0] ymm6 = _mm256_load_pd(&vec2[i+4]); // y[1] ymm7 = _mm256_load_pd(&vec2[i+8]); // y[2] ymm8 = _mm256_load_pd(&vec2[i+12]); // y[3] ymm9 = _mm256_mul_pd( ymm1, ymm5 ); // xy0 ymm10 = _mm256_mul_pd( ymm2, ymm6 ); // xy1 ymm11 = _mm256_hadd_pd( ymm9, ymm10 ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13 ymm12 = _mm256_mul_pd( ymm3, ymm7 ); // xy2 ymm13 = _mm256_mul_pd( ymm4, ymm8 ); // xy3 ymm14 = _mm256_hadd_pd( ymm12, ymm13 ); // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33 ymm15 = _mm256_permute2f128_pd( ymm11, ymm14, 0x21 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31 ymm1 = _mm256_blend_pd( ymm11, ymm14, 0b1100); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33 ymm2 = _mm256_add_pd( ymm15, ymm1 ); ymm0 = _mm256_add_pd( ymm0, ymm2 ); */ /* __m256d x[4], y[4]; x[0] = _mm256_load_pd(&vec1[i]); x[1] = _mm256_load_pd(&vec1[i+4]); x[2] = _mm256_load_pd(&vec1[i+8]); x[3] = _mm256_load_pd(&vec1[i+12]); y[0] = _mm256_load_pd(&vec2[i]); y[1] = _mm256_load_pd(&vec2[i+4]); y[2] = _mm256_load_pd(&vec2[i+8]); y[3] = _mm256_load_pd(&vec2[i+12]); __m256d xy0 = _mm256_mul_pd( x[0], y[0] ); __m256d xy1 = _mm256_mul_pd( x[1], y[1] ); // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13 __m256d temp01 = _mm256_hadd_pd( xy0, xy1 ); __m256d xy2 = _mm256_mul_pd( x[2], y[2] ); __m256d xy3 = _mm256_mul_pd( x[3], y[3] ); // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33 __m256d temp23 = _mm256_hadd_pd( xy2, xy3 ); // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31 __m256d swapped = _mm256_permute2f128_pd( temp01, temp23, 0x21 ); // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33 __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100); __m256d dotproduct = _mm256_add_pd( swapped, blended ); */ //ymm0 = _mm256_add_pd(ymm0,dotproduct); /* __m128d xmm1 = _mm256_extractf128_pd(dotproduct,0); __m128d xmm2 = _mm256_extractf128_pd(dotproduct,1);; __m128d xmm3 = _mm_hadd_pd(xmm1,xmm2); double res[2] __attribute__((aligned(_CBSIM_DBL_ALIGN_))) = {0.0, 0.0}; _mm_store_pd(&res[0],xmm3); //_mm256_store_pd(&res[0],ymm12); result += (res[0] + res[1]);// + (res[2] + res[3]); */ // _mm256_store_pd(&res[0],dotproduct); // result += (res[0] + res[1]) + (res[2] + res[3]); //result_SIMD_INTRINSICS += dotproduct[0] + dotproduct[1] + dotproduct[2] + dotproduct[3]; //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //_mm256_store_pd(&res[0],ymm0); //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; //double* res = (double*)&ymm0; //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; } //double* res = (double*)&ymm0; //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; //double res[4] __attribute__((aligned(_CBSIM_DBL_ALIGN_))); //_mm256_store_pd(&res[0],ymm0); //result_SIMD_INTRINSICS += res[0] + res[1] + res[2] + res[3]; } // t2 = tbb::tick_count::now(); // double exec_time = 1000.0*(t2-t1).seconds(); //std::cout << std::setiosflags(std::ios::fixed) << std::setprecision(5); std::cout<<std::endl<<"RESULTS: " <<std::endl; std::cout<<"result_intrin ----------: "<< result << std::endl; //std::cout<<"result_intrin ----------: "<< result << ", time: " << 1000.0*(t2-t1).seconds() << " ms" << std::endl; std::cout<<std::endl<<"Program end. "<<std::endl<<std::endl; return 0; }
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator) { AVX2TABLETYPE *mp; DBL sum = 0.0; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator == kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } const __m256d ONE_PD = _mm256_set1_pd(1); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d sumr = _mm256_setzero_pd(); __m256d sumr1 = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj); { sumr = _mm256_add_pd(sumr, sumr1); __m128d sumr_up = _mm256_extractf128_pd(sumr,1); sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up); sumr_up = _mm_hadd_pd(sumr_up,sumr_up); sum = _mm_cvtsd_f64(sumr_up); } if (noise_generator == kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ sum += 1.05242; sum *= 0.48985582; /*sum *= 0.5; sum += 0.5;*/ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } else { sum = sum + 0.5; /* range at this point -0.5 - 0.5... */ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } #if CHECK_FUNCTIONAL { DBL orig_sum = PortableNoise(EPoint, noise_generator); if (fabs(orig_sum - sum) >= EPSILON) { throw POV_EXCEPTION_STRING("Noise error"); } } #endif _mm256_zeroupper(); return (sum); }
/** * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels. */ FORCE_INLINE double single_pixel( double *im, int center, int top, int left, int right, int bottom, const __m256i mask1110, const __m256d rgb0W, const __m256d onehalf, const __m256d minustwelvehalf){ // double r = im[center]; // double g = im[center+1]; // double b = im[center+2]; // double r1 = im[top]; // double g1 = im[top+1]; // double b1 = im[top+2]; // double r2 = im[left]; // double g2 = im[left+1]; // double b2 = im[left+2]; // double r3 = im[right]; // double g3 = im[right+1]; // double b3 = im[right+2]; // double r4 = im[bottom]; // double g4 = im[bottom+1]; // double b4 = im[bottom+2]; __m256d c = _mm256_maskload_pd(&(im[center]),mask1110); __m256d c1 = _mm256_loadu_pd(&(im[top])); __m256d c2 = _mm256_loadu_pd(&(im[left])); __m256d c3 = _mm256_loadu_pd(&(im[right])); __m256d c4 = _mm256_loadu_pd(&(im[bottom])); COST_INC_LOAD(20); // double grey = rw * r + gw * g + bw * b; // double grey1 = rw * r1 + gw * g1 + bw * b1; // double grey2 = rw * r2 + gw * g2 + bw * b2; // double grey3 = rw * r3 + gw * g3 + bw * b3; // double grey4 = rw * r4 + gw * g4 + bw * b4; __m256d greyc = _mm256_mul_pd(c,rgb0W); __m256d grey1 = _mm256_mul_pd(c1,rgb0W); __m256d grey2 = _mm256_mul_pd(c2,rgb0W); __m256d grey3 = _mm256_mul_pd(c3,rgb0W); __m256d grey4 = _mm256_mul_pd(c4,rgb0W); //AVX: double: horizontal add for 1 vector __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2 __m256d c_h = _mm256_hadd_pd(c,c_perm); __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi double c_hsum_lo = _mm_cvtsd_f64(c_h_lo); double c_hsum_hi = _mm_cvtsd_f64(c_h_hi); double c_hsum = c_hsum_lo + c_hsum_hi; //AVX: double: horizontal add for 1 vector __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2 __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm); __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo); double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi); double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi; //AVX: _m256d: horizontal add for 4 vectors at once __m256d grey12 = _mm256_hadd_pd(grey1,grey2); __m256d grey34 = _mm256_hadd_pd(grey3,grey4); __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011 __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2 __m256d grey_1234 = _mm256_add_pd(grey_1234_perm, grey_1234_blend); //AVX: double: horizontal add for 1 vector __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2 __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm); __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo); double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi); double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi; COST_INC_ADD(10); //+ operations wasted on AVX COST_INC_MUL(15); //+ operations wasted on AVX double mu = c_hsum / 3.0; COST_INC_ADD(2); COST_INC_DIV(1); // double rmu = r-mu; // double gmu = g-mu; // double bmu = b-mu; __m256d c_mu = _mm256_set1_pd(mu); __m256d c_rgbmu = _mm256_sub_pd(c,c_mu); COST_INC_ADD(3); //+1 operations wasted on AVX // double rz = r-0.5; // double gz = g-0.5; // double bz = b-0.5; __m256d c_rgbz = _mm256_sub_pd(c,onehalf); COST_INC_ADD(3); //+1 operations wasted on AVX // double rzrz = rz*rz; // double gzgz = gz*gz; // double bzbz = bz*bz; __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz); COST_INC_MUL(3); //+1 operations wasted on AVX // double re = exp(-12.5*rzrz); // double ge = exp(-12.5*gzgz); // double be = exp(-12.5*bzbz); __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq); __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp); __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps); __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps); COST_INC_EXP(3); COST_INC_MUL(3); //+1 operations wasted on AVX // double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0); __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu); __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo); double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi); double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum); double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo; double t1_tmp2 = t1_tmp1 / 3.0; double t1 = sqrt(t1_tmp2); COST_INC_SQRT(1); COST_INC_ADD(3); COST_INC_MUL(3); //+1 operations wasted on AVX COST_INC_DIV(1); double t2 = fabs(t1); COST_INC_ABS(1); // double t3 = re*ge*be; __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo); double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi); __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped); double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo; COST_INC_MUL(2); double t4 = fabs(t3); COST_INC_ABS(1); double t5 = t2 * t4; COST_INC_MUL(1); // double t6 = -4.0*grey+grey1+grey2+grey3+grey4; double minusfour_times_grey = -4.0*greyc_hsum; double t6 = minusfour_times_grey+grey1234_sum; COST_INC_MUL(1); COST_INC_ADD(2); //2 operations saved due to AVX double t7 = fabs(t6); COST_INC_ABS(1); double t8 = t5 * t7; COST_INC_MUL(1); double t9 = t8 + 1.0E-12; COST_INC_ADD(1); return t9; }
// it moves vertically across blocks void kernel_dtrmv_u_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; double *tA, *tx; int k; __m256d tmp0, a_00_10_20_30, x_0_1_2_3, y_00; y_00 = _mm256_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); A += 4 + (sda-1)*lda; x += 4; } __m128d tm0, a_00_10, a_01_11, x_0_1, y_0, y_1, y_0_1; tm0 = _mm256_extractf128_pd( y_00, 0x1 ); y_0 = _mm256_castpd256_pd128( y_00 ); y_0 = _mm_add_pd( y_0, tm0 ); if(k<kmax-1) { x_0_1 = _mm_loadu_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); tm0 = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd( y_0, tm0 ); A += 2; x += 2; } x_0_1 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tm0 = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd( y_0, tm0 ); y_0 = _mm_hadd_pd( y_0, y_0 ); if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_add_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_sub_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
__m128d test_mm_hadd_pd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_hadd_pd // CHECK: call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) return _mm_hadd_pd(A, B); }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C) { //#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__)) #ifdef OGDF_SSE3_EXTENSIONS const int n = C.numberOfNodes(); #ifdef _OPENMP const int work = 256; const int nThreadsRep = min(omp_get_max_threads(), 1 + n*n/work); const int nThreadsPrev = min(omp_get_max_threads(), 1 + n /work); #endif const double k = m_idealEdgeLength; const double kSquare = k*k; const double c_rep = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal const double minDist = 10e-6;//100*DBL_EPSILON; const double minDistSquare = minDist*minDist; double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); __m128d mm_kSquare = _mm_set1_pd(kSquare); __m128d mm_minDist = _mm_set1_pd(minDist); __m128d mm_minDistSquare = _mm_set1_pd(minDistSquare); __m128d mm_c_rep = _mm_set1_pd(c_rep); #pragma omp parallel num_threads(nThreadsRep) { double tx = m_txNull; double ty = m_tyNull; int cF = 1; for(int i = 1; i <= m_iterations; i++) { // repulsive forces #pragma omp for for(int v = 0; v < n; ++v) { __m128d mm_disp_xv = _mm_setzero_pd(); __m128d mm_disp_yv = _mm_setzero_pd(); __m128d mm_xv = _mm_set1_pd(C.m_x[v]); __m128d mm_yv = _mm_set1_pd(C.m_y[v]); int u; for(u = 0; u+1 < v; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } int uStart = u+2; if(u == v) ++u; if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } for(u = uStart; u < n; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv); mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv); _mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep)); _mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep)); } // attractive forces #pragma omp single for(int e = 0; e < C.numberOfEdges(); ++e) { int v = C.m_src[e]; int u = C.m_tgt[e]; double delta_x = C.m_x[v] - C.m_x[u]; double delta_y = C.m_y[v] - C.m_y[u]; double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y)); disp_x[v] -= delta_x * dist / k; disp_y[v] -= delta_y * dist / k; disp_x[u] += delta_x * dist / k; disp_y[u] += delta_y * dist / k; } // limit the maximum displacement to the temperature (m_tx,m_ty) __m128d mm_tx = _mm_set1_pd(tx); __m128d mm_ty = _mm_set1_pd(ty); #pragma omp for nowait for(int v = 0; v < n-1; v += 2) { __m128d mm_disp_xv = _mm_load_pd(&disp_x[v]); __m128d mm_disp_yv = _mm_load_pd(&disp_y[v]); __m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd( _mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv)) )); _mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]), _mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx)) )); _mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]), _mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty)) )); } #pragma omp single nowait { if(n % 2) { int v = n-1; double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v])); C.m_x[v] += disp_x[v] / dist * min(dist,tx); C.m_y[v] += disp_y[v] / dist * min(dist,ty); } } cool(tx,ty,cF); #pragma omp barrier } } System::alignedMemoryFree(disp_x); System::alignedMemoryFree(disp_y); #else mainStep(C); #endif }