static void test() { typedef typename Vec::EntryType VecT; MemT *data = Vc::malloc<MemT, Vc::AlignOnCacheline>(128); for (size_t i = 0; i < 128; ++i) { data[i] = static_cast<MemT>(i - 64); } for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { Vec v; if (i % (2 * Vec::Size) == 0) { v = Vec(&data[i]); } else if (i % Vec::Size == 0) { v = Vec(&data[i], Vc::Aligned); } else { v = Vec(&data[i], Vc::Unaligned); } for (size_t j = 0; j < Vec::Size; ++j) { COMPARE(v[j], static_cast<VecT>(data[i + j])) << " " << TypeInfo<MemT>::string(); } } for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { Vec v; if (i % (2 * Vec::Size) == 0) { v.load(&data[i]); } else if (i % Vec::Size == 0) { v.load(&data[i], Vc::Aligned); } else { v.load(&data[i], Vc::Unaligned); } for (size_t j = 0; j < Vec::Size; ++j) { COMPARE(v[j], static_cast<VecT>(data[i + j])) << " " << TypeInfo<MemT>::string(); } } for (size_t i = 0; i < 128 - Vec::Size + 1; ++i) { Vec v; if (i % (2 * Vec::Size) == 0) { v = Vec(&data[i], Vc::Streaming); } else if (i % Vec::Size == 0) { v = Vec(&data[i], Vc::Streaming | Vc::Aligned); } else { v = Vec(&data[i], Vc::Streaming | Vc::Unaligned); } for (size_t j = 0; j < Vec::Size; ++j) { COMPARE(v[j], static_cast<VecT>(data[i + j])) << " " << TypeInfo<MemT>::string(); } } ADD_PASS() << "loadCvt: load " << TypeInfo<MemT>::string() << "* as " << TypeInfo<Vec>::string(); LoadCvt<Vec, typename SupportedConversions<VecT, MemT>::Next>::test(); }
template<typename Vec> void loadArray() { typedef typename Vec::EntryType T; typedef typename Vec::IndexType I; enum loadArrayEnum { count = 256 * 1024 / sizeof(T) }; Vc::Memory<Vec, count> array; for (int i = 0; i < count; ++i) { array[i] = i; } const I indexesFromZero(IndexesFromZero); const Vec offsets(indexesFromZero); for (int i = 0; i < count; i += Vec::Size) { const T *const addr = &array[i]; Vec ii(i); ii += offsets; Vec a(addr); COMPARE(a, ii); Vec b = Vec::Zero(); b.load(addr); COMPARE(b, ii); } }
template<typename Vec> void loadArrayShort() { typedef typename Vec::EntryType T; Vc::Memory<Vec, loadArrayShortCount> array; for (int i = 0; i < loadArrayShortCount; ++i) { array[i] = i; } const Vec &offsets = static_cast<Vec>(ushort_v::IndexesFromZero()); for (int i = 0; i < loadArrayShortCount; i += Vec::Size) { const T *const addr = &array[i]; Vec ii(i); ii += offsets; Vec a(addr); COMPARE(a, ii); Vec b = Vec::Zero(); b.load(addr); COMPARE(b, ii); } }
/*make sure quartic polynomial is monotonic*/ inline void filter_pqm_monotonicity(Vec *values, uint k, Vec &fv_l, Vec &fv_r, Vec &fd_l, Vec &fd_r){ const Vec root_outside = Vec(100.0); //fixed values give to roots clearly outside [0,1], or nonexisting ones*/ /*second derivative coefficients, eq 23 in white et al.*/ Vec b0 = 60.0 * values[k] - 24.0 * fv_r - 36.0 * fv_l + 3.0 * (fd_r - 3.0 * fd_l); Vec b1 = -360.0 * values[k] + 36.0 * fd_l - 24.0 * fd_r + 168.0 * fv_r + 192.0 * fv_l; Vec b2 = 360.0 * values[k] + 30.0 * (fd_r - fd_l) - 180.0 * (fv_l + fv_r); /*let's compute sqrt value to be used for computing roots. If we take sqrt of negaitve numbers, then we instead set a value that will make the root to be +-100 which is well outside range of[0,1]. We also guard the sqrt against sqrt with negative numbers by doing a max*/ const Vec sqrt_val = select(b1 * b1 - 4 * b0 * b2 < 0.0, b1 + 200.0 * b2, sqrt(max(b1 * b1- 4 * b0 * b2, 0.0))); //compute roots. Division is safe with vectorclass (=inf) const Vec root1 = (-b1 + sqrt_val) / (2 * b2); const Vec root2 = (-b1 - sqrt_val) / (2 * b2); /*PLM slope, MC limiter*/ Vec plm_slope_l = 2.0 * (values[k] - values[k - 1]); Vec plm_slope_r = 2.0 * (values[k + 1] - values[k]); Vec slope_sign = plm_slope_l + plm_slope_r; //it also has some magnitude, but we will only use its sign. /*first derivative coefficients*/ const Vec c0 = fd_l; const Vec c1 = b0; const Vec c2 = b1 / 2.0; const Vec c3 = b2 / 3.0; //compute both slopes at inflexion points, at least one of these //is with [0..1]. If the root is not in this range, we //simplify later if statements by setting it to the plm slope //sign Vec root1_slope = select(root1 >= 0.0 && root1 <= 1.0, c0 + root1 * ( c1 + root1 * (c2 + root1 * c3 ) ), slope_sign); Vec root2_slope = select(root2 >= 0.0 && root2 <= 1.0, c0 + root2 * ( c1 + root2 * (c2 + root2 * c3 ) ), slope_sign); Vecb fixInflexion = root1_slope * slope_sign < 0.0 || root2_slope * slope_sign < 0.0; if (horizontal_or (fixInflexion) ){ Realv valuesa[VECL]; Realv fva_l[VECL]; Realv fva_r[VECL]; Realv fda_l[VECL]; Realv fda_r[VECL]; Realv slope_signa[VECL]; values[k].store(valuesa); fv_l.store(fva_l); fd_l.store(fda_l); fv_r.store(fva_r); fd_r.store(fda_r); slope_sign.store(slope_signa); //todo store and then load data to avoid inserts (is it beneficial...?) //serialized the handling of inflexion points, these do not happen for smooth regions for(uint i = 0;i < VECL; i++) { if(fixInflexion[i]){ //need to collapse, at least one inflexion point has wrong //sign. if(fabs(plm_slope_l[i]) <= fabs(plm_slope_r[i])) { //collapse to left edge (eq 21) fda_l[i] = 1.0 / 3.0 * ( 10 * valuesa[i] - 2.0 * fva_r[i] - 8.0 * fva_l[i]); fda_r[i] = -10.0 * valuesa[i] + 6.0 * fva_r[i] + 4.0 * fva_l[i]; //check if PLM slope is consistent (eq 28 & 29) if (slope_signa[i] * fda_l[i] < 0) { fda_l[i] = 0; fva_r[i] = 5 * valuesa[i] - 4 * fva_l[i]; fda_r[i] = 20 * (valuesa[i] - fva_l[i]); } else if (slope_signa[i] * fda_r[i] < 0) { fda_r[i] = 0; fva_l[i] = 0.5 * (5 * valuesa[i] - 3 * fva_r[i]); fda_l[i] = 10.0 / 3.0 * (-valuesa[i] + fva_r[i]); } } else { //collapse to right edge (eq 21) fda_l[i] = 10.0 * valuesa[i] - 6.0 * fva_l[i] - 4.0 * fva_r[i]; fda_r[i] = 1.0 / 3.0 * ( - 10.0 * valuesa[i] + 2 * fva_l[i] + 8 * fva_r[i]); //check if PLM slope is consistent (eq 28 & 29) if (slope_signa[i] * fda_l[i] < 0) { fda_l[i] = 0; fva_r[i] = 0.5 * ( 5 * valuesa[i] - 3 * fva_l[i]); fda_r[i] = 10.0 / 3.0 * (valuesa[i] - fva_l[i]); } else if (slope_signa[i] * fda_r[i] < 0) { fda_r[i] = 0; fva_l[i] = 5 * valuesa[i] - 4 * fva_r[i]; fda_l[i] = 20.0 * ( - valuesa[i] + fva_r[i]); } } } } fv_l.load(fva_l); fd_l.load(fda_l); fv_r.load(fva_r); fd_r.load(fda_r); } }