Expr halide_erf(Expr x_full) { user_assert(x_full.type() == Float(32)) << "halide_erf only works for Float(32)"; // Extract the sign and magnitude. Expr sign = select(x_full < 0, -1.0f, 1.0f); Expr x = abs(x_full); // An approximation very similar to one from Abramowitz and // Stegun, but tuned for values > 1. Takes the form 1 - P(x)^-16. float c1[] = {0.0000818502f, -0.0000026500f, 0.0009353904f, 0.0081960206f, 0.0430054424f, 0.0703310579f, 1.0f }; Expr approx1 = evaluate_polynomial(x, c1, sizeof(c1)/sizeof(c1[0])); approx1 = 1.0f - pow(approx1, -16); // An odd polynomial tuned for values < 1. Similar to the Taylor // expansion of erf. float c2[] = {-0.0005553339f, 0.0048937243f, -0.0266849239f, 0.1127890132f, -0.3761207240f, 1.1283789803f }; Expr approx2 = evaluate_polynomial(x*x, c2, sizeof(c2)/sizeof(c2[0])); approx2 *= x; // Switch between the two approximations based on the magnitude. Expr y = select(x > 1.0f, approx1, approx2); Expr result = common_subexpression_elimination(sign * y); return result; }
Expr halide_exp(Expr x_full) { Type type = x_full.type(); internal_assert(type.element_of() == Float(32)); float ln2_part1 = 0.6931457519f; float ln2_part2 = 1.4286067653e-6f; float one_over_ln2 = 1.0f/logf(2.0f); Expr scaled = x_full * one_over_ln2; Expr k_real = floor(scaled); Expr k = cast(Int(32, type.lanes()), k_real); Expr x = x_full - k_real * ln2_part1; x -= k_real * ln2_part2; float coeff[] = { 0.00031965933071842413f, 0.00119156835564003744f, 0.00848988645943932717f, 0.04160188091348320655f, 0.16667983794100929562f, 0.49999899033463041098f, 1.0f, 1.0f }; Expr result = evaluate_polynomial(x, coeff, sizeof(coeff)/sizeof(coeff[0])); // Compute 2^k. int fpbias = 127; Expr biased = k + fpbias; Expr inf = Call::make(type, "inf_f32", {}, Call::PureExtern); // Shift the bits up into the exponent field and reinterpret this // thing as float. Expr two_to_the_n = reinterpret(type, biased << 23); result *= two_to_the_n; // Catch overflow and underflow result = select(biased < 255, result, inf); result = select(biased > 0, result, make_zero(type)); // This introduces lots of common subexpressions result = common_subexpression_elimination(result); return result; }
Expr halide_log(Expr x_full) { Type type = x_full.type(); internal_assert(type.element_of() == Float(32)); Expr nan = Call::make(type, "nan_f32", {}, Call::PureExtern); Expr neg_inf = Call::make(type, "neg_inf_f32", {}, Call::PureExtern); Expr use_nan = x_full < 0.0f; // log of a negative returns nan Expr use_neg_inf = x_full == 0.0f; // log of zero is -inf Expr exceptional = use_nan | use_neg_inf; // Avoid producing nans or infs by generating ln(1.0f) instead and // then fixing it later. Expr patched = select(exceptional, make_one(type), x_full); Expr reduced, exponent; range_reduce_log(patched, &reduced, &exponent); // Very close to the Taylor series for log about 1, but tuned to // have minimum relative error in the reduced domain (0.75 - 1.5). float coeff[] = { 0.05111976432738144643f, -0.11793923497136414580f, 0.14971993724699017569f, -0.16862004708254804686f, 0.19980668101718729313f, -0.24991211576292837737f, 0.33333435275479328386f, -0.50000106292873236491f, 1.0f, 0.0f }; Expr x1 = reduced - 1.0f; Expr result = evaluate_polynomial(x1, coeff, sizeof(coeff)/sizeof(coeff[0])); result += cast(type, exponent) * logf(2.0); result = select(exceptional, select(use_nan, nan, neg_inf), result); // This introduces lots of common subexpressions result = common_subexpression_elimination(result); return result; }
TYPED_TEST(EvaluationDomainTest, FFT) { const size_t m = 4; std::vector<TypeParam> f = { 2, 5, 3, 8 }; std::shared_ptr<evaluation_domain<TypeParam> > domain; for (int key = 0; key < 5; key++) { try { if (key == 0) domain.reset(new basic_radix2_domain<TypeParam>(m)); else if (key == 1) domain.reset(new extended_radix2_domain<TypeParam>(m)); else if (key == 2) domain.reset(new step_radix2_domain<TypeParam>(m)); else if (key == 3) domain.reset(new geometric_sequence_domain<TypeParam>(m)); else if (key == 4) domain.reset(new arithmetic_sequence_domain<TypeParam>(m)); std::vector<TypeParam> a(f); domain->FFT(a); std::vector<TypeParam> idx(m); for (size_t i = 0; i < m; i++) { idx[i] = domain->get_domain_element(i); } for (size_t i = 0; i < m; i++) { TypeParam e = evaluate_polynomial(m, f, idx[i]); EXPECT_TRUE(e == a[i]); } } catch(DomainSizeException &e) { printf("%s - skipping\n", e.what()); } catch(InvalidSizeException &e) { printf("%s - skipping\n", e.what()); } } }
inline V evaluate_even_polynomial(const lslboost::array<T,N>& a, const V& z) { return evaluate_polynomial(a, V(z*z)); }
inline U evaluate_odd_polynomial(const T* poly, U z, std::size_t count) { return poly[0] + z * evaluate_polynomial(poly+1, U(z*z), count-1); }
inline V evaluate_even_polynomial(const T(&a)[N], const V& z) { return evaluate_polynomial(a, V(z*z)); }
inline U evaluate_even_polynomial(const T* poly, U z, std::size_t count) { return evaluate_polynomial(poly, U(z*z), count); }
inline V evaluate_polynomial_c_imp(const T* a, const V& val, const Tag*) { return evaluate_polynomial(a, val, Tag::value); }
double erf_imp(double z, bool invert) { if(z < 0) { if(!invert) return -erf_imp(-z, invert); else if(z < -0.5) return 2 - erf_imp(-z, invert); else return 1 + erf_imp(-z, false); } double result; // // Big bunch of selection statements now to pick // which implementation to use, // try to put most likely options first: // if(z < 0.5) { // // We're going to calculate erf: // if(z == 0) { result = 0.0; } else if(z < 1e-10) { result = static_cast<double>(z * 1.125f + z * 0.003379167095512573896158903121545171688L); } else { // Maximum Deviation Found: 1.561e-17 // Expected Error Term: 1.561e-17 // Maximum Relative Change in Control Points: 1.155e-04 // Max Error found at double precision = 2.961182e-17 static const double Y = 1.044948577880859375; static const double P[] = { 0.0834305892146531832907L, -0.338165134459360935041L, -0.0509990735146777432841L, -0.00772758345802133288487L, -0.000322780120964605683831L, }; static const double Q[] = { 1L, 0.455004033050794024546L, 0.0875222600142252549554L, 0.00858571925074406212772L, 0.000370900071787748000569L, }; result = z * (Y + evaluate_polynomial(P, z * z) / evaluate_polynomial(Q, z * z)); } } else if((z < 14) || ((z < 28) && invert)) { // // We'll be calculating erfc: // invert = !invert; if(z < 1.5f) { // Maximum Deviation Found: 3.702e-17 // Expected Error Term: 3.702e-17 // Maximum Relative Change in Control Points: 2.845e-04 // Max Error found at double precision = 4.841816e-17 static const double Y = 0.405935764312744140625; static const double P[] = { -0.098090592216281240205L, 0.178114665841120341155L, 0.191003695796775433986L, 0.0888900368967884466578L, 0.0195049001251218801359L, 0.00180424538297014223957L, }; static const double Q[] = { 1L, 1.84759070983002217845L, 1.42628004845511324508L, 0.578052804889902404909L, 0.12385097467900864233L, 0.0113385233577001411017L, 0.337511472483094676155e-5L, }; result = Y + evaluate_polynomial(P, z - 0.5) / evaluate_polynomial(Q, z - 0.5); result *= std::exp(-z * z) / z; } else if(z < 2.5f) { // Max Error found at double precision = 6.599585e-18 // Maximum Deviation Found: 3.909e-18 // Expected Error Term: 3.909e-18 // Maximum Relative Change in Control Points: 9.886e-05 static const double Y = 0.50672817230224609375; static const double P[] = { -0.0243500476207698441272L, 0.0386540375035707201728L, 0.04394818964209516296L, 0.0175679436311802092299L, 0.00323962406290842133584L, 0.000235839115596880717416L, }; static const double Q[] = { 1L, 1.53991494948552447182L, 0.982403709157920235114L, 0.325732924782444448493L, 0.0563921837420478160373L, 0.00410369723978904575884L, }; result = Y + evaluate_polynomial(P, z - 1.5) / evaluate_polynomial(Q, z - 1.5); result *= std::exp(-z * z) / z; } else if(z < 4.5f) { // Maximum Deviation Found: 1.512e-17 // Expected Error Term: 1.512e-17 // Maximum Relative Change in Control Points: 2.222e-04 // Max Error found at double precision = 2.062515e-17 static const double Y = 0.5405750274658203125; static const double P[] = { 0.00295276716530971662634L, 0.0137384425896355332126L, 0.00840807615555585383007L, 0.00212825620914618649141L, 0.000250269961544794627958L, 0.113212406648847561139e-4L, }; static const double Q[] = { 1L, 1.04217814166938418171L, 0.442597659481563127003L, 0.0958492726301061423444L, 0.0105982906484876531489L, 0.000479411269521714493907L, }; result = Y + evaluate_polynomial(P, z - 3.5) / evaluate_polynomial(Q, z - 3.5); result *= std::exp(-z * z) / z; } else { // Max Error found at double precision = 2.997958e-17 // Maximum Deviation Found: 2.860e-17 // Expected Error Term: 2.859e-17 // Maximum Relative Change in Control Points: 1.357e-05 static const double Y = 0.5579090118408203125; static const double P[] = { 0.00628057170626964891937L, 0.0175389834052493308818L, -0.212652252872804219852L, -0.687717681153649930619L, -2.5518551727311523996L, -3.22729451764143718517L, -2.8175401114513378771L, }; static const double Q[] = { 1L, 2.79257750980575282228L, 11.0567237927800161565L, 15.930646027911794143L, 22.9367376522880577224L, 13.5064170191802889145L, 5.48409182238641741584L, }; result = Y + evaluate_polynomial(P, 1 / z) / evaluate_polynomial(Q, 1 / z); result *= std::exp(-z * z) / z; } } else { // // Any value of z larger than 28 will underflow to zero: // result = 0; invert = !invert; } if(invert) { result = 1 - result; } return result; }
// // The inverse erf and erfc functions share a common implementation, // this version is for 80-bit long double's and smaller: // double erf_inv_imp(double p, double q) { double result = 0; if(p <= 0.5) { // // Evaluate inverse erf using the rational approximation: // // x = p(p+10)(Y+R(p)) // // Where Y is a constant, and R(p) is optimised for a low // absolute error compared to |Y|. // // double: Max error found: 2.001849e-18 // long double: Max error found: 1.017064e-20 // Maximum Deviation Found (actual error term at infinite precision) 8.030e-21 // static const float Y = 0.0891314744949340820313f; static const double P[] = { -0.000508781949658280665617L, -0.00836874819741736770379L, 0.0334806625409744615033L, -0.0126926147662974029034L, -0.0365637971411762664006L, 0.0219878681111168899165L, 0.00822687874676915743155L, -0.00538772965071242932965L }; static const double Q[] = { 1, -0.970005043303290640362L, -1.56574558234175846809L, 1.56221558398423026363L, 0.662328840472002992063L, -0.71228902341542847553L, -0.0527396382340099713954L, 0.0795283687341571680018L, -0.00233393759374190016776L, 0.000886216390456424707504L }; double g = p * (p + 10); double r = evaluate_polynomial(P, p) / evaluate_polynomial(Q, p); result = g * Y + g * r; } else if(q >= 0.25) { // // Rational approximation for 0.5 > q >= 0.25 // // x = sqrt(-2*log(q)) / (Y + R(q)) // // Where Y is a constant, and R(q) is optimised for a low // absolute error compared to Y. // // double : Max error found: 7.403372e-17 // long double : Max error found: 6.084616e-20 // Maximum Deviation Found (error term) 4.811e-20 // static const float Y = 2.249481201171875f; static const double P[] = { -0.202433508355938759655L, 0.105264680699391713268L, 8.37050328343119927838L, 17.6447298408374015486L, -18.8510648058714251895L, -44.6382324441786960818L, 17.445385985570866523L, 21.1294655448340526258L, -3.67192254707729348546L }; static const double Q[] = { 1L, 6.24264124854247537712L, 3.9713437953343869095L, -28.6608180499800029974L, -20.1432634680485188801L, 48.5609213108739935468L, 10.8268667355460159008L, -22.6436933413139721736L, 1.72114765761200282724L }; double g = sqrt(-2 * log(q)); double xs = q - 0.25; double r = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); result = g / (Y + r); } else { // // For q < 0.25 we have a series of rational approximations all // of the general form: // // let: x = sqrt(-log(q)) // // Then the result is given by: // // x(Y+R(x-B)) // // where Y is a constant, B is the lowest value of x for which // the approximation is valid, and R(x-B) is optimised for a low // absolute error compared to Y. // // Note that almost all code will really go through the first // or maybe second approximation. After than we're dealing with very // small input values indeed: 80 and 128 bit long double's go all the // way down to ~ 1e-5000 so the "tail" is rather long... // double x = sqrt(-log(q)); if(x < 3) { // Max error found: 1.089051e-20 static const float Y = 0.807220458984375f; static const double P[] = { -0.131102781679951906451L, -0.163794047193317060787L, 0.117030156341995252019L, 0.387079738972604337464L, 0.337785538912035898924L, 0.142869534408157156766L, 0.0290157910005329060432L, 0.00214558995388805277169L, -0.679465575181126350155e-6L, 0.285225331782217055858e-7L, -0.681149956853776992068e-9L }; static const double Q[] = { 1, 3.46625407242567245975L, 5.38168345707006855425L, 4.77846592945843778382L, 2.59301921623620271374L, 0.848854343457902036425L, 0.152264338295331783612L, 0.01105924229346489121L }; double xs = x - 1.125; double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); result = Y * x + R * x; } else if(x < 6) { // Max error found: 8.389174e-21 static const float Y = 0.93995571136474609375f; static const double P[] = { -0.0350353787183177984712L, -0.00222426529213447927281L, 0.0185573306514231072324L, 0.00950804701325919603619L, 0.00187123492819559223345L, 0.000157544617424960554631L, 0.460469890584317994083e-5L, -0.230404776911882601748e-9L, 0.266339227425782031962e-11L }; static const double Q[] = { 1L, 1.3653349817554063097L, 0.762059164553623404043L, 0.220091105764131249824L, 0.0341589143670947727934L, 0.00263861676657015992959L, 0.764675292302794483503e-4L }; double xs = x - 3; double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); result = Y * x + R * x; } else if(x < 18) { // Max error found: 1.481312e-19 static const float Y = 0.98362827301025390625f; static const double P[] = { -0.0167431005076633737133L, -0.00112951438745580278863L, 0.00105628862152492910091L, 0.000209386317487588078668L, 0.149624783758342370182e-4L, 0.449696789927706453732e-6L, 0.462596163522878599135e-8L, -0.281128735628831791805e-13L, 0.99055709973310326855e-16L }; static const double Q[] = { 1L, 0.591429344886417493481L, 0.138151865749083321638L, 0.0160746087093676504695L, 0.000964011807005165528527L, 0.275335474764726041141e-4L, 0.282243172016108031869e-6L }; double xs = x - 6; double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); result = Y * x + R * x; } else if(x < 44) { // Max error found: 5.697761e-20 static const float Y = 0.99714565277099609375f; static const double P[] = { -0.0024978212791898131227L, -0.779190719229053954292e-5L, 0.254723037413027451751e-4L, 0.162397777342510920873e-5L, 0.396341011304801168516e-7L, 0.411632831190944208473e-9L, 0.145596286718675035587e-11L, -0.116765012397184275695e-17L }; static const double Q[] = { 1L, 0.207123112214422517181L, 0.0169410838120975906478L, 0.000690538265622684595676L, 0.145007359818232637924e-4L, 0.144437756628144157666e-6L, 0.509761276599778486139e-9L }; double xs = x - 18; double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); result = Y * x + R * x; } else { // Max error found: 1.279746e-20 static const float Y = 0.99941349029541015625f; static const double P[] = { -0.000539042911019078575891L, -0.28398759004727721098e-6L, 0.899465114892291446442e-6L, 0.229345859265920864296e-7L, 0.225561444863500149219e-9L, 0.947846627503022684216e-12L, 0.135880130108924861008e-14L, -0.348890393399948882918e-21L }; static const double Q[] = { 1L, 0.0845746234001899436914L, 0.00282092984726264681981L, 0.468292921940894236786e-4L, 0.399968812193862100054e-6L, 0.161809290887904476097e-8L, 0.231558608310259605225e-11L }; double xs = x - 44; double R = evaluate_polynomial(P, xs) / evaluate_polynomial(Q, xs); result = Y * x + R * x; } } return result; }