/* Compute high-accuracy product of two double-double operands, taking full advantage of FMA. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 5.238480533564479e-32 (~= 2**-103.9125). */ __device__ __forceinline__ dbldbl mul_dbldbl (dbldbl a, dbldbl b) { dbldbl t, z; double e; t.y = __dmul_rn (a.y, b.y); t.x = __fma_rn (a.y, b.y, -t.y); t.x = __fma_rn (a.x, b.x, t.x); t.x = __fma_rn (a.y, b.x, t.x); t.x = __fma_rn (a.x, b.y, t.x); z.y = e = __dadd_rn (t.y, t.x); z.x = __dadd_rn (t.y - e, t.x); return z; }
/* Compute error-free product of two doubles. Take full advantage of FMA */ __device__ __forceinline__ dbldbl mul_double_to_dbldbl (double a, double b) { dbldbl z; z.y = __dmul_rn (a, b); z.x = __fma_rn (a, b, -z.y); return z; }
/* Compute high-accuracy square root of a double-double number. Newton-Raphson iteration based on equation 4 from a paper by Alan Karp and Peter Markstein, High Precision Division and Square Root, ACM TOMS, vol. 23, no. 4, December 1997, pp. 561-589. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 3.7564109505601846e-32 (~= 2**-104.3923). */ __device__ __forceinline__ dbldbl sqrt_dbldbl (dbldbl a) { dbldbl t, z; double e, y, s, r; r = rsqrt (a.y); if (a.y == 0.0) r = 0.0; y = __dmul_rn (a.y, r); s = __fma_rn (y, -y, a.y); r = __dmul_rn (0.5, r); z.y = e = __dadd_rn (s, a.x); z.x = __dadd_rn (s - e, a.x); t.y = __dmul_rn (r, z.y); t.x = __fma_rn (r, z.y, -t.y); t.x = __fma_rn (r, z.x, t.x); r = __dadd_rn (y, t.y); s = __dadd_rn (y - r, t.y); s = __dadd_rn (s, t.x); z.y = e = __dadd_rn (r, s); z.x = __dadd_rn (r - e, s); return z; }
/* Compute high-accuracy reciprocal square root of a double-double number. Based on Newton-Raphson iteration. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 6.4937771666026349e-32 (~= 2**-103.6026) */ __device__ __forceinline__ dbldbl rsqrt_dbldbl (dbldbl a) { dbldbl z; double r, s, e; r = rsqrt (a.y); e = __dmul_rn (a.y, r); s = __fma_rn (e, -r, 1.0); e = __fma_rn (a.y, r, -e); s = __fma_rn (e, -r, s); e = __dmul_rn (a.x, r); s = __fma_rn (e, -r, s); e = 0.5 * r; z.y = __dmul_rn (e, s); z.x = __fma_rn (e, s, -z.y); s = __dadd_rn (r, z.y); r = __dadd_rn (r, -s); r = __dadd_rn (r, z.y); r = __dadd_rn (r, z.x); z.y = e = __dadd_rn (s, r); z.x = __dadd_rn (s - e, r); return z; }
/* Compute high-accuracy quotient of two double-double operands, using Newton- Raphson iteration. Based on: T. Nagai, H. Yoshida, H. Kuroda, Y. Kanada. Fast Quadruple Precision Arithmetic Library on Parallel Computer SR11000/J2. In Proceedings of the 8th International Conference on Computational Science, ICCS '08, Part I, pp. 446-455. In the absence of underflow and overflow, the maximum relative error observed with 10 billion test cases was 1.0161322480099059e-31 (~= 2**-102.9566). */ __device__ __forceinline__ dbldbl div_dbldbl (dbldbl a, dbldbl b) { dbldbl t, z; double e, r; r = 1.0 / b.y; t.y = __dmul_rn (a.y, r); e = __fma_rn (b.y, -t.y, a.y); t.y = __fma_rn (r, e, t.y); t.x = __fma_rn (b.y, -t.y, a.y); t.x = __dadd_rn (a.x, t.x); t.x = __fma_rn (b.x, -t.y, t.x); e = __dmul_rn (r, t.x); t.x = __fma_rn (b.y, -e, t.x); t.x = __fma_rn (r, t.x, e); z.y = e = __dadd_rn (t.y, t.x); z.x = __dadd_rn (t.y - e, t.x); return z; }
MYDEVFN unsigned dDefJacL0 (volatile double *const G, volatile double *const V, const unsigned x, const unsigned y) { #if __CUDA_ARCH__ >= 300 #else // Fermi const unsigned y2 = (y << 1u); volatile double *const shPtr = &(F32(V, 0u, y2)); #endif // __CUDA_ARCH__ unsigned blk_transf_s = 0u, blk_transf_b = 0u; for (unsigned swp = 0u; swp < _nSwp; ++swp) { int swp_transf_s = 0, swp_transf_b = 0; for (unsigned step = 0u; step < _STRAT0_STEPS; ++step) { const unsigned p = _strat0[step][y][0u], q = _strat0[step][y][1u]; double Dp = +0.0; double Dq = +0.0; double Apq = +0.0; const double Gp = F32(G, x, p); const double Gq = F32(G, x, q); const double Vp = F32(V, x, p); const double Vq = F32(V, x, q); __syncthreads(); Dp = __fma_rn(Gp, Gp, Dp); Dq = __fma_rn(Gq, Gq, Dq); Apq = __fma_rn(Gp, Gq, Apq); #if __CUDA_ARCH__ >= 300 Dp = dSum32(Dp); Dq = dSum32(Dq); Apq = dSum32(Apq); #else // Fermi Dp = dSum32(Dp, shPtr, x); Dq = dSum32(Dq, shPtr, x); Apq = dSum32(Apq, shPtr, x); #endif // __CUDA_ARCH const double Dp_ = __dsqrt_rn(Dp), Dq_ = __dsqrt_rn(Dq), Apq_ = fabs(Apq), Dpq_ = Dp_ * Dq_; const int transf_s = !(Apq_ < (Dpq_ * HYPJAC_MYTOL)); swp_transf_s += (__syncthreads_count(transf_s) >> WARP_SZ_LGi); int transf_b = 0; if (transf_s) { double c, t; transf_b = dRotT(Apq, Dp, Dq, c, t); const double t_ = -t; if (transf_b) { F32(G, x, p) = c * __fma_rn(t_, Gq, Gp); F32(G, x, q) = c * __fma_rn(t, Gp, Gq); F32(V, x, p) = c * __fma_rn(t_, Vq, Vp); F32(V, x, q) = c * __fma_rn(t, Vp, Vq); } else { F32(G, x, p) = __fma_rn(t_, Gq, Gp); F32(G, x, q) = __fma_rn(t, Gp, Gq); F32(V, x, p) = __fma_rn(t_, Vq, Vp); F32(V, x, q) = __fma_rn(t, Vp, Vq); } } else { // must restore V F32(V, x, p) = Vp; F32(V, x, q) = Vq; } swp_transf_b += (__syncthreads_count(transf_b) >> WARP_SZ_LGi); } if (swp_transf_s) { blk_transf_s += static_cast<unsigned>(swp_transf_s); blk_transf_b += static_cast<unsigned>(swp_transf_b); } else break; } if (!y && !x && blk_transf_s) { if (blk_transf_b) { unsigned long long blk_transf = blk_transf_b; blk_transf <<= 32u; blk_transf |= blk_transf_s; atomicAdd((unsigned long long*)_cvg, blk_transf); } else atomicAdd((unsigned*)_cvg, blk_transf_s); } __syncthreads(); return blk_transf_s; }