예제 #1
0
/* Compute high-accuracy product of two double-double operands, taking full
   advantage of FMA. In the absence of underflow and overflow, the maximum
   relative error observed with 10 billion test cases was 5.238480533564479e-32
   (~= 2**-103.9125).
*/
__device__ __forceinline__ dbldbl mul_dbldbl (dbldbl a, dbldbl b)
{
    dbldbl t, z;
    double e;
    t.y = __dmul_rn (a.y, b.y);
    t.x = __fma_rn (a.y, b.y, -t.y);
    t.x = __fma_rn (a.x, b.x, t.x);
    t.x = __fma_rn (a.y, b.x, t.x);
    t.x = __fma_rn (a.x, b.y, t.x);
    z.y = e = __dadd_rn (t.y, t.x);
    z.x = __dadd_rn (t.y - e, t.x);
    return z;
}
예제 #2
0
/* Compute error-free product of two doubles. Take full advantage of FMA */
__device__ __forceinline__ dbldbl mul_double_to_dbldbl (double a, double b)
{
    dbldbl z;
    z.y = __dmul_rn (a, b);
    z.x = __fma_rn (a, b, -z.y);
    return z;
}
예제 #3
0
/* Compute high-accuracy square root of a double-double number. Newton-Raphson
   iteration based on equation 4 from a paper by Alan Karp and Peter Markstein,
   High Precision Division and Square Root, ACM TOMS, vol. 23, no. 4, December
   1997, pp. 561-589. In the absence of underflow and overflow, the maximum
   relative error observed with 10 billion test cases was
   3.7564109505601846e-32 (~= 2**-104.3923).
*/
__device__ __forceinline__ dbldbl sqrt_dbldbl (dbldbl a)
{
    dbldbl t, z;
    double e, y, s, r;
    r = rsqrt (a.y);
    if (a.y == 0.0) r = 0.0;
    y = __dmul_rn (a.y, r);
    s = __fma_rn (y, -y, a.y);
    r = __dmul_rn (0.5, r);
    z.y = e = __dadd_rn (s, a.x);
    z.x = __dadd_rn (s - e, a.x);
    t.y = __dmul_rn (r, z.y);
    t.x = __fma_rn (r, z.y, -t.y);
    t.x = __fma_rn (r, z.x, t.x);
    r = __dadd_rn (y, t.y);
    s = __dadd_rn (y - r, t.y);
    s = __dadd_rn (s, t.x);
    z.y = e = __dadd_rn (r, s);
    z.x = __dadd_rn (r - e, s);
    return z;
}
예제 #4
0
/* Compute high-accuracy reciprocal square root of a double-double number.
   Based on Newton-Raphson iteration. In the absence of underflow and overflow,
   the maximum relative error observed with 10 billion test cases was
   6.4937771666026349e-32 (~= 2**-103.6026)
*/
__device__ __forceinline__ dbldbl rsqrt_dbldbl (dbldbl a)
{
    dbldbl z;
    double r, s, e;
    r = rsqrt (a.y);
    e = __dmul_rn (a.y, r);
    s = __fma_rn (e, -r, 1.0);
    e = __fma_rn (a.y, r, -e);
    s = __fma_rn (e, -r, s);
    e = __dmul_rn (a.x, r);
    s = __fma_rn (e, -r, s);
    e = 0.5 * r;
    z.y = __dmul_rn (e, s);
    z.x = __fma_rn (e, s, -z.y);
    s = __dadd_rn (r, z.y);
    r = __dadd_rn (r, -s);
    r = __dadd_rn (r, z.y);
    r = __dadd_rn (r, z.x);
    z.y = e = __dadd_rn (s, r);
    z.x = __dadd_rn (s - e, r);
    return z;
}
예제 #5
0
/* Compute high-accuracy quotient of two double-double operands, using Newton-
   Raphson iteration. Based on: T. Nagai, H. Yoshida, H. Kuroda, Y. Kanada.
   Fast Quadruple Precision Arithmetic Library on Parallel Computer SR11000/J2.
   In Proceedings of the 8th International Conference on Computational Science,
   ICCS '08, Part I, pp. 446-455. In the absence of underflow and overflow, the
   maximum relative error observed with 10 billion test cases was
   1.0161322480099059e-31 (~= 2**-102.9566).
*/
__device__ __forceinline__ dbldbl div_dbldbl (dbldbl a, dbldbl b)
{
    dbldbl t, z;
    double e, r;
    r = 1.0 / b.y;
    t.y = __dmul_rn (a.y, r);
    e = __fma_rn (b.y, -t.y, a.y);
    t.y = __fma_rn (r, e, t.y);
    t.x = __fma_rn (b.y, -t.y, a.y);
    t.x = __dadd_rn (a.x, t.x);
    t.x = __fma_rn (b.x, -t.y, t.x);
    e = __dmul_rn (r, t.x);
    t.x = __fma_rn (b.y, -e, t.x);
    t.x = __fma_rn (r, t.x, e);
    z.y = e = __dadd_rn (t.y, t.x);
    z.x = __dadd_rn (t.y - e, t.x);
    return z;
}
예제 #6
0
MYDEVFN unsigned dDefJacL0
(volatile double *const G,
 volatile double *const V,
 const unsigned x,
 const unsigned y)
{
#if __CUDA_ARCH__ >= 300
#else // Fermi
  const unsigned y2 = (y << 1u);
  volatile double *const shPtr = &(F32(V, 0u, y2));
#endif // __CUDA_ARCH__

  unsigned
    blk_transf_s = 0u,
    blk_transf_b = 0u;

  for (unsigned swp = 0u; swp < _nSwp; ++swp) {
    int
      swp_transf_s = 0,
      swp_transf_b = 0;

    for (unsigned step = 0u; step < _STRAT0_STEPS; ++step) {
      const unsigned
        p = _strat0[step][y][0u],
        q = _strat0[step][y][1u];

      double Dp = +0.0;
      double Dq = +0.0;
      double Apq = +0.0;

      const double Gp = F32(G, x, p);
      const double Gq = F32(G, x, q);

      const double Vp = F32(V, x, p);
      const double Vq = F32(V, x, q);

      __syncthreads();

      Dp = __fma_rn(Gp, Gp, Dp);
      Dq = __fma_rn(Gq, Gq, Dq);
      Apq = __fma_rn(Gp, Gq, Apq);

#if __CUDA_ARCH__ >= 300
      Dp = dSum32(Dp);
      Dq = dSum32(Dq);
      Apq = dSum32(Apq);
#else // Fermi
      Dp = dSum32(Dp, shPtr, x);
      Dq = dSum32(Dq, shPtr, x);
      Apq = dSum32(Apq, shPtr, x);
#endif // __CUDA_ARCH

      const double
        Dp_ = __dsqrt_rn(Dp),
        Dq_ = __dsqrt_rn(Dq),
        Apq_ = fabs(Apq),
        Dpq_ = Dp_ * Dq_;

      const int transf_s = !(Apq_ < (Dpq_ * HYPJAC_MYTOL));
      swp_transf_s += (__syncthreads_count(transf_s) >> WARP_SZ_LGi);

      int transf_b = 0;
      if (transf_s) {
        double c, t;
        transf_b = dRotT(Apq, Dp, Dq, c, t);
        const double t_ = -t;

        if (transf_b) {
          F32(G, x, p) = c * __fma_rn(t_, Gq, Gp);
          F32(G, x, q) = c * __fma_rn(t, Gp, Gq);
          F32(V, x, p) = c * __fma_rn(t_, Vq, Vp);
          F32(V, x, q) = c * __fma_rn(t, Vp, Vq);
        }
        else {
          F32(G, x, p) = __fma_rn(t_, Gq, Gp);
          F32(G, x, q) = __fma_rn(t, Gp, Gq);
          F32(V, x, p) = __fma_rn(t_, Vq, Vp);
          F32(V, x, q) = __fma_rn(t, Vp, Vq);
        }
      }
      else {
        // must restore V
        F32(V, x, p) = Vp;
        F32(V, x, q) = Vq;
      }

      swp_transf_b += (__syncthreads_count(transf_b) >> WARP_SZ_LGi);
    }

    if (swp_transf_s) {
      blk_transf_s += static_cast<unsigned>(swp_transf_s);
      blk_transf_b += static_cast<unsigned>(swp_transf_b);
    }
    else
      break;
  }

  if (!y && !x && blk_transf_s) {
    if (blk_transf_b) {
      unsigned long long blk_transf = blk_transf_b;
      blk_transf <<= 32u;
      blk_transf |= blk_transf_s;
      atomicAdd((unsigned long long*)_cvg, blk_transf);
    }
    else
      atomicAdd((unsigned*)_cvg, blk_transf_s);
  }

  __syncthreads();
  return blk_transf_s;
}