////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//  Pi
void Pi_BSR(BigFloat &P,BigFloat &Q,BigFloat &R,uint32_t a,uint32_t b,size_t p){
    //  Binary Splitting recursion for the Chudnovsky Formula.

    if (b - a == 1){
        //  P = (13591409 + 545140134 b)(2b-1)(6b-5)(6b-1) (-1)^b
        P = BigFloat(b).mul(545140134);
        P = P.add(BigFloat(13591409));
        P = P.mul(2*b - 1);
        P = P.mul(6*b - 5);
        P = P.mul(6*b - 1);
        if (b % 2 == 1)
            P.negate();

        //  Q = 10939058860032000 * b^3
        Q = BigFloat(b);
        Q = Q.mul(Q).mul(Q).mul(26726400).mul(409297880);

        //  R = (2b-1)(6b-5)(6b-1)
        R = BigFloat(2*b - 1);
        R = R.mul(6*b - 5);
        R = R.mul(6*b - 1);

        return;
    }

    uint32_t m = (a + b) / 2;

    BigFloat P0,Q0,R0,P1,Q1,R1;
    Pi_BSR(P0,Q0,R0,a,m,p);
    Pi_BSR(P1,Q1,R1,m,b,p);

    P = P0.mul(Q1,p).add(P1.mul(R0,p),p);
    Q = Q0.mul(Q1,p);
    R = R0.mul(R1,p);
}
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
//  Pi
void Pi_BSR(BigFloat &P,BigFloat &Q,BigFloat &R,uint32_t a,uint32_t b,size_t p,int tds = 1){
    //  Binary Splitting recursion for the Chudnovsky Formula.

    if (b - a == 1){
        //  P = (13591409 + 545140134 b)(2b-1)(6b-5)(6b-1) (-1)^b
        P = BigFloat(b).mul(545140134);
        P = P.add(BigFloat(13591409));
        P = P.mul(2*b - 1);
        P = P.mul(6*b - 5);
        P = P.mul(6*b - 1);
        if (b % 2 == 1)
            P.negate();

        //  Q = 10939058860032000 * b^3
        Q = BigFloat(b);
        Q = Q.mul(Q).mul(Q).mul(26726400).mul(409297880);

        //  R = (2b-1)(6b-5)(6b-1)
        R = BigFloat(2*b - 1);
        R = R.mul(6*b - 5);
        R = R.mul(6*b - 1);

        return;
    }

    uint32_t m = (a + b) / 2;

    BigFloat P0,Q0,R0,P1,Q1,R1;

    if (b - a < 1000 || tds < 2){
        //  No more threads.
        Pi_BSR(P0,Q0,R0,a,m,p);
        Pi_BSR(P1,Q1,R1,m,b,p);
    }else{
        //  Run sub-recursions in parallel.
        int tds0 = tds / 2;
        int tds1 = tds - tds0;
#pragma omp parallel num_threads(2)
        {
            int tid = omp_get_thread_num();
            if (tid == 0){
                Pi_BSR(P0,Q0,R0,a,m,p,tds0);
            }
            if (tid != 0 || omp_get_num_threads() < 2){
                Pi_BSR(P1,Q1,R1,m,b,p,tds1);
            }
        }
    }

    P = P0.mul(Q1,p,tds).add(P1.mul(R0,p,tds),p);
    Q = Q0.mul(Q1,p,tds);
    R = R0.mul(R1,p,tds);
}
BigFloat invsqrt(uint32_t x,size_t p,int tds){
    //  Compute inverse square root using Newton's Method.

    //            (  r0^2 * x - 1  )
    //  r1 = r0 - (----------------) * r0
    //            (       2        )

    if (x == 0)
        throw "Divide by Zero";

    //  End of recursion. Generate starting point.
    if (p == 0){
        double val = 1. / sqrt((double)x);

        int64_t exponent = 0;

        //  Scale
        while (val < 1000000000.){
            val *= 1000000000.;
            exponent--;
        }

        //  Rebuild a BigFloat.
        uint64_t val64 = (uint64_t)val;

        BigFloat out;
        out.sign = true;

        out.T = std::unique_ptr<uint32_t[]>(new uint32_t[2]);
        out.T[0] = (uint32_t)(val64 % 1000000000);
        out.T[1] = (uint32_t)(val64 / 1000000000);
        out.L = 2;
        out.exp = exponent;

        return out;
    }

    //  Half the precision
    size_t s = p / 2 + 1;
    if (p == 1) s = 0;
    if (p == 2) s = 1;

    //  Recurse at half the precision
    BigFloat T = invsqrt(x,s,tds);

    BigFloat temp = T.mul(T,p);     //  r0^2
    temp = temp.mul(x,p,tds);       //  r0^2 * x
    temp = temp.sub(BigFloat(1),p); //  r0^2 * x - 1
    temp = temp.mul(500000000);         //  (r0^2 * x - 1) / 2
    temp.exp--;
    temp = temp.mul(T,p,tds);       //  (r0^2 * x - 1) / 2 * r0
    return T.sub(temp,p);           //  r0 - (r0^2 * x - 1) / 2 * r0
}