C++ (Cpp) dV Examples

Example #1

0

Show file

File: LowRankVector.cpp Project: JuliaPackageMirrors/ElasticFDA.jl

LowRankVector::LowRankVector(integer Ur, integer Uc, integer Drc, integer Vr, integer Vc)
{
	StieVector dU(Ur, Uc);
	EucVector dD(Drc, Drc);
	StieVector dV(Vr, Vc);

	Element **Elems = new Element *[3];
	Elems[0] = &dU;
	Elems[1] = &dD;
	Elems[2] = &dV;
	integer *powsintev = new integer[4];
	powsintev[0] = 0;
	powsintev[1] = 1;
	powsintev[2] = 2;
	powsintev[3] = 3;

	ProductElementInitialization(Elems, 3, powsintev, 3);

	delete[] powsintev;
	delete[] Elems;
};

Example #2

0

Show file

File: device.vector.test.cpp Project: K-Vamshi-Krishna/Bolt

TEST( Vector, DataReadView )
{
    std::vector< int > v( 5ul, 3 );
    bolt::amp::device_vector< int, concurrency::array_view > dV( v );
    EXPECT_EQ( 5, dV.size( ) );
    dV[ 0 ] = 1;
    dV[ 1 ] = 2;
    dV[ 2 ] = 3;
    dV[ 3 ] = 4;
    dV[ 4 ] = 5;

    // Any device_vector pointer can be used.
    bolt::amp::device_vector< int, concurrency::array_view >::pointer mySP = dV.data( );

    EXPECT_EQ( 1, mySP[ 0 ] );
    EXPECT_EQ( 2, mySP[ 1 ] );
    EXPECT_EQ( 3, mySP[ 2 ] );
    EXPECT_EQ( 4, mySP[ 3 ] );
    EXPECT_EQ( 5, mySP[ 4 ] );

}

Example #3

0

Show file

File: deviceVector.cpp Project: K-Vamshi-Krishna/Bolt

int _tmain( int argc, _TCHAR* argv[ ] )
{
    const size_t vecSize = 10;
    bolt::cl::device_vector< int > dV( vecSize );

    bolt::cl::device_vector< int >::iterator myIter = dV.begin( );
    std::cout << "\nDevice vector EXAMPLE \n";
    //  Iterator arithmetic supported
    *myIter = 1;
    ++myIter;
    *myIter = 2;
    myIter++;
    *myIter = 3;
    myIter += 1;
    *(myIter + 0) = 4;
    *(myIter + 1) = 5;
    myIter += 1;

    //  Operator [] on the container suported
    dV[ 5 ] = 6;
    dV[ 6 ] = 7;

    //  The .data() method maps internal GPU buffer to host accessible memory, and keeps the memory mapped
    bolt::cl::device_vector< int >::pointer pdV = dV.data( );

    //  These are fast writes to host accessible memory
    pdV[ 7 ] = 8;
    pdV[ 8 ] = 9;
    pdV[ 9 ] = 10;

    //  Unmaps the GPU buffer, updating the contents of GPU memory
    pdV.reset( );

    std::cout << "Device Vector contents: " << std::endl;
    for( size_t i = 0; i < vecSize; ++i )
    {
        std::cout << dV[ i ] << ", ";
    }
    return 0;
}

Example #4

0

Show file

File: device.vector.test.cpp Project: K-Vamshi-Krishna/Bolt

TEST( VectorReverseIterator, ArithmeticAndEqual )
{
    bolt::BCKND::device_vector< int > dV( 5 );
    EXPECT_EQ( 5, dV.size( ) );

    bolt::BCKND::device_vector< int >::reverse_iterator myIter = dV.rbegin( );
    *myIter = 1;
    ++myIter;
    *myIter = 2;
    myIter++;
    *myIter = 3;
    myIter += 1;
    *(myIter + 0) = 4;
    *(myIter + 1) = 5;
    myIter += 1;

    EXPECT_EQ( 1, dV[ 4 ] );
    EXPECT_EQ( 2, dV[ 3 ] );
    EXPECT_EQ( 3, dV[ 2 ] );
    EXPECT_EQ( 4, dV[ 1 ] );
    EXPECT_EQ( 5, dV[ 0 ] );
}

Example #5

0

Show file

File: device.vector.test.cpp Project: 8l/kalmar

TEST( DeviceVector, Assign )
{
    bolt::BCKND::device_vector< int > dV( 3, 98 );
    std::vector< int > stdV( 3, 98 );
    EXPECT_EQ(dV[0], stdV[0]);
    dV.assign(10, 89 );
    stdV.assign(10, 89 );
    for(int i=0; i<10; i++)
    {
        EXPECT_EQ(stdV[i], dV[i]);
    }
    dV.assign(5, 98 );
    stdV.assign(5, 98 );
    EXPECT_EQ(stdV.capacity(), dV.capacity());
    EXPECT_EQ(stdV.size(), dV.size());

    for(int i=0; i<5; i++)
    {
        EXPECT_EQ(stdV[i], dV[i]);
    }
    std::vector<float> stdFloatVect(1024);
    for(int i=0; i<1024; i++)
    {
        stdFloatVect[i] = (float)i;
    }
    bolt::BCKND::device_vector<float> dvFloatVect(stdFloatVect.begin(), 1024);
    for(int i=0; i<1024; i++)
    {
        EXPECT_FLOAT_EQ((float)i, dvFloatVect[i]);
    }
    dvFloatVect.assign(1022, 89 );
    stdFloatVect.assign(1022, 89 );
    for(int i=0; i<1022; i++)
    {
        /*This loop is iterating to 1022 elements only because stdFloatVect[i] throws an exception 
        if accessing beyond 1022 elements*/
        EXPECT_FLOAT_EQ(stdFloatVect[i], dvFloatVect[i]);
    }
}

Example #6

0

Show file

File: cosmolib_pywrap.c Project: d80b2t/python

static PyObject*
PyCosmoObject_dV_vec(struct PyCosmoObject* self, PyObject* args) {
    PyObject* zObj=NULL, *resObj=NULL;;
    double *z, *res;
    npy_intp n, i;

    if (!PyArg_ParseTuple(args, (char*)"O", &zObj)) {
        return NULL;
    }

    n = PyArray_SIZE(zObj);
    z = (double* )PyArray_DATA(zObj);

    resObj = PyArray_ZEROS(1, &n, NPY_FLOAT64, 0);
    res = (double* )PyArray_DATA(resObj);

    for (i=0; i<n; i++) {
        res[i] = dV(self->cosmo, z[i]); 
    }

    return resObj;

}

Example #7

0

Show file

File: device.vector.test.cpp Project: K-Vamshi-Krishna/Bolt

TEST( VectorReference, OperatorValueType )
{
    bolt::BCKND::device_vector< int > dV( 5 );

    dV[ 0 ] = 1;
    dV[ 1 ] = 2;
    dV[ 2 ] = 3;
    dV[ 3 ] = 4;
    dV[ 4 ] = 5;

    std::vector< int > readBack( 5 );
    readBack[ 0 ] = dV[ 0 ];
    readBack[ 1 ] = dV[ 1 ];
    readBack[ 2 ] = dV[ 2 ];
    readBack[ 3 ] = dV[ 3 ];
    readBack[ 4 ] = dV[ 4 ];

    EXPECT_EQ( readBack[ 0 ], dV[ 0 ] );
    EXPECT_EQ( readBack[ 1 ], dV[ 1 ] );
    EXPECT_EQ( readBack[ 2 ], dV[ 2 ] );
    EXPECT_EQ( readBack[ 3 ], dV[ 3 ] );
    EXPECT_EQ( readBack[ 4 ], dV[ 4 ] );
}

Example #8

0

Show file

File: device.vector.test.cpp Project: K-Vamshi-Krishna/Bolt

TEST( Vector, Erase )
{
    bolt::BCKND::device_vector< int > dV( 5 );
    EXPECT_EQ( 5, dV.size( ) );

    dV[ 0 ] = 1;
    dV[ 1 ] = 2;
    dV[ 2 ] = 3;
    dV[ 3 ] = 4;
    dV[ 4 ] = 5;

    bolt::BCKND::device_vector< int >::iterator myIter = dV.begin( );
    myIter += 2;

    bolt::BCKND::device_vector< int >::iterator myResult = dV.erase( myIter );
    EXPECT_EQ( 4, dV.size( ) );
    EXPECT_EQ( 4, *myResult );

    EXPECT_EQ( 1, dV[ 0 ] );
    EXPECT_EQ( 2, dV[ 1 ] );
    EXPECT_EQ( 4, dV[ 2 ] );
    EXPECT_EQ( 5, dV[ 3 ] );
}

Example #9

0

Show file

File: FocusMeasure.cpp Project: TUWien/DocScan

	/// <summary>
	/// Computes Brenner's focus measure and determines the ratio of the median/mean.
	/// </summary>
	/// <returns>The focus measure value</returns>
	double BasicFM::computeROGR()
	{
		if (checkInput()) {

			cv::Mat dH = mSrcImg(cv::Range::all(), cv::Range(1, mSrcImg.cols)) - mSrcImg(cv::Range::all(), cv::Range(0, mSrcImg.cols - 1));
			cv::Mat dV = mSrcImg(cv::Range(1, mSrcImg.rows), cv::Range::all()) - mSrcImg(cv::Range(0, mSrcImg.rows - 1), cv::Range::all());
			dH = cv::abs(dH);
			dV = cv::abs(dV);

			cv::Mat FM = cv::max(dH(cv::Range(0, dH.rows - 1), cv::Range::all()), dV(cv::Range::all(), cv::Range(0, dV.cols - 1)));
			FM = FM.mul(FM);

			cv::Scalar m = cv::mean(FM);
			cv::Mat tmp;
			FM.convertTo(tmp, CV_32F);


            double r = 255.0*255.0;
			//mVal = r > 0 ? m[0] / r : m[0];
			mVal = m[0] / r;
		}

		return mVal;
	}

Example #10

0

Show file

File: dlahr2_m.cpp Project: EmergentOrder/magma

/**
    Purpose
    -------
    DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by DGEHRD.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.

    @param[in]
    k       INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    @param[in]
    nb      INTEGER
            The number of columns to be reduced.

    @param[in,out]
    A       DOUBLE_PRECISION array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE_PRECISION array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    @param[out]
    T       DOUBLE_PRECISION array, dimension (LDT,NB)
            The upper triangular matrix T.

    @param[in]
    ldt     INTEGER
            The leading dimension of the array T.  LDT >= NB.

    @param[out]
    Y       DOUBLE_PRECISION array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    @param[in]
    ldy     INTEGER
            The leading dimension of the array Y. LDY >= N.

    @param[in,out]
    data    Structure with pointers to dA, dT, dV, dW, dY
            which are distributed across multiple GPUs.

    Further Details
    ---------------
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

    @verbatim
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )
    @endverbatim

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    @ingroup magma_dgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_dlahr2_m(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    double *A, magma_int_t lda,
    double *tau,
    double *T, magma_int_t ldt,
    double *Y, magma_int_t ldy,
    struct dgehrd_data* data )
{
    #define  A(  i, j ) ( A + (i) + (j)*lda)
    #define  Y(  i, j ) ( Y + (i) + (j)*ldy)
    #define  T(  i, j ) ( T + (i) + (j)*ldt)
    #define dA(  d, i, j ) (data->A [d] + (i) + (j)*ldda)
    #define dTi( d       ) (data->Ti[d])
    #define dV(  d, i, j ) (data->V [d] + (i) + (j)*ldv )
    #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
    #define dY(  d, i, j ) (data->Y [d] + (i) + (j)*ldda)

    double c_zero    = MAGMA_D_ZERO;
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double tmp;

    magma_int_t ngpu = data->ngpu;
    magma_int_t ldda = data->ldda;
    magma_int_t ldv  = data->ldv;
    magma_int_t ldvd = data->ldvd;
    
    magma_int_t ione = 1;
    
    magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid;
    magma_int_t n_k_i_1, n_k;
    double scale;

    magma_int_t i;
    double ei = MAGMA_D_ZERO;

    magma_int_t info_data = 0;
    magma_int_t *info = &info_data;
    if (n < 0) {
        *info = -1;
    } else if (k < 0 || k >= n) {
        *info = -2;
    } else if (nb < 1 || nb > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (ldt < nb) {
        *info = -8;
    } else if (ldy < max(1,n)) {
        *info = -10;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    // adjust from 1-based indexing
    k -= 1;

    // Function Body
    if (n <= 1)
        return 0;
    
    // zero out current top block of V on all GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv );
    }
    
    // set all Y=0
    lapackf77_dlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy );
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Finish applying I - V * T * V' on right
            tmp = MAGMA_D_NEGATE( tau[i-1] );
            blasf77_daxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_dcopy( &i,
                           A(k+1,i), &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_dtrmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_dgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1), &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_dgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_dtrmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_daxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i), &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_dlarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // compute yi = A vi = sum_g A{d} vi{d}
        nblocks = (n-1) / nb / ngpu + 1;
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magmablasSetKernelStream( data->streams[d] );
            
            // dV(k+i+1:n-1, i) = VA(k+i:n, i)
            magma_dsetvector_async( n_k_i_1,
                                    A(k+i+1,i), 1,
                                    dV(d, k+i+1, i), 1, data->streams[d] );
            
            // copy column of dV -> dVd, using block cyclic distribution.
            // This assumes V and Vd have been padded so that
            // a 2D matrix copy doesn't access them out-of-bounds
            gblock = k / nb;
            lblock = gblock / ngpu;
            lgid   = gblock % ngpu;
            if ( d < lgid ) {
                lblock += 1;
            }
            // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix
            magmablas_dlacpy( MagmaFull, nb, nblocks-lblock,
                              dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu,
                              dVd(d, 0    + lblock*nb,      i), nb );
            
            // convert global indices (k) to local indices (dk)
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            
            // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i)
            // skip if matrix is empty
            // each GPU copies to different temporary vector in Y,
            // which are summed in separate loop below
            if ( dn-dki1 > 0 ) {
                magma_dgemv( MagmaNoTrans, n-k, dn-dki1,
                             c_one,  dA (d, k,    dki1), ldda,
                                     dVd(d, dki1,    i), 1,
                             c_zero, dY (d, k,       i), 1 );
                
                // copy vector to host, storing in column nb+d of Y
                // as temporary space (Y has >= nb+ngpu columns)
                magma_dgetvector_async( n-k,
                                        dY(d, k, i), 1,
                                        Y(k, nb+d),  1, data->streams[d] );
            }
        }
        
        // while GPU is doing above Ag*v...
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_D_NEGATE( tau[i] );
        blasf77_dgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i), &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];
        
        // apply reflectors to next column, A(i+1), on right only.
        // one axpy will be required to finish this, in the next iteration above
        if ( i > 0 && i+1 < nb ) {
            // Update next column, A(k:n,i+1), applying Q on right.
            // One axpy will be required to finish this, in the next iteration
            // above, after yi is computed.
            // This updates one more row than LAPACK does (row k),
            // making block above panel an even multiple of nb.
            // Use last column of T as workspace, w.
            magma_int_t i1 = i+1;
            
            // If real, conjugate row of V, and undo afterwards
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)'
            // T is now rectangular, so we use gemv instead of trmv as in lapack.
            blasf77_dgemv( "No trans", &i, &i1,
                           &c_one,  T(0,0), &ldt,
                                    A(k+i1,0), &lda,
                           &c_zero, T(0,nb-1), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_dlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            
            // A(k:n, i+1) -= Y(k:n, 0:i) * w
            blasf77_dgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0), &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i1), &ione );
        }
        
        // yi = sum_g yi{d}
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magma_queue_sync( data->streams[d] );
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            if ( dn-dki1 > 0 ) {
                // yi = yi + yi{d}
                blasf77_daxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione );
            }
        }
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;
    
    // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:)
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        
        // convert global indices (k) to local indices (dk)
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        
        // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :)
        // skip if matrix is empty
        // each GPU copies to different temporary block in Y,
        // which are summed in separate loop below
        if ( dn-dki1 > 0 ) {
            magma_dgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1,
                         c_one,  dA (d, 0,    dki1), ldda,
                                 dVd(d, dki1,    0), ldvd,
                         c_zero, dY (d, 0,       0), ldda );
            
            // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y
            // as temporary space (Y has nb + nb*ngpu columns)
            magma_dgetmatrix_async( k, nb,
                                    dY(d, 0, 0),  ldda,
                                    Y(0,nb+nb*d), ldy, data->streams[d] );
        }
    }
    
    // Y = sum_g Y{d}
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_queue_sync( 0 );
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        if ( dn-dki1 > 0 ) {
            // Y = Y + Am V
            for( i = 0; i < nb; ++i ) {
                blasf77_daxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione );
            }
        }
    }
    
    // copy Y and T matrices to GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_dsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] );
        magma_dsetmatrix_async( nb, nb, T, nb, dTi(d),      nb,   data->streams[d] );
    }

    return 0;
} /* magma_dlahr2 */

Example #11

0

Show file

/**
    Purpose
    -------
    DGEHRD reduces a DOUBLE PRECISION general matrix A to upper Hessenberg form H by
    an orthogonal similarity transformation:  Q' * A * Q = H . This version
    stores the triangular matrices used in the factorization so that they can
    be applied directly (i.e., without being recomputed) later. As a result,
    the application of Q is much faster.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    ilo     INTEGER
    @param[in]
    ihi     INTEGER
            It is assumed that A is already upper triangular in rows
            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
            set by a previous call to DGEBAL; otherwise they should be
            set to 1 and N respectively. See Further Details.
            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the N-by-N general matrix to be reduced.
            On exit, the upper triangle and the first subdiagonal of A
            are overwritten with the upper Hessenberg matrix H, and the
            elements below the first subdiagonal, with the array TAU,
            represent the orthogonal matrix Q as a product of elementary
            reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
            zero.

    @param[out]
    work    (workspace) DOUBLE PRECISION array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.  LWORK >= N*NB,
            where NB is the optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    dT      DOUBLE PRECISION array on the GPU, dimension NB*N,
            where NB is the optimal blocksize. It stores the NB*NB blocks
            of the triangular T matrices used in the reduction.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ---------------
    The matrix Q is represented as a product of (ihi-ilo) elementary
    reflectors

       Q = H(ilo) H(ilo+1) . . . H(ihi-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
    exit in A(i+2:ihi,i), and tau in TAU(i).

    The contents of A are illustrated by the following example, with
    n = 7, ilo = 2 and ihi = 6:

    @verbatim
    on entry,                        on exit,

    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
    (                         a )    (                          a )
    @endverbatim

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    
    This version stores the T matrices in dT, for later use in magma_dorghr.

    @ingroup magma_dgeev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgehrd(
    magma_int_t n, magma_int_t ilo, magma_int_t ihi,
    double *A, magma_int_t lda,
    double *tau,
    double *work, magma_int_t lwork,
    magmaDouble_ptr dT,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)

    #ifdef HAVE_clBLAS
    #define dA(i_,j_)  dwork, ((i_) + (j_)*ldda + nb*ldda*2)
    #define dT(i_,j_)  dT,    ((i_) + (j_)*nb   + dT_offset)
    #define dV(i_,j_)  dwork, ((i_) + (j_)*ldda + nb*ldda)
    #define dwork(i_)  dwork, ((i_))
    #else
    #define dA(i_,j_) (dA    + (i_) + (j_)*ldda)
    #define dT(i_,j_) (dT    + (i_) + (j_)*nb)
    #define dV(i_,j_) (dV    + (i_) + (j_)*ldda)
    #define dwork(i_) (dwork + (i_))
    #endif

    // Constants
    const double c_one  = MAGMA_D_ONE;
    const double c_zero = MAGMA_D_ZERO;

    // Local variables
    magma_int_t nb = magma_get_dgehrd_nb( n );
    magma_int_t ldda = magma_roundup( n, 32 );

    magma_int_t i, nh, iws;
    magma_int_t iinfo;
    magma_int_t lquery;

    *info = 0;
    iws = n*nb;
    work[0] = magma_dmake_lwork( iws );

    lquery = (lwork == -1);
    if (n < 0) {
        *info = -1;
    } else if (ilo < 1 || ilo > max(1,n)) {
        *info = -2;
    } else if (ihi < min(ilo,n) || ihi > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (lwork < iws && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    // Adjust from 1-based indexing
    ilo -= 1;
    
    // Quick return if possible
    nh = ihi - ilo;
    if (nh <= 1) {
        work[0] = c_one;
        return *info;
    }

    // Now requires lwork >= iws; else dT won't be computed in unblocked code.
    // If not enough workspace, use unblocked code
    //if ( lwork < iws ) {
    //    nb = 1;
    //}

    if (nb == 1 || nb > nh) {
        // Use unblocked code below
        i = ilo;
    }
    else {
        // Use blocked code
        magma_queue_t queue;
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queue );
        
        // GPU workspace is:
        //   nb*ldda for dwork for dlahru
        //   nb*ldda for dV
        //   n*ldda  for dA
        magmaDouble_ptr dwork;
        if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*nb*ldda + n*ldda )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        double *dV = dwork + nb*ldda;
        double *dA = dwork + nb*ldda*2;
        
        double *T;
        magma_dmalloc_cpu( &T, nb*nb );
        if ( T == NULL ) {
            magma_free( dwork );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        
        // zero first block of V, which is lower triangular
        magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue );
        
        // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero
        for (i = 0; i < ilo; ++i)
            tau[i] = c_zero;
        
        for (i = max(0,ihi-1); i < n-1; ++i)
            tau[i] = c_zero;
        
        assert( nb % 4 == 0 );
        for (i=0; i < nb*nb; i += 4)
            T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero;
        
        magmablas_dlaset( MagmaFull, nb, n, c_zero, c_zero, dT(0,0), nb, queue );
        
        // Copy the matrix to the GPU
        magma_dsetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue );
        
        for (i = ilo; i < ihi-1 - nb; i += nb) {
            // Reduce columns i:i+nb-1 to Hessenberg form, returning the
            // matrices V and T of the block reflector H = I - V*T*V'
            // which performs the reduction, and also the matrix Y = A*V*T
            
            // Get the current panel (no need for the 1st iteration)
            magma_dgetmatrix( ihi-i, nb,
                              dA(i,i-ilo), ldda,
                              A(i,i), lda, queue );
            
            // add 1 to i for 1-based index
            magma_dlahr2( ihi, i+1, nb,
                          dA(0,i-ilo), ldda,
                          dV(0,0),     ldda,
                          A(0,i),      lda,
                          &tau[i], T, nb, work, n, queue );
            
            // Copy T from the CPU to dT on the GPU
            magma_dsetmatrix( nb, nb, T, nb, dT(0,i-ilo), nb, queue );
            
            magma_dlahru( n, ihi, i, nb,
                          A(0,i),      lda,
                          dA(0,i-ilo), ldda, // dA
                          dA(i,i-ilo), ldda, // dY, stored over current panel
                          dV(0,0),     ldda,
                          dT(0,i-ilo), dwork(0), queue );
        }
        
        // Copy remainder to host
        magma_dgetmatrix( n, n-i,
                          dA(0,i-ilo), ldda,
                          A(0,i), lda, queue );
        
        magma_free( dwork );
        magma_free_cpu( T );
        
        magma_queue_destroy( queue );
    }

    // Use unblocked code to reduce the rest of the matrix
    // add 1 to i for 1-based index
    i += 1;
    lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo);
    work[0] = magma_dmake_lwork( iws );

    return *info;
} /* magma_dgehrd */

Example #12

0

Show file

File: test.c Project: rahulporuri/project_work

double DDphi(double N, double phi, double Dphi, double V0, double q, double phi0)
/* the value of the second derivative of the scalar field phi with respect to e-fold N */
/* it is used when we are evaluating and integrating phi */
{
	return -(3 -(Dphi*Dphi)/2.)*Dphi -(dV(phi, V0, q, phi0)/(2*V(phi, V0, q, phi0)))*(6 -(Dphi*Dphi));
}

Example #13

0

Show file

File: node.cpp Project: SanjayaDeSilva/GridLAB-D

TIMESTAMP node::postsync(TIMESTAMP t0) 
{
	OBJECT *hdr = OBJECTHDR(this);
	node *swing = hdr->parent?OBJECTDATA(hdr->parent,node):this;
	complex dV(0.0);
	complex YY = Ys + complex(G,B);
	// copy values that might get updated while we work on this object
	complex old_YVs = YVs;
#ifdef HYBRID
	swing->del_inj_residual(this);
#endif
	if (!YY.IsZero() || type==SWING)
	{
		switch (type) {
		case PV:
			S.Im() = ((~V*(YY*V-old_YVs)).Im());
			if (Qmin_MVAR<Qmax_MVAR && S.Im()<Qmin_MVAR) 
				S.Im() = Qmin_MVAR;
			else if (Qmax_MVAR>Qmin_MVAR && S.Im()>Qmax_MVAR) 
				S.Im() = Qmax_MVAR;
			//else
			{
				complex Vnew = (-(~S/~V) + old_YVs) / YY;
				Vnew.SetPolar(V.Mag(),Vnew.Arg());
#ifdef HYBRID
				if (Vstdev>0)
				{
					double pr = swing->get_obs_probability();
					swing->del_obs_residual(this);
					dV = Vobs*(1-pr) + (Vnew)*pr - V;
					V += dV;
					swing->add_obs_residual(this);
				}
				else
#endif
				{
					dV = Vnew - V;
					V = Vnew;
				}
				break;
			}
			/* continue with PQ solution */
		case PQ:
			if (!V.IsZero())
			{
				complex Vnew = (-(~S/~V) + old_YVs) / YY;
#ifdef HYBRID
				if (Vstdev>0) // need to consider observation
				{
					double pr = swing->get_obs_probability();
					swing->del_obs_residual(this);
					dV = Vobs*(1-pr) + (Vnew)*pr - V;
					V += dV;
					swing->add_obs_residual(this);
				}
				else // no observation 
#endif
				{
					dV = (Vnew - V)*acceleration_factor;
					V += dV;
				}
				V.Notation() = A;
			}
			break;
		case SWING:
			S = ~(~V*(YY*V - YVs));
			S.Notation() = J;
			break;
		default:
			/* unknown type fails */
			gl_error("invalid bus type");
			return TS_ZERO;
		}
	}
#ifdef HYBRID
	swing->add_inj_residual(this);
#endif

#ifdef _DEBUG
	// node debugging
	if (debug_node>0)
	{
		OBJECT* obj = OBJECTHDR(this);
		static int first=-1;
		if (first==-1) first = obj->id;
		if (obj->id==first)
		{
			printf("\n");
			printf("Node           Type  V                 Vobs              Stdev    Power             G        B        dV       Pr{Vobs} r2     Sr2\n");
			printf("============== ===== ================= ================= ======== ================= ======== ======== ======== ======== ====== ======\n");
		}
		if (((debug_node&1)==1 && dV.Mag()>convergence_limit )  // only on dV
#ifdef HYBRID
			|| ((debug_node&2)==2 && Vstdev>0 ) // only on observation
			|| ((debug_node&4)==4 && get_inj_residual()>0.001)// non-zero power residual
#endif
			)
		{
			printf("%2d (%-9.9s) %5s %+8.4f%+8.3fd %+8.4f%+8.3fd %8.5f %+8.4f%+8.4fj %+8.5f ", 
				obj->id, obj->name, type==SWING?"SWING":(type==PQ?"PQ   ":"PV"),
				V.Mag(),V.Arg()*180/3.1416, 
				Vobs.Mag(), Vobs.Arg()*180/3.1416, Vstdev,
				S.Re(), S.Im(), 
				G, B,
				dV.Mag());
#ifdef HYBRID
			printf("%+8.5f %8.5f ", get_obs_probability(), r2);
			if (Vstdev>0)
				printf("%8.5f %6.3f %6.3f\n", get_obs_probability(), r2, get_inj_residual());
			else
				printf("   --      --   %6.3f\n",get_inj_residual());
#else
			printf("\n");
#endif
		}
	}
#endif // _DEBUG

	// send dV through all links
	LINKLIST *item;
	for (item=linklist; item!=NULL; item=item->next)
		item->data->apply_dV(hdr,dV);

	if (dV.Mag()>convergence_limit)
		return t0; /* did not converge, hold the clock */
	else
		return TS_NEVER; /* converged, no further updates needed */
}

Example #14

0

Show file

File: look_md_results.cpp Project: daju1/winlibghemical

int Look_txt()
{
	TCHAR filter[] =     TEXT("Ghemical MD results File (*.txt)\0*.txt\0")
						 TEXT("All Files (*.*)\0*.*\0");
	TCHAR fpath[1024];
	TCHAR filename[1024];

	sprintf(filename, "\0");
	{
		DWORD nFilterIndex;

		vector<string> names;
		vector<string> *pnames = &names;
		vector<vector<double> > vectors;
		vectors.reserve(2000000);

		while (OpenFileDlg(0, filter, fpath, nFilterIndex) == S_OK)
		{		
			ReadDatFile(NULL, fpath, filename, &vectors, pnames);
			pnames = NULL;

			printf("\nfilename %s\n\n", filename);

			int cols = names.size();
			int rows = vectors.size();
			
#if WRITE_LOCKED_FORCES
			int cMom = 4 - 1;
			int cVx = 5 - 1;
			int cFxup = 14 - 1;
			int cFxdw = 17 - 1;

			int cVxup = 8 - 1;
			int cVxdw = 11 - 1;
#endif
#if WRITE_WORKED_FORCES
			int cMom = 4 - 1;

			int cVx = 5 - 1;
			int cVxup = 14 - 1;
			int cVxdw = 17 - 1;

			int cVx_wk_up = 8 - 1;
			int cVx_wk_dw = 11 - 1;

			int cFx_wk_up = 20 - 1;
			int cFx_wk_dw = 23 - 1;

#endif

			vector<double> means(cols, 0.0);


			printf("vectors.size() = %d\n",rows);
			printf("names.size() = %d\n", cols);

			for (vector<vector<double> >::iterator it = vectors.begin();
			it != vectors.end(); it++)
			{
				for (int c = 0; c < cols; c++)
				{
					means[c] += (*it).operator [](c);
				}
			}

			for (int c = 0; c < cols; c++)
			{
				means[c] /= rows;
				printf("mean(%s) = %f\n", names[c].c_str(), means[c]);
			}

#if WRITE_LOCKED_FORCES || WRITE_WORKED_FORCES
			int r0 = 0;

			cout << "enter r0\n";
			cin >> r0;
#endif

#if WRITE_LOCKED_FORCES
			vector<double> dF(rows-r0);
			for (int r = r0; r < rows; r++)
			{
				dF[r-r0] = vectors[r][cFxup] - vectors[r][cFxdw];
			}

			Statistika (dF, "dF");

			vector<double> Mom(rows-r0);
			for (r = r0; r < rows; r++)
			{
				Mom[r-r0] = vectors[r][cMom];
			}

			Statistika (Mom, "Mom");

			vector<double> dV(rows-r0);
			for (r = r0; r < rows; r++)
			{
				dV[r-r0] = vectors[r][cVxup] - vectors[r][cVxdw];
			}

			Statistika (dV, "dV");

			vector<double> Vx(rows-r0);
			for (r = r0; r < rows; r++)
			{
				Vx[r-r0] = vectors[r][cVx];
			}

			Statistika (Vx, "Vx");
#endif
#if WRITE_WORKED_FORCES
			vector<double> dF_wk(rows-r0);
			for (int r = r0; r < rows; r++)
			{
				dF_wk[r-r0] = vectors[r][cFx_wk_up] - vectors[r][cFx_wk_dw];
			}

			Statistika (dF_wk, "dF_wk");


			vector<double> dV_wk(rows-r0);
			for (r = r0; r < rows; r++)
			{
				dV_wk[r-r0] = vectors[r][cVx_wk_up] - vectors[r][cVx_wk_dw];
			}

			Statistika (dV_wk, "dV_wk");

			//if (!worked[n1])
			vector<double> Mom(rows-r0);
			for (r = r0; r < rows; r++)
			{
				Mom[r-r0] = vectors[r][cMom];
			}

			Statistika (Mom, "Mom");

			vector<double> dV(rows-r0);
			for (r = r0; r < rows; r++)
			{
				dV[r-r0] = vectors[r][cVxup] - vectors[r][cVxdw];
			}

			Statistika (dV, "dV");

			vector<double> Vx(rows-r0);
			for (r = r0; r < rows; r++)
			{
				Vx[r-r0] = vectors[r][cVx];
			}

			Statistika (Vx, "Vx");
#endif
		}
	}
	/*else
	{
		DWORD nFilterIndex;
		if (SaveFileDlg(0, filename, filter, nFilterIndex) == S_OK)
		{
			SetDlgItemText(ref->hDlg,IDC_EDIT_TRAJFILE2, filename);
		}	
	}*/
	
	printf("Hello World!\n");
	return 0;

}

Example #15

0

Show file

extern "C" magma_int_t
magma_clahr2(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    magmaFloatComplex *dA, magmaFloatComplex *dV,
    magmaFloatComplex *A, magma_int_t lda,
    magmaFloatComplex *tau,
    magmaFloatComplex *T, magma_int_t ldt,
    magmaFloatComplex *Y, magma_int_t ldy )
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    CLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by CGEHRD.

    Arguments
    =========
    N       (input) INTEGER
            The order of the matrix A.

    K       (input) INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    NB      (input) INTEGER
            The number of columns to be reduced.

    dA      (input/output) COMPLEX array on the GPU, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements in rows K:N of the first NB columns are
            overwritten with the matrix Y.

    DV      (output) COMPLEX array on the GPU, dimension (N, NB)
            On exit this contains the Householder vectors of the transformation.

    A       (input/output) COMPLEX array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    TAU     (output) COMPLEX array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    T       (output) COMPLEX array, dimension (LDT,NB)
            The upper triangular matrix T.

    LDT     (input) INTEGER
            The leading dimension of the array T.  LDT >= NB.

    Y       (output) COMPLEX array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    LDY     (input) INTEGER
            The leading dimension of the array Y. LDY >= N.

    Further Details
    ===============
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    =====================================================================    */

    #define  A( i, j ) ( A + (i) + (j)*lda)
    #define  Y( i, j ) ( Y + (i) + (j)*ldy)
    #define  T( i, j ) ( T + (i) + (j)*ldt)
    #define dA( i, j ) (dA + (i) + (j)*ldda)
    #define dV( i, j ) (dV + (i) + (j)*ldda)
    
    magmaFloatComplex c_zero    = MAGMA_C_ZERO;
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t ldda = lda;
    magma_int_t ione = 1;
    
    magma_int_t n_k_i_1, n_k;
    magmaFloatComplex scale;

    magma_int_t i;
    magmaFloatComplex ei = MAGMA_C_ZERO;

    // adjust from 1-based indexing
    k -= 1;

    // Function Body
    if (n <= 1)
        return 0;
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Update A(k:n-1,i); Update i-th column of A - Y * T * V'
            // This updates one more row than LAPACK does (row k),
            // making the block above the panel an even multiple of nb.
            // Use last column of T as workspace, w.
            // w(0:i-1, nb-1) = VA(k+i, 0:i-1)'
            blasf77_ccopy( &i,
                           A(k+i,0),  &lda,
                           T(0,nb-1), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            // If complex, conjugate row of V.
            lapackf77_clacgv(&i, T(0,nb-1), &ione);
            #endif
            
            // w = T(0:i-1, 0:i-1) * w
            blasf77_ctrmv( "Upper", "No trans", "No trans", &i,
                           T(0,0),    &ldt,
                           T(0,nb-1), &ione );
            
            // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w
            blasf77_cgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0),    &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i),    &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_ccopy( &i,
                           A(k+1,i),  &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_ctrmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_cgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1),  &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_ctrmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_cgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1),  &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_ctrmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_caxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i),    &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_clarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i)
        magma_csetvector( n_k_i_1,
                          A(k+i+1,i), 1,
                          dV(i+1,i),  1 );
        
        // Compute Y(k+1:n,i) = A vi
        // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i)
        magma_cgemv( MagmaNoTrans, n_k, n_k_i_1,
                     c_one,  dA(k,i+1), ldda,
                             dV(i+1,i),   ione,
                     c_zero, dA(k,i),     ione );
        
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_C_NEGATE( tau[i]);
        blasf77_cgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i),     &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_ctrmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];

        // Y(k:n-1, i) = dA(k:n-1, i)
        magma_cgetvector( n-k,
                          dA(k,i), 1,
                          Y(k,i),  1 );
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;

    return 0;
} // magma_clahr2

Example #16

0

Show file

File: vstablesolve.cpp Project: nicolaje/reliable-slam

bool VStableSolve::isVStable()
{
    Variable x1,x2,a1,a2;//,a;
//    double a=10;
//    double _ii[2][2]={{-10,-10},{-10,10}};
//    IntervalVector a(2,_ii);
    Function f_sup("f_sup.txt");
    Function f_inf("f_inf.txt");
    Function V("V.txt");
    Function dV("gradV.txt");

    NumConstraint c1_c(x1,x2,a1,a2,a1*dV(x1,x2)[0]+a2*dV(x1,x2)[1]>=0);

    NumConstraint c21_c(x1,x2,a1,a2,f_sup(x1,x2)[0]-a1>=0);
    NumConstraint c22_c(x1,x2,a1,a2,f_sup(x1,x2)[1]-a2>=0);

    NumConstraint c31_c(x1,x2,a1,a2,a1-f_inf(x1,x2)[0]>=0);
    NumConstraint c32_c(x1,x2,a1,a2,a2-f_inf(x1,x2)[1]>=0);

    NumConstraint c41_c(x1,x2,V(x1,x2)>=0);
    NumConstraint c42_c(x1,x2,V(x1,x2)<=v_bar);

    CtcFwdBwd c1(c1_c);

    CtcFwdBwd c21(c21_c);
    CtcFwdBwd c22(c22_c);
    CtcCompo c2(c21,c22);

    CtcFwdBwd c31(c31_c);
    CtcFwdBwd c32(c32_c);
    CtcCompo c3(c31, c32);

    CtcFwdBwd c41(c41_c);
    CtcFwdBwd c42(c42_c);
    CtcCompo c4(c41,c42);

    CtcCompo cOut1(c1,c2);
    CtcCompo cOut2(c3,c4);

    CtcCompo cOut(cOut1,cOut2);

    // Build the initial box.
    IntervalVector box(4);
    box[0]=Interval(-10,10);
    box[1]=Interval(-10,10);
    box[2]=Interval::ALL_REALS;
    box[3]=Interval::ALL_REALS;

    // Build the way boxes will be bisected.
    // "LargestFirst" means that the dimension bisected
    // is always the largest one.
    LargestFirst lf;

    stack<IntervalVector> s;
    s.push(box);

    while (!s.empty()) {
        // Get a copy of the current box (on top of the stack)
        IntervalVector box=s.top();


//        qDebug() << "a[0]= [" <<box[2].lb() << "; " << box[2].ub() << "], a[1]= [" << box[3].lb() << "; " <<box[3].ub() << "]" << endl;
        // Remove the box from the stack
        s.pop();
        try {

//            // Remove the part that is inside
////            contract_and_draw(cOut,box,Qt::darkBlue,Qt::cyan);
            IntervalVector initbox=box;       // get a copy
            try {
//                cOut.contract(box);
//                if (box==initbox) return;     // nothing contracted.
                if(box!=initbox){
                IntervalVector* rest;
//                int n=initbox.diff(box,rest); // calculate the set difference
//                for (int i=0; i<n; i++) {     // display the boxes
//                    frame.DrawBox(rest[i][0],rest[i][1],QPen(pencolor),QBrush(brushcolor));
//                }
//                delete[] rest;
            }
            } catch(EmptyBoxException&) {
//                frame.DrawBox(initbox[0],initbox[1],QPen(pencolor),QBrush(brushcolor));
            }


//            if (box.is_empty()) { continue; }

//            // Check if the box is small enough
            if (box.max_diam()<0.0001){//epsilon) {
                return false; // il y a une situation incertaine: on conclut non-V-Stable
//                frame.DrawBox(box[0],box[1],QPen(Qt::yellow),QBrush(Qt::yellow));
            } else {
                // otherwise, bisect it and
                // push the two subboxes on the stack.
                pair<IntervalVector,IntervalVector> boxes=lf.bisect(box);
                s.push(boxes.first);
                s.push(boxes.second);
            }
        } catch(EmptyBoxException&) { }
    }

return true;
}

Example #17

0

Show file

File: sivia.cpp Project: nicolaje/reliable-slam

Sivia::Sivia(Frame& frame, double epsilon) : frame(frame) {

    // Create the function we want to apply SIVIA on.
    Variable x1,x2,a1,a2;

    Function f_sup("f_sup.txt");
    Function f_inf("f_inf.txt");
    Function V("V.txt");
    Function dV("gradV.txt");

    NumConstraint c1_c(x1,x2,a1,a2,a1*dV(x1,x2)[0]+a2*dV(x1,x2)[1]>=0);

    NumConstraint c21_c(x1,x2,a1,a2,f_sup(x1,x2)[0]-a1>=0);
    NumConstraint c22_c(x1,x2,a1,a2,f_sup(x1,x2)[1]-a2>=0);

    NumConstraint c31_c(x1,x2,a1,a2,a1-f_inf(x1,x2)[0]>=0);
    NumConstraint c32_c(x1,x2,a1,a2,a2-f_inf(x1,x2)[1]>=0);

    NumConstraint c41_c(x1,x2,V(x1,x2)>=0);
    NumConstraint c42_c(x1,x2,V(x1,x2)<=v_bar);

    CtcFwdBwd c1(c1_c);

    CtcFwdBwd c21(c21_c);
    CtcFwdBwd c22(c22_c);
    CtcCompo c2(c21,c22);

    CtcFwdBwd c31(c31_c);
    CtcFwdBwd c32(c32_c);
    CtcCompo c3(c31, c32);

    CtcFwdBwd c41(c41_c);
    CtcFwdBwd c42(c42_c);
    CtcCompo c4(c41,c42);

    CtcCompo cOut1(c1,c2);
    CtcCompo cOut2(c3,c4);

    CtcCompo cOut(cOut1,cOut2);

    NumConstraint c1_cIn(x1,x2,a1,a2,a1*dV(x1,x2)[0]+a2*dV(x1,x2)[1]>=0);

    NumConstraint c21_cIn(x1,x2,a1,a2,f_sup(x1,x2)[0]-a1<0);
    NumConstraint c22_cIn(x1,x2,a1,a2,f_sup(x1,x2)[1]-a2<0);

    NumConstraint c31_cIn(x1,x2,a1,a2,a1-f_inf(x1,x2)[0]<0);
    NumConstraint c32_cIn(x1,x2,a1,a2,a2-f_inf(x1,x2)[1]<0);

    NumConstraint c41_cIn(x1,x2,V(x1,x2)<0);
    NumConstraint c42_cIn(x1,x2,V(x1,x2)>v_bar);

    CtcFwdBwd c1_In(c1_cIn);

    CtcFwdBwd c21_In(c21_cIn);
    CtcFwdBwd c22_In(c22_cIn);
    CtcUnion c2_In(c21_In, c22_In);

    CtcFwdBwd c31_In(c31_cIn);
    CtcFwdBwd c32_In(c32_cIn);
    CtcUnion c3_In(c31_In, c32_In);

    CtcFwdBwd c41_In(c41_cIn);
    CtcFwdBwd c42_In(c42_cIn);
    CtcUnion c4_In(c41_In, c42_In);

    CtcUnion cIn1(c1_In, c2_In);
    CtcUnion cIn2(c3_In, c4_In);

    CtcUnion cIn(cIn1, cIn2);

    // Build the initial box.
    IntervalVector box(4);
    box[0]=Interval(-10,10);
    box[1]=Interval(-10,10);
    box[2]=Interval::ALL_REALS;
    box[3]=Interval::ALL_REALS;

    // Build the way boxes will be bisected.
    // "LargestFirst" means that the dimension bisected
    // is always the largest one.
    LargestFirst lf;

    stack<IntervalVector> s;
    s.push(box);

    while (!s.empty()) {
        // Get a copy of the current box (on top of the stack)
        IntervalVector box=s.top();


//        qDebug() << "a[0]= [" <<box[2].lb() << "; " << box[2].ub() << "], a[1]= [" << box[3].lb() << "; " <<box[3].ub() << "]" << endl;
        // Remove the box from the stack
        s.pop();
        try {
            // Remove the part that is outside
            contract_and_draw(cIn,box,Qt::magenta,Qt::red);
            if (box.is_empty()) { continue; }

            // Remove the part that is inside
            contract_and_draw(cOut,box,Qt::darkBlue,Qt::cyan);
            if (box.is_empty()) { continue; }

            // Check if the box is small enough
            if (box.max_diam()<epsilon) {
                frame.DrawBox(box[0],box[1],QPen(Qt::yellow),QBrush(Qt::yellow));
            } else {
                // otherwise, bisect it and
                // push the two subboxes on the stack.
                pair<IntervalVector,IntervalVector> boxes=lf.bisect(box);
                s.push(boxes.first);
                s.push(boxes.second);
            }
        } catch(EmptyBoxException&) { }
    }


    frame.Save("paving");
}

Example #18

0

Show file

File: chromoconditions.cpp Project: DennisGoldfarb/MSAcquisitionSimulator

void ChromoConditions::recalculateSSConcentrations()
{
    mSSConcentrations.clear();
    if (gradient().empty())
    {
        return;
    }
    
    // If the gradient is isocratic, add a single point only.
    if ((gradient().size() == 2) 
         && (gradient().front().concentrationB() 
             == gradient().back().concentrationB()))
    {
        mSSConcentrations.push_back(
            (100.0 - gradient().front().concentrationB()) / 100.0 
            * secondSolventConcentrationA() 
            + gradient().front().concentrationB() / 100.0
            * secondSolventConcentrationB());
        return;
    }

    double secondSolventConcentrationPump = 0.0;
    double time = dV() / 2.0 / flowRate();
    int segmentNum = -1;
    double localSlope = 0.0;
    double initialSSConcentration, finalSSConcentration, initialTime,
           finalTime, pumpedConcentration;

    while (true)
    {
        if (time > gradient()[segmentNum+1].time())
        {
            if (segmentNum < int(gradient().size() - 1))
            {
                segmentNum += 1;
                initialSSConcentration =
                    (100.0 - gradient()[segmentNum].concentrationB()) / 100.0 
                    * secondSolventConcentrationA() 
                    + gradient()[segmentNum].concentrationB() / 100.0
                    * secondSolventConcentrationB();
                finalSSConcentration =
                    (100.0 - gradient()[segmentNum+1].concentrationB()) / 100.0 
                    * secondSolventConcentrationA() 
                    + gradient()[segmentNum+1].concentrationB() / 100.0
                    * secondSolventConcentrationB();
                initialTime = gradient()[segmentNum].time();
                finalTime = gradient()[segmentNum+1].time();
                localSlope = (finalSSConcentration - initialSSConcentration) 
                    / (finalTime - initialTime);
            }
            else
            {
                break;
            }
        }

        pumpedConcentration = localSlope * (time - initialTime) 
                              + initialSSConcentration;

        // If mixingCorrection is enabled calculate second solvent concentrations
        // from the equation 
        // d[SS] / dt = flowRate / (V0 + VP) * ([SS]pump - [SS])
        // Otherwise, [SS] == [SS]pump
        if (mixingCorrection())
        {
            // Special case for the first time point.
            if (time == dV() / 2.0 / flowRate())
            {
                mSSConcentrations.push_back(
                    initialSSConcentration + 
                    + dV() / (columnInterstitialVolume() + columnPoreVolume())
                    * (pumpedConcentration - mSSConcentrations.back()) / 2.0);
            }
            else
            {
                mSSConcentrations.push_back(
                    mSSConcentrations.back() 
                    + dV() / (columnInterstitialVolume() + columnPoreVolume())
                    * (pumpedConcentration - mSSConcentrations.back()));
            }
        }
        else
        {
            mSSConcentrations.push_back(pumpedConcentration);
        }
        time += dV() / flowRate();
    }
}

Example #19

0

Show file

File: WJordanSolver.cpp Project: CytoComp/HeuristicWorm

//-----------------------------------------------------------------------
void WJordanSolver::IClamp()
{
		m_V[0] = -0.075;
		m_Ca[0] = 0;
		for(int i = 0; i <= 4; ++i)
			m_gatevars[i] = 1 / ( 1 + exp( ( m_vhalf_x[i] - m_V[0] ) / m_k_x[i] ) );

		m_gatevars[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - m_Ca[0] ) / m_k_x[5] ) );

		// Now do main loop
		for( unsigned  int i = 1; i < m_numpoints; ++i )
		{
			float k1[7], k2[7], k3[7], k4[7];
			float h_2 = m_deltat / 2;
			float tempgate[6];
			float tempCa, tempV;

			//step 1 - get k1 values
			for(int j = 0; j <= 4; ++j){
				k1[j] = dx(m_gatevars[j], m_V[i-1], m_T_x[j], m_vhalf_x[j], m_k_x[j]);
			}
			k1[5] = dCa( m_gxs, m_gatevars, m_vrevs, m_alphaCa, m_V[i-1], m_TCa, m_thiCa, m_Ca[i-1] );
			k1[6] = dV( m_gxs, m_gatevars, m_vrevs, m_alphaCa, m_I[i], m_Cm, m_V[i-1]);

			//step 2  - get k2 values
			tempV = m_V[i-1] + k1[6] * h_2;
			tempCa = m_Ca[i-1] + k1[5] * h_2;
			for(int j = 0; j <= 4; ++j){
				tempgate[j] =  (m_gatevars[j] + k1[j] * h_2);
				k2[j] = dx(tempgate[j], tempV, m_T_x[j], m_vhalf_x[j], m_k_x[j]);
			}
			tempgate[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa) / m_k_x[5] ) );
			k2[5] = dCa( m_gxs, tempgate, m_vrevs, m_alphaCa, tempV, m_TCa, m_thiCa, tempCa );
			k2[6] = dV (m_gxs, tempgate, m_vrevs, m_alphaCa, m_I[i], m_Cm, tempV );

			//step 3  - get k3 values
			tempV = m_V[i-1] + k2[6] * h_2;
			tempCa = m_Ca[i-1] + k2[5] * h_2;
			for(int j = 0; j <= 4; ++j){
				tempgate[j] =  (m_gatevars[j] + k2[j] * h_2);
				k3[j] = dx(tempgate[j], tempV, m_T_x[j], m_vhalf_x[j], m_k_x[j]);
			}
			tempgate[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa ) / m_k_x[5] ) );
			k3[5] = dCa( m_gxs, tempgate, m_vrevs, m_alphaCa, tempV, m_TCa, m_thiCa, tempCa );
			k3[6] = dV( m_gxs, tempgate, m_vrevs, m_alphaCa, m_I[i], m_Cm, tempV );

			//step 4  - get k4 values
			tempV = m_V[i-1] + k3[6] * h_2 * 2;
			tempCa = m_Ca[i-1] + k3[5] * h_2 * 2;
			for(int j = 0; j <= 4; ++j){
				tempgate[j] =  ( m_gatevars[j] + k3[j] * h_2 * 2 );
				k4[j] = dx(tempgate[j], tempV, m_T_x[j], m_vhalf_x[j], m_k_x[j]);
			}
			tempgate[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa ) / m_k_x[5] ) );
			k4[5] = dCa( m_gxs, tempgate, m_vrevs, m_alphaCa, tempV, m_TCa, m_thiCa, tempCa );
			k4[6] = dV( m_gxs, tempgate, m_vrevs, m_alphaCa, m_I[i], m_Cm, tempV );

			// now we must combine them to get final value
			for(int j = 0; j <= 4; ++j){
				m_gatevars[j] += ( m_deltat / 6 ) * ( k1[j] + 2 * k2[j] + 2 * k3[j] + k4[j] );
			}
			m_Ca[i] = m_Ca[i-1] + ( m_deltat / 6 ) * ( k1[5] + 2 * k2[5] + 2 * k3[5] + k4 [5] );
			m_gatevars[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa ) / m_k_x[5] ) );
			m_V[i] = m_V[i-1] + ( m_deltat / 6 ) * ( k1[6] + 2 * k2[6] + 2 * k3[6] + k4[6] );
		}
		return;
}

Example #20

0

Show file

	void Integration::SODE()
	{
		const int  NOR = Global::NOR;
		double eps = Global::eps;
		double t0 = Global::t0;
		double te = Global::te;
		double step = Global::step;
		double SVi[6]; for (int i = 0; i < 6; i++) SVi[i] = Global::SV(i, 0);

		char lineRad[300];
		vector<double> Rado(NOR);
		vector<double> h(NOR);
		vector<vector<double> > C(NOR, vector<double>(NOR));
		vector<triple > X1(NOR);
		vector<triple > V1(NOR);
		//vector<triple > Fi(NOR);

		vector <triple> A1(NOR), A2(NOR);
		vector<vector<double>> Dh(NOR, vector<double>(NOR));
		vector<triple> Alp(NOR);
		vector<triple> dF(NOR);

		double tout = t0;

		triple F0, X(SVi[0], SVi[1], SVi[2]), V(SVi[3], SVi[4], SVi[5]), Fi;

		FILE*f = fopen("Radau.txt", "r");

		fosv = fopen("sv_J2000.out", "w");
		//fprintf(fosv,"year month day hms(UTC) TDB(sec) interval(days) X Y Z Vx Vy Vz \n");

		foel = fopen("elts_J2000.out", "w");
		//	fprintf(foel,"year month day hms(UTC) TDB(sec) interval(days) A E I NODE W M \n");
		foSvEcl = fopen("sv_ECLIPJ2000.out", "w");
		foelEcl = fopen("elts_ECLIPJ2000.out", "w");
		fosvR = fopen("sv_IAUplanet.out", "w");
		//	fprintf(fosvR,"year month day hms(UTC) TDB(sec) interval(days) X Y Z Vx Vy Vz \n");

		foBL = fopen("BL.out", "w");
		//	fprintf(foBL,"year month day hms(UTC) TDB(sec) interval(days) L B H \n");

		foNEU = fopen("NEU.out", "w");
		//	fprintf(foNEU,"year month day hms(UTC) TDB(sec) interval(days) N E U \n");


		fvisi = fopen("visibility.out", "w");

		fo3bg = fopen("3body_geodetic.out", "w");
		//Ќј’ќ∆ƒ≈Ќ»≈ ƒќЋ√ќ“џ ¬ќ—’ќƒяў≈√ќ ”«Ћј Ќј “0

		//	if(Global::b_out_elts_planet || Global::b_out_sv_planet || Global::b_out_el_IAUPlanet==true){ 

		double poss[6];
		double lt1, dlt;
		triple Zorb;
		double Zpl[3];
		double TimeNode = t0;

		spkacs_c(Global::IDC, t0, "J2000", "NONE", 10, poss, &lt1, &dlt);

		triple Xv = triple(poss[0], poss[1], poss[2]);
		triple Vv = triple(poss[3], poss[4], poss[5]);
		Xv = Xv / Xv.getAbs();
		Vv = Vv / Vv.getAbs();
		Zorb = Xv&Vv / sin(triple::getAngle(Xv, Vv));

		double Z[3] = { Zorb[0], Zorb[1], Zorb[2] };

		trpos(t0, 1, Global::IDC, Z, Zpl);
		triple ZorbP = triple(Zpl);
		triple PolP = triple(0.0, 0.0, 1.0);

		triple Node = ZorbP&PolP;
		Node = Node / Node.getAbs();

		double NodeA = atan2(Node[1], Node[0]);
		if (NodeA < 0.0) NodeA = NodeA + 2 * pi;


		// ѕЋјЌ≈“ќ÷≈Ќ“–»„≈— јя √–ј¬»“ј÷»ќЌЌјя ѕќ—“ќяЌЌјя
		double mu = Global::mu;

		int ii = 1;
		int jj = 0;

		while (!feof(f))
		{
			fscanf(f, "%s\n", lineRad);
			if (ii > Misc::sum(NOR - 1) && ii <= Misc::sum(NOR))
			{
				Rado[jj] = atof(lineRad); jj++;
			}
			else {}
			ii++;
		}
		//интегрирование назад
		if (te < t0) { step = -step; Global::Discr = -Global::Discr; }

		//System::Threading::Thread Worker ^ = gcnew System::Threading::Thread(delegate() {  });
		//собственно, сам интегртор:
		while (abs(te - t0) != 0e0)
		{
			if (abs(te - t0) < 1e-12) break;
			if (step > 0)
			{
				if (t0 + step > te) step = te - t0;
			}
			else
			{
				if (t0 + step < te) step = te - t0;
			}

			for (int i = 0; i < NOR; i++) h[i] = step*Rado[i];

			for (int k = 0; k < NOR; k++)
			{
				Dh[k][k] = h[k];
				for (int j = 0; j < k; j++)
				{
					Dh[k][j] = h[k] - h[j];
				}
			}

			//числа Cтирлинга
			for (int i = 0; i < NOR; i++)
			{
				C[i][i] = 1.0;
				if (i > 0) C[i][0] = -h[i - 1] * C[i - 1][0];
				for (int j = 1; j < i; j++) C[i][j] = C[i - 1][j - 1] - h[i - 1] * C[i - 1][j];
			}

			//////////////////////////////////////////////////////		
			F0 = Force::force_SODE(t0, X, V);

			for (int j = 0; j < NOR; j++)
			{
				X1[j] = X + V*h[j] + F0*h[j] * h[j] / 2;
				V1[j] = V + F0*h[j];
				///////////////////////////////////////////////
				Fi = Force::force_SODE(t0 + h[j], X1[j], V1[j]);

				dF[j] = Fi - F0;
			}

			Alp[0] = dF[0] / h[0];
			for (int k = 1; k < NOR; k++)
			{
				Alp[k] = dF[k] / h[k];
				for (int j = 0; j < k; j++)
				{
					Alp[k] = (Alp[k] - Alp[j]) / Dh[k][j];
				}
			}

			for (int k = 0; k < NOR; k++)
			{
				A1[k] = triple(0., 0., 0.);
				for (int i = k; i < NOR; i++) A1[k] = A1[k] + Alp[i] * C[i][k];
			}

			int ij = 0;
			for (;;)
			{
				for (int k = 0; k < NOR; k++)
				{
					triple 	v2 = A1[0] * 0.5;
					triple 	x2 = A1[0] * (1.0 / 6.0);

					for (int i = 1; i < NOR; i++)
					{
						v2 = v2 + A1[i] * (pow(h[k], i) / (i + 2));
						x2 = x2 + A1[i] * (pow(h[k], i) / (i + 2) / (i + 3));
					}

					v2 = V1[k] + v2*(pow(h[k], 2));
					x2 = X1[k] + x2*(pow(h[k], 3));
					//////////////////////////////////////////////
					Fi = Force::force_SODE(t0 + h[k], x2, v2);

					dF[k] = Fi - F0;
				}

				Alp[0] = dF[0] / h[0];
				for (int k = 1; k < NOR; k++)
				{
					Alp[k] = dF[k] / h[k];
					for (int j = 0; j < k; j++)
					{
						Alp[k] = (Alp[k] - Alp[j]) / Dh[k][j];
					}
				}

				for (int k = 0; k < NOR; k++)
				{
					A2[k] = triple(0.0, 0., 0.);
					for (int i = k; i < NOR; i++) A2[k] = A2[k] + Alp[i] * C[i][k];
				}

				vector <triple> dV(NOR);
				double max = 0;
				vector<double> mdV(NOR);
				for (int i = 0; i < NOR; i++)
				{
					dV[i] = A2[i] - A1[i];
					mdV[i] = dV[i].getAbs();
					if (max <abs( mdV[i])) max = abs(mdV[i]);
				}
				if (max < eps && ij>2)	goto exit;
				if (ij > 7)	goto exit;

				A1 = A2;
				ij++;

			}

		exit:   triple dv = A1[0] * 0.5;
			triple dx = A1[0] * (1.0 / 6.0);

			for (int i = 1; i < NOR; i++)
			{
				dv = dv + A1[i] * (pow(step, i) / (i + 2.0));
				dx = dx + A1[i] * (pow(step, i) / (i + 2.0) / (i + 3.0));
			}

			//запись результатов:
			//int ik=int(t0-ts)%Global::Discr;
			if (Global::Discr != 0)
			{
				while (abs(tout - t0) < abs(step))
				{
					double hout = tout - t0;

					triple dvo = A1[0] * 0.5;
					triple dxo = A1[0] * (1.0 / 6.0);
					for (int i = 1; i < NOR; i++)
					{
						dvo = dvo + A1[i] * (pow(hout, i) / (i + 2.0));
						dxo = dxo + A1[i] * (pow(hout, i) / (i + 2.0) / (i + 3.0));
					}

					triple Xout = X + V*hout + F0*(hout*hout / 2.0) + dxo*(hout*hout*hout);
					triple Vout = V + F0*hout + dvo*(hout*hout);

					write(tout, Xout, Vout);

					tout += Global::Discr;
				}
			}

			X = X + V*step + F0*(step*step / 2.0) + dx*(step*step*step);
			V = V + F0*step + dv*(step*step);


			t0 += step;

		};

		write(t0, X, V);

		fclose(fosv);
		fclose(foel);

		fclose(foSvEcl);
		fclose(foelEcl);


		fclose(fosvR);
		//fclose(foelR);

		//fclose(fosvp);
		//fclose(foelp);

		fclose(foBL);

		fclose(foNEU);

		fclose(fvisi);
		fclose(fo3bg);

	};

Example #21

0

Show file

File: zbulge_applyQ.cpp Project: cjy7117/FT-MAGMA

extern "C" void magma_zbulge_applyQ(
    magma_int_t WANTZ, magma_side_t SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB,
    magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE,
    magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T,
    magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT,
    magmaDoubleComplex *dE, magma_int_t copytype )
{
    //%===========================
    //%   local variables
    //%===========================
    magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one  = MAGMA_Z_ONE;
    
    magma_int_t LDT, LDV, firstcolj;
    magma_int_t bg, nbGblk, rownbm, k, m, n;
    magma_int_t st, ed, fst, vlen, vnb, colj, len;
    magma_int_t blkid, vpos, taupos, tpos;
    //magmaDoubleComplex *WORK;
    magma_int_t LWORK;
    magma_int_t  cur_blksiz, avai_blksiz, ncolinvolvd;
    magma_int_t  nbgr, colst, coled, versionL, versionR;
    magma_int_t blkcnt=-1;

    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    *INFO=0;
    versionL = 113;
    versionR = 92;
    LDT      = Vblksiz;
    LDV      = NB+Vblksiz-1;
    //blklen = LDV*Vblksiz;
    nbGblk   = plasma_ceildiv((N-1), Vblksiz);
    //magma_zmalloc_cpu( &WORK, LWORK );

    /* find the size of the matrix T V*/
    findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV);
    /* Copy E & V & T to the GPU in dE and dV and dT
     * depending on copytype:
     * 1: mean copy only V
     * 2: mean copy V and T
     * 3: mean copy V, T and E
     * */
    if (copytype > 0) magma_zsetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV );
    if (copytype > 1) magma_zsetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT );
    if (copytype > 2) magma_zsetmatrix( N, NE, E, N, dE, N );
    magmaDoubleComplex *dwork;
    //ldwork  = NE;
    LWORK   = 2*N*max(Vblksiz, 64);
    if (MAGMA_SUCCESS != magma_zmalloc( &dwork, LWORK )) {
        printf ("!!!!  magma_zbulge_applyQ magma_alloc failed for: dwork\n" );
        exit(-1);
    }

    /* SIDE LEFT  meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
     *            Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */
    /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal  order (forward) from q_1 to q_n
     *            Also E is splitten by col meaning each apply consist in a block of col (vertical block) */

    /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update.
     *         So we use the reverse order from small q to large one,
     *         so from q_n to q_1 so Left update to Identity.
     *         Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order.
     * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update
     * */
    if (WANTZ == 1) {
        versionL=113;
        SIDE = MagmaLeft;
        //set the matrix to Identity here to avoid copying it from the CPU
        magmablas_zlaset( MagmaFull, N, N, c_zero, c_one, dE, N );
    }
    


    printf("  APPLY Q_v115 GPU with  N %d   NB %d   Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n",
           (int) N, (int) NB, (int) Vblksiz, SIDE, (int) versionL, (int) versionR, (int) WANTZ);


#if defined(USESTREAM)
    magma_int_t N2=N/2;
    magma_int_t N1=N-N2;
    printf("using stream\n");
    magma_queue_t stream[2];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
#endif
    

    if (SIDE == MagmaLeft) {
        if (versionL == 113) {
            for (bg = nbGblk; bg > 0; bg--) {
                firstcolj = (bg-1)*Vblksiz + 1;
                if (bg == nbGblk)
                    rownbm = plasma_ceildiv((N-(firstcolj)), NB);  // last blk has size=1 used for complex to handle A(N,N-1)
                else
                    rownbm = plasma_ceildiv((N-(firstcolj+1)), NB);
                
                for (m = rownbm; m > 0; m--) {
                    vlen = 0;
                    vnb  = 0;
                    colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop
                    fst  = (rownbm -m)*NB+colj +1;
                    for (k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -m)*NB+colj +1;
                        ed   = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=k+1;
                    }
                    colst     = (bg-1)*Vblksiz;
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", (int) bg, (int) m, (int) vlen, (int) vnb, (int) colst+1, (int) vpos+1, (int) taupos+1);
                    if ((vlen > 0) && (vnb > 0)) {
                        if (WANTZ == 1) {
                            len =  N-colst;
                            magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len);
                        } else {
                            magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
                        }
                    }
                }
            }
        } else if (versionL == 114) {
            rownbm = plasma_ceildiv((N-1), NB);
            for (m = rownbm; m > 0; m--) {
                ncolinvolvd = min(N-1, m*NB);
                avai_blksiz=min(Vblksiz, ncolinvolvd);
                nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz);
                for (n = nbgr; n > 0; n--) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
                    colst = (n-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -m)*NB+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -m)*NB+colj +1;
                        ed = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=vnb+1;
                    }
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    //printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colst+1, vpos+1, taupos+1);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N1, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N2, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,N1), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        }
    } else if (SIDE == MagmaRight) {
        if (versionR == 91) {
            for (bg =1; bg <= nbGblk; bg++) {
                firstcolj = (bg-1)*Vblksiz + 1;
                rownbm    = plasma_ceildiv((N-(firstcolj+1)), NB);
                if (bg == nbGblk) rownbm    = plasma_ceildiv((N-(firstcolj)), NB);  // last blk has size=1 used for complex to handle A(N,N-1)
                for (m = 1; m <= rownbm; m++) {
                    vlen = 0;
                    vnb  = 0;
                    // for k=0; I compute the fst and then can remove it from the loop
                    colj = (bg-1)*Vblksiz;
                    fst  = (rownbm -m)*NB+colj +1;
                    for (k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -m)*NB+colj +1;
                        ed   = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=k+1;
                    }
                    colj     = (bg-1)*Vblksiz;
                    findVTpos(N, NB, Vblksiz, colj, fst, &vpos, &taupos, &tpos, &blkid);
                    //printf("voici bg %d m %d  vlen %d  vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colj, vpos, taupos);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        } else if (versionR == 92) {
            rownbm = plasma_ceildiv((N-1), NB);
            for (m = 1; m <= rownbm; m++) {
                ncolinvolvd = min(N-1, m*NB);
                avai_blksiz=min(Vblksiz, ncolinvolvd);
                nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz);
                for (n = 1; n <= nbgr; n++) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz);
                    colst = (n-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -m)*NB+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -m)*NB+colj +1;
                        ed = min(st+NB-1, N-1);
                        if (st > ed) break;
                        if ((st == ed) && (colj != N-2)) break;
                        vlen=ed-fst+1;
                        vnb=vnb+1;
                    }
                    findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid);
                    if ((vlen > 0) && (vnb > 0)) {
                        #if defined(USESTREAM)
                        magmablasSetKernelStream(stream[0]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1);
                        magmablasSetKernelStream(stream[1]);
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2);
                        #else
                        magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE);
                        #endif
                    }
                }
            }
        }
    } else {
            printf("ERROR SIDE %d\n", SIDE);
    }

#if defined(USESTREAM)
    magma_queue_destroy( stream[0] );
    magma_queue_destroy( stream[1] );
#endif
    magmablasSetKernelStream( orig_stream );
}

Example #22

0

Show file

File: cunmlq.cpp Project: cjy7117/FT-MAGMA

/**
    Purpose
    -------
    CUNMLQ overwrites the general complex M-by-N matrix C with

    @verbatim
                             SIDE = MagmaLeft     SIDE = MagmaRight
    TRANS = MagmaNoTrans:    Q * C                C * Q
    TRANS = Magma_ConjTrans: Q**H * C             C * Q**H
    @endverbatim

    where Q is a complexunitary matrix defined as the product of k
    elementary reflectors

          Q = H(k)**H . . . H(2)**H H(1)**H

    as returned by CGELQF. Q is of order M if SIDE = MagmaLeft and of order N
    if SIDE = MagmaRight.

    Arguments
    ---------
    @param[in]
    side    magma_side_t
      -     = MagmaLeft:      apply Q or Q**H from the Left;
      -     = MagmaRight:     apply Q or Q**H from the Right.

    @param[in]
    trans   magma_trans_t
      -     = MagmaNoTrans:    No transpose, apply Q;
      -     = Magma_ConjTrans: Conjugate transpose, apply Q**H.

    @param[in]
    m       INTEGER
            The number of rows of the matrix C. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix C. N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines
            the matrix Q.
            If SIDE = MagmaLeft,  M >= K >= 0;
            if SIDE = MagmaRight, N >= K >= 0.

    @param[in]
    A       COMPLEX array, dimension
                (LDA,M) if SIDE = MagmaLeft,
                (LDA,N) if SIDE = MagmaRight.
            The i-th row must contain the vector which defines the
            elementary reflector H(i), for i = 1,2,...,k, as returned by
            CGELQF in the first k rows of its array argument A.
            A is modified by the routine but restored on exit.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A. LDA >= max(1,K).

    @param[in]
    tau     COMPLEX array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by CGELQF.

    @param[in,out]
    C       COMPLEX array, dimension (LDC,N)
            On entry, the M-by-N matrix C.
            On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q.

    @param[in]
    ldc     INTEGER
            The leading dimension of the array C. LDC >= max(1,M).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.
            If SIDE = MagmaLeft,  LWORK >= max(1,N);
            if SIDE = MagmaRight, LWORK >= max(1,M).
            For optimum performance
            if SIDE = MagmaLeft,  LWORK >= N*NB;
            if SIDE = MagmaRight, LWORK >= M*NB,
            where NB is the optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_cgelqf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cunmlq(
    magma_side_t side, magma_trans_t trans,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex *A, magma_int_t lda,
    magmaFloatComplex *tau,
    magmaFloatComplex *C, magma_int_t ldc,
    magmaFloatComplex *work, magma_int_t lwork,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)
    #define dC(i_,j_) (dC + (i_) + (j_)*lddc)
    #define dV(i_,j_) (dV + (i_) + (j_)*ib)
    #define dT(i_,j_) (dT + (i_) + (j_)*ib)
    #define dwork(i_) (dwork + (i_))

    magmaFloatComplex *T, *T2;
    magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step;
    magma_int_t iinfo, ldwork, lwkopt;
    magma_int_t left, notran, lquery;
    magma_trans_t transt;

    *info = 0;
    left   = (side  == MagmaLeft);
    notran = (trans == MagmaNoTrans);
    lquery = (lwork == -1);

    /* NQ is the order of Q and NW is the minimum dimension of WORK */
    if (left) {
        nq = m;
        nw = n;
    } else {
        nq = n;
        nw = m;
    }
    
    /* Test the input arguments */
    if (! left && side != MagmaRight) {
        *info = -1;
    } else if (! notran && trans != Magma_ConjTrans) {
        *info = -2;
    } else if (m < 0) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (k < 0 || k > nq) {
        *info = -5;
    } else if (lda < max(1,k)) {
        *info = -7;
    } else if (ldc < max(1,m)) {
        *info = -10;
    } else if (lwork < max(1,nw) && ! lquery) {
        *info = -12;
    }

    if (*info == 0) {
        nb = magma_get_cgelqf_nb( min( m, n ));
        lwkopt = max(1,nw)*nb;
        work[0] = MAGMA_C_MAKE( lwkopt, 0 );
    }

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0 || k == 0) {
        work[0] = MAGMA_C_ONE;
        return *info;
    }

    ldwork = nw;
    
    if (nb >= k) {
        /* Use CPU code */
        lapackf77_cunmlq( lapack_side_const(side), lapack_trans_const(trans),
            &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo);
    }
    else {
        /* Use hybrid CPU-GPU code */
        /* Allocate work space on the GPU.
         * nw*nb  for dwork (m or n) by nb
         * nq*nb  for dV    (n or m) by nb
         * nb*nb  for dT
         * lddc*n for dC.
         */
        magma_int_t lddc = ((m+31)/32)*32;
        magmaFloatComplex_ptr dwork, dV, dT, dC;
        magma_cmalloc( &dwork, (nw + nq + nb)*nb + lddc*n );
        if ( dwork == NULL ) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        dV = dwork + nw*nb;
        dT = dV    + nq*nb;
        dC = dT    + nb*nb;
        
        /* work space on CPU.
         * nb*nb for T
         * nb*nb for T2, used to save and restore diagonal block of panel  */
        magma_cmalloc_cpu( &T, 2*nb*nb );
        if ( T == NULL ) {
            magma_free( dwork );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        T2 = T + nb*nb;
        
        /* Copy matrix C from the CPU to the GPU */
        magma_csetmatrix( m, n, C, ldc, dC(0,0), lddc );
        
        if ( (left && notran) || (! left && ! notran) ) {
            i1 = 0;
            i2 = k;
            step = nb;
        } else {
            i1 = ((k - 1) / nb)*nb;
            i2 = 0;
            step = -nb;
        }

        // silence "uninitialized" warnings
        mi = 0;
        ni = 0;
        
        if (left) {
            ni = n;
            jc = 0;
        } else {
            mi = m;
            ic = 0;
        }

        if (notran) {
            transt = Magma_ConjTrans;
        } else {
            transt = MagmaNoTrans;
        }

        for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) {
            ib = min(nb, k - i);
            
            /* Form the triangular factor of the block reflector
               H = H(i) H(i + 1) . . . H(i + ib-1) */
            nq_i = nq - i;
            lapackf77_clarft("Forward", "Rowwise", &nq_i, &ib,
                             A(i,i), &lda, &tau[i], T, &ib);

            /* 1) set upper triangle of panel in A to identity,
               2) copy the panel from A to the GPU, and
               3) restore A                                      */
            cpanel_to_q( MagmaLower, ib, A(i,i), lda, T2 );
            magma_csetmatrix( ib, nq_i,  A(i,i), lda, dV(0,0), ib );
            cq_to_panel( MagmaLower, ib, A(i,i), lda, T2 );
            
            if (left) {
                /* H or H**H is applied to C(i:m,1:n) */
                mi = m - i;
                ic = i;
            }
            else {
                /* H or H**H is applied to C(1:m,i:n) */
                ni = n - i;
                jc = i;
            }
            
            /* Apply H or H**H; First copy T to the GPU */
            magma_csetmatrix( ib, ib, T, ib, dT(0,0), ib );
            magma_clarfb_gpu( side, transt, MagmaForward, MagmaRowwise,
                              mi, ni, ib,
                              dV(0,0), ib,
                              dT(0,0), ib,
                              dC(ic,jc), lddc,
                              dwork(0), ldwork );
        }
        magma_cgetmatrix( m, n, dC(0,0), lddc, C, ldc );
        
        magma_free( dwork );
        magma_free_cpu( T );
    }
    work[0] = MAGMA_C_MAKE( lwkopt, 0 );
    
    return *info;
} /* magma_cunmlq */

Example #23

0

Show file

File: clahru_m.cpp Project: XapaJIaMnu/magma

/**
    Purpose
    -------
    CLAHRU is an auxiliary MAGMA routine that is used in CGEHRD to update
    the trailing sub-matrices after the reductions of the corresponding
    panels.
    See further details below.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    ihi     INTEGER
            Last row to update. Same as IHI in cgehrd.

    @param[in]
    k       INTEGER
            Number of rows of the matrix Am (see details below)

    @param[in]
    nb      INTEGER
            Block size

    @param[out]
    A       COMPLEX array, dimension (LDA,N-K)
            On entry, the N-by-(N-K) general matrix to be updated. The
            computation is done on the GPU. After Am is updated on the GPU
            only Am(1:NB) is transferred to the CPU - to update the
            corresponding Am matrix. See Further Details below.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[in,out]
    data    Structure with pointers to dA, dT, dV, dW, dY
            which are distributed across multiple GPUs.

    Further Details
    ---------------
    This implementation follows the algorithm and notations described in:

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.

    The difference is that here Am is computed on the GPU.
    M is renamed Am, G is renamed Ag.

    @ingroup magma_cgeev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_clahru_m(
    magma_int_t n, magma_int_t ihi, magma_int_t k, magma_int_t nb,
    magmaFloatComplex *A, magma_int_t lda,
    struct cgehrd_data* data )
{
    #define dA(  d, i, j ) (data->A [d] + (i) + (j)*ldda)
    #define dTi( d       ) (data->Ti[d])
    #define dV(  d, i, j ) (data->V [d] + (i) + (j)*ldv )
    #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
    #define dW(  d, i, j ) (data->W [d] + (i) + (j)*ldda)
    #define dY(  d, i, j ) (data->Y [d] + (i) + (j)*ldda)
    
    magmaFloatComplex c_zero    = MAGMA_C_ZERO;
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;

    magma_int_t ngpu = data->ngpu;
    magma_int_t ldda = data->ldda;
    magma_int_t ldv  = data->ldv;
    magma_int_t ldvd = data->ldvd;
    
    magma_int_t d;
    magma_int_t dk, dkhi, dknb, dn;
    
    magma_int_t info = 0;
    if (n < 0) {
        info = -1;
    } else if (ihi < 0 || ihi > n) {
        info = -2;
    } else if (k < 0 || k > n) {
        info = -3;
    } else if (nb < 1 || nb > n) {
        info = -4;
    } else if (lda < max(1,n)) {
        info = -6;
    }
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        
        // convert global indices (k) to local indices (dk)
        magma_indices_1D_bcyclic( nb, ngpu, d, k,    ihi, &dk,   &dkhi );
        magma_indices_1D_bcyclic( nb, ngpu, d, k+nb, n,   &dknb, &dn   );
        
        // -----
        // on right, A := A Q = A - A V T V'
        // Update Am = Am - Am V T Vd' = Am - Ym Wd', with Wd = Vd T'
        // Wd = Vd T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)'
        // Vd and Wd are the portions corresponding to the block cyclic dkstribution
        magma_cgemm( MagmaNoTrans, MagmaConjTrans, dkhi-dk, nb, nb,
                     c_one,  dVd(d, dk, 0), ldvd,
                             dTi(d),        nb,
                     c_zero, dW (d, dk, 0), ldda );
        
        // Am = Am - Ym Wd' = A(0:k-1, k:ihi-1) - Ym(0:k-1, 0:nb-1) * W(k:ihi-1, 0:nb-1)'
        magma_cgemm( MagmaNoTrans, MagmaConjTrans, k, dkhi-dk, nb,
                     c_neg_one, dY(d, 0,  0),  ldda,
                                dW(d, dk, 0),  ldda,
                     c_one,     dA(d, 0,  dk), ldda );

        // -----
        // on right, A := A Q = A - A V T V'
        // Update Ag = Ag - Ag V T V' = Ag - Yg Wd'
        // Ag = Ag - Yg Wd' = A(k:ihi-1, nb:ihi-k-1) - Y(k:ihi-1, 0:nb-1) * W(k+nb:ihi-1, 0:nb-1)'
        magma_cgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, dkhi-dknb, nb,
                     c_neg_one, dY(d, k,    0),    ldda,
                                dW(d, dknb, 0),    ldda,
                     c_one,     dA(d, k,    dknb), ldda );
        
        // -----
        // on left, A := Q' A = A - V T' V' A
        // Ag2 = Ag2 - V T' V' Ag2 = W Yg, with W = V T' and Yg = V' Ag2
        // Note that Ag is A(k:ihi, nb+1:ihi-k)
        // while    Ag2 is A(k:ihi, nb+1: n -k)
        
        // here V and W are the whole matrices, not just block cyclic portion
        // W = V T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)'
        // TODO would it be cheaper to compute the whole matrix and
        // copy the block cyclic portions to another workspace?
        magma_cgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, nb, nb,
                     c_one,  dV (d, k, 0), ldv,
                             dTi(d),       nb,
                     c_zero, dW (d, k, 0), ldda );
        
        // Z = V(k:ihi-1, 0:nb-1)' * A(k:ihi-1, nb:n-k-1);  Z is stored over Y
        magma_cgemm( MagmaConjTrans, MagmaNoTrans, nb, dn-dknb, ihi-k,
                     c_one,  dV(d, k, 0),    ldv,
                             dA(d, k, dknb), ldda,
                     c_zero, dY(d, 0, 0),    nb );
        
        // Ag2 = Ag2 - W Z = A(k:ihi-1, k+nb:n-1) - W(k+nb:n-1, 0:nb-1) * Z(0:nb-1, k+nb:n-1)
        magma_cgemm( MagmaNoTrans, MagmaNoTrans, ihi-k, dn-dknb, nb,
                     c_neg_one, dW(d, k, 0),    ldda,
                                dY(d, 0, 0),    nb,
                     c_one,     dA(d, k, dknb), ldda );
    }
    
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    
    return info;
}

Example #24

0

Show file

File: zlahr2_m.cpp Project: soulsheng/magma

extern "C" magma_int_t
magma_zlahr2_m(
    magma_int_t n, magma_int_t k, magma_int_t nb,
    magmaDoubleComplex *A, magma_int_t lda,
    magmaDoubleComplex *tau,
    magmaDoubleComplex *T, magma_int_t ldt,
    magmaDoubleComplex *Y, magma_int_t ldy,
    struct zgehrd_data* data )
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    ZLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1)
    matrix A so that elements below the k-th subdiagonal are zero. The
    reduction is performed by an orthogonal similarity transformation
    Q' * A * Q. The routine returns the matrices V and T which determine
    Q as a block reflector I - V*T*V', and also the matrix Y = A * V.
    (Note this is different than LAPACK, which computes Y = A * V * T.)

    This is an auxiliary routine called by ZGEHRD.

    Arguments
    =========
    N       (input) INTEGER
            The order of the matrix A.

    K       (input) INTEGER
            The offset for the reduction. Elements below the k-th
            subdiagonal in the first NB columns are reduced to zero.
            K < N.

    NB      (input) INTEGER
            The number of columns to be reduced.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements on and above the k-th subdiagonal in
            the first NB columns are overwritten with the corresponding
            elements of the reduced matrix; the elements below the k-th
            subdiagonal, with the array TAU, represent the matrix Q as a
            product of elementary reflectors. The other columns of A are
            unchanged. See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    TAU     (output) COMPLEX_16 array, dimension (NB)
            The scalar factors of the elementary reflectors. See Further
            Details.

    T       (output) COMPLEX_16 array, dimension (LDT,NB)
            The upper triangular matrix T.

    LDT     (input) INTEGER
            The leading dimension of the array T.  LDT >= NB.

    Y       (output) COMPLEX_16 array, dimension (LDY,NB)
            The n-by-nb matrix Y.

    LDY     (input) INTEGER
            The leading dimension of the array Y. LDY >= N.

    dA      (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N-K+1)
            On entry, the n-by-(n-k+1) general matrix A.
            On exit, the elements in rows K:N of the first NB columns are
            overwritten with the matrix Y.

    DV      (output) COMPLEX_16 array on the GPU, dimension (N, NB)
            On exit this contains the Householder vectors of the transformation.

    Further Details
    ===============
    The matrix Q is represented as a product of nb elementary reflectors

       Q = H(1) H(2) . . . H(nb).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in
    A(i+k+1:n,i), and tau in TAU(i).

    The elements of the vectors v together form the (n-k+1)-by-nb matrix
    V which is needed, with T and Y, to apply the transformation to the
    unreduced part of the matrix, using an update of the form:
    A := (I - V*T*V') * (A - Y*T*V').

    The contents of A on exit are illustrated by the following example
    with n = 7, k = 3 and nb = 2:

       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( a   a   a   a   a )
       ( h   h   a   a   a )
       ( v1  h   a   a   a )
       ( v1  v2  a   a   a )
       ( v1  v2  a   a   a )

    where "a" denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    =====================================================================    */

    #define  A(  i, j ) ( A + (i) + (j)*lda)
    #define  Y(  i, j ) ( Y + (i) + (j)*ldy)
    #define  T(  i, j ) ( T + (i) + (j)*ldt)
    #define dA(  d, i, j ) (data->A [d] + (i) + (j)*ldda)
    #define dTi( d       ) (data->Ti[d])
    #define dV(  d, i, j ) (data->V [d] + (i) + (j)*ldv )
    #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd)
    #define dY(  d, i, j ) (data->Y [d] + (i) + (j)*ldda)

    magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;
    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex tmp;

    magma_int_t ngpu = data->ngpu;
    magma_int_t ldda = data->ldda;
    magma_int_t ldv  = data->ldv;
    magma_int_t ldvd = data->ldvd;
    
    magma_int_t ione = 1;
    
    magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid;
    magma_int_t n_k_i_1, n_k;
    magmaDoubleComplex scale;

    magma_int_t i;
    magmaDoubleComplex ei = MAGMA_Z_ZERO;

    magma_int_t info_data = 0;
    magma_int_t *info = &info_data;
    if (n < 0) {
        *info = -1;
    } else if (k < 0 || k >= n) {
        *info = -2;
    } else if (nb < 1 || nb > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (ldt < nb) {
        *info = -8;
    } else if (ldy < max(1,n)) {
        *info = -10;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    
    // adjust from 1-based indexing
    k -= 1;

    // Function Body
    if (n <= 1)
        return 0;
    
    // zero out current top block of V on all GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        magmablas_zlaset( MagmaUpperLower, nb, nb, dV(d,k,0), ldv );
    }
    
    // set all Y=0
    lapackf77_zlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy );
    
    for (i = 0; i < nb; ++i) {
        n_k_i_1 = n - k - i - 1;
        n_k     = n - k;
        
        if (i > 0) {
            // Finish applying I - V * T * V' on right
            tmp = MAGMA_Z_NEGATE( tau[i-1] );
            blasf77_zaxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione );
            
            // Apply I - V * T' * V' to this column (call it b) from the
            // left, using the last column of T as workspace, w.
            //
            // Let  V = ( V1 )   and   b = ( b1 )   (first i-1 rows)
            //          ( V2 )             ( b2 )
            // where V1 is unit lower triangular
            
            // w := b1 = A(k+1:k+i, i)
            blasf77_zcopy( &i,
                           A(k+1,i), &ione,
                           T(0,nb-1), &ione );
            
            // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w
            blasf77_ztrmv( "Lower", "Conj", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i)
            blasf77_zgemv( "Conj", &n_k_i_1, &i,
                           &c_one, A(k+i+1,0), &lda,
                                   A(k+i+1,i), &ione,
                           &c_one, T(0,nb-1), &ione );
            
            // w := T'*w = T(0:i-1, 0:i-1)' * w
            blasf77_ztrmv( "Upper", "Conj", "Non-unit", &i,
                           T(0,0), &ldt,
                           T(0,nb-1), &ione );
            
            // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w
            blasf77_zgemv( "No trans", &n_k_i_1, &i,
                           &c_neg_one, A(k+i+1,0), &lda,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k+i+1,i), &ione );
            
            // w := V1*w = VA(k+1:k+i, 0:i-1) * w
            blasf77_ztrmv( "Lower", "No trans", "Unit", &i,
                           A(k+1,0), &lda,
                           T(0,nb-1), &ione );
            
            // b1 := b1 - w = A(k+1:k+i-1, i) - w
            blasf77_zaxpy( &i,
                           &c_neg_one, T(0,nb-1), &ione,
                                       A(k+1,i), &ione );
            
            // Restore diagonal element, saved below during previous iteration
            *A(k+i,i-1) = ei;
        }
        
        // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i)
        lapackf77_zlarfg( &n_k_i_1,
                          A(k+i+1,i),
                          A(k+i+2,i), &ione, &tau[i] );
        // Save diagonal element and set to one, to simplify multiplying by V
        ei = *A(k+i+1,i);
        *A(k+i+1,i) = c_one;

        // compute yi = A vi = sum_g A{d} vi{d}
        nblocks = (n-1) / nb / ngpu + 1;
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magmablasSetKernelStream( data->streams[d] );
            
            // dV(k+i+1:n-1, i) = VA(k+i:n, i)
            magma_zsetvector_async( n_k_i_1,
                                    A(k+i+1,i), 1,
                                    dV(d, k+i+1, i), 1, data->streams[d] );
            
            // copy column of dV -> dVd, using block cyclic distribution.
            // This assumes V and Vd have been padded so that
            // a 2D matrix copy doesn't access them out-of-bounds
            gblock = k / nb;
            lblock = gblock / ngpu;
            lgid   = gblock % ngpu;
            if ( d < lgid ) {
                lblock += 1;
            }
            // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix
            magmablas_zlacpy( 'F', nb, nblocks-lblock,
                              dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu,
                              dVd(d, 0    + lblock*nb     , i), nb );
            
            // convert global indices (k) to local indices (dk)
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            
            // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i)
            // skip if matrix is empty
            // each GPU copies to different temporary vector in Y,
            // which are summed in separate loop below
            if ( dn-dki1 > 0 ) {
                magma_zgemv( 'N', n-k, dn-dki1,
                             c_one,  dA (d, k   , dki1), ldda,
                                     dVd(d, dki1,    i), 1,
                             c_zero, dY (d, k   ,    i), 1 );
                
                // copy vector to host, storing in column nb+d of Y
                // as temporary space (Y has >= nb+ngpu columns)
                magma_zgetvector_async( n-k,
                                        dY(d, k, i), 1,
                                        Y(k, nb+d),  1, data->streams[d] );
            }
        }
        
        // while GPU is doing above Ag*v...
        // Compute T(0:i,i) = [ -tau T V' vi ]
        //                    [  tau         ]
        // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i)
        scale = MAGMA_Z_NEGATE( tau[i] );
        blasf77_zgemv( "Conj", &n_k_i_1, &i,
                       &scale,  A(k+i+1,0), &lda,
                                A(k+i+1,i), &ione,
                       &c_zero, T(0,i), &ione );
        // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i)
        blasf77_ztrmv( "Upper", "No trans", "Non-unit", &i,
                       T(0,0), &ldt,
                       T(0,i), &ione );
        *T(i,i) = tau[i];
        
        // apply reflectors to next column, A(i+1), on right only.
        // one axpy will be required to finish this, in the next iteration above
        if ( i > 0 && i+1 < nb ) {
            // Update next column, A(k:n,i+1), applying Q on right.
            // One axpy will be required to finish this, in the next iteration
            // above, after yi is computed.
            // This updates one more row than LAPACK does (row k),
            // making block above panel an even multiple of nb.
            // Use last column of T as workspace, w.
            magma_int_t i1 = i+1;
            
            // If complex, conjugate row of V, and undo afterwards
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_zlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)'
            // T is now rectangular, so we use gemv instead of trmv as in lapack.
            blasf77_zgemv( "No trans", &i, &i1,
                           &c_one,  T(0,0), &ldt,
                                    A(k+i1,0), &lda,
                           &c_zero, T(0,nb-1), &ione );
            #if defined(PRECISION_z) || defined(PRECISION_c)
            lapackf77_zlacgv( &i1,  A(k+i1,0), &lda );
            #endif
            
            // A(k:n, i+1) -= Y(k:n, 0:i) * w
            blasf77_zgemv( "No trans", &n_k, &i,
                           &c_neg_one, Y(k,0), &ldy,
                                       T(0,nb-1), &ione,
                           &c_one,     A(k,i1), &ione );
        }
        
        // yi = sum_g yi{d}
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            magma_queue_sync( data->streams[d] );
            magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn );
            if ( dn-dki1 > 0 ) {
                // yi = yi + yi{d}
                blasf77_zaxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione );
            }
        }
    }
    // Restore diagonal element
    *A(k+nb,nb-1) = ei;
    
    // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:)
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magmablasSetKernelStream( data->streams[d] );
        
        // convert global indices (k) to local indices (dk)
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        
        // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :)
        // skip if matrix is empty
        // each GPU copies to different temporary block in Y,
        // which are summed in separate loop below
        if ( dn-dki1 > 0 ) {
            magma_zgemm( 'N', 'N', k, nb, dn-dki1,
                         c_one,  dA (d, 0   , dki1), ldda,
                                 dVd(d, dki1,    0), ldvd,
                         c_zero, dY (d, 0   ,    0), ldda );
            
            // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y
            // as temporary space (Y has nb + nb*ngpu columns)
            magma_zgetmatrix_async( k, nb,
                                    dY(d, 0, 0),  ldda,
                                    Y(0,nb+nb*d), ldy, data->streams[d] );
        }
    }
    
    // Y = sum_g Y{d}
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_queue_sync( 0 );
        magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn );
        if ( dn-dki1 > 0 ) {
            // Y = Y + Am V
            for( i = 0; i < nb; ++i ) {
                blasf77_zaxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione );
            }
        }
    }
    
    // copy Y and T matrices to GPUs
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_zsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] );
        magma_zsetmatrix_async( nb, nb, T, nb, dTi(d),      nb,   data->streams[d] );
    }

    return 0;
} // magma_zlahr2