LowRankVector::LowRankVector(integer Ur, integer Uc, integer Drc, integer Vr, integer Vc) { StieVector dU(Ur, Uc); EucVector dD(Drc, Drc); StieVector dV(Vr, Vc); Element **Elems = new Element *[3]; Elems[0] = &dU; Elems[1] = &dD; Elems[2] = &dV; integer *powsintev = new integer[4]; powsintev[0] = 0; powsintev[1] = 1; powsintev[2] = 2; powsintev[3] = 3; ProductElementInitialization(Elems, 3, powsintev, 3); delete[] powsintev; delete[] Elems; };
TEST( Vector, DataReadView ) { std::vector< int > v( 5ul, 3 ); bolt::amp::device_vector< int, concurrency::array_view > dV( v ); EXPECT_EQ( 5, dV.size( ) ); dV[ 0 ] = 1; dV[ 1 ] = 2; dV[ 2 ] = 3; dV[ 3 ] = 4; dV[ 4 ] = 5; // Any device_vector pointer can be used. bolt::amp::device_vector< int, concurrency::array_view >::pointer mySP = dV.data( ); EXPECT_EQ( 1, mySP[ 0 ] ); EXPECT_EQ( 2, mySP[ 1 ] ); EXPECT_EQ( 3, mySP[ 2 ] ); EXPECT_EQ( 4, mySP[ 3 ] ); EXPECT_EQ( 5, mySP[ 4 ] ); }
int _tmain( int argc, _TCHAR* argv[ ] ) { const size_t vecSize = 10; bolt::cl::device_vector< int > dV( vecSize ); bolt::cl::device_vector< int >::iterator myIter = dV.begin( ); std::cout << "\nDevice vector EXAMPLE \n"; // Iterator arithmetic supported *myIter = 1; ++myIter; *myIter = 2; myIter++; *myIter = 3; myIter += 1; *(myIter + 0) = 4; *(myIter + 1) = 5; myIter += 1; // Operator [] on the container suported dV[ 5 ] = 6; dV[ 6 ] = 7; // The .data() method maps internal GPU buffer to host accessible memory, and keeps the memory mapped bolt::cl::device_vector< int >::pointer pdV = dV.data( ); // These are fast writes to host accessible memory pdV[ 7 ] = 8; pdV[ 8 ] = 9; pdV[ 9 ] = 10; // Unmaps the GPU buffer, updating the contents of GPU memory pdV.reset( ); std::cout << "Device Vector contents: " << std::endl; for( size_t i = 0; i < vecSize; ++i ) { std::cout << dV[ i ] << ", "; } return 0; }
TEST( VectorReverseIterator, ArithmeticAndEqual ) { bolt::BCKND::device_vector< int > dV( 5 ); EXPECT_EQ( 5, dV.size( ) ); bolt::BCKND::device_vector< int >::reverse_iterator myIter = dV.rbegin( ); *myIter = 1; ++myIter; *myIter = 2; myIter++; *myIter = 3; myIter += 1; *(myIter + 0) = 4; *(myIter + 1) = 5; myIter += 1; EXPECT_EQ( 1, dV[ 4 ] ); EXPECT_EQ( 2, dV[ 3 ] ); EXPECT_EQ( 3, dV[ 2 ] ); EXPECT_EQ( 4, dV[ 1 ] ); EXPECT_EQ( 5, dV[ 0 ] ); }
TEST( DeviceVector, Assign ) { bolt::BCKND::device_vector< int > dV( 3, 98 ); std::vector< int > stdV( 3, 98 ); EXPECT_EQ(dV[0], stdV[0]); dV.assign(10, 89 ); stdV.assign(10, 89 ); for(int i=0; i<10; i++) { EXPECT_EQ(stdV[i], dV[i]); } dV.assign(5, 98 ); stdV.assign(5, 98 ); EXPECT_EQ(stdV.capacity(), dV.capacity()); EXPECT_EQ(stdV.size(), dV.size()); for(int i=0; i<5; i++) { EXPECT_EQ(stdV[i], dV[i]); } std::vector<float> stdFloatVect(1024); for(int i=0; i<1024; i++) { stdFloatVect[i] = (float)i; } bolt::BCKND::device_vector<float> dvFloatVect(stdFloatVect.begin(), 1024); for(int i=0; i<1024; i++) { EXPECT_FLOAT_EQ((float)i, dvFloatVect[i]); } dvFloatVect.assign(1022, 89 ); stdFloatVect.assign(1022, 89 ); for(int i=0; i<1022; i++) { /*This loop is iterating to 1022 elements only because stdFloatVect[i] throws an exception if accessing beyond 1022 elements*/ EXPECT_FLOAT_EQ(stdFloatVect[i], dvFloatVect[i]); } }
static PyObject* PyCosmoObject_dV_vec(struct PyCosmoObject* self, PyObject* args) { PyObject* zObj=NULL, *resObj=NULL;; double *z, *res; npy_intp n, i; if (!PyArg_ParseTuple(args, (char*)"O", &zObj)) { return NULL; } n = PyArray_SIZE(zObj); z = (double* )PyArray_DATA(zObj); resObj = PyArray_ZEROS(1, &n, NPY_FLOAT64, 0); res = (double* )PyArray_DATA(resObj); for (i=0; i<n; i++) { res[i] = dV(self->cosmo, z[i]); } return resObj; }
TEST( VectorReference, OperatorValueType ) { bolt::BCKND::device_vector< int > dV( 5 ); dV[ 0 ] = 1; dV[ 1 ] = 2; dV[ 2 ] = 3; dV[ 3 ] = 4; dV[ 4 ] = 5; std::vector< int > readBack( 5 ); readBack[ 0 ] = dV[ 0 ]; readBack[ 1 ] = dV[ 1 ]; readBack[ 2 ] = dV[ 2 ]; readBack[ 3 ] = dV[ 3 ]; readBack[ 4 ] = dV[ 4 ]; EXPECT_EQ( readBack[ 0 ], dV[ 0 ] ); EXPECT_EQ( readBack[ 1 ], dV[ 1 ] ); EXPECT_EQ( readBack[ 2 ], dV[ 2 ] ); EXPECT_EQ( readBack[ 3 ], dV[ 3 ] ); EXPECT_EQ( readBack[ 4 ], dV[ 4 ] ); }
TEST( Vector, Erase ) { bolt::BCKND::device_vector< int > dV( 5 ); EXPECT_EQ( 5, dV.size( ) ); dV[ 0 ] = 1; dV[ 1 ] = 2; dV[ 2 ] = 3; dV[ 3 ] = 4; dV[ 4 ] = 5; bolt::BCKND::device_vector< int >::iterator myIter = dV.begin( ); myIter += 2; bolt::BCKND::device_vector< int >::iterator myResult = dV.erase( myIter ); EXPECT_EQ( 4, dV.size( ) ); EXPECT_EQ( 4, *myResult ); EXPECT_EQ( 1, dV[ 0 ] ); EXPECT_EQ( 2, dV[ 1 ] ); EXPECT_EQ( 4, dV[ 2 ] ); EXPECT_EQ( 5, dV[ 3 ] ); }
/// <summary> /// Computes Brenner's focus measure and determines the ratio of the median/mean. /// </summary> /// <returns>The focus measure value</returns> double BasicFM::computeROGR() { if (checkInput()) { cv::Mat dH = mSrcImg(cv::Range::all(), cv::Range(1, mSrcImg.cols)) - mSrcImg(cv::Range::all(), cv::Range(0, mSrcImg.cols - 1)); cv::Mat dV = mSrcImg(cv::Range(1, mSrcImg.rows), cv::Range::all()) - mSrcImg(cv::Range(0, mSrcImg.rows - 1), cv::Range::all()); dH = cv::abs(dH); dV = cv::abs(dV); cv::Mat FM = cv::max(dH(cv::Range(0, dH.rows - 1), cv::Range::all()), dV(cv::Range::all(), cv::Range(0, dV.cols - 1))); FM = FM.mul(FM); cv::Scalar m = cv::mean(FM); cv::Mat tmp; FM.convertTo(tmp, CV_32F); double r = 255.0*255.0; //mVal = r > 0 ? m[0] / r : m[0]; mVal = m[0] / r; } return mVal; }
/** Purpose ------- DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by DGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T DOUBLE_PRECISION array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_aux ********************************************************************/ extern "C" magma_int_t magma_dlahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *T, magma_int_t ldt, double *Y, magma_int_t ldy, struct dgehrd_data* data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; double scale; magma_int_t i; double ei = MAGMA_D_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return 0; // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_dlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_D_NEGATE( tau[i-1] ); blasf77_daxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_dcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_dtrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_dgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_dtrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_daxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_dlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_dsetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_dlacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_dgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_D_NEGATE( tau[i] ); blasf77_dgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If real, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_dgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_dgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_daxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_dgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_daxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_dsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_dsetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } return 0; } /* magma_dlahr2 */
/** Purpose ------- DGEHRD reduces a DOUBLE PRECISION general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to DGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) DOUBLE PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE PRECISION array on the GPU, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices in dT, for later use in magma_dorghr. @ingroup magma_dgeev_comp ********************************************************************/ extern "C" magma_int_t magma_dgehrd( magma_int_t n, magma_int_t ilo, magma_int_t ihi, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magmaDouble_ptr dT, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda*2) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dV(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda) #define dwork(i_) dwork, ((i_)) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dV(i_,j_) (dV + (i_) + (j_)*ldda) #define dwork(i_) (dwork + (i_)) #endif // Constants const double c_one = MAGMA_D_ONE; const double c_zero = MAGMA_D_ZERO; // Local variables magma_int_t nb = magma_get_dgehrd_nb( n ); magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t i, nh, iws; magma_int_t iinfo; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = magma_dmake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < iws && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // Now requires lwork >= iws; else dT won't be computed in unblocked code. // If not enough workspace, use unblocked code //if ( lwork < iws ) { // nb = 1; //} if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // GPU workspace is: // nb*ldda for dwork for dlahru // nb*ldda for dV // n*ldda for dA magmaDouble_ptr dwork; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*nb*ldda + n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } double *dV = dwork + nb*ldda; double *dA = dwork + nb*ldda*2; double *T; magma_dmalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; assert( nb % 4 == 0 ); for (i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; magmablas_dlaset( MagmaFull, nb, n, c_zero, c_zero, dT(0,0), nb, queue ); // Copy the matrix to the GPU magma_dsetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_dgetmatrix( ihi-i, nb, dA(i,i-ilo), ldda, A(i,i), lda, queue ); // add 1 to i for 1-based index magma_dlahr2( ihi, i+1, nb, dA(0,i-ilo), ldda, dV(0,0), ldda, A(0,i), lda, &tau[i], T, nb, work, n, queue ); // Copy T from the CPU to dT on the GPU magma_dsetmatrix( nb, nb, T, nb, dT(0,i-ilo), nb, queue ); magma_dlahru( n, ihi, i, nb, A(0,i), lda, dA(0,i-ilo), ldda, // dA dA(i,i-ilo), ldda, // dY, stored over current panel dV(0,0), ldda, dT(0,i-ilo), dwork(0), queue ); } // Copy remainder to host magma_dgetmatrix( n, n-i, dA(0,i-ilo), ldda, A(0,i), lda, queue ); magma_free( dwork ); magma_free_cpu( T ); magma_queue_destroy( queue ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_dmake_lwork( iws ); return *info; } /* magma_dgehrd */
double DDphi(double N, double phi, double Dphi, double V0, double q, double phi0) /* the value of the second derivative of the scalar field phi with respect to e-fold N */ /* it is used when we are evaluating and integrating phi */ { return -(3 -(Dphi*Dphi)/2.)*Dphi -(dV(phi, V0, q, phi0)/(2*V(phi, V0, q, phi0)))*(6 -(Dphi*Dphi)); }
TIMESTAMP node::postsync(TIMESTAMP t0) { OBJECT *hdr = OBJECTHDR(this); node *swing = hdr->parent?OBJECTDATA(hdr->parent,node):this; complex dV(0.0); complex YY = Ys + complex(G,B); // copy values that might get updated while we work on this object complex old_YVs = YVs; #ifdef HYBRID swing->del_inj_residual(this); #endif if (!YY.IsZero() || type==SWING) { switch (type) { case PV: S.Im() = ((~V*(YY*V-old_YVs)).Im()); if (Qmin_MVAR<Qmax_MVAR && S.Im()<Qmin_MVAR) S.Im() = Qmin_MVAR; else if (Qmax_MVAR>Qmin_MVAR && S.Im()>Qmax_MVAR) S.Im() = Qmax_MVAR; //else { complex Vnew = (-(~S/~V) + old_YVs) / YY; Vnew.SetPolar(V.Mag(),Vnew.Arg()); #ifdef HYBRID if (Vstdev>0) { double pr = swing->get_obs_probability(); swing->del_obs_residual(this); dV = Vobs*(1-pr) + (Vnew)*pr - V; V += dV; swing->add_obs_residual(this); } else #endif { dV = Vnew - V; V = Vnew; } break; } /* continue with PQ solution */ case PQ: if (!V.IsZero()) { complex Vnew = (-(~S/~V) + old_YVs) / YY; #ifdef HYBRID if (Vstdev>0) // need to consider observation { double pr = swing->get_obs_probability(); swing->del_obs_residual(this); dV = Vobs*(1-pr) + (Vnew)*pr - V; V += dV; swing->add_obs_residual(this); } else // no observation #endif { dV = (Vnew - V)*acceleration_factor; V += dV; } V.Notation() = A; } break; case SWING: S = ~(~V*(YY*V - YVs)); S.Notation() = J; break; default: /* unknown type fails */ gl_error("invalid bus type"); return TS_ZERO; } } #ifdef HYBRID swing->add_inj_residual(this); #endif #ifdef _DEBUG // node debugging if (debug_node>0) { OBJECT* obj = OBJECTHDR(this); static int first=-1; if (first==-1) first = obj->id; if (obj->id==first) { printf("\n"); printf("Node Type V Vobs Stdev Power G B dV Pr{Vobs} r2 Sr2\n"); printf("============== ===== ================= ================= ======== ================= ======== ======== ======== ======== ====== ======\n"); } if (((debug_node&1)==1 && dV.Mag()>convergence_limit ) // only on dV #ifdef HYBRID || ((debug_node&2)==2 && Vstdev>0 ) // only on observation || ((debug_node&4)==4 && get_inj_residual()>0.001)// non-zero power residual #endif ) { printf("%2d (%-9.9s) %5s %+8.4f%+8.3fd %+8.4f%+8.3fd %8.5f %+8.4f%+8.4fj %+8.5f ", obj->id, obj->name, type==SWING?"SWING":(type==PQ?"PQ ":"PV"), V.Mag(),V.Arg()*180/3.1416, Vobs.Mag(), Vobs.Arg()*180/3.1416, Vstdev, S.Re(), S.Im(), G, B, dV.Mag()); #ifdef HYBRID printf("%+8.5f %8.5f ", get_obs_probability(), r2); if (Vstdev>0) printf("%8.5f %6.3f %6.3f\n", get_obs_probability(), r2, get_inj_residual()); else printf(" -- -- %6.3f\n",get_inj_residual()); #else printf("\n"); #endif } } #endif // _DEBUG // send dV through all links LINKLIST *item; for (item=linklist; item!=NULL; item=item->next) item->data->apply_dV(hdr,dV); if (dV.Mag()>convergence_limit) return t0; /* did not converge, hold the clock */ else return TS_NEVER; /* converged, no further updates needed */ }
int Look_txt() { TCHAR filter[] = TEXT("Ghemical MD results File (*.txt)\0*.txt\0") TEXT("All Files (*.*)\0*.*\0"); TCHAR fpath[1024]; TCHAR filename[1024]; sprintf(filename, "\0"); { DWORD nFilterIndex; vector<string> names; vector<string> *pnames = &names; vector<vector<double> > vectors; vectors.reserve(2000000); while (OpenFileDlg(0, filter, fpath, nFilterIndex) == S_OK) { ReadDatFile(NULL, fpath, filename, &vectors, pnames); pnames = NULL; printf("\nfilename %s\n\n", filename); int cols = names.size(); int rows = vectors.size(); #if WRITE_LOCKED_FORCES int cMom = 4 - 1; int cVx = 5 - 1; int cFxup = 14 - 1; int cFxdw = 17 - 1; int cVxup = 8 - 1; int cVxdw = 11 - 1; #endif #if WRITE_WORKED_FORCES int cMom = 4 - 1; int cVx = 5 - 1; int cVxup = 14 - 1; int cVxdw = 17 - 1; int cVx_wk_up = 8 - 1; int cVx_wk_dw = 11 - 1; int cFx_wk_up = 20 - 1; int cFx_wk_dw = 23 - 1; #endif vector<double> means(cols, 0.0); printf("vectors.size() = %d\n",rows); printf("names.size() = %d\n", cols); for (vector<vector<double> >::iterator it = vectors.begin(); it != vectors.end(); it++) { for (int c = 0; c < cols; c++) { means[c] += (*it).operator [](c); } } for (int c = 0; c < cols; c++) { means[c] /= rows; printf("mean(%s) = %f\n", names[c].c_str(), means[c]); } #if WRITE_LOCKED_FORCES || WRITE_WORKED_FORCES int r0 = 0; cout << "enter r0\n"; cin >> r0; #endif #if WRITE_LOCKED_FORCES vector<double> dF(rows-r0); for (int r = r0; r < rows; r++) { dF[r-r0] = vectors[r][cFxup] - vectors[r][cFxdw]; } Statistika (dF, "dF"); vector<double> Mom(rows-r0); for (r = r0; r < rows; r++) { Mom[r-r0] = vectors[r][cMom]; } Statistika (Mom, "Mom"); vector<double> dV(rows-r0); for (r = r0; r < rows; r++) { dV[r-r0] = vectors[r][cVxup] - vectors[r][cVxdw]; } Statistika (dV, "dV"); vector<double> Vx(rows-r0); for (r = r0; r < rows; r++) { Vx[r-r0] = vectors[r][cVx]; } Statistika (Vx, "Vx"); #endif #if WRITE_WORKED_FORCES vector<double> dF_wk(rows-r0); for (int r = r0; r < rows; r++) { dF_wk[r-r0] = vectors[r][cFx_wk_up] - vectors[r][cFx_wk_dw]; } Statistika (dF_wk, "dF_wk"); vector<double> dV_wk(rows-r0); for (r = r0; r < rows; r++) { dV_wk[r-r0] = vectors[r][cVx_wk_up] - vectors[r][cVx_wk_dw]; } Statistika (dV_wk, "dV_wk"); //if (!worked[n1]) vector<double> Mom(rows-r0); for (r = r0; r < rows; r++) { Mom[r-r0] = vectors[r][cMom]; } Statistika (Mom, "Mom"); vector<double> dV(rows-r0); for (r = r0; r < rows; r++) { dV[r-r0] = vectors[r][cVxup] - vectors[r][cVxdw]; } Statistika (dV, "dV"); vector<double> Vx(rows-r0); for (r = r0; r < rows; r++) { Vx[r-r0] = vectors[r][cVx]; } Statistika (Vx, "Vx"); #endif } } /*else { DWORD nFilterIndex; if (SaveFileDlg(0, filename, filter, nFilterIndex) == S_OK) { SetDlgItemText(ref->hDlg,IDC_EDIT_TRAJFILE2, filename); } }*/ printf("Hello World!\n"); return 0; }
extern "C" magma_int_t magma_clahr2( magma_int_t n, magma_int_t k, magma_int_t nb, magmaFloatComplex *dA, magmaFloatComplex *dV, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *T, magma_int_t ldt, magmaFloatComplex *Y, magma_int_t ldy ) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by CGEHRD. Arguments ========= N (input) INTEGER The order of the matrix A. K (input) INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. NB (input) INTEGER The number of columns to be reduced. dA (input/output) COMPLEX array on the GPU, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements in rows K:N of the first NB columns are overwritten with the matrix Y. DV (output) COMPLEX array on the GPU, dimension (N, NB) On exit this contains the Householder vectors of the transformation. A (input/output) COMPLEX array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. T (output) COMPLEX array, dimension (LDT,NB) The upper triangular matrix T. LDT (input) INTEGER The leading dimension of the array T. LDT >= NB. Y (output) COMPLEX array, dimension (LDY,NB) The n-by-nb matrix Y. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( i, j ) (dA + (i) + (j)*ldda) #define dV( i, j ) (dV + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ldda = lda; magma_int_t ione = 1; magma_int_t n_k_i_1, n_k; magmaFloatComplex scale; magma_int_t i; magmaFloatComplex ei = MAGMA_C_ZERO; // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return 0; for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Update A(k:n-1,i); Update i-th column of A - Y * T * V' // This updates one more row than LAPACK does (row k), // making the block above the panel an even multiple of nb. // Use last column of T as workspace, w. // w(0:i-1, nb-1) = VA(k+i, 0:i-1)' blasf77_ccopy( &i, A(k+i,0), &lda, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) // If complex, conjugate row of V. lapackf77_clacgv(&i, T(0,nb-1), &ione); #endif // w = T(0:i-1, 0:i-1) * w blasf77_ctrmv( "Upper", "No trans", "No trans", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_ccopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_ctrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_cgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_ctrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_cgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_ctrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_caxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_clarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i) magma_csetvector( n_k_i_1, A(k+i+1,i), 1, dV(i+1,i), 1 ); // Compute Y(k+1:n,i) = A vi // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i) magma_cgemv( MagmaNoTrans, n_k, n_k_i_1, c_one, dA(k,i+1), ldda, dV(i+1,i), ione, c_zero, dA(k,i), ione ); // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_C_NEGATE( tau[i]); blasf77_cgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_ctrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // Y(k:n-1, i) = dA(k:n-1, i) magma_cgetvector( n-k, dA(k,i), 1, Y(k,i), 1 ); } // Restore diagonal element *A(k+nb,nb-1) = ei; return 0; } // magma_clahr2
bool VStableSolve::isVStable() { Variable x1,x2,a1,a2;//,a; // double a=10; // double _ii[2][2]={{-10,-10},{-10,10}}; // IntervalVector a(2,_ii); Function f_sup("f_sup.txt"); Function f_inf("f_inf.txt"); Function V("V.txt"); Function dV("gradV.txt"); NumConstraint c1_c(x1,x2,a1,a2,a1*dV(x1,x2)[0]+a2*dV(x1,x2)[1]>=0); NumConstraint c21_c(x1,x2,a1,a2,f_sup(x1,x2)[0]-a1>=0); NumConstraint c22_c(x1,x2,a1,a2,f_sup(x1,x2)[1]-a2>=0); NumConstraint c31_c(x1,x2,a1,a2,a1-f_inf(x1,x2)[0]>=0); NumConstraint c32_c(x1,x2,a1,a2,a2-f_inf(x1,x2)[1]>=0); NumConstraint c41_c(x1,x2,V(x1,x2)>=0); NumConstraint c42_c(x1,x2,V(x1,x2)<=v_bar); CtcFwdBwd c1(c1_c); CtcFwdBwd c21(c21_c); CtcFwdBwd c22(c22_c); CtcCompo c2(c21,c22); CtcFwdBwd c31(c31_c); CtcFwdBwd c32(c32_c); CtcCompo c3(c31, c32); CtcFwdBwd c41(c41_c); CtcFwdBwd c42(c42_c); CtcCompo c4(c41,c42); CtcCompo cOut1(c1,c2); CtcCompo cOut2(c3,c4); CtcCompo cOut(cOut1,cOut2); // Build the initial box. IntervalVector box(4); box[0]=Interval(-10,10); box[1]=Interval(-10,10); box[2]=Interval::ALL_REALS; box[3]=Interval::ALL_REALS; // Build the way boxes will be bisected. // "LargestFirst" means that the dimension bisected // is always the largest one. LargestFirst lf; stack<IntervalVector> s; s.push(box); while (!s.empty()) { // Get a copy of the current box (on top of the stack) IntervalVector box=s.top(); // qDebug() << "a[0]= [" <<box[2].lb() << "; " << box[2].ub() << "], a[1]= [" << box[3].lb() << "; " <<box[3].ub() << "]" << endl; // Remove the box from the stack s.pop(); try { // // Remove the part that is inside //// contract_and_draw(cOut,box,Qt::darkBlue,Qt::cyan); IntervalVector initbox=box; // get a copy try { // cOut.contract(box); // if (box==initbox) return; // nothing contracted. if(box!=initbox){ IntervalVector* rest; // int n=initbox.diff(box,rest); // calculate the set difference // for (int i=0; i<n; i++) { // display the boxes // frame.DrawBox(rest[i][0],rest[i][1],QPen(pencolor),QBrush(brushcolor)); // } // delete[] rest; } } catch(EmptyBoxException&) { // frame.DrawBox(initbox[0],initbox[1],QPen(pencolor),QBrush(brushcolor)); } // if (box.is_empty()) { continue; } // // Check if the box is small enough if (box.max_diam()<0.0001){//epsilon) { return false; // il y a une situation incertaine: on conclut non-V-Stable // frame.DrawBox(box[0],box[1],QPen(Qt::yellow),QBrush(Qt::yellow)); } else { // otherwise, bisect it and // push the two subboxes on the stack. pair<IntervalVector,IntervalVector> boxes=lf.bisect(box); s.push(boxes.first); s.push(boxes.second); } } catch(EmptyBoxException&) { } } return true; }
Sivia::Sivia(Frame& frame, double epsilon) : frame(frame) { // Create the function we want to apply SIVIA on. Variable x1,x2,a1,a2; Function f_sup("f_sup.txt"); Function f_inf("f_inf.txt"); Function V("V.txt"); Function dV("gradV.txt"); NumConstraint c1_c(x1,x2,a1,a2,a1*dV(x1,x2)[0]+a2*dV(x1,x2)[1]>=0); NumConstraint c21_c(x1,x2,a1,a2,f_sup(x1,x2)[0]-a1>=0); NumConstraint c22_c(x1,x2,a1,a2,f_sup(x1,x2)[1]-a2>=0); NumConstraint c31_c(x1,x2,a1,a2,a1-f_inf(x1,x2)[0]>=0); NumConstraint c32_c(x1,x2,a1,a2,a2-f_inf(x1,x2)[1]>=0); NumConstraint c41_c(x1,x2,V(x1,x2)>=0); NumConstraint c42_c(x1,x2,V(x1,x2)<=v_bar); CtcFwdBwd c1(c1_c); CtcFwdBwd c21(c21_c); CtcFwdBwd c22(c22_c); CtcCompo c2(c21,c22); CtcFwdBwd c31(c31_c); CtcFwdBwd c32(c32_c); CtcCompo c3(c31, c32); CtcFwdBwd c41(c41_c); CtcFwdBwd c42(c42_c); CtcCompo c4(c41,c42); CtcCompo cOut1(c1,c2); CtcCompo cOut2(c3,c4); CtcCompo cOut(cOut1,cOut2); NumConstraint c1_cIn(x1,x2,a1,a2,a1*dV(x1,x2)[0]+a2*dV(x1,x2)[1]>=0); NumConstraint c21_cIn(x1,x2,a1,a2,f_sup(x1,x2)[0]-a1<0); NumConstraint c22_cIn(x1,x2,a1,a2,f_sup(x1,x2)[1]-a2<0); NumConstraint c31_cIn(x1,x2,a1,a2,a1-f_inf(x1,x2)[0]<0); NumConstraint c32_cIn(x1,x2,a1,a2,a2-f_inf(x1,x2)[1]<0); NumConstraint c41_cIn(x1,x2,V(x1,x2)<0); NumConstraint c42_cIn(x1,x2,V(x1,x2)>v_bar); CtcFwdBwd c1_In(c1_cIn); CtcFwdBwd c21_In(c21_cIn); CtcFwdBwd c22_In(c22_cIn); CtcUnion c2_In(c21_In, c22_In); CtcFwdBwd c31_In(c31_cIn); CtcFwdBwd c32_In(c32_cIn); CtcUnion c3_In(c31_In, c32_In); CtcFwdBwd c41_In(c41_cIn); CtcFwdBwd c42_In(c42_cIn); CtcUnion c4_In(c41_In, c42_In); CtcUnion cIn1(c1_In, c2_In); CtcUnion cIn2(c3_In, c4_In); CtcUnion cIn(cIn1, cIn2); // Build the initial box. IntervalVector box(4); box[0]=Interval(-10,10); box[1]=Interval(-10,10); box[2]=Interval::ALL_REALS; box[3]=Interval::ALL_REALS; // Build the way boxes will be bisected. // "LargestFirst" means that the dimension bisected // is always the largest one. LargestFirst lf; stack<IntervalVector> s; s.push(box); while (!s.empty()) { // Get a copy of the current box (on top of the stack) IntervalVector box=s.top(); // qDebug() << "a[0]= [" <<box[2].lb() << "; " << box[2].ub() << "], a[1]= [" << box[3].lb() << "; " <<box[3].ub() << "]" << endl; // Remove the box from the stack s.pop(); try { // Remove the part that is outside contract_and_draw(cIn,box,Qt::magenta,Qt::red); if (box.is_empty()) { continue; } // Remove the part that is inside contract_and_draw(cOut,box,Qt::darkBlue,Qt::cyan); if (box.is_empty()) { continue; } // Check if the box is small enough if (box.max_diam()<epsilon) { frame.DrawBox(box[0],box[1],QPen(Qt::yellow),QBrush(Qt::yellow)); } else { // otherwise, bisect it and // push the two subboxes on the stack. pair<IntervalVector,IntervalVector> boxes=lf.bisect(box); s.push(boxes.first); s.push(boxes.second); } } catch(EmptyBoxException&) { } } frame.Save("paving"); }
void ChromoConditions::recalculateSSConcentrations() { mSSConcentrations.clear(); if (gradient().empty()) { return; } // If the gradient is isocratic, add a single point only. if ((gradient().size() == 2) && (gradient().front().concentrationB() == gradient().back().concentrationB())) { mSSConcentrations.push_back( (100.0 - gradient().front().concentrationB()) / 100.0 * secondSolventConcentrationA() + gradient().front().concentrationB() / 100.0 * secondSolventConcentrationB()); return; } double secondSolventConcentrationPump = 0.0; double time = dV() / 2.0 / flowRate(); int segmentNum = -1; double localSlope = 0.0; double initialSSConcentration, finalSSConcentration, initialTime, finalTime, pumpedConcentration; while (true) { if (time > gradient()[segmentNum+1].time()) { if (segmentNum < int(gradient().size() - 1)) { segmentNum += 1; initialSSConcentration = (100.0 - gradient()[segmentNum].concentrationB()) / 100.0 * secondSolventConcentrationA() + gradient()[segmentNum].concentrationB() / 100.0 * secondSolventConcentrationB(); finalSSConcentration = (100.0 - gradient()[segmentNum+1].concentrationB()) / 100.0 * secondSolventConcentrationA() + gradient()[segmentNum+1].concentrationB() / 100.0 * secondSolventConcentrationB(); initialTime = gradient()[segmentNum].time(); finalTime = gradient()[segmentNum+1].time(); localSlope = (finalSSConcentration - initialSSConcentration) / (finalTime - initialTime); } else { break; } } pumpedConcentration = localSlope * (time - initialTime) + initialSSConcentration; // If mixingCorrection is enabled calculate second solvent concentrations // from the equation // d[SS] / dt = flowRate / (V0 + VP) * ([SS]pump - [SS]) // Otherwise, [SS] == [SS]pump if (mixingCorrection()) { // Special case for the first time point. if (time == dV() / 2.0 / flowRate()) { mSSConcentrations.push_back( initialSSConcentration + + dV() / (columnInterstitialVolume() + columnPoreVolume()) * (pumpedConcentration - mSSConcentrations.back()) / 2.0); } else { mSSConcentrations.push_back( mSSConcentrations.back() + dV() / (columnInterstitialVolume() + columnPoreVolume()) * (pumpedConcentration - mSSConcentrations.back())); } } else { mSSConcentrations.push_back(pumpedConcentration); } time += dV() / flowRate(); } }
//----------------------------------------------------------------------- void WJordanSolver::IClamp() { m_V[0] = -0.075; m_Ca[0] = 0; for(int i = 0; i <= 4; ++i) m_gatevars[i] = 1 / ( 1 + exp( ( m_vhalf_x[i] - m_V[0] ) / m_k_x[i] ) ); m_gatevars[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - m_Ca[0] ) / m_k_x[5] ) ); // Now do main loop for( unsigned int i = 1; i < m_numpoints; ++i ) { float k1[7], k2[7], k3[7], k4[7]; float h_2 = m_deltat / 2; float tempgate[6]; float tempCa, tempV; //step 1 - get k1 values for(int j = 0; j <= 4; ++j){ k1[j] = dx(m_gatevars[j], m_V[i-1], m_T_x[j], m_vhalf_x[j], m_k_x[j]); } k1[5] = dCa( m_gxs, m_gatevars, m_vrevs, m_alphaCa, m_V[i-1], m_TCa, m_thiCa, m_Ca[i-1] ); k1[6] = dV( m_gxs, m_gatevars, m_vrevs, m_alphaCa, m_I[i], m_Cm, m_V[i-1]); //step 2 - get k2 values tempV = m_V[i-1] + k1[6] * h_2; tempCa = m_Ca[i-1] + k1[5] * h_2; for(int j = 0; j <= 4; ++j){ tempgate[j] = (m_gatevars[j] + k1[j] * h_2); k2[j] = dx(tempgate[j], tempV, m_T_x[j], m_vhalf_x[j], m_k_x[j]); } tempgate[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa) / m_k_x[5] ) ); k2[5] = dCa( m_gxs, tempgate, m_vrevs, m_alphaCa, tempV, m_TCa, m_thiCa, tempCa ); k2[6] = dV (m_gxs, tempgate, m_vrevs, m_alphaCa, m_I[i], m_Cm, tempV ); //step 3 - get k3 values tempV = m_V[i-1] + k2[6] * h_2; tempCa = m_Ca[i-1] + k2[5] * h_2; for(int j = 0; j <= 4; ++j){ tempgate[j] = (m_gatevars[j] + k2[j] * h_2); k3[j] = dx(tempgate[j], tempV, m_T_x[j], m_vhalf_x[j], m_k_x[j]); } tempgate[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa ) / m_k_x[5] ) ); k3[5] = dCa( m_gxs, tempgate, m_vrevs, m_alphaCa, tempV, m_TCa, m_thiCa, tempCa ); k3[6] = dV( m_gxs, tempgate, m_vrevs, m_alphaCa, m_I[i], m_Cm, tempV ); //step 4 - get k4 values tempV = m_V[i-1] + k3[6] * h_2 * 2; tempCa = m_Ca[i-1] + k3[5] * h_2 * 2; for(int j = 0; j <= 4; ++j){ tempgate[j] = ( m_gatevars[j] + k3[j] * h_2 * 2 ); k4[j] = dx(tempgate[j], tempV, m_T_x[j], m_vhalf_x[j], m_k_x[j]); } tempgate[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa ) / m_k_x[5] ) ); k4[5] = dCa( m_gxs, tempgate, m_vrevs, m_alphaCa, tempV, m_TCa, m_thiCa, tempCa ); k4[6] = dV( m_gxs, tempgate, m_vrevs, m_alphaCa, m_I[i], m_Cm, tempV ); // now we must combine them to get final value for(int j = 0; j <= 4; ++j){ m_gatevars[j] += ( m_deltat / 6 ) * ( k1[j] + 2 * k2[j] + 2 * k3[j] + k4[j] ); } m_Ca[i] = m_Ca[i-1] + ( m_deltat / 6 ) * ( k1[5] + 2 * k2[5] + 2 * k3[5] + k4 [5] ); m_gatevars[5] = 1 / ( 1 + exp( ( m_vhalf_x[5] - tempCa ) / m_k_x[5] ) ); m_V[i] = m_V[i-1] + ( m_deltat / 6 ) * ( k1[6] + 2 * k2[6] + 2 * k3[6] + k4[6] ); } return; }
void Integration::SODE() { const int NOR = Global::NOR; double eps = Global::eps; double t0 = Global::t0; double te = Global::te; double step = Global::step; double SVi[6]; for (int i = 0; i < 6; i++) SVi[i] = Global::SV(i, 0); char lineRad[300]; vector<double> Rado(NOR); vector<double> h(NOR); vector<vector<double> > C(NOR, vector<double>(NOR)); vector<triple > X1(NOR); vector<triple > V1(NOR); //vector<triple > Fi(NOR); vector <triple> A1(NOR), A2(NOR); vector<vector<double>> Dh(NOR, vector<double>(NOR)); vector<triple> Alp(NOR); vector<triple> dF(NOR); double tout = t0; triple F0, X(SVi[0], SVi[1], SVi[2]), V(SVi[3], SVi[4], SVi[5]), Fi; FILE*f = fopen("Radau.txt", "r"); fosv = fopen("sv_J2000.out", "w"); //fprintf(fosv,"year month day hms(UTC) TDB(sec) interval(days) X Y Z Vx Vy Vz \n"); foel = fopen("elts_J2000.out", "w"); // fprintf(foel,"year month day hms(UTC) TDB(sec) interval(days) A E I NODE W M \n"); foSvEcl = fopen("sv_ECLIPJ2000.out", "w"); foelEcl = fopen("elts_ECLIPJ2000.out", "w"); fosvR = fopen("sv_IAUplanet.out", "w"); // fprintf(fosvR,"year month day hms(UTC) TDB(sec) interval(days) X Y Z Vx Vy Vz \n"); foBL = fopen("BL.out", "w"); // fprintf(foBL,"year month day hms(UTC) TDB(sec) interval(days) L B H \n"); foNEU = fopen("NEU.out", "w"); // fprintf(foNEU,"year month day hms(UTC) TDB(sec) interval(days) N E U \n"); fvisi = fopen("visibility.out", "w"); fo3bg = fopen("3body_geodetic.out", "w"); //Ќј’ќ∆ƒ≈Ќ»≈ ƒќЋ√ќ“џ ¬ќ—’ќƒяў≈√ќ ”«Ћј Ќј “0 // if(Global::b_out_elts_planet || Global::b_out_sv_planet || Global::b_out_el_IAUPlanet==true){ double poss[6]; double lt1, dlt; triple Zorb; double Zpl[3]; double TimeNode = t0; spkacs_c(Global::IDC, t0, "J2000", "NONE", 10, poss, <1, &dlt); triple Xv = triple(poss[0], poss[1], poss[2]); triple Vv = triple(poss[3], poss[4], poss[5]); Xv = Xv / Xv.getAbs(); Vv = Vv / Vv.getAbs(); Zorb = Xv&Vv / sin(triple::getAngle(Xv, Vv)); double Z[3] = { Zorb[0], Zorb[1], Zorb[2] }; trpos(t0, 1, Global::IDC, Z, Zpl); triple ZorbP = triple(Zpl); triple PolP = triple(0.0, 0.0, 1.0); triple Node = ZorbP&PolP; Node = Node / Node.getAbs(); double NodeA = atan2(Node[1], Node[0]); if (NodeA < 0.0) NodeA = NodeA + 2 * pi; // ѕЋјЌ≈“ќ÷≈Ќ“–»„≈— јя √–ј¬»“ј÷»ќЌЌјя ѕќ—“ќяЌЌјя double mu = Global::mu; int ii = 1; int jj = 0; while (!feof(f)) { fscanf(f, "%s\n", lineRad); if (ii > Misc::sum(NOR - 1) && ii <= Misc::sum(NOR)) { Rado[jj] = atof(lineRad); jj++; } else {} ii++; } //интегрирование назад if (te < t0) { step = -step; Global::Discr = -Global::Discr; } //System::Threading::Thread Worker ^ = gcnew System::Threading::Thread(delegate() { }); //собственно, сам интегртор: while (abs(te - t0) != 0e0) { if (abs(te - t0) < 1e-12) break; if (step > 0) { if (t0 + step > te) step = te - t0; } else { if (t0 + step < te) step = te - t0; } for (int i = 0; i < NOR; i++) h[i] = step*Rado[i]; for (int k = 0; k < NOR; k++) { Dh[k][k] = h[k]; for (int j = 0; j < k; j++) { Dh[k][j] = h[k] - h[j]; } } //числа Cтирлинга for (int i = 0; i < NOR; i++) { C[i][i] = 1.0; if (i > 0) C[i][0] = -h[i - 1] * C[i - 1][0]; for (int j = 1; j < i; j++) C[i][j] = C[i - 1][j - 1] - h[i - 1] * C[i - 1][j]; } ////////////////////////////////////////////////////// F0 = Force::force_SODE(t0, X, V); for (int j = 0; j < NOR; j++) { X1[j] = X + V*h[j] + F0*h[j] * h[j] / 2; V1[j] = V + F0*h[j]; /////////////////////////////////////////////// Fi = Force::force_SODE(t0 + h[j], X1[j], V1[j]); dF[j] = Fi - F0; } Alp[0] = dF[0] / h[0]; for (int k = 1; k < NOR; k++) { Alp[k] = dF[k] / h[k]; for (int j = 0; j < k; j++) { Alp[k] = (Alp[k] - Alp[j]) / Dh[k][j]; } } for (int k = 0; k < NOR; k++) { A1[k] = triple(0., 0., 0.); for (int i = k; i < NOR; i++) A1[k] = A1[k] + Alp[i] * C[i][k]; } int ij = 0; for (;;) { for (int k = 0; k < NOR; k++) { triple v2 = A1[0] * 0.5; triple x2 = A1[0] * (1.0 / 6.0); for (int i = 1; i < NOR; i++) { v2 = v2 + A1[i] * (pow(h[k], i) / (i + 2)); x2 = x2 + A1[i] * (pow(h[k], i) / (i + 2) / (i + 3)); } v2 = V1[k] + v2*(pow(h[k], 2)); x2 = X1[k] + x2*(pow(h[k], 3)); ////////////////////////////////////////////// Fi = Force::force_SODE(t0 + h[k], x2, v2); dF[k] = Fi - F0; } Alp[0] = dF[0] / h[0]; for (int k = 1; k < NOR; k++) { Alp[k] = dF[k] / h[k]; for (int j = 0; j < k; j++) { Alp[k] = (Alp[k] - Alp[j]) / Dh[k][j]; } } for (int k = 0; k < NOR; k++) { A2[k] = triple(0.0, 0., 0.); for (int i = k; i < NOR; i++) A2[k] = A2[k] + Alp[i] * C[i][k]; } vector <triple> dV(NOR); double max = 0; vector<double> mdV(NOR); for (int i = 0; i < NOR; i++) { dV[i] = A2[i] - A1[i]; mdV[i] = dV[i].getAbs(); if (max <abs( mdV[i])) max = abs(mdV[i]); } if (max < eps && ij>2) goto exit; if (ij > 7) goto exit; A1 = A2; ij++; } exit: triple dv = A1[0] * 0.5; triple dx = A1[0] * (1.0 / 6.0); for (int i = 1; i < NOR; i++) { dv = dv + A1[i] * (pow(step, i) / (i + 2.0)); dx = dx + A1[i] * (pow(step, i) / (i + 2.0) / (i + 3.0)); } //запись результатов: //int ik=int(t0-ts)%Global::Discr; if (Global::Discr != 0) { while (abs(tout - t0) < abs(step)) { double hout = tout - t0; triple dvo = A1[0] * 0.5; triple dxo = A1[0] * (1.0 / 6.0); for (int i = 1; i < NOR; i++) { dvo = dvo + A1[i] * (pow(hout, i) / (i + 2.0)); dxo = dxo + A1[i] * (pow(hout, i) / (i + 2.0) / (i + 3.0)); } triple Xout = X + V*hout + F0*(hout*hout / 2.0) + dxo*(hout*hout*hout); triple Vout = V + F0*hout + dvo*(hout*hout); write(tout, Xout, Vout); tout += Global::Discr; } } X = X + V*step + F0*(step*step / 2.0) + dx*(step*step*step); V = V + F0*step + dv*(step*step); t0 += step; }; write(t0, X, V); fclose(fosv); fclose(foel); fclose(foSvEcl); fclose(foelEcl); fclose(fosvR); //fclose(foelR); //fclose(fosvp); //fclose(foelp); fclose(foBL); fclose(foNEU); fclose(fvisi); fclose(fo3bg); };
extern "C" void magma_zbulge_applyQ( magma_int_t WANTZ, magma_side_t SIDE, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaDoubleComplex *E, magma_int_t LDE, magmaDoubleComplex *V, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t *INFO, magmaDoubleComplex *dV, magmaDoubleComplex *dT, magmaDoubleComplex *dE, magma_int_t copytype ) { //%=========================== //% local variables //%=========================== magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t LDT, LDV, firstcolj; magma_int_t bg, nbGblk, rownbm, k, m, n; magma_int_t st, ed, fst, vlen, vnb, colj, len; magma_int_t blkid, vpos, taupos, tpos; //magmaDoubleComplex *WORK; magma_int_t LWORK; magma_int_t cur_blksiz, avai_blksiz, ncolinvolvd; magma_int_t nbgr, colst, coled, versionL, versionR; magma_int_t blkcnt=-1; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *INFO=0; versionL = 113; versionR = 92; LDT = Vblksiz; LDV = NB+Vblksiz-1; //blklen = LDV*Vblksiz; nbGblk = plasma_ceildiv((N-1), Vblksiz); //magma_zmalloc_cpu( &WORK, LWORK ); /* find the size of the matrix T V*/ findVTsiz(N, NB, Vblksiz, &blkcnt, &LDV); /* Copy E & V & T to the GPU in dE and dV and dT * depending on copytype: * 1: mean copy only V * 2: mean copy V and T * 3: mean copy V, T and E * */ if (copytype > 0) magma_zsetmatrix( LDV, blkcnt*Vblksiz, V, LDV, dV, LDV ); if (copytype > 1) magma_zsetmatrix( LDT, blkcnt*Vblksiz, T, LDT, dT, LDT ); if (copytype > 2) magma_zsetmatrix( N, NE, E, N, dE, N ); magmaDoubleComplex *dwork; //ldwork = NE; LWORK = 2*N*max(Vblksiz, 64); if (MAGMA_SUCCESS != magma_zmalloc( &dwork, LWORK )) { printf ("!!!! magma_zbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ /* WANTZ = 1 meaning E is IDENTITY so form Q using optimized update. * So we use the reverse order from small q to large one, * so from q_n to q_1 so Left update to Identity. * Use versionL 113 because in 114 we need to update the whole matrix and not in icreasing order. * WANTZ = 2 meaning E is a full matrix and need to be updated from Left or Right so use normal update * */ if (WANTZ == 1) { versionL=113; SIDE = MagmaLeft; //set the matrix to Identity here to avoid copying it from the CPU magmablas_zlaset( MagmaFull, N, N, c_zero, c_one, dE, N ); } printf(" APPLY Q_v115 GPU with N %d NB %d Vblksiz %d SIDE %c versionL %d versionR %d WANTZ %d \n", (int) N, (int) NB, (int) Vblksiz, SIDE, (int) versionL, (int) versionR, (int) WANTZ); #if defined(USESTREAM) magma_int_t N2=N/2; magma_int_t N1=N-N2; printf("using stream\n"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); #endif if (SIDE == MagmaLeft) { if (versionL == 113) { for (bg = nbGblk; bg > 0; bg--) { firstcolj = (bg-1)*Vblksiz + 1; if (bg == nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)), NB); // last blk has size=1 used for complex to handle A(N,N-1) else rownbm = plasma_ceildiv((N-(firstcolj+1)), NB); for (m = rownbm; m > 0; m--) { vlen = 0; vnb = 0; colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop fst = (rownbm -m)*NB+colj +1; for (k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=k+1; } colst = (bg-1)*Vblksiz; findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", (int) bg, (int) m, (int) vlen, (int) vnb, (int) colst+1, (int) vpos+1, (int) taupos+1); if ((vlen > 0) && (vnb > 0)) { if (WANTZ == 1) { len = N-colst; magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, len, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,colst), LDE, dwork, len); } else { magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE); } } } } } else if (versionL == 114) { rownbm = plasma_ceildiv((N-1), NB); for (m = rownbm; m > 0; m--) { ncolinvolvd = min(N-1, m*NB); avai_blksiz=min(Vblksiz, ncolinvolvd); nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz); for (n = nbgr; n > 0; n--) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz); colst = (n-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -m)*NB+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=vnb+1; } findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colst+1, vpos+1, taupos+1); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N1, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, N2, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,N1), LDE, &dwork[N1*Vblksiz], N2); #else magma_zlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, vlen, NE, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(fst,0), LDE, dwork, NE); #endif } } } } } else if (SIDE == MagmaRight) { if (versionR == 91) { for (bg =1; bg <= nbGblk; bg++) { firstcolj = (bg-1)*Vblksiz + 1; rownbm = plasma_ceildiv((N-(firstcolj+1)), NB); if (bg == nbGblk) rownbm = plasma_ceildiv((N-(firstcolj)), NB); // last blk has size=1 used for complex to handle A(N,N-1) for (m = 1; m <= rownbm; m++) { vlen = 0; vnb = 0; // for k=0; I compute the fst and then can remove it from the loop colj = (bg-1)*Vblksiz; fst = (rownbm -m)*NB+colj +1; for (k=0; k < Vblksiz; k++) { colj = (bg-1)*Vblksiz + k; st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=k+1; } colj = (bg-1)*Vblksiz; findVTpos(N, NB, Vblksiz, colj, fst, &vpos, &taupos, &tpos, &blkid); //printf("voici bg %d m %d vlen %d vnb %d fcolj %d vpos %d taupos %d \n", bg, m, vlen, vnb, colj, vpos, taupos); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2); #else magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE); #endif } } } } else if (versionR == 92) { rownbm = plasma_ceildiv((N-1), NB); for (m = 1; m <= rownbm; m++) { ncolinvolvd = min(N-1, m*NB); avai_blksiz=min(Vblksiz, ncolinvolvd); nbgr = plasma_ceildiv(ncolinvolvd, avai_blksiz); for (n = 1; n <= nbgr; n++) { vlen = 0; vnb = 0; cur_blksiz = min(ncolinvolvd-(n-1)*avai_blksiz, avai_blksiz); colst = (n-1)*avai_blksiz; coled = colst + cur_blksiz -1; fst = (rownbm -m)*NB+colst +1; for (colj=colst; colj <= coled; colj++) { st = (rownbm -m)*NB+colj +1; ed = min(st+NB-1, N-1); if (st > ed) break; if ((st == ed) && (colj != N-2)) break; vlen=ed-fst+1; vnb=vnb+1; } findVTpos(N, NB, Vblksiz, colst, fst, &vpos, &taupos, &tpos, &blkid); if ((vlen > 0) && (vnb > 0)) { #if defined(USESTREAM) magmablasSetKernelStream(stream[0]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N1, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, N1); magmablasSetKernelStream(stream[1]); magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, N2, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(N1, fst), LDE, &dwork[N1*Vblksiz], N2); #else magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, vlen, vnb, dV(vpos), LDV, dT(tpos), LDT, dE(0, fst), LDE, dwork, NE); #endif } } } } } else { printf("ERROR SIDE %d\n", SIDE); } #if defined(USESTREAM) magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); #endif magmablasSetKernelStream( orig_stream ); }
/** Purpose ------- CUNMLQ overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complexunitary matrix defined as the product of k elementary reflectors Q = H(k)**H . . . H(2)**H H(1)**H as returned by CGELQF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX array, dimension (LDA,M) if SIDE = MagmaLeft, (LDA,N) if SIDE = MagmaRight. The i-th row must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGELQF in the first k rows of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,K). @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGELQF. @param[in,out] C COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgelqf_comp ********************************************************************/ extern "C" magma_int_t magma_cunmlq( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define dV(i_,j_) (dV + (i_) + (j_)*ib) #define dT(i_,j_) (dT + (i_) + (j_)*ib) #define dwork(i_) (dwork + (i_)) magmaFloatComplex *T, *T2; magma_int_t i, i1, i2, ib, ic, jc, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; magma_int_t left, notran, lquery; magma_trans_t transt; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,k)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_cgelqf_nb( min( m, n )); lwkopt = max(1,nw)*nb; work[0] = MAGMA_C_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_C_ONE; return *info; } ldwork = nw; if (nb >= k) { /* Use CPU code */ lapackf77_cunmlq( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = ((m+31)/32)*32; magmaFloatComplex_ptr dwork, dV, dT, dC; magma_cmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_cmalloc_cpu( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, C, ldc, dC(0,0), lddc ); if ( (left && notran) || (! left && ! notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb)*nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 0; } else { mi = m; ic = 0; } if (notran) { transt = Magma_ConjTrans; } else { transt = MagmaNoTrans; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i) H(i + 1) . . . H(i + ib-1) */ nq_i = nq - i; lapackf77_clarft("Forward", "Rowwise", &nq_i, &ib, A(i,i), &lda, &tau[i], T, &ib); /* 1) set upper triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ cpanel_to_q( MagmaLower, ib, A(i,i), lda, T2 ); magma_csetmatrix( ib, nq_i, A(i,i), lda, dV(0,0), ib ); cq_to_panel( MagmaLower, ib, A(i,i), lda, T2 ); if (left) { /* H or H**H is applied to C(i:m,1:n) */ mi = m - i; ic = i; } else { /* H or H**H is applied to C(1:m,i:n) */ ni = n - i; jc = i; } /* Apply H or H**H; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dT(0,0), ib ); magma_clarfb_gpu( side, transt, MagmaForward, MagmaRowwise, mi, ni, ib, dV(0,0), ib, dT(0,0), ib, dC(ic,jc), lddc, dwork(0), ldwork ); } magma_cgetmatrix( m, n, dC(0,0), lddc, C, ldc ); magma_free( dwork ); magma_free_cpu( T ); } work[0] = MAGMA_C_MAKE( lwkopt, 0 ); return *info; } /* magma_cunmlq */
/** Purpose ------- CLAHRU is an auxiliary MAGMA routine that is used in CGEHRD to update the trailing sub-matrices after the reductions of the corresponding panels. See further details below. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ihi INTEGER Last row to update. Same as IHI in cgehrd. @param[in] k INTEGER Number of rows of the matrix Am (see details below) @param[in] nb INTEGER Block size @param[out] A COMPLEX array, dimension (LDA,N-K) On entry, the N-by-(N-K) general matrix to be updated. The computation is done on the GPU. After Am is updated on the GPU only Am(1:NB) is transferred to the CPU - to update the corresponding Am matrix. See Further Details below. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- This implementation follows the algorithm and notations described in: S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. The difference is that here Am is computed on the GPU. M is renamed Am, G is renamed Ag. @ingroup magma_cgeev_aux ********************************************************************/ extern "C" magma_int_t magma_clahru_m( magma_int_t n, magma_int_t ihi, magma_int_t k, magma_int_t nb, magmaFloatComplex *A, magma_int_t lda, struct cgehrd_data* data ) { #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dW( d, i, j ) (data->W [d] + (i) + (j)*ldda) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t d; magma_int_t dk, dkhi, dknb, dn; magma_int_t info = 0; if (n < 0) { info = -1; } else if (ihi < 0 || ihi > n) { info = -2; } else if (k < 0 || k > n) { info = -3; } else if (nb < 1 || nb > n) { info = -4; } else if (lda < max(1,n)) { info = -6; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k, ihi, &dk, &dkhi ); magma_indices_1D_bcyclic( nb, ngpu, d, k+nb, n, &dknb, &dn ); // ----- // on right, A := A Q = A - A V T V' // Update Am = Am - Am V T Vd' = Am - Ym Wd', with Wd = Vd T' // Wd = Vd T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)' // Vd and Wd are the portions corresponding to the block cyclic dkstribution magma_cgemm( MagmaNoTrans, MagmaConjTrans, dkhi-dk, nb, nb, c_one, dVd(d, dk, 0), ldvd, dTi(d), nb, c_zero, dW (d, dk, 0), ldda ); // Am = Am - Ym Wd' = A(0:k-1, k:ihi-1) - Ym(0:k-1, 0:nb-1) * W(k:ihi-1, 0:nb-1)' magma_cgemm( MagmaNoTrans, MagmaConjTrans, k, dkhi-dk, nb, c_neg_one, dY(d, 0, 0), ldda, dW(d, dk, 0), ldda, c_one, dA(d, 0, dk), ldda ); // ----- // on right, A := A Q = A - A V T V' // Update Ag = Ag - Ag V T V' = Ag - Yg Wd' // Ag = Ag - Yg Wd' = A(k:ihi-1, nb:ihi-k-1) - Y(k:ihi-1, 0:nb-1) * W(k+nb:ihi-1, 0:nb-1)' magma_cgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, dkhi-dknb, nb, c_neg_one, dY(d, k, 0), ldda, dW(d, dknb, 0), ldda, c_one, dA(d, k, dknb), ldda ); // ----- // on left, A := Q' A = A - V T' V' A // Ag2 = Ag2 - V T' V' Ag2 = W Yg, with W = V T' and Yg = V' Ag2 // Note that Ag is A(k:ihi, nb+1:ihi-k) // while Ag2 is A(k:ihi, nb+1: n -k) // here V and W are the whole matrices, not just block cyclic portion // W = V T' = V(k:ihi-1, 0:nb-1) * T(0:nb-1, 0:nb-1)' // TODO would it be cheaper to compute the whole matrix and // copy the block cyclic portions to another workspace? magma_cgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, nb, nb, c_one, dV (d, k, 0), ldv, dTi(d), nb, c_zero, dW (d, k, 0), ldda ); // Z = V(k:ihi-1, 0:nb-1)' * A(k:ihi-1, nb:n-k-1); Z is stored over Y magma_cgemm( MagmaConjTrans, MagmaNoTrans, nb, dn-dknb, ihi-k, c_one, dV(d, k, 0), ldv, dA(d, k, dknb), ldda, c_zero, dY(d, 0, 0), nb ); // Ag2 = Ag2 - W Z = A(k:ihi-1, k+nb:n-1) - W(k+nb:n-1, 0:nb-1) * Z(0:nb-1, k+nb:n-1) magma_cgemm( MagmaNoTrans, MagmaNoTrans, ihi-k, dn-dknb, nb, c_neg_one, dW(d, k, 0), ldda, dY(d, 0, 0), nb, c_one, dA(d, k, dknb), ldda ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return info; }
extern "C" magma_int_t magma_zlahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *tau, magmaDoubleComplex *T, magma_int_t ldt, magmaDoubleComplex *Y, magma_int_t ldy, struct zgehrd_data* data ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZLAHR2 reduces the first NB columns of a complex general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by ZGEHRD. Arguments ========= N (input) INTEGER The order of the matrix A. K (input) INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. NB (input) INTEGER The number of columns to be reduced. A (input/output) COMPLEX_16 array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX_16 array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. T (output) COMPLEX_16 array, dimension (LDT,NB) The upper triangular matrix T. LDT (input) INTEGER The leading dimension of the array T. LDT >= NB. Y (output) COMPLEX_16 array, dimension (LDY,NB) The n-by-nb matrix Y. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements in rows K:N of the first NB columns are overwritten with the matrix Y. DV (output) COMPLEX_16 array on the GPU, dimension (N, NB) On exit this contains the Householder vectors of the transformation. Further Details =============== The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; magmaDoubleComplex scale; magma_int_t i; magmaDoubleComplex ei = MAGMA_Z_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return 0; // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_zlaset( MagmaUpperLower, nb, nb, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_zlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_Z_NEGATE( tau[i-1] ); blasf77_zaxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_zcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_ztrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_zgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_ztrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_zgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_ztrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_zaxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_zlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_zsetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_zlacpy( 'F', nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb , i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_zgemv( 'N', n-k, dn-dki1, c_one, dA (d, k , dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k , i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_zgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_Z_NEGATE( tau[i] ); blasf77_zgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_ztrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If complex, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_zgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_zlacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_zgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_zaxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_zgemm( 'N', 'N', k, nb, dn-dki1, c_one, dA (d, 0 , dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0 , 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_zgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_zaxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_zsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_zsetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } return 0; } // magma_zlahr2