/** Purpose ------- DGEHRD2 reduces a DOUBLE PRECISION general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to DGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) DOUBLE PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_comp ********************************************************************/ extern "C" magma_int_t magma_dgehrd2( magma_int_t n, magma_int_t ilo, magma_int_t ihi, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda*2) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dV(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda) #define dwork(i_) dwork, ((i_)) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dV(i_,j_) (dV + (i_) + (j_)*ldda) #define dwork(i_) (dwork + (i_)) #endif // Constants const double c_one = MAGMA_D_ONE; const double c_zero = MAGMA_D_ZERO; // Local variables magma_int_t nb = magma_get_dgehrd_nb( n ); magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t i, nh, iws; magma_int_t iinfo; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = magma_dmake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // GPU workspace is: // nb*ldda for dwork for dlahru // nb*ldda for dV // n*ldda for dA // nb*nb for dT magmaDouble_ptr dwork; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*nb*ldda + n*ldda + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } double *dV = dwork + nb*ldda; double *dA = dwork + nb*ldda*2; double *dT = dwork + nb*ldda*2 + n*ldda; double *T; magma_dmalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; assert( nb % 4 == 0 ); for (i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; // Copy the matrix to the GPU magma_dsetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_dgetmatrix( ihi-i, nb, dA(i,i-ilo), ldda, A(i,i), lda, queue ); // add 1 to i for 1-based index magma_dlahr2( ihi, i+1, nb, dA(0,i-ilo), ldda, dV(0,0), ldda, A(0,i), lda, &tau[i], T, nb, work, n, queue ); // Copy T from the CPU to dT on the GPU magma_dsetmatrix( nb, nb, T, nb, dT(0,0), nb, queue ); magma_dlahru( n, ihi, i, nb, A(0,i), lda, dA(0,i-ilo), ldda, // dA dA(i,i-ilo), ldda, // dY, stored over current panel dV(0,0), ldda, dT(0,0), dwork, queue ); } // Copy remainder to host magma_dgetmatrix( n, n-i, dA(0,i-ilo), ldda, A(0,i), lda, queue ); magma_free( dwork ); magma_free_cpu( T ); magma_queue_destroy( queue ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_dmake_lwork( iws ); return *info; } /* magma_dgehrd2 */
extern "C" magma_int_t magma_dpidr_merge( magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_n_one = MAGMA_D_NEG_ONE; // internal user parameters const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const double angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; double residual; double nrm; double nrmb; double nrmr; double nrmt; double rho; double om; double gamma; double fk; // matrices and vectors magma_d_matrix dxs = {Magma_CSR}; magma_d_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_d_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_d_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_d_matrix dU = {Magma_CSR}; magma_d_matrix dM = {Magma_CSR}, hMdiag = {Magma_CSR}; magma_d_matrix df = {Magma_CSR}; magma_d_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_d_matrix dc = {Magma_CSR}; magma_d_matrix dv = {Magma_CSR}; magma_d_matrix dlu = {Magma_CSR}; magma_d_matrix dskp = {Magma_CSR}, hskp = {Magma_CSR}; magma_d_matrix dalpha = {Magma_CSR}, halpha = {Magma_CSR}; magma_d_matrix dbeta = {Magma_CSR}, hbeta = {Magma_CSR}; double *d1 = NULL, *d2 = NULL; // chronometry real_Double_t tempo1, tempo2; // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_dnrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_dscal( x->num_rows, MAGMA_D_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_dvinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_dvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_dresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_dvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_dlarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_dmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_dmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_dqr(P1), QR factorization CHECK( magma_dqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_dnrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_dscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_dmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_dmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_dvinit( &hskp, Magma_CPU, 4, 1, c_zero, queue )); CHECK( magma_dvinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_dvinit( &halpha, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_dvinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_dvinit( &hbeta, Magma_CPU, s, 1, c_zero, queue )); CHECK( magma_dvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_dmalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_dmalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_dmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_dvinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_dvinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_dcopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_dvinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_dvinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_dvinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_dvinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_dvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_dvinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_dvinit( &hMdiag, Magma_CPU, s, 1, c_zero, queue )); magmablas_dlaset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_dvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_dvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = 0 CHECK( magma_dvinit( &dv, Magma_DEV, dr.num_rows, 1, c_zero, queue )); // lu = 0 CHECK( magma_dvinit( &dlu, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_D_ONE; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r magma_dgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queue ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; // c(k:s) = M(k:s,k:s) \ f(k:s) magma_dcopyvector( sk, &df.dval[k], 1, &dc.dval[k], 1, queue ); magma_dtrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk, &dM.dval[k*dM.ld+k], dM.ld, &dc.dval[k], 1, queue ); // v = r - G(:,k:s) c(k:s) magma_dcopyvector( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); magmablas_dgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, &dG.dval[k*dG.ld], dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_d_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_d_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // U(:,k) = om * v + U(:,k:s) c(k:s) magmablas_dgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queue ); magma_dcopyvector( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queue ); // G(:,k) = A U(:,k) dGcol.dval = dG.dval + k * dG.ld; CHECK( magma_d_spmv( c_one, A, dv, c_zero, dGcol, queue )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) halpha.val[i] = magma_ddot( dP.num_rows, &dP.dval[i*dP.ld], 1, &dG.dval[k*dG.ld], 1, queue ); // alpha = alpha / M(i,i) halpha.val[i] = halpha.val[i] / hMdiag.val[i]; // G(:,k) = G(:,k) - alpha * G(:,i) magma_daxpy( dG.num_rows, -halpha.val[i], &dG.dval[i*dG.ld], 1, &dG.dval[k*dG.ld], 1, queue ); } // non-first s iteration if ( k > 0 ) { // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) magma_dsetvector( k, halpha.val, 1, dalpha.dval, 1, queue ); magmablas_dgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, &dU.dval[k*dU.ld], 1, queue ); } // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) magma_dgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], &dG.dval[k*dG.ld], d1, d2, &dM.dval[k*dM.ld+k], queue ); magma_dgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag.val[k], 1, queue ); // check M(k,k) == 0 if ( MAGMA_D_EQUAL(hMdiag.val[k], MAGMA_D_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) magma_dgetvector( 1, &df.dval[k], 1, &fk, 1, queue ); hbeta.val[k] = fk / hMdiag.val[k]; // check for nan if ( magma_d_isnan( hbeta.val[k] ) || magma_d_isinf( hbeta.val[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // r = r - beta * G(:,k) magma_daxpy( dr.num_rows, -hbeta.val[k], &dG.dval[k*dG.ld], 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // |r| nrmr = magma_dnrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // x = x + beta * U(:,k) magma_daxpy( x->num_rows, hbeta.val[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queue ); // smoothing operation //--------------------------------------- // t = rs - r magma_didr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queue ); // t't // t'rs CHECK( magma_dgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queue )); magma_dgetvector( 2, &dskp.dval[2], 1, &hskp.val[2], 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = hskp.val[3] / hskp.val[2]; // rs = rs - gamma * (rs - r) magma_daxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_didr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queue ); // |rs| nrmr = magma_dnrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) magma_daxpy( sk-1, -hbeta.val[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queue ); } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // update solution approximation x // x = x + U(:,1:s) * beta(1:s) magma_dsetvector( s, hbeta.val, 1, dbeta.dval, 1, queue ); magmablas_dgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queue ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // v = r magma_dcopy( dr.num_rows, dr.dval, 1, dv.dval, 1, queue ); // preconditioning operation // v = L \ v; // v = U \ v; CHECK( magma_d_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queue )); CHECK( magma_d_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queue )); // t = A v CHECK( magma_d_spmv( c_one, A, dv, c_zero, dt, queue )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // t't // t'r CHECK( magma_dgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queue )); magma_dgetvector( 2, dskp.dval, 1, hskp.val, 1, queue ); // |t| nrmt = magma_dsqrt( MAGMA_D_REAL(hskp.val[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_D_REAL(hskp.val[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp.val[1] / hskp.val[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_D_EQUAL(om, MAGMA_D_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // update approximation vector // x = x + om * v magma_daxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queue ); // update residual vector // r = r - om * t magma_daxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queue ); // smoothing disabled if ( smoothing <= 0 ) { // residual norm nrmr = magma_dnrm2( dr.num_rows, dr.dval, 1, queue ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r magma_didr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queue ); // t't // t'rs CHECK( magma_dgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queue )); magma_dgetvector( 2, &dskp.dval[2], 1, &hskp.val[2], 1, queue ); // gamma = (t' * rs) / (t' * t) gamma = hskp.val[3] / hskp.val[2]; // rs = rs - gamma * (rs - r) magma_daxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queue ); // xs = xs - gamma * (xs - x) magma_didr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queue ); // |rs| nrmr = magma_dnrm2( drs.num_rows, drs.dval, 1, queue ); //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_dcopyvector( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_dcopyvector( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_dresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_dmfree( &dxs, queue ); magma_dmfree( &drs, queue ); magma_dmfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_dmfree( &dr, queue ); magma_dmfree( &dP, queue ); magma_dmfree( &dP1, queue ); magma_dmfree( &dG, queue ); magma_dmfree( &dGcol, queue ); magma_dmfree( &dU, queue ); magma_dmfree( &dM, queue ); magma_dmfree( &hMdiag, queue ); magma_dmfree( &df, queue ); magma_dmfree( &dt, queue ); magma_dmfree( &dc, queue ); magma_dmfree( &dv, queue ); magma_dmfree( &dlu, queue ); magma_dmfree( &dskp, queue ); magma_dmfree( &dalpha, queue ); magma_dmfree( &dbeta, queue ); magma_dmfree( &hskp, queue ); magma_dmfree( &halpha, queue ); magma_dmfree( &hbeta, queue ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_dpidr_merge */ }
extern "C" magma_int_t magma_dgehrd(magma_int_t n, magma_int_t ilo, magma_int_t ihi, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, double *dT, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= DGEHRD reduces a DOUBLE_PRECISION general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. ILO (input) INTEGER IHI (input) INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to DGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. WORK (workspace/output) DOUBLE_PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) DOUBLE_PRECISION array on the GPU, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices in dT, for later use in magma_dorghr. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define dA( i, j ) (dA + (i) + (j-ilo)*ldda) double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t nb = magma_get_dgehrd_nb(n); magma_int_t ldda = n; // assumed in dlahru magma_int_t nh, iws; magma_int_t iinfo; magma_int_t ldwork; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = MAGMA_D_MAKE( iws, 0 ); lquery = lwork == -1; if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // GPU workspace is: // nb*ldda for dwork for dlahru // nb*ldda for dV // n*ldda for dA double *dwork; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*nb*ldda + n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } double *dV = dwork + nb*ldda; double *dA = dwork + nb*ldda*2; ldwork = n; magma_int_t i; double *T, *dTi; magma_dmalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular dzero_nbxnb_block(nb, dV, ldda); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for(i = 0; i < ilo; ++i) tau[i] = c_zero; for(i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; for(i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; magmablas_dlaset( 'F', nb, n, dT, nb ); // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // Copy the matrix to the GPU magma_dsetmatrix( n, n-ilo, A(0,ilo), lda, dA, ldda ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_dgetmatrix( ihi-i, nb, dA(i,i), ldda, A (i,i), lda ); // add 1 to i for 1-based index magma_dlahr2( ihi, i+1, nb, dA(0,i), dV, A (0,i), lda, &tau[i], T, nb, work, ldwork); // Copy T from the CPU to dT on the GPU dTi = dT + (i - ilo)*nb; magma_dsetmatrix( nb, nb, T, nb, dTi, nb ); magma_dlahru( n, ihi, i, nb, A (0,i), lda, dA(0,i), // dA dA(i,i), // dY, stored over current panel dV, dTi, dwork ); } // Copy remainder to host magma_dgetmatrix( n, n-i, dA(0,i), ldda, A (0,i), lda ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = MAGMA_D_MAKE( iws, 0 ); magma_free( dwork ); magma_free_cpu( T ); return *info; } /* magma_dgehrd */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlaset Code is very similar to testing_dlacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R; magmaDouble_ptr d_A; double offdiag, diag; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N offdiag diag CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("================================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { for( int ival = 0; ival < 4; ++ival ) { // test combinations of zero & non-zero: // ival offdiag diag // 0 0 0 // 1 0 3.14 // 2 1.23 0 // 3 1.23 3.14 offdiag = MAGMA_D_MAKE( 1.2345, 6.7890 ) * (ival / 2); diag = MAGMA_D_MAKE( 3.1415, 2.7183 ) * (ival % 2); M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldda = roundup( M, opts.roundup ); size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // save triangle (with diagonal) // TODO wrong for trapezoid gbytes = sizeof(double) * 0.5*N*(N+1) / 1e9; } else { // save entire matrix gbytes = sizeof(double) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, double, size ); TESTING_MALLOC_CPU( h_R, double, size ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_D_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); //magmablas_dlaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda ); // inset by 1 row & col magmablas_dlaset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_dlaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_dlaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_dprint( M, N, h_A, lda ); printf( "dA=" ); magma_dprint_gpu( M, N, d_A, ldda ); } /* ===================================================================== Check the result =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, lda ); blasf77_daxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work); bool okay = (error == 0); status += ! okay; printf("%5s %5d %5d %7.2f %4.2f %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const( uplo[iuplo] ), (int) M, (int) N, real(offdiag), real(diag), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (okay ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/** Purpose ------- DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by DGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T DOUBLE_PRECISION array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_aux ********************************************************************/ extern "C" magma_int_t magma_dlahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *T, magma_int_t ldt, double *Y, magma_int_t ldy, struct dgehrd_data *data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; double scale; magma_int_t i; double ei = MAGMA_D_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_dlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_D_NEGATE( tau[i-1] ); blasf77_daxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_dcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_dtrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_dgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_dtrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_daxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_dlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_dsetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_dlacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_dgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_D_NEGATE( tau[i] ); blasf77_dgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If real, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_dgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_dgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_daxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_dgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_daxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_dsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_dsetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dlahr2 */