extern "C" magma_int_t magma_cunmql(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *c, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CUNMQL overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'C': Q**H * C C * Q**H where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by CGEQLF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'C': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQLF. C (input/output) COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; magma_int_t i__4, i__; magmaFloatComplex *T; magma_int_t i1, i2, i3, ib, nb, mi, ni, nq, nw; magma_int_t iinfo, ldwork, lwkopt=0; int lquery, left, notran; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = max(1,n); } else { nq = n; nw = max(1,m); } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "C")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } if (*info == 0) { if (m == 0 || n == 0) { lwkopt = 1; } else { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = nw * nb; } work[0] = MAGMA_C_MAKE( lwkopt, 0 ); if (lwork < nw && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } /* Allocate work space on the GPU */ magmaFloatComplex *dwork, *dc; magma_cmalloc( &dc, (m)*(n) ); magma_cmalloc( &dwork, 2*(m + 64)*64 ); /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, c, ldc, dc, m ); /* work space on CPU */ if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &T, 2*nb*nb ) ) { magma_free( dc ); magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_cunmql(side_, trans_, &m, &n, &k, a, &lda, tau, c, &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ((left && notran) || (! left && ! notran)) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } for (i__ = i1; (i3 < 0 ? i__ >= i2 : i__ <= i2); i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ i__4 = nq - k + i__ + ib - 1; lapackf77_clarft("Backward", "Columnwise", &i__4, &ib, &a[(i__-1) * lda], &lda, &tau[i__-1], T, &ib); /* 1) Put 0s in the lower triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ cpanel_to_q('L', ib, &a[i__-1 + (i__-1) * lda], lda, T+ib*ib); magma_csetmatrix( i__4, ib, &a[(i__-1) * lda], lda, dwork, i__4 ); cq_to_panel('L', ib, &a[i__-1 + (i__-1) * lda], lda, T+ib*ib); if (left) { /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i__ + ib - 1; } else { /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i__ + ib - 1; } /* Apply H or H'; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dwork+i__4*ib, ib ); magma_clarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dwork, i__4, dwork+i__4*ib, ib, dc, m, dwork+i__4*ib + ib*ib, ldwork); } magma_cgetmatrix( m, n, dc, m, c, ldc ); } work[0] = MAGMA_C_MAKE( lwkopt, 0 ); magma_free( dc ); magma_free( dwork ); magma_free_pinned( T); return *info; } /* magma_cunmql */
extern "C" int calc_numerical_range(magmaFloatComplex *M, magma_int_t M_lead_dim, float _from, float _step, magma_int_t _steps, magmaFloatComplex *pts) { magma_int_t idx = 0, rslt = 0; magmaFloatComplex p, scalar; std::complex<float> vtmp; float j; magmaFloatComplex *dA = nullptr; magmaFloatComplex *dAth = NULL, *dAthT = NULL, *dX = NULL, *dY = NULL; float *dE = NULL; //float *hE = NULL; //magma_int_t *ipiv = NULL; magma_int_t lda = M_lead_dim; //magma_int_t ldx = lda; magma_int_t info = 0; magma_int_t nb = 0; //magma_vec_t jobvl; //magma_vec_t jobvr; magmaFloatComplex *work = nullptr; magma_int_t lwork = 0; float *rwork = nullptr; magma_int_t lrwork = 0; magma_int_t *iwork = nullptr; magma_int_t liwork = 0; nb = magma_get_cgehrd_nb( M_lead_dim ); lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2 * M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec lrwork = 1 + 5 * M_lead_dim + 2 * M_lead_dim*M_lead_dim; // MagmaVec liwork = (3 + 5 * M_lead_dim); // MagmaVec magma_imalloc_cpu(&iwork, liwork); magma_smalloc_cpu(&rwork, lrwork); magma_cmalloc_pinned(&work, lwork); magma_cmalloc_pinned(&dA, lda*M_lead_dim); magma_cmalloc_pinned(&dAth, lda*M_lead_dim); magma_cmalloc_pinned(&dAthT, lda*M_lead_dim); magma_smalloc_pinned(&dE, M_lead_dim); //magma_smalloc_cpu(&hE, M_lead_dim); magma_cmalloc_pinned(&dX, M_lead_dim); magma_cmalloc_pinned(&dY, M_lead_dim); magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue); // th=[0:resolution:2*pi] j = _from; for (idx = 0; idx < _steps; idx++) { //scalar = exp( 1im * -j); vtmp.real( 0.0f ); vtmp.imag( -j ); //vtmp = _FCbuild(0.0f, -j); //printf("vtmp = %f + i%f\n", vtmp._Val[0], vtmp._Val[1]); vtmp = exp(vtmp); scalar.x = vtmp.real(); scalar.y = vtmp.imag(); //printf("scalar = %f + i%f\n", scalar.x, scalar.y); magma_ccopy(lda * M_lead_dim, dA, 1, dAth, 1, queue); // Ath = exp(1im * -j) * As magma_cscal(lda * M_lead_dim, scalar, dAth, 1, queue); //magma_cprint_gpu(N, N, dA, lda); //magma_cprint_gpu(N, N, dAth, lda); // AthT = (Ath + Ath') magmablas_ctranspose_conj(M_lead_dim, M_lead_dim, dAth, M_lead_dim, dAthT, M_lead_dim, queue); magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dAth, M_lead_dim, dAthT, M_lead_dim, queue); // AthT = AthT / 2 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAthT, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); // e, r = eig(AthT) rslt = magma_cheevd(MagmaVec, MagmaLower, M_lead_dim, dAthT, lda, dE, work, lwork, rwork, lrwork, iwork, liwork, &info); magma_sync_wtime(queue); //printf("magma_cheevd info=%d\n", info); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); //magma_sprint_gpu(M_lead_dim, 1, dE, M_lead_dim); //magma_sgetvector(M_lead_dim, dE, 1, hE, 1, queue); //printf("%f %f\n", hE[0], hE[1]); // p = r[:,s]' * A * r[:,s] // r = r[:,s] magma_ccopy( M_lead_dim, dAthT + (M_lead_dim*(M_lead_dim-1)), 1, // dAthT + (N), where (N) is a column offset dX, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dX, M_lead_dim); // pp = A * r[:,s] magma_cgemv(MagmaNoTrans, M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, lda, dX, 1, MAGMA_C_MAKE(0.0f, 0.0f), dY, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dY, M_lead_dim); // p = r' * pp p = magma_cdotc(M_lead_dim, dX, 1, dY, 1, queue); magma_sync_wtime(queue); pts[idx] = p; //printf("p = %f %fi\n", p.x, p.y); j += _step; } // end of for (idx = 0; idx < _steps; idx++) magma_free_pinned(dY); magma_free_pinned(dX); //magma_free_cpu(hE); magma_free_pinned(dE); magma_free_pinned(dAthT); magma_free_pinned(dAth); magma_free_pinned(dA); magma_free_pinned(work); magma_free_cpu(rwork); magma_free_cpu(iwork); //magma_free_cpu(w); //magma_free_cpu(A); return rslt; }
magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; magma_cmalloc_pinned( &H, (ldh+1)*ldh ); magma_cmalloc_pinned( &y, ldh ); magma_cmalloc_pinned( &HH, ldh*ldh ); magma_cmalloc_pinned( &h1, ldh ); // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; magmaFloatComplex *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_cmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_cmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_ccopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_ccopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r ); // r = A q[k] if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_cdotc(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); } magma_csetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->val, 1); magma_c_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_caxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r); magma_c_vfree(&q); // free GPU streams and events //magma_queue_destroy( stream[0] ); //magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_cgmres */
/***************************************************************************//** Purpose ------- CGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This is a GPU interface of the routine. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dlA COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf *******************************************************************************/ extern "C" magma_int_t magma_cgeqrf2_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dlA[], magma_int_t ldda, magmaFloatComplex *tau, magma_int_t *info ) { #define dlA(dev, i, j) (dlA[dev] + (i) + (j)*(ldda)) #define hpanel(i) (hpanel + (i)) // set to NULL to make cleanup easy: free(NULL) does nothing. magmaFloatComplex *dwork[MagmaMaxGPUs]={NULL}, *dpanel[MagmaMaxGPUs]={NULL}; magmaFloatComplex *hwork=NULL, *hpanel=NULL; magma_queue_t queues[MagmaMaxGPUs][2]={{NULL}}; magma_event_t panel_event[MagmaMaxGPUs]={NULL}; magma_int_t i, j, min_mn, dev, ldhpanel, lddwork, rows; magma_int_t ib, nb; magma_int_t lhwork, lwork; magma_int_t panel_dev, i_local, i_nb_local, n_local[MagmaMaxGPUs], la_dev, dpanel_offset; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } min_mn = min(m,n); if (min_mn == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); nb = magma_get_cgeqrf_nb( m, n ); /* dwork is (n*nb) --- for T (nb*nb) and clarfb work ((n-nb)*nb) --- * + dpanel (ldda*nb), on each GPU. * I think clarfb work could be smaller, max(n_local[:]). * Oddly, T and clarfb work get stacked on top of each other, both with lddwork=n. * on GPU that owns panel, set dpanel = dlA(dev,i,i_local). * on other GPUs, set dpanel = dwork[dev] + dpanel_offset. */ lddwork = n; dpanel_offset = lddwork*nb; for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if ( MAGMA_SUCCESS != magma_cmalloc( &(dwork[dev]), (lddwork + ldda)*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } } /* hwork is MAX( workspace for cgeqrf (n*nb), two copies of T (2*nb*nb) ) * + hpanel (m*nb). * for last block, need 2*n*nb total. */ ldhpanel = m; lhwork = max( n*nb, 2*nb*nb ); lwork = max( lhwork + ldhpanel*nb, 2*n*nb ); if ( MAGMA_SUCCESS != magma_cmalloc_pinned( &hwork, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; goto CLEANUP; } hpanel = hwork + lhwork; /* Set the number of local n for each GPU */ for( dev=0; dev < ngpu; dev++ ) { n_local[dev] = ((n/nb)/ngpu)*nb; if (dev < (n/nb) % ngpu) n_local[dev] += nb; else if (dev == (n/nb) % ngpu) n_local[dev] += n % nb; } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_create( dev, &queues[dev][0] ); magma_queue_create( dev, &queues[dev][1] ); magma_event_create( &panel_event[dev] ); } if ( nb < min_mn ) { /* Use blocked code initially */ // Note: as written, ib cannot be < nb. for( i = 0; i < min_mn-nb; i += nb ) { /* Set the GPU number that holds the current panel */ panel_dev = (i/nb) % ngpu; /* Set the local index where the current panel is (j == i) */ i_local = i/(nb*ngpu)*nb; ib = min(min_mn-i, nb); rows = m-i; /* Send current panel to the CPU, after panel_event indicates it has been updated */ magma_setdevice( panel_dev ); magma_queue_wait_event( queues[panel_dev][1], panel_event[panel_dev] ); magma_cgetmatrix_async( rows, ib, dlA(panel_dev, i, i_local), ldda, hpanel(i), ldhpanel, queues[panel_dev][1] ); magma_queue_sync( queues[panel_dev][1] ); // Factor panel lapackf77_cgeqrf( &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %lld\n", (long long) *info ); } // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, hpanel(i), &ldhpanel, tau+i, hwork, &ib ); magma_cpanel_to_q( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); // Send the current panel back to the GPUs for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if (dev == panel_dev) dpanel[dev] = dlA(dev, i, i_local); else dpanel[dev] = dwork[dev] + dpanel_offset; magma_csetmatrix_async( rows, ib, hpanel(i), ldhpanel, dpanel[dev], ldda, queues[dev][0] ); } for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } // TODO: if magma_cpanel_to_q copied whole block, wouldn't need to restore // -- just send the copy to the GPUs. // TODO: also, could zero out the lower triangle and use Azzam's larfb w/ gemm. /* Restore the panel */ magma_cq_to_panel( MagmaUpper, ib, hpanel(i), ldhpanel, hwork + ib*ib ); if (i + ib < n) { /* Send the T matrix to the GPU. */ for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_csetmatrix_async( ib, ib, hwork, ib, dwork[dev], lddwork, queues[dev][0] ); } la_dev = (panel_dev+1) % ngpu; for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); if (dev == la_dev && i+nb < min_mn-nb) { // If not last panel, // for look-ahead panel, apply H' to A(i:m,i+ib:i+2*ib) i_nb_local = (i+nb)/(nb*ngpu)*nb; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); magma_event_record( panel_event[dev], queues[dev][0] ); // for trailing matrix, apply H' to A(i:m,i+2*ib:n) magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-(i_nb_local+ib), ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local+ib), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); } else { // for trailing matrix, apply H' to A(i:m,i+ib:n) i_nb_local = i_local; if (dev <= panel_dev) { i_nb_local += ib; } magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n_local[dev]-i_nb_local, ib, dpanel[dev], ldda, // V dwork[dev], lddwork, // T dlA(dev, i, i_nb_local), ldda, // C dwork[dev]+ib, lddwork, // work queues[dev][0] ); } } // Restore top of panel (after larfb is done) magma_setdevice( panel_dev ); magma_csetmatrix_async( ib, ib, hpanel(i), ldhpanel, dlA(panel_dev, i, i_local), ldda, queues[panel_dev][0] ); } } } else { i = 0; } /* Use unblocked code to factor the last or only block row. */ if (i < min_mn) { rows = m-i; for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % ngpu; i_local = j/(nb*ngpu)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_cgetmatrix( rows, ib, dlA(panel_dev, i, i_local), ldda, hwork + (j-i)*rows, rows, queues[panel_dev][0] ); } // needs lwork >= 2*n*nb: // needs (m-i)*(n-i) for last block row, bounded by nb*n. // needs (n-i)*nb for cgeqrf work, bounded by n*nb. ib = n-i; // total columns in block row lhwork = lwork - ib*rows; lapackf77_cgeqrf( &rows, &ib, hwork, &rows, tau+i, hwork + ib*rows, &lhwork, info ); if ( *info != 0 ) { fprintf( stderr, "error %lld\n", (long long) *info ); } for( j=i; j < n; j += nb ) { panel_dev = (j/nb) % ngpu; i_local = j/(nb*ngpu)*nb; ib = min( n-j, nb ); magma_setdevice( panel_dev ); magma_csetmatrix( rows, ib, hwork + (j-i)*rows, rows, dlA(panel_dev, i, i_local), ldda, queues[panel_dev][0] ); } } CLEANUP: // free(NULL) does nothing. for( dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); magma_queue_destroy( queues[dev][0] ); magma_queue_destroy( queues[dev][1] ); magma_event_destroy( panel_event[dev] ); magma_free( dwork[dev] ); } magma_free_pinned( hwork ); magma_setdevice( orig_dev ); return *info; } /* magma_cgeqrf2_mgpu */
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm; magma_int_t i, j, d, lddat, lddwork; magmaFloatComplex *d_lAT[MagmaMaxGPUs]; magmaFloatComplex *d_panel[MagmaMaxGPUs], *work; magma_queue_t queues[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* create the queues */ for( d=0; d < ngpu; d++ ) { magma_queue_create( d, &queues[d][0] ); magma_queue_create( d, &queues[d][1] ); } /* Function Body */ nb = magma_get_cgetrf_nb( m, n ); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ magma_device_t orig_dev; magma_getdevice( &orig_dev ); maxm = magma_roundup( m, 32 ); if ( ngpu > ceil((float)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 ); lddat = magma_ceildiv( n, nb ); /* number of block columns */ lddat = magma_ceildiv( lddat, ngpu ); /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = magma_roundup( lddat, 32 ); /* make it a multiple of 32 */ for (i=0; i < ngpu; i++) { magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ctranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] ); } for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_queue_sync(queues[i][0]); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*ngpu )) { for (i=0; i < ngpu; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and queues */ magma_cgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, queues, info); /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ctranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] ); magma_queue_sync(queues[d][0]); magma_queue_sync(queues[d][1]); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); } /* end of for d=1,..,ngpu */ magma_setdevice( orig_dev ); magma_free_pinned( work ); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_destroy( queues[d][0] ); magma_queue_destroy( queues[d][1] ); } return *info; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaFloatComplex *A, *B, *C, *C2, *LU; magmaFloatComplex *dA, *dB, *dC1, *dC2; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.5, 0.1 ); magmaFloatComplex beta = MAGMA_C_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_cmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_cmalloc( &dA, size ); assert( err == 0 ); err = magma_cmalloc( &dB, size ); assert( err == 0 ); err = magma_cmalloc( &dC1, size ); assert( err == 0 ); err = magma_cmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test CSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetmatrix( m, n, A, ld, dB, ld ); magma_cswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_cswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_cgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_clange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "cswap diff %.2g\n", error ); } else { printf( "cswap skipped for n < 3\n" ); } // ----- test ICAMAX // get argmax of column of A magma_csetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_icamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIcamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "icamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test CGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetvector( maxn, B, 1, dB, 1 ); magma_csetvector( maxn, C, 1, dC1, 1 ); magma_csetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_cgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasCaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMV( m, n ) / 1e9; printf( "cgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetvector( m, B, 1, dB, 1 ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_chemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMV( m ) / 1e9; printf( "chemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_clacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_cgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_csetmatrix( m, m, LU, ld, dA, ld ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ctrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ctrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test CGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_csetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_csetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMM( m, n, k ) / 1e9; printf( "cgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetmatrix( m, n, B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_chemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMM( side[is], m, n ) / 1e9; printf( "chemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_csetmatrix( n, k, A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHERK( k, n ) / 1e9; printf( "cherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_csetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHER2K( k, n ) / 1e9; printf( "cher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasCtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRMM( side[is], m, n ) / 1e9; printf( "ctrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test CTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( side[is], m, n ) / 1e9; printf( "ctrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/** Purpose ------- CUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by CGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQLF. @param[in,out] C COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_cunmql( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) magmaFloatComplex *T, *T2; magma_int_t i, i1, i2, ib, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; *info = 0; bool left = (side == MagmaLeft); bool notran = (trans == MagmaNoTrans); bool lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_cgelqf_nb( m, n ); lwkopt = max(1,nw)*nb; work[0] = magma_cmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_C_ONE; return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_cunmql( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo ); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = magma_roundup( m, 32 ); magmaFloatComplex *dwork, *dV, *dT, *dC; magma_cmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_cmalloc_pinned( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, C, ldc, dC, lddc, queue ); if ( (left && notran) || (! left && ! notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ nq_i = nq - k + i + ib; lapackf77_clarft("Backward", "Columnwise", &nq_i, &ib, A(0,i), &lda, &tau[i], T, &ib); /* 1) set lower triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ magma_cpanel_to_q( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); magma_csetmatrix( nq_i, ib, A(0,i), lda, dV, nq_i, queue ); magma_cq_to_panel( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); if (left) { /* H or H**H is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib; } else { /* H or H**H is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib; } /* Apply H or H**H; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dT, ib, queue ); magma_clarfb_gpu( side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dV, nq_i, dT, ib, dC, lddc, dwork, ldwork, queue ); } magma_cgetmatrix( m, n, dC, lddc, C, ldc, queue ); magma_queue_destroy( queue ); magma_free( dwork ); magma_free_pinned( T ); } work[0] = magma_cmake_lwork( lwkopt ); return *info; } /* magma_cunmql */
/** Purpose ------- CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu) On entry, the Hermitian matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_cposv_comp ********************************************************************/ extern "C" magma_int_t magma_cpotrf_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); magmaFloatComplex *work; bool upper = (uplo == MagmaUpper); magmaFloatComplex *dwork[MagmaMaxGPUs]; magma_queue_t queues[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_cpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*ngpu)); if ( n%(nb*ngpu) != 0 ) lddp += min(nb, n-ngpu*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (ngpu == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); magma_queue_create( 0, &queues[0][0] ); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( n, n, d_lA[0], ldda, work, n, queues[0][0] ); lapackf77_cpotrf(uplo_, &n, work, &n, info); magma_csetmatrix( n, n, work, n, d_lA[0], ldda, queues[0][0] ); magma_free_pinned( work ); magma_queue_destroy( queues[0][0] ); } else { lddp = magma_roundup( n, nb ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dwork[d], ngpu*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) { magma_queue_create( d, &queues[d][j] ); } for( j=0; j < 5; j++ ) { magma_event_create( &event[d][j] ); } } magma_setdevice(0); h = 1; //ngpu; //magma_ceildiv( n, nb ); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with three queues */ magma_cpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, queues, event, info); } else { /* with three queues */ magma_cpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, queues, event, info); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( queues[d][j] ); magma_queue_destroy( queues[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_cpotrf_mgpu */
/** Purpose ------- CGEQRF3 computes a QR factorization of a complex M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them and the corresponding parts of the upper triangular R are stored separately in dT. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] dT (workspace) COMPLEX array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_cgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal matrices for the R matrix. The rest of the array is used as workspace. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cgeqrf3_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex_ptr dT, magma_int_t *info ) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (a_1)*nb) #define d_ref(a_1) (dT + ( minmn+(a_1))*nb) #define dd_ref(a_1) (dT + (2*minmn+(a_1))*nb) #define work(a_1) (work + (a_1)) #define hwork (work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; magmaFloatComplex *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_cgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(magmaFloatComplex)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_cgetmatrix_async( rows, ib, dA(i,i), ldda, work(i), ldwork, stream[1] ); if (i > 0) { /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, dA(old_i, old_i ), ldda, dT(old_i), nb, dA(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_csetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_cgeqrf(&rows, &ib, work(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut. */ magma_queue_sync( stream[0] ); csplit_diag_block3(ib, work(i), ldwork, ut); magma_csetmatrix( rows, ib, work(i), ldwork, dA(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_csetmatrix( ib, ib, hwork, ib, dT(i), nb ); if (i+nb < k-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, dA(i, i ), ldda, dT(i), nb, dA(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_csetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_cgetmatrix( rows, ib, dA(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_csetmatrix( rows, ib, work, rows, dA(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_cgeqrf_gpu */
// ---------------------------------------- int main( int argc, char** argv ) { TESTING_INIT(); //real_Double_t t_m, t_c, t_f; magma_int_t ione = 1; magmaFloatComplex *A, *B; float diff, error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld; magmaFloatComplex x2_m, x2_c; // complex x for magma, cblas/fortran blas respectively float x_m, x_c; // x for magma, cblas/fortran blas respectively magma_opts opts; parse_opts( argc, argv, &opts ); opts.tolerance = max( 100., opts.tolerance ); float tol = opts.tolerance * lapackf77_slamch("E"); gTol = tol; printf( "!! Calling these CBLAS and Fortran BLAS sometimes crashes (segfault), which !!\n" "!! is why we use wrappers. It does not necesarily indicate a bug in MAGMA. !!\n" "\n" "Diff compares MAGMA wrapper to CBLAS and BLAS function; should be exactly 0.\n" "Error compares MAGMA implementation to CBLAS and BLAS function; should be ~ machine epsilon.\n" "\n" ); float total_diff = 0.; float total_error = 0.; int inc[] = { 1 }; //{ -2, -1, 1, 2 }; //{ 1 }; //{ -1, 1 }; int ninc = sizeof(inc)/sizeof(*inc); for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; for( int iincx = 0; iincx < ninc; ++iincx ) { magma_int_t incx = inc[iincx]; for( int iincy = 0; iincy < ninc; ++iincy ) { magma_int_t incy = inc[iincy]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d, incx = %d, incy = %d\n", (int) m, (int) n, (int) k, (int) incx, (int) incy ); printf( "Function MAGMA CBLAS BLAS Diff Error\n" " msec msec msec\n" ); // allocate matrices // over-allocate so they can be any combination of // {m,n,k} * {abs(incx), abs(incy)} by // {m,n,k} * {abs(incx), abs(incy)} maxn = max( max( m, n ), k ) * max( abs(incx), abs(incy) ); ld = max( 1, maxn ); size = ld*maxn; magma_cmalloc_pinned( &A, size ); assert( A != NULL ); magma_cmalloc_pinned( &B, size ); assert( B != NULL ); // initialize matrices lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); printf( "Level 1 BLAS ----------------------------------------------------------\n" ); // ----- test SCASUM // get one-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scasum( m, A(0,j), incx ); x_c = cblas_scasum( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scasum( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scasum", diff, error ); total_diff += diff; total_error += error; } // ----- test SCNRM2 // get two-norm of column j of A if ( incx > 0 && incx == incy ) { // positive, no incy diff = 0; error = 0; for( int j = 0; j < k; ++j ) { x_m = magma_cblas_scnrm2( m, A(0,j), incx ); x_c = cblas_scnrm2( m, A(0,j), incx ); diff += fabs( x_m - x_c ); x_c = blasf77_scnrm2( &m, A(0,j), &incx ); error += fabs( (x_m - x_c) / (m*x_c) ); } output( "scnrm2", diff, error ); total_diff += diff; total_error += error; } // ----- test CDOTC // dot columns, Aj^H Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotc_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotc( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotc( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotc", diff, error ); total_diff += diff; total_error += error; total_error += error; // ----- test CDOTU // dot columns, Aj^T * Bj diff = 0; error = 0; for( int j = 0; j < k; ++j ) { // MAGMA implementation, not just wrapper x2_m = magma_cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); // crashes on MKL 11.1.2, ILP64 #if ! defined( MAGMA_WITH_MKL ) #ifdef COMPLEX cblas_cdotu_sub( m, A(0,j), incx, B(0,j), incy, &x2_c ); #else x2_c = cblas_cdotu( m, A(0,j), incx, B(0,j), incy ); #endif error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif // crashes on MacOS 10.9 #if ! defined( __APPLE__ ) x2_c = blasf77_cdotu( &m, A(0,j), &incx, B(0,j), &incy ); error += fabs( x2_m - x2_c ) / fabs( m*x2_c ); #endif } output( "cdotu", diff, error ); total_diff += diff; total_error += error; // tell user about disabled functions #if defined( MAGMA_WITH_MKL ) printf( "cblas_cdotc and cblas_cdotu disabled with MKL (segfaults)\n" ); #endif #if defined( __APPLE__ ) printf( "blasf77_cdotc and blasf77_cdotu disabled on MacOS (segfaults)\n" ); #endif // cleanup magma_free_pinned( A ); magma_free_pinned( B ); fflush( stdout ); }}} // itest, incx, incy // TODO use average error? printf( "sum diffs = %8.2g, MAGMA wrapper compared to CBLAS and Fortran BLAS; should be exactly 0.\n" "sum errors = %8.2e, MAGMA implementation compared to CBLAS and Fortran BLAS; should be ~ machine epsilon.\n\n", total_diff, total_error ); if ( total_diff != 0. ) { printf( "some tests failed diff == 0.; see above.\n" ); } else { printf( "all tests passed diff == 0.\n" ); } TESTING_FINALIZE(); int status = (total_diff != 0.); return status; }
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, (dA_offset + (i_) + (j_)*ldda) #define dAT(i_, j_) dAT, (dAT_offset + (i_)*lddat + (j_)) #define dAP(i_, j_) dAP, ( (i_) + (j_)*maxm) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #define dAT(i_, j_) (dAT + (i_)*lddat + (j_)) #define dAP(i_, j_) (dAP + (i_) + (j_)*maxm) #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, minmn; magma_int_t i, j, jb, rows, lddat, ldwork; magmaFloatComplex_ptr dAT=NULL, dAP=NULL; magmaFloatComplex *work=NULL; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ minmn = min( m, n ); nb = magma_get_cgetrf_nb( m, n ); magma_queue_t queues[2] = { NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } magma_cgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] ); lapackf77_cgetrf( &m, &n, work, &m, ipiv, info ); magma_csetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] ); magma_free_cpu( work ); work=NULL; } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_cmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; lddat = ldda; magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { lddat = maxn; // N-by-M if (MAGMA_SUCCESS != magma_cmalloc( &dAT, lddat*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magmablas_ctranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } magma_queue_sync( queues[0] ); // finish transpose ldwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, ldwork*nb )) { *info = MAGMA_ERR_HOST_ALLOC; goto cleanup; } for( j=0; j < minmn-nb; j += nb ) { // get j-th panel from device magmablas_ctranspose( nb, m-j, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] ); magma_queue_sync( queues[1] ); // wait for transpose magma_cgetmatrix_async( m-j, nb, dAP(0,0), maxm, work, ldwork, queues[0] ); if ( j > 0 ) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-(j+nb), nb, c_one, dAT(j-nb, j-nb), lddat, dAT(j-nb, j+nb), lddat, queues[1] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(j+nb), m-j, nb, c_neg_one, dAT(j-nb, j+nb), lddat, dAT(j, j-nb), lddat, c_one, dAT(j, j+nb), lddat, queues[1] ); } // do the cpu part rows = m - j; magma_queue_sync( queues[0] ); // wait to get work lapackf77_cgetrf( &rows, &nb, work, &ldwork, ipiv+j, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j; // send j-th panel to device magma_csetmatrix_async( m-j, nb, work, ldwork, dAP, maxm, queues[0] ); for( i=j; i < j + nb; ++i ) { ipiv[i] += j; } magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + nb, ipiv, 1, queues[1] ); magma_queue_sync( queues[0] ); // wait to set dAP magmablas_ctranspose( m-j, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] ); // do the small non-parallel computations (next panel update) if ( j + nb < minmn - nb ) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+nb), lddat, queues[1] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+nb), nb, c_neg_one, dAT(j, j+nb), lddat, dAT(j+nb, j ), lddat, c_one, dAT(j+nb, j+nb), lddat, queues[1] ); } else { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-(j+nb), nb, c_one, dAT(j, j ), lddat, dAT(j, j+nb), lddat, queues[1] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(j+nb), m-(j+nb), nb, c_neg_one, dAT(j, j+nb), lddat, dAT(j+nb, j ), lddat, c_one, dAT(j+nb, j+nb), lddat, queues[1] ); } } jb = min( m-j, n-j ); if ( jb > 0 ) { rows = m - j; magmablas_ctranspose( jb, rows, dAT(j,j), lddat, dAP(0,0), maxm, queues[1] ); magma_cgetmatrix( rows, jb, dAP(0,0), maxm, work, ldwork, queues[1] ); // do the cpu part lapackf77_cgetrf( &rows, &jb, work, &ldwork, ipiv+j, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j; for( i=j; i < j + jb; ++i ) { ipiv[i] += j; } magmablas_claswp( n, dAT(0,0), lddat, j + 1, j + jb, ipiv, 1, queues[1] ); // send j-th panel to device magma_csetmatrix( rows, jb, work, ldwork, dAP(0,0), maxm, queues[1] ); magmablas_ctranspose( rows, jb, dAP(0,0), maxm, dAT(j,j), lddat, queues[1] ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-j-jb, jb, c_one, dAT(j,j), lddat, dAT(j,j+jb), lddat, queues[1] ); } // undo transpose if ( m == n ) { magmablas_ctranspose_inplace( m, dAT(0,0), lddat, queues[1] ); } else { magmablas_ctranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[1] ); } } cleanup: magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dAP ); if (m != n) { magma_free( dAT ); } magma_free_pinned( work ); return *info; } /* magma_cgetrf_gpu */
/** Purpose ------- CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu) On entry, the Hermitian matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_cposv_comp ********************************************************************/ extern "C" magma_int_t magma_cpotrf_mgpu_right( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (nqueue > 1 ? 1+((i)/nb)%(nqueue-1) : 0) magmaFloatComplex z_one = MAGMA_C_MAKE( 1.0, 0.0 ); magmaFloatComplex mz_one = MAGMA_C_MAKE( -1.0, 0.0 ); float one = 1.0; float m_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevj, prevtrsmrows=0, nqueue = 5; magmaFloatComplex *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; magmaFloatComplex *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, igpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t queues[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_cpotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_cgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_cpotrf( uplo_, &n, panel, &ldpanel, info); magma_csetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < ngpu; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / ngpu) * nb; if (d < (n / nb) % ngpu) n_local[d] += nb; else if (d == (n / nb) % ngpu) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_cmalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < nqueue; j++ ) { magma_queue_create( &queues[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (ngpu == 4) crosspoint = n; else if (ngpu == 3) crosspoint = n; else if (ngpu == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = magma_wtime(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_cgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, queues[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = magma_wtime()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % ngpu; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( queues[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = magma_wtime(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_cgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &mz_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &z_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = magma_wtime() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = magma_wtime(); #endif lapackf77_cpotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = magma_wtime() - tcchol; ttot_cchol += tcchol; tctrsm = magma_wtime(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_ctrsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &z_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = magma_wtime() - tctrsm; ttot_ctrsm += tctrsm; tctm = magma_wtime(); #endif d = (id + 1) % ngpu; // send current panel to gpus for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_csetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, queues[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % ngpu; for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); } #if defined (ENABLE_TIMER) tctm = magma_wtime() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % ngpu; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * ngpu) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = magma_wtime(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream( queues[d][STREAM_ID(j_local2)] ); #define CHERK_ON_DIAG #ifdef CHERK_ON_DIAG magma_cherk( MagmaLower, MagmaNoTrans, nb, nb, m_one, dlpanel, ldda, one, dlA(d, j + nb, j_local2), ldda); magma_cgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, mz_one, dlpanel+nb, ldda, dlpanel, ldda, z_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_cgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, mz_one, dlpanel, ldda, dlpanel, ldda, z_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = magma_wtime() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( queues[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_cgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, queues[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < ngpu; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) therk[d] = magma_wtime(); #endif //magmablasSetKernelStream( queues[d] ); //magma_cherk(MagmaLower, MagmaNoTrans, n - offset, nb, // m_one, dlpanel, ldda, // one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef CHERK_ON_DIAG magma_cherk_mgpu #else magma_cherk_mgpu2 #endif (ngpu, MagmaLower, MagmaNoTrans, nb, n - offset, nb, m_one, dlpanels, ldda, 0, one, d_lA, ldda, offset, nqueue, queues ); #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = magma_wtime() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; prevj = j; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % ngpu, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( id=0; id < nqueue; id++ ) { magma_queue_sync( queues[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % ngpu; // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = magma_wtime(); #endif magma_cgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_cpotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_csetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = magma_wtime() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < nqueue; j++ ) { magma_queue_destroy( queues[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_cpotrf_mgpu_right */
/** Purpose ------- CHEGST_GPU reduces a complex Hermitian-definite generalized eigenproblem to standard form. If ITYPE = 1, the problem is A*x = lambda*B*x, and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H) If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L. B must have been previously factorized as U**H*U or L*L**H by CPOTRF. Arguments --------- @param[in] itype INTEGER = 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H); = 2 or 3: compute U*A*U**H or L**H*A*L. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored and B is factored as U**H*U; - = MagmaLower: Lower triangle of A is stored and B is factored as L*L**H. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] dA COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the transformed matrix, stored in the same format as A. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] dB COMPLEX array, dimension (LDB,N) The triangular factor from the Cholesky factorization of B, as returned by CPOTRF. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cheev_comp ********************************************************************/ extern "C" magma_int_t magma_chegst_gpu( magma_int_t itype, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t *info) { #define A(i, j) (w + (j)*lda + (i)) #define B(i, j) (w + nb*lda + (j)*ldb + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dB(i, j) (dB + (j)*lddb + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nb; magma_int_t k, kb, kb2; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; magmaFloatComplex c_neg_half = MAGMA_C_NEG_HALF; magmaFloatComplex *w; magma_int_t lda; magma_int_t ldb; float d_one = 1.0; int upper = (uplo == MagmaUpper); /* Test the input parameters. */ *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (n < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_chegst_nb(n); lda = nb; ldb = nb; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &w, 2*nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_queue_create( &stream[2] ); /* Use hybrid blocked code */ if (itype == 1) { if (upper) { kb = min(n,nb); /* Compute inv(U')*A*inv(U) */ magma_cgetmatrix_async( kb, kb, dB(0, 0), lddb, B(0, 0), nb, stream[2] ); magma_cgetmatrix_async( kb, kb, dA(0, 0), ldda, A(0, 0), nb, stream[1] ); for (k = 0; k < n; k += nb) { kb = min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the upper triangle of A(k:n,k:n) */ magma_queue_sync( stream[2] ); magma_queue_sync( stream[1] ); lapackf77_chegst( &itype, uplo_, &kb, A(0,0), &lda, B(0,0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { // Start copying the new B block magma_cgetmatrix_async( kb2, kb2, dB(k+kb, k+kb), lddb, B(0, 0), nb, stream[2] ); magma_ctrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k,k), lddb, dA(k,k+kb), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_cher2k(MagmaUpper, MagmaConjTrans, n-k-kb, kb, c_neg_one, dA(k,k+kb), ldda, dB(k,k+kb), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(0, 0), lda, stream[1] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_ctrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k+kb,k+kb), lddb, dA(k,k+kb), ldda); } } magma_queue_sync( stream[0] ); } else { kb = min(n,nb); /* Compute inv(L)*A*inv(L') */ magma_cgetmatrix_async( kb, kb, dB(0, 0), lddb, B(0, 0), nb, stream[2] ); magma_cgetmatrix_async( kb, kb, dA(0, 0), ldda, A(0, 0), nb, stream[1] ); for (k = 0; k < n; k += nb) { kb= min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the lower triangle of A(k:n,k:n) */ magma_queue_sync( stream[2] ); magma_queue_sync( stream[1] ); lapackf77_chegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { // Start copying the new B block magma_cgetmatrix_async( kb2, kb2, dB(k+kb, k+kb), lddb, B(0, 0), nb, stream[2] ); magma_ctrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k,k), lddb, dA(k+kb,k), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_cher2k(MagmaLower, MagmaNoTrans, n-k-kb, kb, c_neg_one, dA(k+kb,k), ldda, dB(k+kb,k), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(0, 0), lda, stream[1] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k+kb,k+kb), lddb, dA(k+kb,k), ldda); } } } magma_queue_sync( stream[0] ); } else { if (upper) { /* Compute U*A*U' */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dB(k, k), lddb, B(0, 0), nb, stream[2] ); /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, k, kb, c_one, dB(0,0), lddb, dA(0,k), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_queue_sync( stream[1] ); } magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(0, 0), lda, stream[0] ); if (k > 0) { magma_cher2k(MagmaUpper, MagmaNoTrans, k, kb, c_one, dA(0,k), ldda, dB(0,k), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_ctrmm(MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, k, kb, c_one, dB(k,k), lddb, dA(0,k), ldda); } magma_queue_sync( stream[2] ); magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } else { /* Compute L'*A*L */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dB(k, k), lddb, B(0, 0), nb, stream[2] ); /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, kb, k, c_one, dB(0,0), lddb, dA(k,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_queue_sync( stream[1] ); } magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(0, 0), lda, stream[0] ); if (k > 0) { magma_cher2k(MagmaLower, MagmaConjTrans, k, kb, c_one, dA(k,0), ldda, dB(k,0), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_ctrmm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, kb, k, c_one, dB(k,k), lddb, dA(k,0), ldda); } magma_queue_sync( stream[2] ); magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_queue_destroy( stream[2] ); magma_free_pinned( w ); return *info; } /* magma_chegst_gpu */