// init a zero matrix on GPU to store X'*X void GpuSolver::add_data ( magma_int_t m, magmaDouble_ptr X, magmaDouble_ptr T ) { real_Double_t time; magmaDouble_ptr dX=NULL, dT=NULL; magma_dmalloc( &dX, m*n ); magma_dmalloc( &dT, m*nrhs ); if ( dX == NULL || dT == NULL ) { fprintf( stderr, "malloc failed - not enough GPU or system memory?\n" ); goto cleanup; } magma_dsetmatrix( m, n, X, m, dX, m ); magma_dsetmatrix( m, nrhs, T, m, dT, m ); time = magma_sync_wtime( NULL ); magma_dgemm( MagmaTrans, MagmaNoTrans, n, nrhs, m, 1, dX, m, dT, m, 1, dB, n ); magma_dgemm( MagmaTrans, MagmaNoTrans, n, n, m, 1, dX, m, dX, m, 1, dA, n ); time = magma_sync_wtime( NULL ) - time; fprintf( stdout, "added data in %f sec\n", time ); cleanup: magma_free( dX ); magma_free( dT ); };
void magma_task_dev_dgemm(Schedule* sched_obj ) { magma_int_t deviceID; magma_trans_t transA; magma_trans_t transB; magma_int_t m; magma_int_t n; magma_int_t k; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; double beta; double *dC; magma_int_t ldc; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_14(sched_obj, deviceID, transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc); magma_setdevice(deviceID); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, k, dA, lda, "A before magma_dgemm"); ca_dbg_printMat_transpose_gpu(k, n, dB, ldb, "B before magma_dgemm"); ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C before magma_dgemm"); #endif pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(compute_stream[deviceID]); magma_dgemm( transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc ); //task_magma_dgemm(MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD); pthread_mutex_unlock(&mutex_compute_stream); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C after magma_dgemm"); #endif #if (dbglevel >=1) ca_trace_end_gpu('S'); ca_trace_end_cpu('C'); #endif }
void magma_task_dgemm(Schedule* sched_obj ) { magma_trans_t transA; magma_trans_t transB; magma_int_t m; magma_int_t n; magma_int_t k; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; double beta; double *dC; magma_int_t ldc; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_13(sched_obj,transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, k, dA, lda, "A before magma_dgemm"); ca_dbg_printMat_transpose_gpu(k, n, dB, ldb, "B before magma_dgemm"); ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C before magma_dgemm"); #endif magma_dgemm( transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc ); //magma_task_dgemm('N','N', gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C after magma_dgemm"); #endif #if (dbglevel >=1) ca_trace_end_1gpu('S'); ca_trace_end_cpu('C'); #endif }
void gmm_magma(const Tensor_core<double,2>& A, const Tensor_core<double,2>& B, Tensor_core<double,2>& C, char TRANSA, char TRANSB, double alpha, double beta) { int AL0 = A.rank(0); int AL1 = A.rank(1); int BL0 = B.rank(0); int BL1 = B.rank(1); int CL0 = C.rank(0); int CL1 = C.rank(1); magma_int_t M, N, K, LDA, LDB, LDC; magma_trans_t transA=magma_trans_const(TRANSA), transB=magma_trans_const(TRANSB); magmaDouble_ptr d_A, d_B, d_C; //Set LDA, LDB, and LDC, round up to multiple of 32 for best GPU performance LDA = ((AL0+31)/32)*32; LDB = ((BL0+31)/32)*32; LDC = ((CL0+31)/32)*32; // Allocate memory for the matrices on GPU magma_dmalloc(&d_A, LDA*AL1 ); magma_dmalloc(&d_B, LDB*BL1 ); magma_dmalloc(&d_C, LDC*CL1 ); // Copy data from host (CPU) to device (GPU) magma_dsetmatrix( AL0, AL1, A.data(), AL0, d_A, LDA ); magma_dsetmatrix( BL0, BL1, B.data(), BL0, d_B, LDB ); if( abs(beta)>1e-32 ) magma_dsetmatrix( CL0, CL1, C.data(), CL0, d_C, LDC ); //Call magma_sgemm M=( TRANSA=='N' || TRANSA=='n' ) ? AL0:AL1; K=( TRANSA=='N' || TRANSA=='n' ) ? AL1:AL0; N=( TRANSB=='N' || TRANSB=='n' ) ? BL1:BL0; magma_dgemm(transA, transB, M, N, K, alpha, d_A, LDA, d_B, LDB, beta,d_C, LDC); // Copy solution from device (GPU) to host (CPU) magma_dgetmatrix(CL0, CL1, d_C, LDC, C.data(), CL0); // Free memory on GPU magma_free(d_A); magma_free(d_B); magma_free(d_C); }
/** Purpose ------- DLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to DLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] k INTEGER The number of terms in the rational function to be solved by DLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d DOUBLE PRECISION array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q DOUBLE PRECISION array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho DOUBLE PRECISION The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda DOUBLE PRECISION array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 DOUBLE PRECISION array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. TODO what is LDQ2? @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see DLAED2). The rows of the eigenvectors found by DLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w DOUBLE PRECISION array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) DOUBLE PRECISION array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (workspace) DOUBLE PRECISION array, dimension (3*N*N/2+3*N) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from dlaex1. @param[in] vl DOUBLE PRECISION @param[in] vu DOUBLE PRECISION if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_dsyev_aux ********************************************************************/ extern "C" magma_int_t magma_dlaex3(magma_int_t k, magma_int_t n, magma_int_t n1, double* d, double* Q, magma_int_t ldq, double rho, double* dlamda, double* Q2, magma_int_t* indx, magma_int_t* ctot, double* w, double* s, magma_int_t* indxq, double* dwork, magma_range_t range, double vl, double vu, magma_int_t il, magma_int_t iu, magma_int_t* info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) double d_one = 1.; double d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; double* dq2= dwork; double* ds = dq2 + n*(n/2+1); double* dq = ds + n*(n/2+1); magma_int_t lddq = n/2 + 1; magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2; double temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return *info; } // Quick return if possible if (k == 0) return *info; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_dsetvector_async( lq2, Q2, 1, dq2, 1, NULL ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_dlamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_dvrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_dirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_dcopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_dcopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = magma_cblas_dnrm2( k, s+id*k, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } if (*info != 0) return *info; timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// magma_timer_t time=0; timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_dlamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_dlaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return *info; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_dlamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) magma_dvrange(k, d, &iil, &iiu, vl, vu); else if (indeig) magma_dirange(k, indxq, &iil, &iiu, il, iu); else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_dcopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_dcopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = magma_cblas_dnrm2( k, s, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } timer_stop( time ); timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. timer_start( time ); magma_queue_sync( NULL ); if (rk != 0) { if ( n23 != 0 ) { if (rk < magma_get_dlaed3_k()) { lapackf77_dlacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_dgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_dsetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, ds, n23 ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, &dq2[iq2], n2, ds, n23, d_zero, dq, lddq); magma_dgetmatrix( n2, rk, dq, lddq, Q(n1,iil-1), ldq ); } } else lapackf77_dlaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { if (rk < magma_get_dlaed3_k()) { lapackf77_dlacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_dgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_dsetmatrix( n12, rk, Q(0,iil-1), ldq, ds, n12 ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dq2, n1, ds, n12, d_zero, dq, lddq); magma_dgetmatrix( n1, rk, dq, lddq, Q(0,iil-1), ldq ); } } else lapackf77_dlaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } timer_stop( time ); timer_printf( "gemms = %6.2f\n", time ); return *info; } /* magma_dlaex3 */
extern "C" magma_int_t magma_dpotrf_gpu(char uplo, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**T * U, if UPLO = 'U', or dA = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb; char uplo_[2] = {uplo, 0}; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_dpotrf_nb(n); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_dgetmatrix_async( n, n, dA, ldda, work, n, stream[1] ); magma_queue_sync( stream[1] ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix_async( n, n, work, n, dA, ldda, stream[1] ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j<n; j+=nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_dsyrk(MagmaUpper, MagmaTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_dgemm(MagmaTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j<n; j+=nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { magma_dgemm( MagmaNoTrans, MagmaTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm(MagmaRight, MagmaLower, MagmaTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } return *info; } /* magma_dpotrf_gpu */
extern "C" magma_int_t magma_dsytrf_nopiv(magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.6.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DSYTRF_nopiv computes the LDLt factorization of a real symmetric matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = U\*\*H * D * U, if UPLO = 'U', or A = L * D * L\*\*H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U\*\*H*U or A = L*L\*\*H. Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ double zone = MAGMA_D_ONE; double mzone = MAGMA_D_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, ldda, nb, ib, iinfo; magmaDouble_ptr dA; magmaDouble_ptr dW; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; ldda = ((n+31)/32)*32; nb = magma_get_dsytrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization if ((MAGMA_SUCCESS != magma_dmalloc(&dA, n *ldda)) || (MAGMA_SUCCESS != magma_dmalloc(&dW, nb*ldda))) { /* alloc failed so call the non-GPU-resident version */ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, (CUstream_st**)stream ); //if (nb <= 1 || nb >= n) //{ // lapackf77_dpotrf(uplo_, &n, a, &lda, info); //} else { /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // copy matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(j+jb, jb, A(0, j), lda, dA(0, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th column of U back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(j, jb, dA(0, j), ldda, A(0, j), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_dcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // copy the matrix to GPU for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async((n-j), jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); } // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_dgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), lda, stream[0]); trace_gpu_end( 0, 0 ); // copy j-th row of L back to CPU magma_queue_wait_event( stream[1], event ); trace_gpu_start( 0, 1, "get", "get" ); magma_dgetmatrix_async(jb, j, dA(j, 0), ldda, A(j, 0), lda, stream[1]); trace_gpu_end( 0, 1 ); // factorize the diagonal block magma_queue_sync(stream[0]); trace_cpu_start( 0, "potrf", "potrf" ); dsytrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), lda, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_dsetmatrix_async(jb, jb, A(j, j), lda, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_dcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_dlascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); magma_event_record( event, stream[0] ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_dgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); } trace_gpu_end( 0, 0 ); } } } } trace_finalize( "dsytrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free(dW); magma_free(dA); return MAGMA_SUCCESS; } /* magma_dsytrf_nopiv */
/** Purpose ------- DSGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. DSGEQRSV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] dB DOUBLE PRECISION array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,M). @param[out] dX DOUBLE PRECISION array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGEQRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value @ingroup magma_dgels_driver ********************************************************************/ extern "C" magma_int_t magma_dsgeqrsv_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dB, magma_int_t lddb, magmaDouble_ptr dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *hworkd; float *hworks; double *tau; float *stau; magmaDouble_ptr dworkd; magmaFloat_ptr dworks; magmaDouble_ptr dR, dT; magmaFloat_ptr dSA, dSX, dST; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_sgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_smalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_smalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange(MagmaInfNorm, m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_dlag2s( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_sgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_slag2d( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_dlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( m, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_dlag2s( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_sgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_dsaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_dlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( m, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for dgeqrf */ nb = magma_get_dgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_dmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for dgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_dmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_dgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_dgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
extern "C" magma_int_t magma_dlobpcg( magma_d_sparse_matrix A, magma_d_solver_par *solver_par ){ #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define magmablas_swap(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_d_bspmv_tuned(m, n, alpha, A, X, beta, AX) { \ magmablas_dtranspose( m, n, X, m, blockW, n ); \ magma_d_vector x, ax; \ x.memory_location = Magma_DEV; x.num_rows = m*n; x.nnz = m*n; x.val = blockW; \ ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX; \ magma_d_spmv(alpha, A, x, beta, ax ); \ magmablas_dtranspose( n, m, blockW, n, X, m ); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n =(solver_par->num_eigenvalues); double *blockX = solver_par->eigenvectors; double *evalues = solver_par->eigenvalues; double *dwork, *hwork; double *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW; double *gramA, *gramB, *gramM; double *gevectors, *h_gramB; double *pointer, *origX = blockX; double *eval_gpu; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_dmalloc_pinned( &hwork , lwork ); magma_dmalloc( &blockAX , m*n ); magma_dmalloc( &blockAR , m*n ); magma_dmalloc( &blockAP , m*n ); magma_dmalloc( &blockR , m*n ); magma_dmalloc( &blockP , m*n ); magma_dmalloc( &blockW , m*n ); magma_dmalloc( &dwork , m*n ); magma_dmalloc( &eval_gpu , 3*n ); //**********************************************************+ magma_int_t verbosity = 1; magma_int_t *iwork, liwork = 15*n+9; // === Set solver parameters === double residualTolerance = solver_par->epsilon; magma_int_t maxIterations = solver_par->maxiter; // === Set some constants & defaults === double c_one = MAGMA_D_ONE, c_zero = MAGMA_D_ZERO; double *residualNorms, *condestGhistory, condestG; double *gevalues; magma_int_t *activeMask; // === Check some parameters for possible quick exit === solver_par->info = 0; if (m < 2) solver_par->info = -1; else if (n > m) solver_par->info = -2; if (solver_par->info != 0) { magma_xerbla( __func__, -(solver_par->info) ); return solver_par->info; } magma_int_t *info = &(solver_par->info); // local info variable; // === Allocate GPU memory for the residual norms' history === magma_dmalloc(&residualNorms, (maxIterations+1) * n); magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) ); // === Allocate CPU work space === magma_dmalloc_cpu(&condestGhistory, maxIterations+1); magma_dmalloc_cpu(&gevalues, 3 * n); magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t)); double *hW; magma_dmalloc_pinned(&hW, n*n); magma_dmalloc_pinned(&gevectors, 9*n*n); magma_dmalloc_pinned(&h_gramB , 9*n*n); // === Allocate GPU workspace === magma_dmalloc(&gramM, n * n); magma_dmalloc(&gramA, 9 * n * n); magma_dmalloc(&gramB, 9 * n * n); #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); magma_dmalloc_cpu(&rwork, lrwork); #endif // === Set activemask to one === for(int k =0; k<n; k++) iwork[k]=1; magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n); magma_int_t gramDim, ldgram = 3*n, ikind = 4; // === Make the initial vectors orthonormal === magma_dgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info ); //magma_dorthomgs( m, n, blockX ); magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); // === Update X = X * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockX); // === Update AX = AX * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockAX); condestGhistory[1] = 7.82; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); /* for(int i=0; i<n; i++){ magma_daxpy(m, MAGMA_D_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1); } */ #if defined(PRECISION_z) || defined(PRECISION_d) magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #else magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #endif magma_dlobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu); magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === remove the residuals corresponding to already converged evectors magma_dcompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) // magmablas_dlacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m); /* // === make the preconditioned residuals orthogonal to X magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m); */ // === make the active preconditioned residuals orthonormal magma_dgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockR ); // === compute AR magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR ); if (!restart) { // === compact P & AP as well magma_dcompactActive(m, n, blockP, m, activeMask); magma_dcompactActive(m, n, blockAP, m, activeMask); /* // === make P orthogonal to X ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m); // === make P orthogonal to R ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_dgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockP ); //magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP ); magma_dsetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize); // magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, // m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #endif } iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize))); double condestGmean = 0.; for(int i = 0; i<iterationNumber-iter+1; i++) condestGmean += condestGhistory[i]; condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram); // identity if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram); } magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); lapackf77_dsyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, #endif info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", (int) iterationNumber); } // === assemble GramA; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram); // identity magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram); if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram); } /* // === Compute X' AX or just use the eigenvalues below ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram); */ if (restart==0) { magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } else { gramDim = n+cBlockSize; magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } for(int k=0; k<n; k++) *gevectors(k,k) = MAGMA_D_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); magma_int_t itype = 1; lapackf77_dsygvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_dsetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m); // === corresponding contribution from AP to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m); } else { // === contribution from R (only) to the new X magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m); // === corresponding contribution from AR (only) to the new AX magma_dgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m); } // === contribution from old X to the new X + the new search direction P magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockX); //magma_daxpy(m*n, c_one, blockP, 1, blockX, 1); magma_dlobpcg_maxpy( m, n, blockP, blockX ); // === corresponding contribution from old AX to new AX + AP magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAX); //magma_daxpy(m*n, c_one, blockAP, 1, blockAX, 1); magma_dlobpcg_maxpy( m, n, blockAP, blockAX ); condestGhistory[iterationNumber+1]=condestG; if (verbosity==1) { // double res; // magma_dgetmatrix(1, 1, // (double*)residualNorms(0, iterationNumber), 1, // (double*)&res, 1); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ) solver_par->info = -2; else solver_par->info = -1; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors magmablas_swap(blockX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m); // === update AX = AX * evectors to compute the final residual magmablas_swap(blockAX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m); // === compute R = AX - evalues X magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); for(int i=0; i<n; i++) magma_daxpy(m, MAGMA_D_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1); // === residualNorms[iterationNumber] = || R || magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === restore blockX if needed if (blockX != origX) magmablas_dlacpy( MagmaUpperLower, m, n, blockX, m, origX, m); printf("Eigenvalues:\n"); for(int i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Print residual history in a file for plotting ==== double *hresidualNorms; magma_dmalloc_cpu(&hresidualNorms, (iterationNumber+1) * n); magma_dgetmatrix(n, iterationNumber, (double*)residualNorms, n, (double*)hresidualNorms, n); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(int i =1; i<iterationNumber; i++) { for(int j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); magma_free( blockAX ); magma_free( blockAR ); magma_free( blockAP ); magma_free( blockR ); magma_free( blockP ); magma_free( blockW ); magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #if defined(PRECISION_z) || defined(PRECISION_c) magma_free_cpu( rwork ); #endif return MAGMA_SUCCESS; }
void magmablas_dsyr2k_mgpu_spec( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, double alpha, double *dA[], magma_int_t lda, magma_int_t aoffset, double *dB[], magma_int_t ldb, magma_int_t boffset, double beta, double *dC[], magma_int_t ldc, magma_int_t coffset, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) ) #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) ) #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc) /* Check arguments */ magma_int_t info = 0; if ( uplo != MagmaLower ) { info = -1; // 'u' not yet handled } else if ( trans != MagmaNoTrans ) { info = -2; // 'c' not yet handled } else if ( n < 0 ) { info = -3; } else if ( k < 0 ) { info = -4; } else if ( ((trans == MagmaNoTrans) && lda < max(1,n)) || ((trans == MagmaTrans) && lda < max(1,k)) ) { info = -7; } else if ( aoffset < 0 || aoffset > lda ) { info = -8; } else if ( ((trans == MagmaNoTrans) && ldb < max(1,n)) || ((trans == MagmaTrans) && ldb < max(1,k)) ) { info = -10; } else if ( boffset < 0 || boffset > ldb ) { info = -11; } else if ( ldc < max(1,n) ) { info = -13; } else if ( coffset < 0 || coffset > ldc ) { info = -14; } else if ( ngpu <= 0 ) { info = -15; } else if ( nb <= 0 ) { info = -16; } else if ( nstream <= 0 ) { info = -18; } if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return; } const double c_one = MAGMA_D_ONE; double cbeta = MAGMA_D_MAKE( beta, 0. ); magma_int_t ib, ioff, iblock, idev, di, s; magma_device_t cdev; magma_queue_t cqueue; magma_getdevice( &cdev ); magmablasGetKernelStream( &cqueue ); // loop over all blocks // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C // blockoffset is offset within first block; for subsequent blocks it is 0 magma_int_t blockoffset = coffset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + coffset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nstream; magmablasSetKernelStream( streams[ idev ][ s ] ); // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i] //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k, alpha, dA(idev,0,0), lda, dB(idev,i,0), ldb, cbeta, dC(idev,coffset,di), ldc ); blockoffset = 0; } // second loop does C = (alpha)*B*A' + C_hat alpha = MAGMA_D_CNJG( alpha ); blockoffset = coffset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + coffset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nstream; magmablasSetKernelStream( streams[ idev ][ s ] ); // C[i:n,i] += (alpha) * B[i:n,0] * A[i,0]' //printf( "dgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k, alpha, dB(idev,0,0), ldb, dA(idev,i,0), lda, c_one, dC(idev,coffset,di), ldc ); blockoffset = 0; } magma_setdevice( cdev ); magmablasSetKernelStream( cqueue ); }
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_gpu(magma_uplo_t uplo, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA + (j)*ldda + (i)) magma_int_t j, jb, nb; const char* uplo_ = lapack_uplo_const( uplo ); double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_dpotrf_nb(n); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_dgetmatrix_async( n, n, dA, ldda, work, n, stream[1] ); magma_queue_sync( stream[1] ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix_async( n, n, work, n, dA, ldda, stream[1] ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_dgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { magma_dgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dpotrf_gpu */
/***************************************************************************//** Purpose ------- DLARFB applies a real block reflector H or its transpose H^H to a DOUBLE PRECISION m by n matrix C, from the left. __Note that this function assumes__ that the upper part of dV is 0 because it is referenced. Same for upper/lower part of dT. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply H or H^H from the Left - = MagmaRight: apply H or H^H from the Right @param[in] trans magma_trans_t - = MagmaNoTrans: apply H (No transpose) - = MagmaTrans: apply H^H (Conjugate transpose) @param[in] direct magma_direct_t Indicates how H is formed from a product of elementary reflectors - = MagmaForward: H = H(1) H(2) . . . H(k) (Forward) - = MagmaBackward: H = H(k) . . . H(2) H(1) (Backward) @param[in] storev magma_storev_t Indicates how the vectors which define the elementary reflectors are stored: - = MagmaColumnwise: Columnwise - = MagmaRowwise: Rowwise @param[in] m INTEGER The number of rows of the matrix C. @param[in] n INTEGER The number of columns of the matrix C. @param[in] k INTEGER The order of the matrix T (= the number of elementary reflectors whose product defines the block reflector). @param[in] dV DOUBLE PRECISION array on the GPU, dimension (LDDV,K) if STOREV = MagmaColumnwise (LDDV,M) if STOREV = MagmaRowwise and SIDE = MagmaLeft (LDDV,N) if STOREV = MagmaRowwise and SIDE = MagmaRight The matrix V. See further details. @param[in] lddv INTEGER The leading dimension of the array V. If STOREV = MagmaColumnwise and SIDE = MagmaLeft, LDDV >= max(1,M); if STOREV = MagmaColumnwise and SIDE = MagmaRight, LDDV >= max(1,N); if STOREV = MagmaRowwise, LDDV >= K. @param[in] dT DOUBLE PRECISION array on the GPU, dimension (LDDT,K) The triangular k by k matrix T in the representation of the block reflector. @param[in] lddt INTEGER The leading dimension of the array T. LDDT >= K. @param[in,out] dC DOUBLE PRECISION array on the GPU, dimension (LDDC,N) On entry, the m by n matrix C. On exit, C is overwritten by H*C, or H^H*C, or C*H, or C*H^H. @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param dwork (workspace) DOUBLE PRECISION array, dimension (LDWORK,K) @param[in] ldwork INTEGER The leading dimension of the array WORK. If SIDE = MagmaLeft, LDWORK >= max(1,N); if SIDE = MagmaRight, LDWORK >= max(1,M); @param dworkvt (workspace) DOUBLE PRECISION array, dimension (LDWORKT,K) @param[in] ldworkvt INTEGER The leading dimension of the array WORKVT. LDWORKVT >= max(1,min(M,N)); @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- The shape of the matrix V and the storage of the vectors which define the H(i) is best illustrated by the following example with n = 5 and k = 3. All elements including 0's and 1's are stored, unlike LAPACK. DIRECT = MagmaForward and DIRECT = MagmaForward and STOREV = MagmaColumnwise: STOREV = MagmaRowwise: V = ( 1 0 0 ) V = ( 1 v1 v1 v1 v1 ) ( v1 1 0 ) ( 0 1 v2 v2 v2 ) ( v1 v2 1 ) ( 0 0 1 v3 v3 ) ( v1 v2 v3 ) ( v1 v2 v3 ) DIRECT = MagmaBackward and DIRECT = MagmaBackward and STOREV = MagmaColumnwise: STOREV = MagmaRowwise: V = ( v1 v2 v3 ) V = ( v1 v1 1 0 0 ) ( v1 v2 v3 ) ( v2 v2 v2 1 0 ) ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) ( 0 1 v3 ) ( 0 0 1 ) @ingroup magma_larfb *******************************************************************************/ extern "C" magma_int_t magma_dlarfb_gpu_gemm_q( magma_side_t side, magma_trans_t trans, magma_direct_t direct, magma_storev_t storev, magma_int_t m, magma_int_t n, magma_int_t k, magmaDouble_const_ptr dV, magma_int_t lddv, magmaDouble_const_ptr dT, magma_int_t lddt, magmaDouble_ptr dC, magma_int_t lddc, magmaDouble_ptr dwork, magma_int_t ldwork, magmaDouble_ptr dworkvt, magma_int_t ldworkvt, magma_queue_t queue ) { /* Constants */ const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t info = 0; /* Function Body */ if (m <= 0 || n <= 0) { return info; } /* Local variables */ magma_int_t ldwvt = (m > n ? k : m); magma_int_t ldw; if ( side == MagmaLeft ) { ldw = k; } else { ldw = m; } // whether V is stored transposed or not magma_trans_t notransV, transV; if (storev == MagmaColumnwise) { notransV = MagmaNoTrans; transV = MagmaTrans; } else { notransV = MagmaTrans; transV = MagmaNoTrans; } if ( side == MagmaLeft ) { // Form H C or H^H C // Comments assume H C. // When forming H^H C, T gets transposed via trans. // W = V^H C magma_dgemm( transV, MagmaNoTrans, k, n, m, c_one, dV, lddv, dC, lddc, c_zero, dwork, ldw, queue ); if (m <= n) { // W2 = V T magma_dgemm( notransV, trans, m, k, k, c_one, dV, lddv, dT, lddt, c_zero, dworkvt, ldwvt, queue ); // C = C - W2 W = C - V T V^H C = (I - V T V^H) C = H C magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, n, k, c_neg_one, dworkvt, ldwvt, dwork, ldw, c_one, dC, lddc, queue ); } else { // W2 = T W = T V^H C magma_dgemm( trans, MagmaNoTrans, k, n, k, c_one, dT, lddt, dwork, ldw, c_zero, dworkvt, ldwvt, queue ); // C = C - V W2 = C - V T V^H C = (I - V T V^H) C = H C magma_dgemm( notransV, MagmaNoTrans, m, n, k, c_neg_one, dV, lddv, dworkvt, ldwvt, c_one, dC, lddc, queue ); } } else { // Form C H or C H^H // Comments assume C H. // When forming C H^H, T gets transposed via trans. // W = C V magma_dgemm( MagmaNoTrans, notransV, m, k, n, c_one, dC, lddc, dV, lddv, c_zero, dwork, ldw, queue ); if (m <= n) { // W2 = W T = C V T magma_dgemm( MagmaNoTrans, trans, m, k, k, c_one, dwork, ldw, dT, lddt, c_zero, dworkvt, ldwvt, queue ); // C = C - W2 V^H = C - C V T V^H = C (I - V T V^H) = C H magma_dgemm( MagmaNoTrans, transV, m, n, k, c_neg_one, dworkvt, ldwvt, dV, lddv, c_one, dC, lddc, queue ); } else { // W2 = T V^H magma_dgemm( trans, transV, k, n, k, c_one, dT, lddt, dV, lddv, c_zero, dworkvt, ldwvt, queue ); // C = C - W W2 = C - C V T V^H = C (I - V T V^H) = C H magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, n, k, c_neg_one, dwork, ldw, dworkvt, ldwvt, c_one, dC, lddc, queue ); } } return info; } /* magma_dlarfb */
/** Purpose ------- DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sy2sb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, double *dT, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define dA(a_1,a_2) (dA + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_1) (dT + ((a_1)-1)*(lddt)) int ldda = ((n+31)/32)*32; int lddt = nb; double c_neg_one = MAGMA_D_NEG_ONE; double c_neg_half = MAGMA_D_NEG_HALF; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0; int i; int lwkopt; int lquery; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); double *dA; if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n + 2*nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); /* Use the first panel of dA as work space */ double *dwork = dA + n*ldda; double *dW = dwork + nb*ldda; #ifdef TRACING char buf[80]; #endif magma_queue_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); stream[2] = 0; // default stream trace_init( 1, 1, 3, stream ); double *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(double)); magmablasSetKernelStream( stream[0] ); magma_event_t Pupdate_event; cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming); //magma_event_create(&Pupdate_event); if (upper) { printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Copy the matrix to the GPU */ if (1 <= n-nb) { trace_gpu_start( 0, 0, "set", "set A" ); magma_dsetmatrix_async( (n-nb), (n-nb), A(nb+1, nb+1), lda, dA(nb+1, nb+1), ldda, stream[0] ); trace_gpu_end( 0, 0 ); } /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // dpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); trace_gpu_start( 0, 1, "get", "get panel" ); //magma_queue_sync( stream[0] ); magma_queue_wait_event(stream[1], Pupdate_event); //, 0); magma_dgetmatrix_async( (pm+pn), pn, dA( i, i), ldda, A ( i, i), lda, stream[1] ); trace_gpu_end( 0, 1 ); trace_gpu_start( 0, 2, "her2k", "her2k" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dA(indi_old+pn_old, indj_old), ldda, dW + pn_old, pm_old, d_one, dA(indi_old+pn_old, indi_old+pn_old), ldda); trace_gpu_end( 0, 2 ); trace_cpu_start( 0, "sync", "sync on 1" ); magma_queue_sync( stream[1] ); trace_cpu_end( 0 ); dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ #ifdef TRACING snprintf( buf, sizeof(buf), "panel %d", i ); #endif trace_cpu_start( 0, "geqrf", buf ); lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); trace_cpu_end( 0 ); /* Send V from the CPU to the GPU */ trace_gpu_start( 0, 0, "set", "set V and T" ); magma_dsetmatrix_async( pm, pk, A(indi, indj), lda, dA(indi, indj), ldda, stream[0] ); /* Send the triangular factor T to the GPU */ magma_dsetmatrix_async( pk, pk, hT, nb, dT(i), lddt, stream[0] ); trace_gpu_end( 0, 0 ); /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ /* dwork = V T */ trace_cpu_start( 0, "sync", "sync on 0" ); // this sync is done here to be sure that the copy has been finished // because below we made a restore dq_to_panel and this restore need // to ensure that the copy has been finished. we did it here to allow // overlapp of restore with next gemm and symm. magma_queue_sync( stream[0] ); trace_cpu_end( 0 ); trace_gpu_start( 0, 2, "gemm", "work = V*T" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dA(indi, indj), ldda, dT(i), lddt, c_zero, dwork, pm); trace_gpu_end( 0, 2 ); /* dW = X = A*V*T. dW = A*dwork */ trace_gpu_start( 0, 2, "hemm", "X = A*work" ); magma_dsymm(MagmaLeft, uplo, pm, pk, c_one, dA(indi, indi), ldda, dwork, pm, c_zero, dW, pm); trace_gpu_end( 0, 2 ); /* restore the panel */ dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork, pm, dW, pm, c_zero, dwork + pm*nb, nb); trace_gpu_end( 0, 2 ); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dA(indi, indj), ldda, dwork + pm*nb, nb, c_one, dW, pm); trace_gpu_end( 0, 2 ); /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dA(indi, indj), ldda, dW, pm, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dW, pm, dA(indi, indj), ldda, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); magma_event_record(Pupdate_event, stream[0]); } else { /* no look-ahead as this is last iteration */ trace_gpu_start( 0, 2, "her2k", "her2k last iteration" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dA(indi, indj), ldda, dW, pm, d_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) /* Send the last block to the CPU */ pk = min(pm,pn); if (1 <= n-nb) { dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); trace_gpu_start( 0, 2, "get", "get last block" ); magma_dgetmatrix( pk, pk, dA(n-pk+1, n-pk+1), ldda, A(n-pk+1, n-pk+1), lda ); trace_gpu_end( 0, 2 ); dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } }// end of LOWER trace_finalize( "dsytrd_sy2sb.svg", "trace.css" ); magma_event_destroy( Pupdate_event ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); work[0] = MAGMA_D_MAKE( lwkopt, 0 ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); return *info; } /* magma_dsytrd_sy2sb */
extern "C" magma_int_t magma_dgeqrs_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, double *tau, magmaDouble_ptr dT, size_t dT_offset, magmaDouble_ptr dB, size_t dB_offset, magma_int_t lddb, double *hwork, magma_int_t lwork, magma_queue_t queue, magma_int_t *info) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by DGEQRF_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by DGEQRF_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) DOUBLE_PRECISION array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_DGEQRF_GPU. DB (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) DOUBLE_PRECISION array that is the output (the 6th argument) of magma_dgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) DOUBLE_PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_dgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define d_ref(a_1) dT, (dT_offset + (lddwork+(a_1))*nb) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magmaDouble_ptr dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_dgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_D_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_dormqr_gpu( MagmaLeft, MagmaConjTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, dB_offset, lddb, hwork, lwork, dT, dT_offset, nb, queue, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; int ldtwork; size_t dwork_offset = 0; if (nb < k) { dwork = dT; dwork_offset = dT_offset+2*lddwork*nb; } else { ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; magma_dmalloc( &dwork, ldtwork ); } // To do: Why did we have this line originally; seems to be a bug (Stan)? //dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_dormqr_gpu, hwork contains // the last block of A and B (i.e., C in dormqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_dtrsm and drop the dsetmatrix. if ( nrhs == 1 ) { blasf77_dtrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_dtrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_dsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork, dwork_offset+i, lddwork, queue ); // update c if (nrhs == 1) magma_dgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, 1, c_one, dB, dB_offset, 1, queue ); else magma_dgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset + i, lddwork, c_one, dB, dB_offset, lddb, queue ); int start = i-nb; if (nb < k) { for (i = start; i >=0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_dgemv( MagmaNoTrans, ib, ib, c_one, d_ref(i), ib, dB, dB_offset+i, 1, c_zero, dwork, dwork_offset+i, 1, queue ); magma_dgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, 1, c_one, dB, dB_offset, 1, queue ); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, d_ref(i), ib, dB, dB_offset+i, lddb, c_zero, dwork, dwork_offset+i, lddwork, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork, dwork_offset+i, lddwork, c_one, dB, dB_offset, lddb, queue ); } } } } magma_dcopymatrix( (n), nrhs, dwork, dwork_offset, lddwork, dB, dB_offset, lddb, queue ); if (nb >= k) magma_free(dwork); magma_queue_sync( queue ); return *info; }
extern "C" magma_int_t magma_dpotrf2_mgpu( magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDouble_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, magmaDouble_ptr *d_lP, magma_int_t lddp, double *a, magma_int_t lda, magma_int_t h, magma_queue_t *queues, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; magmaDouble_ptr dlpanel; size_t dlpanel_offset; magma_int_t n_local[MagmaMaxGPUs], ldpanel; magma_event_t events[MagmaMaxGPUs]; *info = 0; if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) { *info = -1; } else if (n < 0) { *info = -2; } else if ((uplo != MagmaUpper) && num_gpus*ldda < max(1,n)) { *info = -4; } else if ((uplo == MagmaUpper) && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } { for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (uplo == MagmaUpper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } } /* Use blocked code. */ if (uplo == MagmaUpper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for(j=0;j<m;j+=nb){ /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if(j>0){ magma_queue_sync(queues[id*2]); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for(dd=0;dd<num_gpus;dd++){ if(d != id){ //magma_queue_sync(queues[2*d]); magma_dsetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, queues[d*2], NULL ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ if( j > 0 ) { magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda, queues[2*id+1]); } /* send the diagonal to cpu */ magma_queue_sync(queues[2*id+1]);// wait for syrk magma_dgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, queues[2*id], NULL); if(j>0){ /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for(dd=0;dd<num_gpus;dd++){ j_local2 = j_local+1; if(d>id) j_local2 --; nb0 = nb*j_local2; if(n_local[d]>nb0){ /* wait for the off-diagonal */ if(d!=id){ dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_sync(queues[d*2]); }else{ dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_dgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda, queues[2*d+1]); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_queue_sync( queues[id*2] ); // wait for the diagonal lapackf77_dpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, buf); ldpanel = lddp; } magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[d*2], NULL); d = (d+1)%num_gpus; } } else { magma_dsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[id*2], NULL ); } /* panel-factorize the off-diagonal */ if((j+jb)<n){ d = (j/nb+1)%num_gpus; for(dd=0;dd<num_gpus;dd++){ /* next column */ j_local2 = j_local+1; if(d>id) j_local2--; if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_queue_sync( queues[2*d]); // wait for the diagonal if(j+jb < m && d == (j/nb+1)%num_gpus){ /* owns the next column, look-ahead the column */ magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*d+1]); /* send the column to cpu */ if(j+jb < m){ magma_queue_sync(queues[2*d+1]); // wait for lookahead magma_dgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, queues[2*d], NULL); } /* update the remaining blocks */ nb2 = nb2 - nb0; magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, queues[2*d+1]); }else if(nb2 > 0){ /* update the entire trailing matrix */ magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[d*2+1]); } d = (d+1)%num_gpus; } } } } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if( j > 0 ) { /* needed on pluto... */ //magma_setdevice(id); magma_queue_sync( queues[id*2] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { /* wait for it on CPU */ //magma_queue_sync( queues[d*2] ); /* send it to GPU */ magma_dsetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, queues[d*2], NULL ); clFlush(queues[d*2]); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ if( j > 0 ) { magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda, queues[id*2+1]); magma_queue_sync( queues[id*2+1] ); // wait for syrk } /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { if( d != id ) { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_sync(queues[d*2]); } else { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, 0); ldpanel = ldda; } magma_dgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, dlpanel_offset, ldpanel, c_one, dlA(d, nb0, j), ldda, queues[d*2+1]); } d = (d+1)%num_gpus; } } /* send the diagonal to cpu */ magma_dgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, queues[id*2], &events[id] ); clFlush(queues[id*2]); /* factor the diagonal */ magma_queue_sync( queues[id*2] ); lapackf77_dpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { printf("row number: %d\n", (int) j); *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, buf); ldpanel = nb; } magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[d*2], NULL ); clFlush(queues[d*2]); d = (d+1)%num_gpus; } } else { magma_dsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[id*2], NULL ); clFlush(queues[id*2]); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = d_lA[d]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { //dlpanel = dlPT(d); dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_queue_sync(queues[d*2]); // wait for the diagonal if( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[d*2+1]); /* send the column to cpu */ if( j+jb < n ) { magma_queue_sync( queues[d*2+1] ); // wait for lookahead magma_dgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[d*2], NULL); clFlush(queues[d*2]); } /* update the remaining blocks */ nb2 = nb2 - nb0; magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, queues[d*2+1]); } else if( nb2 > 0 ) { /* update the entire trailing matrix */ magma_dtrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[d*2+1]); } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[d*2] ); magma_queue_sync( queues[d*2+1] ); } } /* end of not lapack */ return *info; } /* magma_dpotrf_mgpu */
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by DGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by DGEQRF_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau DOUBLE_PRECISION array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_DGEQRF_GPU. @param[in,out] dB DOUBLE_PRECISION array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT DOUBLE_PRECISION array that is the output (the 6th argument) of magma_dgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) DOUBLE_PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_dgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgels_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrs_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, double *tau, double *dT, double *dB, magma_int_t lddb, double *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_dgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_D_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_dormqr_gpu( MagmaLeft, MagmaTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; if (nb < k) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_dormqr_gpu, hwork contains // the last block of A and B (i.e., C in dormqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_dtrsm and drop the dsetmatrix. if ( nrhs == 1 ) { blasf77_dtrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_dtrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_dsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork ); // update c if (nrhs == 1) magma_dgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); else magma_dgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); int start = i-nb; if (nb < k) { for (i = start; i >= 0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_dgemv( MagmaNoTrans, ib, ib, c_one, dT(i), ib, dB+i, 1, c_zero, dwork+i, 1); magma_dgemv( MagmaNoTrans, i, ib, c_neg_one, dA(0, i), ldda, dwork + i, 1, c_one, dB, 1); } else { magma_dgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, dT(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork); magma_dgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, dA(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); } } } } magma_dcopymatrix( (n), nrhs, dwork, lddwork, dB, lddb ); return *info; }
/***************************************************************************//** Purpose ------- DLAUUM computes the product U * U^H or L^H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = MagmaUpper then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = MagmaLower then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the triangular factor stored in the array A is upper or lower triangular: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the triangular factor U or L. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = MagmaUpper, the upper triangle of A is overwritten with the upper triangle of the product U * U^H; if UPLO = MagmaLower, the lower triangle of A is overwritten with the lower triangle of the product L^H * L. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value @ingroup magma_lauum *******************************************************************************/ extern "C" magma_int_t magma_dlauum( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { #define A(i_, j_) ( A + (i_) + (j_)*lda ) #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const double c_one = MAGMA_D_ONE; const double d_one = MAGMA_D_ONE; const char* uplo_ = lapack_uplo_const( uplo ); /* Local variables */ magma_int_t i, ib, ldda, nb; magmaDouble_ptr dA; bool upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if (n == 0) return *info; nb = magma_get_dpotrf_nb( n ); ldda = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= n) { lapackf77_dlauum( uplo_, &n, A, &lda, info ); } else if (upper) { /* Compute the product U * U^H. */ // Computing 2nd block column (diagonal & above): // [ u11 u12 u13 ] [ u11^H ] [ ... u12*u22^H + u13*u23^H ... ] // [ u22 u23 ] * [ u12^H u22^H ] = [ ... u22*u22^H + u23*u23^H ... ] // [ u33 ] [ u13^H u23^H u33^H ] [ ... ... ... ] for (i=0; i < n; i += nb) { ib = min( nb, n-i ); // Send diagonl block, u22 // This must finish before lauum below magma_dsetmatrix( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); // Send right of diagonl block, u23 magma_dsetmatrix_async( ib, n-i-ib, A(i,i+ib), lda, dA(i,i+ib), ldda, queues[1] ); // u12 = u12 * u22^H magma_dtrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0,i), ldda, queues[0] ); // u22 = u22 * u22^H lapackf77_dlauum( MagmaUpperStr, &ib, A(i,i), &lda, info ); magma_dsetmatrix_async( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); if (i+ib < n) { // wait for u23 magma_queue_sync( queues[1] ); // u12 += u13 * u23^H magma_dgemm( MagmaNoTrans, MagmaConjTrans, i, ib, n-i-ib, c_one, dA(0,i+ib), ldda, dA(i,i+ib), ldda, c_one, dA(0,i), ldda, queues[0] ); // u22 += u23 * u23^H magma_dsyrk( MagmaUpper, MagmaNoTrans, ib, n-i-ib, d_one, dA(i,i+ib), ldda, d_one, dA(i,i), ldda, queues[0] ); } // Get diagonal block & above of current column from device // This could be on a different queue -- not needed until return magma_dgetmatrix_async( i+ib, ib, dA(0,i), ldda, A(0,i), lda, queues[0] ); } } else { /* Compute the product L^H * L. */ for (i=0; i < n; i += nb) { ib = min( nb, n-i ); magma_dsetmatrix( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); magma_dsetmatrix_async( n-i-ib, ib, A(i+ib,i), lda, dA(i+ib,i), ldda, queues[1] ); magma_dtrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i,0), ldda, queues[0] ); lapackf77_dlauum( MagmaLowerStr, &ib, A(i,i), &lda, info ); magma_dsetmatrix_async( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); if (i+ib < n) { magma_queue_sync( queues[1] ); magma_dgemm( MagmaConjTrans, MagmaNoTrans, ib, i, n-i-ib, c_one, dA(i+ib,i), ldda, dA(i+ib,0), ldda, c_one, dA(i,0), ldda, queues[0] ); magma_dsyrk( MagmaLower, MagmaConjTrans, ib, n-i-ib, d_one, dA(i+ib,i), ldda, d_one, dA(i,i), ldda, queues[0] ); } magma_dgetmatrix_async( ib, i+ib, dA(i,0), ldda, A(i,0), lda, queues[0] ); } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; }
/** Purpose ------- DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by DGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T DOUBLE_PRECISION array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_aux ********************************************************************/ extern "C" magma_int_t magma_dlahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *T, magma_int_t ldt, double *Y, magma_int_t ldy, struct dgehrd_data* data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; double scale; magma_int_t i; double ei = MAGMA_D_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return 0; // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_dlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_D_NEGATE( tau[i-1] ); blasf77_daxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_dcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_dtrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_dgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_dtrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_daxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_dlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_dsetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_dlacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_dgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_D_NEGATE( tau[i] ); blasf77_dgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If real, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_dgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_dgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_daxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_dgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_daxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_dsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_dsetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } return 0; } /* magma_dlahr2 */
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. It uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_)*nb + (j_)*nb*ldda + dA_offset) #define dAT(i_, j_) dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset) #define dwork(i_) dwork, (i_) #else #define dA(i_, j_) ( dA + (i_)*nb + (j_)*nb*ldda) #define dAT(i_, j_) ( dAT + (i_)*nb*lddat + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif // Constants const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; // Local variables double *work; magmaDouble_ptr dA, dAT, dwork; magma_int_t iinfo, nb; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_dgetrf_nb( m, n ); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_dgetrf( &m, &n, A, &lda, ipiv, info ); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, lddat, maxdim; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); maxdim = max( maxm, maxn ); lddat = maxn; ldda = maxm; /* set number of GPUs */ magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); /* check the memory requirement */ size_t mem_size = magma_queue_mem_size( queues[0] ); mem_size /= sizeof(double); magma_int_t h = 1+(2+ngpu); magma_int_t ngpu2 = ngpu; magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( ngpu > ceil((double)NB/nb) ) { ngpu2 = (magma_int_t)ceil((double)NB/nb); h = 1+(2+ngpu2); NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); } if ( ngpu2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place // dwork is nb*maxm for panel, and maxdim*maxdim for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; ldda = lddat = maxdim; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); dAT = dA; magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place // dwork is nb*maxm for panel, and maxm*maxn for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dwork ); magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } lapackf77_dgetrf( &m, &nb, work, &lda, ipiv, &iinfo ); for( j = 0; j < s; j++ ) { // get j-th panel from device cols = maxm - j*nb; if (j > 0) { magmablas_dtranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] ); magma_queue_sync( queues[0] ); magma_dgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); // do the cpu part rows = m - j*nb; magma_queue_sync( queues[1] ); lapackf77_dgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo ); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; // put j-th panel onto device magma_dsetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] ); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); magma_queue_sync( queues[1] ); magmablas_dtranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] ); // do the small non-parallel computations (next panel update) if (s > (j+1)) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] ); magma_dgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] ); magma_queue_sync( queues[0] ); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo ); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); // put j-th panel onto device magma_dsetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] ); magmablas_dtranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), lddat, dAT(s, s)+nb0, lddat, queues[0] ); } // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); magma_dgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] ); } else { magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_dgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] ); magma_free( dAT ); } magma_free( dwork ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); } return *info; } /* magma_dgetrf */
extern "C" magma_int_t magma_dlarfb_gpu( magma_side_t side, magma_trans_t trans, magma_direct_t direct, magma_storev_t storev, magma_int_t m, magma_int_t n, magma_int_t k, magmaDouble_ptr dV, size_t dV_offset, magma_int_t ldv, magmaDouble_ptr dT, size_t dT_offset, magma_int_t ldt, magmaDouble_ptr dC, size_t dC_offset, magma_int_t ldc, magmaDouble_ptr dwork, size_t dwork_offset, magma_int_t ldwork, magma_queue_t queue) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DLARFB applies a real block reflector H or its transpose H' to a DOUBLE_PRECISION m by n matrix C, from the left. Arguments ========= SIDE (input) CHARACTER = 'L': apply H or H' from the Left = 'R': apply H or H' from the Right TRANS (input) CHARACTER = 'N': apply H (No transpose) = 'C': apply H' (Conjugate transpose) DIRECT (input) CHARACTER Indicates how H is formed from a product of elementary reflectors = 'F': H = H(1) H(2) . . . H(k) (Forward) = 'B': H = H(k) . . . H(2) H(1) (Backward) STOREV (input) CHARACTER Indicates how the vectors which define the elementary reflectors are stored: = 'C': Columnwise = 'R': Rowwise M (input) INTEGER The number of rows of the matrix C. N (input) INTEGER The number of columns of the matrix C. K (input) INTEGER The order of the matrix T (= the number of elementary reflectors whose product defines the block reflector). DV (input) DOUBLE_PRECISION array, dimension (LDV,K) The matrix V. See further details. LDV (input) INTEGER The leading dimension of the array V. LDV >= max(1,M); DT (input) DOUBLE_PRECISION array, dimension (LDT,K) The triangular k by k matrix T in the representation of the block reflector. LDT (input) INTEGER The leading dimension of the array T. LDT >= K. DC (input/output) DOUBLE_PRECISION array, dimension (LDC,N) On entry, the m by n matrix C. On exit, C is overwritten by H*C. LDC (input) INTEGER The leading dimension of the array C. LDA >= max(1,M). WORK (workspace) DOUBLE_PRECISION array, dimension (LDWORK,K) LDWORK (input) INTEGER The leading dimension of the array WORK. If SIDE == 'L', LDWORK >= max(1,N); if SIDE == 'R', LDWORK >= max(1,M); =================================================================== */ /* TODO: replace with updated larfb_gpu from CUDA MAGMA */ #define dV(i) dV, (i) #define dT(i) dT, (i) #define dC(i) dC, (i) #define dwork(i) dwork, (i) double c_zero = MAGMA_D_MAKE( 0.0, 0.0 ); double c_one = MAGMA_D_MAKE( 1.0, 0.0 ); double c_neg_one = MAGMA_D_MAKE( -1.0, 0.0 ); if (m <= 0 || n <= 0) { return MAGMA_SUCCESS; } magma_trans_t transt; if (trans == MagmaNoTrans) transt = MagmaConjTrans; else transt = MagmaNoTrans; if ( side == MagmaLeft ) { if ( storev == MagmaColumnwise ) { magma_dgemm( MagmaConjTrans, MagmaNoTrans, n, k, m, c_one, dC(dC_offset), ldc, dV(dV_offset), ldv, c_zero, dwork(dwork_offset), ldwork, queue); if (direct == MagmaForward) magma_dtrmm( MagmaRight, MagmaUpper, transt, MagmaNonUnit, n, k, c_one, dT(dT_offset), ldt, dwork(dwork_offset), ldwork, queue); else magma_dtrmm( MagmaRight, MagmaLower, transt, MagmaNonUnit, n, k, c_one, dT(dT_offset), ldt, dwork(dwork_offset), ldwork, queue); magma_dgemm( MagmaNoTrans, MagmaConjTrans, m, n, k, c_neg_one, dV(dV_offset), ldv, dwork(dwork_offset), ldwork, c_one, dC(dC_offset), ldc, queue); } else { magma_dgemm( MagmaNoTrans, MagmaConjTrans, m, k, n, c_one, dC(dC_offset), ldc, dV(dV_offset), ldv, c_zero, dwork(dwork_offset), ldwork, queue); magma_dtrmm( MagmaRight, MagmaUpper, transt, MagmaNonUnit, m, k, c_one, dT(dT_offset), ldt, dwork(dwork_offset), ldwork, queue); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, n, k, c_neg_one, dwork(dwork_offset), ldwork, dV(dV_offset), ldv, c_one, dC(dC_offset), ldc, queue); } } else { /* Case side == 'R' */ if ( storev == MagmaColumnwise ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, k, n, c_one, dC(dC_offset), ldc, dV(dV_offset), ldv, c_zero, dwork(dwork_offset), ldwork, queue); // ??? ldwork replaced by k for case n < k if (direct == MagmaForward) magma_dtrmm( MagmaRight, MagmaUpper, transt, MagmaNonUnit, m, k, c_one, dT(dT_offset), ldt, dwork(dwork_offset), ldwork, queue); else magma_dtrmm( MagmaRight, MagmaLower, transt, MagmaNonUnit, m, k, c_one, dT(dT_offset), ldt, dwork(dwork_offset), ldwork, queue); magma_dgemm( MagmaNoTrans, MagmaConjTrans, m, n, k, c_neg_one, dwork(dwork_offset), ldwork, dV(dV_offset), ldv, c_one, dC(dC_offset), ldc, queue); } else { magma_dgemm( MagmaNoTrans, MagmaConjTrans, m, k, n, c_one, dC(dC_offset), ldc, dV(dV_offset), ldv, c_zero, dwork(dwork_offset), ldwork, queue); magma_dtrmm( MagmaRight, MagmaUpper, transt, MagmaNonUnit, m, k, c_one, dT(dT_offset), ldt, dwork(dwork_offset), ldwork, queue); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, n, k, c_neg_one, dwork(dwork_offset), ldwork, dV(dV_offset), ldv, c_one, dC(dC_offset), ldc, queue); } } return MAGMA_SUCCESS; } /* magma_dlarfb */
/** Purpose ------- DGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N matrix A without any pivoting. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_nopiv_gpu(magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i,j) (dA + (i)*nb + (j)*nb*ldda) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddwork; double *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_dgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_dmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, dA, ldda, work, m ); magma_dgetrf_nopiv( m, n, work, m, info); magma_dsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddwork = maxm; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, maxm*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = current_stream; } for( i=0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; magma_queue_sync( stream[1] ); magma_dgetmatrix_async( m-i*nb, nb, dA(i,i), ldda, work, lddwork, stream[0] ); if ( i > 0 ) { magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n - (i+1)*nb, c_one, dA(i-1,i-1), ldda, dA(i-1,i+1), ldda ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-i*nb, n-(i+1)*nb, nb, c_neg_one, dA(i, i-1), ldda, dA(i-1,i+1), ldda, c_one, dA(i, i+1), ldda ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); magma_dgetrf_nopiv( rows, nb, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_dsetmatrix_async( m-i*nb, nb, work, lddwork, dA(i, i), ldda, stream[0] ); magma_queue_sync( stream[0] ); // do the small non-parallel computations if ( s > (i+1) ) { magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } else { magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n-s*nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, n-(i+1)*nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magma_dgetmatrix( rows, nb0, dA(s,s), ldda, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part magma_dgetrf_nopiv( rows, nb0, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; // upload i-th panel magma_dsetmatrix( rows, nb0, work, lddwork, dA(s,s), ldda ); magma_dtrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb0, n-s*nb-nb0, c_one, dA(s,s), ldda, dA(s,s)+nb0, ldda); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* magma_dgetrf_nopiv_gpu */
extern "C" magma_err_t magma_dgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaDouble_ptr *d_lAT, size_t dlAT_offset, magma_int_t lddat, magma_int_t *ipiv, magmaDouble_ptr *d_lAP, size_t dlAP_offset, double *w, magma_int_t ldw, magma_int_t *info, magma_queue_t *queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -7, internal GPU memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) d_lAT[(id)], (((offset)+(i)*nb)*lddat + (j)*nb) #define inAT_offset(i, j) (((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[4]; magma_int_t maxm, mindim; magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4]; magma_int_t id, i_local, i_local2, nb0, nb1; magmaDouble_ptr d_panel[4], panel_local[4]; size_t d_panel_offset[4]; size_t panel_local_offset[4]; //cudaStream_t streaml[4][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); //nb = magma_get_dgetrf_nb(m); if( num_gpus > ceil((double)n/nb) ) { *info = -1; return *info; } else{ printf("dgetrf2_mgpu num_gpu: %d\n", num_gpus); /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for(i=0; i<num_gpus; i++){ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ //d_panel[i] = &(d_lAP[i][nb*maxm]); /* temporary panel storage */ d_panel[i] = d_lAP[i]; d_panel_offset[i] = nb*maxm; } /* start sending the panel to cpu */ nb0 = min(mindim, nb); if( nb0 == nb ) magma_dtranspose( d_lAP[0], 0, maxm, inAT(0,0,0), lddat, nb0, maxm, queues[2*0+1] ); else magma_dtranspose2( d_lAP[0], 0, maxm, inAT(0,0,0), lddat, nb0, maxm, queues[2*0+1] ); magma_dgetmatrix_async( m, nb0, d_lAP[0], 0, maxm, W(0), 0, ldw, queues[2*0+1], NULL ); clFlush(queues[2*0+1]); /* ------------------------------------------------------------------------------------- */ s = mindim / nb; for( i=0; i<s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; /* Set the local index where the current panel is */ i_local = i/num_gpus; // cols for gpu panel cols = maxm - i*nb; // rows for cpu panel rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( queues[2*id+1] ); /* i-th panel factorization */ lapackf77_dgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; //break; } /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_dsetmatrix_async( rows, nb, W(i), 0, ldw, d_lAP[d], dlAP_offset, cols, queues[2*d+1], NULL ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if(dd==0){ // row offset will be added to ipiv in long2 magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] ); }else{ // ipiv is already added by row offset, calling long3 //magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] ); magma_dpermute_long3( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb, queues[2*d] ); } d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* storage for panel */ if( d == id ) { /* the panel belond to this gpu */ //panel_local[d] = inAT(d,i,i_local); panel_local[d] = d_lAT[d]; panel_local_offset[d] = inAT_offset(i, i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ //panel_local[d] = &d_panel[d][(i%2)*nb*maxm]; panel_local[d] = d_panel[d]; panel_local_offset[d] = d_panel_offset[d] + (i%2)*nb*maxm; //panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if( d < s%num_gpus ) nb0 -= nb; } if( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; /* make sure all the pivoting has been applied */ //magma_queue_sync(queues[2*d]); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; /* synchronization to make sure panel arrived on gpu */ //magma_queue_sync(queues[2*d+1]); } /* magma_queue_sync(queues[2*d]); magma_queue_sync(queues[2*d+1]); */ //magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d]); /* gpu updating the trailing matrix */ if(d == (i+1)%num_gpus){ magma_queue_sync(queues[2*d]); magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d+1]); magma_queue_sync(queues[2*d+1]); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], panel_local_offset[d], ldpan[d], inAT(d, i, i_local2), lddat, queues[2*d+1]); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, inAT(d, i, i_local2), lddat, panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, inAT(d, i+1, i_local2), lddat, queues[2*d+1]); }else{ magma_queue_sync(queues[2*d+1]); magma_dtranspose(panel_local[d], panel_local_offset[d], ldpan[d], d_lAP[d], 0, cols, cols, nb, queues[2*d]); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], panel_local_offset[d], ldpan[d], inAT(d, i, i_local2), lddat, queues[2*d]); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, inAT(d, i, i_local2), lddat, panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, inAT(d, i+1, i_local2), lddat, queues[2*d]); } if( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ if( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ if( i+1 < s ) magma_dtranspose( d_lAP[d], 0, ldda, inAT(d,loff,i_local), lddat, nb0, ldda, queues[2*d+1] ); else magma_dtranspose2( d_lAP[d], 0, ldda, inAT(d,loff,i_local), lddat, nb0, ldda, queues[2*d+1] ); //clFinish(queues[2*d+1]); /* send the panel to cpu */ magma_dgetmatrix_async( cols, nb0, d_lAP[d], 0, ldda, W(i+1), 0, ldw, queues[2*d+1], NULL ); } } else { //trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], panel_local_offset[d], ldpan[d], inAT(d,i,i_local+1), lddat, queues[2*d] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, inAT(d,i,i_local+1), lddat, panel_local[d], panel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, inAT(d,i+1, i_local+1), lddat, queues[2*d] ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if( nb0 > 0 ) { /* wait for the last panel on cpu */ magma_queue_sync( queues[2*id+1]); /* factor on cpu */ lapackf77_dgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d<num_gpus; d++ ) { i_local2 = i_local; if( d < id ) i_local2 ++; if( d == id || n_local[d] > i_local2*nb ) { magma_dsetmatrix_async( rows, nb0, W(s), 0, ldw, d_lAP[d], 0, cols, queues[2*d+1], NULL ); } } for( d=0; d<num_gpus; d++ ) { if(d==0){ magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] ); }else{ //magma_dpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] ); magma_dpermute_long3( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb, queues[2*d] ); } } for( d=0; d<num_gpus; d++ ) { //magma_queue_sync( queues[2*d+1] ); /* wait for the pivoting to be done */ magma_queue_sync( queues[2*d] ); i_local2 = i_local; if( d < id ) i_local2++; if( d == id ) { /* the panel belond to this gpu */ //panel_local[d] = inAT(d,s,i_local); panel_local[d] = d_lAT[d]; panel_local_offset[d] = inAT_offset(s, i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magma_dtranspose2( panel_local[d], panel_local_offset[d], lddat, d_lAP[d], 0, cols, rows, nb0, queues[2*d+1]); if( nb1 > 0 ){ magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], panel_local_offset[d], lddat, d_lAT[d], inAT_offset(s,i_local)+nb0, lddat, queues[2*d+1]); } } else if( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ //panel_local[d] = &d_panel[d][(s%2)*nb*maxm]; panel_local[d] = d_panel[d]; panel_local_offset[d] = d_panel_offset[d] + (s%2)*nb*maxm; //panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magma_dtranspose2( panel_local[d], panel_local_offset[d], nb, d_lAP[d], 0, cols, rows, nb0, queues[2*d+1]); //cublasDtrsm magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], panel_local_offset[d], nb, inAT(d,s,i_local2), lddat, queues[2*d+1]); } } } /* if( nb0 > 0 ) */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); //magma_queue_destroy(streaml[d][0]); //magma_queue_destroy(streaml[d][1]); } } return *info; /* End of MAGMA_DGETRF2_MGPU */ }
void magmablas_dsymm_mgpu_com( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, double alpha, magmaDouble_ptr dA[], magma_int_t ldda, magma_int_t offset, magmaDouble_ptr dB[], magma_int_t lddb, double beta, magmaDouble_ptr dC[], magma_int_t lddc, magmaDouble_ptr dwork[], magma_int_t dworksiz, double *C, magma_int_t ldc, double *work[], magma_int_t worksiz, // TODO unused magma_int_t ngpu, magma_int_t nb, magma_queue_t queues[][20], magma_int_t nqueue, magma_event_t redevents[][MagmaMaxGPUs*MagmaMaxGPUs+10], magma_int_t nbevents, magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2], magma_int_t nbcmplx ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*ldda) #define dB(dev, i, j) (dB[dev] + (i) + (j)*lddb) #define dC(dev, i, j) (dC[dev] + (i) + (j)*lddc) #define dwork(dev, i, j) (dwork[dev] + (i) + (j)*lddwork) #define C(i, j) (C + (i) + (j)*ldc) //printf("####################################################\n"); //printf(" start dsymm \n"); //printf("####################################################\n"); if ( side != MagmaLeft || uplo != MagmaLower ) { fprintf( stderr, "%s: only Left Lower implemented\n", __func__ ); } assert( ldda >= m ); assert( lddb >= m ); assert( lddc >= m ); assert( nqueue >= ngpu ); assert( nbevents >= ngpu*ngpu ); double c_one = MAGMA_D_ONE; magmaDouble_ptr dwork2[MagmaMaxGPUs]; magma_int_t maxgsize = n*m; magma_int_t lddwork = lddc; magma_int_t ldwork = m; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dwork2[dev] = dwork[dev]+n*lddwork; // size of dwork2 is maxgsize*ngpu } assert( dworksiz >= (n*lddwork+maxgsize*ngpu) ); assert( worksiz >= (n*ldwork) ); magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t dev, devperm, myblk, mycolsize, myblkoffst; magma_int_t gmaster; magma_int_t masterdev, lcdev, lccolsize, myngpu; magma_int_t stdev = (offset/nb)%ngpu; magma_int_t blockoffset = offset % nb; magma_int_t fstblksiz = 0; if(blockoffset>0){ fstblksiz = min(m, (nb - blockoffset)); } //magma_int_t nbblk = magma_ceildiv(m, nb); magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t remm = m- fstblksiz; magma_int_t nbblkoffst = offset/nb; magma_int_t nblstblks = -1; magma_int_t devlstblk = -1; magma_int_t lstblksiz = remm%nb; if(lstblksiz>0){ nblstblks = nbblk%ngpu; devlstblk = (nblstblks-1+ngpu)%ngpu; } magma_int_t nbcmplxactive = 0; magma_int_t cmplxisactive[MagmaMaxGPUs]; magma_int_t gpuisactive[MagmaMaxGPUs]; memset(gpuisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); memset(cmplxisactive, 0, MagmaMaxGPUs*sizeof(magma_int_t)); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); cudaMemset(dwork(dev,0,0), 0, (lddwork)*(n)*sizeof(double) ); // put all dC on all dev to 0 except the one which // hold i==0 because this one has to multiply by beta. if(dev!=stdev){ cudaMemset(dC(dev,0,0), 0, (lddc)*(n)*sizeof(double) ); } } magma_int_t newoffset = offset; // 1. symmetrize if(blockoffset>0){ newoffset = offset+fstblksiz; // newoffset is adjusted over nb magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > stdev?1:0); //printf("STDEV %d voici offset %d remm %d myblockoffset %d siz %d \n", stdev, offset, remm, myblkoffst, fstblksiz); magma_setdevice( stdev ); magmablasSetKernelStream( queues[ stdev ][ 0 ] ); magmablas_dsymmetrize_tiles( MagmaLower, fstblksiz, dA(stdev, offset, myblkoffst*nb+blockoffset), ldda, 1, ngpu*nb, nb ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t newstdev = (newoffset/nb)%ngpu; magma_int_t nbblk = remm/nb; // number of block of size nb. if m%nb>0 then a last block exist and is of size ib=m%nb magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-newstdev+ngpu)%ngpu) ? 1:0 ); magma_int_t devperm = (dev-newstdev+ngpu)%ngpu; magma_int_t nbblkoffst = newoffset/nb; magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); //printf("dev %d devperm %d newoffset %d rowoff %d coloff %d myblk %d \n", dev, devperm, newoffset, newoffset+devperm*nb, myblkoffst*nb, myblk); magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); magmablas_dsymmetrize_tiles( MagmaLower, nb, dA(dev, newoffset+devperm*nb, myblkoffst*nb), ldda, myblk, ngpu*nb, nb ); if(remm%nb>0){ magma_int_t nblstblks = (nbblk+1)%ngpu; magma_int_t devlstblk = (nblstblks-1+ngpu)%ngpu; //printf("==> siz %d devperm %d, devlstblk %d, newoffset+nbblk*nb %d, myblkoffst*nb+ myblk*nb %d\n", remm % nb, devperm, devlstblk, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb); if(devperm==devlstblk) magmablas_dsymmetrize( MagmaLower, remm % nb, dA(dev, newoffset+nbblk*nb, myblkoffst*nb+ myblk*nb), ldda ); // last partial tile } } /* magma_int_t siz = m+offset; double *R; magma_dmalloc_cpu( &R, siz*siz ); // collecte back A magmablas_dgetmatrix_1D_bcyclic( siz, siz, dA, ldda, R, siz, ngpu, nb ); magma_setdevice( 0 ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); //magma_dgetmatrix( siz, siz, dA[0], ldda, R, siz ); FILE *trace_file; trace_file = fopen("AJETE/Aafter", "w"); for (int j = 0; j < siz ; j++) for (int i = 0; i < siz ; i++) fprintf(trace_file, "%10d%10d%40.30e\n", i+1, j+1, R[j*siz+i]); fclose(trace_file); return; */ // ROW GEMM transpose a row and make a gemm with a block // if only 1 GPU used the ROW GEMM is integrated with the // COL GEMM (better accuracy observed) and better perf if(ngpu>1){ for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix //magma_int_t dev = (ioff / nb) % ngpu; magma_int_t nbblkoffst = offset/nb; magma_int_t nbblk = magma_ceildiv(i, nb); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t myblk = (nbblk/ngpu) + (nbblk%ngpu > ((dev-stdev+ngpu)%ngpu) ? 1:0 ); magma_int_t myblkoffst = (nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0); magma_int_t myrowsize = myblk * nb; magma_int_t coloffset = myblkoffst*nb; if(dev==stdev) { myrowsize = myrowsize -blockoffset; coloffset = myblkoffst*nb+blockoffset; } //printf("ROW GEMM: voici i %d ib %d ioff %d nbblkoffst %d stdev %d dev %d myblk %d myblkoffset %d coloffset %d rowsize %d\n", i, ib, ioff, nbblkoffst, stdev, dev, myblk, myblkoffst, coloffset, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 1 ] ); magma_dgemm( MagmaConjTrans, MagmaNoTrans, myrowsize, n, ib, alpha, dA(dev,ioff,coloffset), ldda, dB(dev,i,0), lddb, c_one, dwork(dev,0,0), lddwork ); } } } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_event_record(redevents[dev][1], queues[dev][1]); } } // COL GEMM // blockoffset is offset within first block; for subsequent blocks it is 0 if(blockoffset>0){ magma_int_t ib = min( nb-blockoffset, m ); // block size magma_int_t iblock = (offset / nb) / ngpu; // local block id magma_int_t di = iblock*nb+blockoffset; // local index in parent matrix magma_setdevice( stdev ); magmablasSetKernelStream( queues[ stdev ][ 0 ] ); //printf("DEV %d COL GEMM first ioff %d di %d m %d n %d ib %d \n", stdev, offset, di, m, n, ib); magma_dgemm( MagmaNoTrans, MagmaNoTrans, m, n, ib, alpha, dA(stdev,offset,di), ldda, dB(stdev,0,0), lddb, beta, dC(stdev,0,0), lddc ); } // COL GEMM for( magma_int_t i = fstblksiz; i < m; i += nb ) { magma_int_t ib = min( nb, m-i ); // block size magma_int_t ioff = i + offset; // start global index in parent matrix magma_int_t iblock = (ioff / nb) / ngpu; // local block id magma_int_t dev = (ioff / nb) % ngpu; magma_int_t di = iblock*nb; // local index in parent matrix //printf("DEV %d COL GEMM i %d ioff %d di %d m-i %d n %d ib %d \n", dev, i, ioff, di, m-i, n, ib); magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); if(i==0){ magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, beta, dC(dev,i,0), lddc ); }else{ magma_dgemm( MagmaNoTrans, MagmaNoTrans, m-i, n, ib, alpha, dA(dev,ioff,di), ldda, dB(dev,i,0), lddb, c_one, dC(dev,i,0), lddc ); } magma_event_record(redevents[dev][0], queues[dev][0]); // if only 1 GPU is used, do the ROW GEMM if(ngpu==1){ // NOTE THAT because the COL gemm write dC below the diagonal (i) // and the ROW GEMM write dC from 0 to diag-1, so they could // run in parallel on different queues. // // NO NO NO because // it might happen that col finished i and strated i+1 while row still at i // magmablasSetKernelStream( queues[ dev ][ 0 ] ); magma_dgemm( MagmaConjTrans, MagmaNoTrans, i, n, ib, alpha, dA(dev,ioff,offset), ldda, dB(dev,i,0), lddb, c_one, dC(dev,0,0), lddc ); } } if(ngpu>1){ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t nbblk = magma_ceildiv((m+blockoffset), nb); magma_int_t nbblkrow = nbblk-1; magma_int_t devperm = (dev-stdev+ngpu)%ngpu; magma_int_t myblk = (nbblkrow/ngpu) + (nbblkrow%ngpu > devperm ? 1:0 ); magma_int_t myrowsize = myblk * nb; if(dev==stdev) { myrowsize = myrowsize - blockoffset; } //printf("blockoffset %d nbblkrow %d devperm %d DEV %d RECEIVING myblk %d myrowsize %d\n", blockoffset, nbblkrow, devperm, dev, myblk, myrowsize); if(myrowsize>0){ magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); magma_queue_wait_event(queues[ dev ][ 0 ], redevents[dev][1]); //magma_queue_sync( queues[ dev ][ 1 ] ); // for each dev add the computed ROW block each on its placment with dC for( magma_int_t blki = 0; blki < myblk; ++blki){ magma_int_t gbblki = (blki*ngpu + devperm)*nb - blockoffset; magma_int_t lcblki = blki*nb; magma_int_t ib = nb;// min(nb, m-gbblki); if(dev==stdev){ lcblki = blki*nb-blockoffset; if(blki==0){ gbblki = 0; lcblki = 0; ib = nb-blockoffset; } } magmablas_dgeadd(ib, n, c_one, &dwork[dev][lcblki], lddwork, &dC[dev][gbblki] , lddc ); } magma_event_record(redevents[dev][0], queues[dev][0]); } } } // =========================================================== // COMMUNICATION ALL_REDUCE_SUM // =========================================================== if(ngpu==1){ return; } // INITIALIZE COMM for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { masterdev = -1; gnode[cmplxid][MagmaMaxGPUs+1] = -1; myngpu = gnode[cmplxid][MagmaMaxGPUs]; for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; devperm = (dev-stdev+ngpu)%ngpu; myblk = (nbblk/ngpu) + (nbblk%ngpu > devperm ? 1:0 ); mycolsize = myblk*nb; myblkoffst = nb*((nbblkoffst/ngpu)+(nbblkoffst%ngpu > dev?1:0)); if(dev==stdev){ mycolsize -= blockoffset; myblkoffst += blockoffset; // local index in parent matrix } if((devperm==devlstblk)&&(lstblksiz>0)){ mycolsize -= (nb-(remm%nb)); } mycolsize = min(mycolsize, m); if(mycolsize>0){ gpuisactive[dev] = mycolsize; if(masterdev==-1) { masterdev = dev; nbcmplxactive = nbcmplxactive +1; cmplxisactive[cmplxid] = 1; gnode[cmplxid][MagmaMaxGPUs+1] = masterdev; } } } } /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //******************************* // each GPU send its result // to its master. The master make // the addition and then send to // to the masters of other real // and receive from the masters of // other real make the addition // and broadcast locally the final // result. //******************************* //printf("=======================================================================\n"); //printf(" sending to my master \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t idev = 0; idev < myngpu; ++idev ) { dev = gnode[cmplxid][idev]; mycolsize = gpuisactive[dev]; if(mycolsize>0){ // I am an active GPU. if I am not the master, then send my result to my master. // store result on dwork[masterdev][dev*maxgsize] if(dev!=masterdev){ magma_setdevice( dev ); //printf(" GPU %d sending to my master %d\n", dev, masterdev); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(queues[ dev ][ 0 ], redevents[dev][0]); // sending to the master of my real magma_dcopymatrix_async( m, n, &dC[dev][0], lddc, &dwork2[masterdev][maxgsize*dev], m, queues[dev][0] ); magma_event_record(redevents[dev][masterdev], queues[dev][0]); } // end I am not the masterdev }// end if mycolsize>0 }// for idev }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master do addition of local result and broadcast to other masters \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( queues[ masterdev ][ 0 ] ); // wait the geadd of my ROW and COL GEMM is done magma_queue_wait_event(queues[ masterdev ][ 0 ], redevents[masterdev][0]); // ======================================== // local addition // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ //printf(" master %d receiving from %d and adding \n", masterdev, lcdev); // this is an active GPU of my real. // wait I received what he send it to me and then do addition. magma_queue_wait_event(queues[ masterdev ][ 0 ], redevents[lcdev][masterdev]); magmablas_dgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*lcdev], m, &dC[masterdev][0] , lddc ); } }// for l=1:myngpu // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], queues[masterdev][0]); // ======================================== // // ======================================== // send to other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until finish the local addition then send using gmaster stream. //use stream 0 to make it sequential or stream gmaster to make it parallel. //Now both re the same. //printf(" master %d from cmplx %d sending to other master %d on cmplx %d \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(queues[ masterdev ][ gmaster ], redevents[masterdev][masterdev]); magma_dcopymatrix_async( m, n, &dC[masterdev][0], lddc, &dwork2[gmaster][maxgsize*masterdev], m, queues[masterdev][gmaster] ); magma_event_record(redevents[masterdev][gmaster], queues[masterdev][gmaster]); magma_event_record(redevents[masterdev][masterdev], queues[masterdev][gmaster]); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ //printf("=======================================================================\n"); //printf(" each master wait receiving other masters results, do the addition and broadcast locally \n"); //printf("=======================================================================\n"); for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ magma_setdevice( masterdev ); // addition is done on stream 0 sequentially magmablasSetKernelStream( queues[ masterdev ][ 0 ] ); // master has to wait until finishing all the send to other masters. magma_queue_wait_event(queues[ masterdev ][ 0 ], redevents[masterdev][masterdev]); // ======================================== // addition of results from other masters // ======================================== for( magma_int_t k = 0; k < nbcmplx; ++k ) { if(k!=cmplxid){ gmaster = gnode[k][MagmaMaxGPUs+1]; if(gmaster!=-1){ //real is active //Master has to wait until receiving from gmaster, then do addition using stream 0 //printf(" master %d from cmplx %d receiving from other master %d on cmplx %d and adding \n", masterdev, cmplxid, gmaster, k); magma_queue_wait_event(queues[ masterdev ][ 0 ], redevents[gmaster][masterdev]); magmablas_dgeadd(m, n, c_one, &dwork2[masterdev][maxgsize*gmaster], m, &dC[masterdev][0] , lddc ); } // end of gmaster!=-1 } // end of k!=cmplxid }// for k = 0: nbcmplx // because addition is done sequentially on stream 0, // I have to record this to be able to synch using it magma_event_record(redevents[masterdev][masterdev], queues[masterdev][0]); // ======================================== // ======================================== // local broadcast of final results // ======================================== for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if((lcdev!=masterdev)&&(lccolsize>0)){ // this is an active GPU of my real. // wait the previous addition is done maening stream 0 is finished and broadcast sequentially for now. // to make it parallel put stream lcdev instead of stream 0 //printf(" master %d broadcasting local to %d \n", masterdev, lcdev); magma_queue_wait_event(queues[ masterdev ][ 0 ], redevents[masterdev][masterdev]); magma_dcopymatrix_async( m, n, &dC[masterdev][0], lddc, &dC[lcdev][0], lddc, queues[masterdev][0] ); magma_event_record(redevents[masterdev][lcdev], queues[masterdev][0]); } }// for l=1:myngpu // ======================================== }// end of if masterdev!=-1 maening real is active }// for cmplxid /* for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_device_sync(); } */ for( magma_int_t cmplxid = 0; cmplxid < nbcmplx; ++cmplxid ) { myngpu = gnode[cmplxid][MagmaMaxGPUs]; masterdev = gnode[cmplxid][MagmaMaxGPUs+1]; //check if real is active if(masterdev!=-1){ for( magma_int_t l = 0; l < myngpu; ++l ) { lcdev = gnode[cmplxid][l]; lccolsize = gpuisactive[lcdev]; if(lccolsize>0){ magma_setdevice( lcdev ); magma_queue_wait_event(queues[ lcdev ][ 0 ], redevents[lcdev][0]); magma_queue_wait_event(queues[ lcdev ][ 0 ], redevents[masterdev][lcdev]); } }// for l=1:myngpu }// end of if masterdev!=-1 maening real is active }// for cmplxid //printf("****************************************************\n"); //printf(" finish dsymm \n"); //printf("****************************************************\n"); magma_setdevice( cdev ); magmablasSetKernelStream( cstream ); }
/** Purpose ------- DGESSM applies the factors L computed by DGETRF_INCPIV to a real M-by-N tile A. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] k INTEGER The number of columns of the matrix L. K >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by DGETRF_INCPIV. @param[in] dL1 DOUBLE_PRECISION array, dimension(LDDL1, N) The IB-by-K matrix in which is stored L^(-1) as returned by GETRF_INCPIV @param[in] lddl1 INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). @param[in] dL DOUBLE_PRECISION array, dimension(LDDL, N) The M-by-K lower triangular tile on the gpu. @param[in] lddl INTEGER The leading dimension of the array L. LDDL >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension (LDDA, N) On entry, the M-by-N tile A on the gpu. On exit, updated by the application of L on the gpu. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @ingroup magma_dgesv_tile ********************************************************************/ extern "C" magma_int_t magma_dgessm_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, magmaDouble_ptr dL1, magma_int_t lddl1, magmaDouble_ptr dL, magma_int_t lddl, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ldda + (j) ) #define L(i,j) (dL + (i) + (j)*lddl ) #define dL1(j) (dL1 + (j)*lddl1) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; int i, sb; magmaDouble_ptr dAT; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for (i = 0; i < k; i += ib) { sb = min(ib, k-i); magmablas_dlaswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, dL1(i), lddl1, AT(i, 0), ldda); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, L( i, i), lddl, AT(i, 0), ldda); #endif if ( (i+sb) < m) { magma_dgemm( MagmaNoTrans, MagmaTrans, n, m-(i+sb), sb, c_neg_one, AT(i, 0), ldda, L( i+sb, i), lddl, c_one, AT(i+sb, 0), ldda ); } } if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } return *info; } /* magma_dgessm_gpu */
/** Purpose ------- DGEBRD reduces a general real M-by-N matrix A to upper or lower bidiagonal form B by an orthogonal transformation: Q**H * A * P = B. If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. M >= 0. @param[in] n INTEGER The number of columns in the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N general matrix to be reduced. On exit, if m >= n, the diagonal and the first superdiagonal are overwritten with the upper bidiagonal matrix B; the elements below the diagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the first superdiagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors; \n if m < n, the diagonal and the first subdiagonal are overwritten with the lower bidiagonal matrix B; the elements below the first subdiagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the diagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] d double precision array, dimension (min(M,N)) The diagonal elements of the bidiagonal matrix B: D(i) = A(i,i). @param[out] e double precision array, dimension (min(M,N)-1) The off-diagonal elements of the bidiagonal matrix B: if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. @param[out] tauq DOUBLE_PRECISION array dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= (M+N)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: If m >= n, Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The contents of A on exit are illustrated by the following examples: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) ( v1 v2 v3 v4 v5 ) @endverbatim where d and e denote diagonal and off-diagonal elements of B, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_dgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_dgebrd(magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *d, double *e, double *tauq, double *taup, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double *dA, *dwork; magma_int_t ncol, nrow, jmax, nb, ldda; magma_int_t i, j, nx; magma_int_t iinfo; magma_int_t minmn; magma_int_t ldwrkx, ldwrky, lwkopt; magma_int_t lquery; nb = magma_get_dgebrd_nb(n); ldda = m; lwkopt = (m + n) * nb; work[0] = MAGMA_D_MAKE( lwkopt, 0. ); lquery = (lwork == -1); /* Check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < lwkopt && (! lquery) ) { *info = -10; } if (*info < 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ minmn = min(m,n); if (minmn == 0) { work[0] = c_one; return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + (m + n)*nb )) { fprintf (stderr, "!!!! device memory allocation error in dgebrd\n" ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = dA + n*ldda; ldwrkx = m; ldwrky = n; /* Set the block/unblock crossover point NX. */ nx = 128; /* Copy the matrix to the GPU */ if (minmn - nx >= 1) { magma_dsetmatrix( m, n, A, lda, dA, ldda ); } for (i=0; i < (minmn - nx); i += nb) { /* Reduce rows and columns i:i+nb-1 to bidiagonal form and return the matrices X and Y which are needed to update the unreduced part of the matrix */ nrow = m - i; ncol = n - i; /* Get the current panel (no need for the 1st iteration) */ if ( i > 0 ) { magma_dgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda ); magma_dgetmatrix( nb, ncol - nb, dA(i, i+nb), ldda, A( i, i+nb), lda ); } magma_dlabrd_gpu(nrow, ncol, nb, A(i, i), lda, dA(i, i), ldda, d+i, e+i, tauq+i, taup+i, work, ldwrkx, dwork, ldwrkx, // x, dx work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky); // y, dy /* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update of the form A := A - V*Y' - X*U' */ nrow = m - i - nb; ncol = n - i - nb; // Send Y back to the GPU magma_dsetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx ); magma_dsetmatrix( ncol, nb, work + (ldwrkx+1)*nb, ldwrky, dwork + (ldwrkx+1)*nb, ldwrky ); magma_dgemm( MagmaNoTrans, MagmaConjTrans, nrow, ncol, nb, c_neg_one, dA(i+nb, i ), ldda, dwork+(ldwrkx+1)*nb, ldwrky, c_one, dA(i+nb, i+nb), ldda); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nrow, ncol, nb, c_neg_one, dwork+nb, ldwrkx, dA( i, i+nb ), ldda, c_one, dA( i+nb, i+nb ), ldda); /* Copy diagonal and off-diagonal elements of B back into A */ if (m >= n) { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_D_MAKE( d[j], 0. ); *A(j, j+1) = MAGMA_D_MAKE( e[j], 0. ); } } else { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_D_MAKE( d[j], 0. ); *A(j+1, j ) = MAGMA_D_MAKE( e[j], 0. ); } } } /* Use unblocked code to reduce the remainder of the matrix */ nrow = m - i; ncol = n - i; if ( 0 < minmn - nx ) { magma_dgetmatrix( nrow, ncol, dA(i, i), ldda, A(i, i), lda ); } lapackf77_dgebrd( &nrow, &ncol, A(i, i), &lda, d+i, e+i, tauq+i, taup+i, work, &lwork, &iinfo); work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dA ); return *info; } /* magma_dgebrd */
extern "C" magma_int_t magma_dgetri_gpu( magma_int_t n, double *dA, magma_int_t lda, magma_int_t *ipiv, double *dwork, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= DGETRI computes the inverse of a matrix using the LU factorization computed by DGETRF. This method inverts U and then computes inv(A) by solving the system inv(A)*L = inv(U) for inv(A). Note that it is generally both faster and more accurate to use DGESV, or DGETRF and DGETRS, to solve the system AX = B, rather than inverting the matrix and multiplying to form X = inv(A)*B. Only in special instances should an explicit inverse be computed with this routine. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDA,N) On entry, the factors L and U from the factorization A = P*L*U as computed by DGETRF_GPU. On exit, if INFO = 0, the inverse of the original matrix A. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). IPIV (input) INTEGER array, dimension (N) The pivot indices from DGETRF; for 1<=i<=N, row i of the matrix was interchanged with row IPIV(i). DWORK (workspace/output) COMPLEX*16 array on the GPU, dimension (MAX(1,LWORK)) LWORK (input) INTEGER The dimension of the array DWORK. LWORK >= N*NB, where NB is the optimal blocksize returned by magma_get_dgetri_nb(n). Unlike LAPACK, this version does not currently support a workspace query, because the workspace is on the GPU. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, U(i,i) is exactly zero; the matrix is singular and its cannot be computed. ===================================================================== */ /* Local variables */ double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *dL = dwork; magma_int_t ldl = n; magma_int_t nb = magma_get_dgetri_nb(n); magma_int_t j, jmax, jb, jp; *info = 0; if (n < 0) *info = -1; else if (lda < max(1,n)) *info = -3; else if ( lwork < n*nb ) *info = -6; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) return *info; /* Invert the triangular factor U */ magma_dtrtri_gpu( MagmaUpper, MagmaNonUnit, n, dA, lda, info ); if ( *info != 0 ) return *info; jmax = ((n-1) / nb)*nb; for( j = jmax; j >= 0; j -= nb ) { jb = min( nb, n-j ); // copy current block column of L to work space, // then replace with zeros in A. magmablas_dlacpy( MagmaUpperLower, n-j, jb, &dA[j + j*lda], lda, &dL[j ], ldl ); magmablas_dlaset( MagmaLower, n-j, jb, &dA[j + j*lda], lda ); // compute current block column of Ainv // Ainv(:, j:j+jb-1) // = ( U(:, j:j+jb-1) - Ainv(:, j+jb:n) L(j+jb:n, j:j+jb-1) ) // * L(j:j+jb-1, j:j+jb-1)^{-1} // where L(:, j:j+jb-1) is stored in dL. if ( j+jb < n ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, n, jb, n-j-jb, c_neg_one, &dA[(j+jb)*lda], lda, &dL[ j+jb ], ldl, c_one, &dA[ j*lda], lda ); } magma_dtrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaUnit, n, jb, c_one, &dL[j ], ldl, &dA[j*lda], lda ); } // Apply column interchanges for( j = n-2; j >= 0; --j ) { jp = ipiv[j] - 1; if ( jp != j ) { magmablas_dswap( n, &dA[ j*lda ], 1, &dA[ jp*lda ], 1 ); } } return *info; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sy2sb_mgpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magmaDouble_ptr dAmgpu[], magma_int_t ldda, magmaDouble_ptr dTmgpu[], magma_int_t lddt, magma_int_t ngpu, magma_int_t distblk, magma_queue_t queues[][20], magma_int_t nqueue, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_0, a_1, a_2) (dTmgpu[a_0] + ((a_2)-1)*(lddt) + (a_1)-1) #define dA(a_0, a_1, a_2) (dAmgpu[a_0] + ((a_2)-1)*(ldda) + (a_1)-1) double c_neg_one = MAGMA_D_NEG_ONE; double c_neg_half = MAGMA_D_NEG_HALF; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, flipV=-1; magma_int_t iblock, idev, di; int i; int lwkopt; int lquery; assert (nqueue >= 3); assert (nqueue >= (ngpu+1)); *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx=0; magma_buildconnection_mgpu(gnode, &nbcmplx, ngpu); #ifdef ENABLE_DEBUG printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n", nbcmplx); #endif double *dspace[MagmaMaxGPUs]; double *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs]; double *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs]; double *workngpu[MagmaMaxGPUs+1]; magma_event_t redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs; magma_int_t lddv = ldda; magma_int_t lddw = lddv; magma_int_t dwrk2siz = ldda*nb*(ngpu+1); magma_int_t worksiz = n*nb; magma_int_t devworksiz = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis // local allocation and stream creation // TODO check malloc for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_dmalloc( &dspace[dev], devworksiz ); magma_dmalloc_pinned ( &workngpu[dev], worksiz); dvall[dev] = dspace[dev]; dw[dev] = dvall[dev] + 2*nb*lddv; dwork[dev] = dw[dev] + nb*lddw; dworkbis[dev] = dwork[dev] + nb*ldda; magmablasSetKernelStream( queues[ dev ][ 0 ] ); for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming); } } magma_dmalloc_pinned ( &workngpu[ngpu], worksiz); double *worktest = NULL; //magma_dmalloc_cpu( &worktest, n*nb ); // not used // ====================== double *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(double)); if (upper) { printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // dpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); // find the device who own the panel then send it to the CPU. // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((i-1) / distblk) / ngpu; // local block id di = iblock*distblk + (i-1)%distblk; // local index in parent matrix idev = ((i-1) / distblk) % ngpu; // device with this block //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di); magma_setdevice( idev ); //magma_device_sync(); magma_dgetmatrix_async( (pm+pn), pn, dA(idev, i, di+1), ldda, A( i, i), lda, queues[ idev ][ nqueue-1 ] ); //magma_setdevice( 0 ); //printf("updating dsyr2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old); // compute DSYR2K_MGPU magmablas_dsyr2k_mgpu2( MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dv, pm_old, pn_old, dw, pm_old, pn_old, d_one, dAmgpu, ldda, indi_old+pn_old-1, ngpu, distblk, queues, 2 ); //magma_setdevice( 0 ); magma_setdevice( idev ); magma_queue_sync( queues[idev][ nqueue-1 ] ); //magma_setdevice( 0 ); dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); /* Send V and T from the CPU to the GPU */ // To be able to overlap the GET with the DSYR2K // it should be done on last stream. // TO Avoid a BUG that is overwriting the old_V // used atthis moment by dsyr2k with the new_V // send it now, we decide to have a flipflop // vector of Vs. if step%2=0 use V[0] else use V[nb*n] flipV = ((i-1)/nb)%2; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dv[dev] = dvall[dev] + flipV*nb*lddv; } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); // send V magma_dsetmatrix_async( pm, pk, A(indi, indj), lda, dv[dev], pm, queues[dev][nqueue-1] ); // Send the triangular factor T to the GPU magma_dsetmatrix_async( pk, pk, hT, nb, dT(dev, 1, i), lddt, queues[dev][nqueue-1] ); } /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // dwork = V T magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ nqueue-1 ] ); magma_queue_sync( queues[dev][nqueue-1] ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dv[dev], pm, dT(dev, 1, i), lddt, c_zero, dwork[dev], pm); } // =============================================== // SYNC TO BE SURE THAT BOTH V AND T WERE // RECEIVED AND VT IS COMPUTED and SYR2K is done // =============================================== for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t s = 0; s < nqueue; ++s ) magma_queue_sync( queues[dev][s] ); } // compute DSYMM_MGPU // The broadcast of the result done inside this function // should be done in stream [0] because i am assuming this // for the GEMMs below otherwise I have to SYNC over the // Broadcasting stream. if (ngpu == 1) { magmablasSetKernelStream( queues[ 0 ][ 0 ] ); magma_dsymm(MagmaLeft, uplo, pm, pk, c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda, dwork[0], pm, c_zero, dw[0], pm); } else { magmablas_dsymm_mgpu_com( MagmaLeft, uplo, pm, pk, c_one, dAmgpu, ldda, indi-1, dwork, pm, c_zero, dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz, ngpu, distblk, queues, nqueue-1, redevents, nbevents, gnode, nbcmplx); } /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // Here we have to wait until the broadcast of DSYMM has been done. // Note that the broadcast should be done on stream[0] so in a way // we can continue here on the same stream and avoid a sync magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); // magma_queue_sync( queues[dev][0] ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork[dev], pm, dw[dev], pm, c_zero, dworkbis[dev], nb); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dv[dev], pm, dworkbis[dev], nb, c_one, dw[dev], pm); } /* restore the panel it is put here to overlap with the previous GEMM*/ dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); // =============================================== // SYNC TO BE SURE THAT BOTH V AND W ARE DONE // =============================================== // Synchronise to be sure that W has been computed // because next DSYR2K use streaming and may happen // that lunch a gemm on stream 2 while stream 0 // which compute those 2 GEMM above has not been // computed and also used for the same reason in // the panel update below and also for the last HER2K for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ nqueue-1 ] ); //magma_queue_sync( queues[idev][0] ); removed because the sync has been done in the loop above magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dv[idev], pm, dw[idev], pm, c_one, dA(idev, indi, di+1), ldda); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dw[idev], pm, dv[idev], pm, c_one, dA(idev, indi, di+1), ldda); //printf("updating next panel distblk %d idev %d on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn); } else { /* no look-ahead as this is last iteration */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ 0 ] ); //printf("LAST DSYR2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk); magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dv[idev], pm, dw[idev], pm, d_one, dA(idev, indi, di+1), ldda); /* Send the last block to the CPU */ dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); magma_dgetmatrix( pk, pk, dA(idev, indi, di+1), ldda, A(n-pk+1, n-pk+1), lda ); dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } indi_old = indi; //indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) }// end of LOWER //magma_setdevice( 0 ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( dspace[dev]); magma_free_pinned(workngpu[dev]); for( magma_int_t e = 0; e < nbevents; ++e ) { magma_event_destroy( redevents[dev][e] ); } } magma_free_pinned(workngpu[ngpu]); magma_free_cpu(worktest); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); work[0] = MAGMA_D_MAKE( lwkopt, 0 ); return *info; } /* magma_dsytrd_sy2sb_mgpu */
extern "C" magma_int_t magma_dsgesv_gpu(char trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DSGESV computes the solution to a real system of linear equations A * X = B or A' * X = B where A is an N-by-N matrix and X and B are N-by-NRHS matrices. DSGESV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) DOUBLE PRECISION array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. ldda (input) INTEGER The leading dimension of the array dA. ldda >= max(1,N). IPIV (output) INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). dIPIV (output) INTEGER array on the GPU, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was moved to row IPIV(i). dB (input) DOUBLE PRECISION array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. lddb (input) INTEGER The leading dimension of the array dB. lddb >= max(1,N). dX (output) DOUBLE PRECISION array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. lddx (input) INTEGER The leading dimension of the array dX. lddx >= max(1,N). dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. iter (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGETRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations info (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *dR; float *dSA, *dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange('I', n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside dsgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_daxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_dgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
/** Purpose ------- DPOTRF_OOC computes the Cholesky factorization of a real symmetric positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may exceed the GPU memory. The factorization has the form A = U**H * U, if UPLO = MagmaUpper, or A = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t rr_dpotrf_m( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A + (j)*lda + (i)) #define dA(d, i, j) (dwork[(d)] + (j)*lddla + (i)) #define dT(d, i, j) ( dt[(d)] + (j)*ldda + (i)) #define dAup(d, i, j) (dwork[(d)] + (j)*NB + (i)) #define dTup(d, i, j) ( dt[(d)] + (j)*nb + (i)) /* Local variables */ double d_one = 1.0; double d_neg_one = -1.0; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; const char* uplo_ = lapack_uplo_const( uplo ); int upper = (uplo == MagmaUpper); double *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, ngpu0 = ngpu; magma_int_t j, jj, jb, J, JB, NB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; magma_timer_t time_total=0, time_sum=0, time=0; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dpotrf_nb(n); if ( ngpu0 > n/nb ) { ngpu = n/nb; if ( n%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*ngpu-1)/(nb*ngpu))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(double); //MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem - max(2,ngpu)*nb*ldda - (n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if ( NB >= n ) { #ifdef CHECK_DPOTRF_OOC printf( " * still fits in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_DPOTRF_OOC printf( " * doesn't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_DPOTRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n", n, NB, nb, lddla, (double)freeMem ); fflush(stdout); #endif for (d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dt[d], NB*lddla + max(2,ngpu)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,ngpu)*nb*ldda]; for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); timer_start( time_total ); if (nb <= 1 || nb >= n) { lapackf77_dpotrf(uplo_, &n, A, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( ngpu0 > (n-J)/nb ) { ngpu = (n-J)/nb; if ( (n-J)%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } /* load the new big-panel by block-rows */ magma_dhtodpo( ngpu, uplo, JB, n, J, J, nb, A, lda, dwork, NB, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_dsetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%ngpu; magma_setdevice(d); jb = min(nb, n-jj); magma_dsetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj < JB; jj += nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%ngpu; J2 = jj/(nb*ngpu); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_dgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if ( n > J+JB ) { for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*ngpu))*nb; if (d < ((n-J)/nb)%ngpu) n_local[d] += nb; else if (d == ((n-J)/nb)%ngpu) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*ngpu)); if ( d < (JB/nb)%ngpu ) J2 += nb; n_local[d] -= J2; magma_dgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using three streams rr_dpotrf3_mgpu(ngpu, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal (and diagonal!!!) big panel */ magma_ddtohpo(ngpu, uplo, JB, n, J, J, nb, NB, A, lda, dwork, NB, stream, &iinfo); //magma_ddtohpo(ngpu, uplo, JB, n, J, J, nb, 0, A, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J < n; J += NB ) { JB = min(NB,n-J); if ( ngpu0 > (n-J)/nb ) { ngpu = (n-J)/nb; if ( (n-J)%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } /* load the new big-panel by block-columns */ magma_dhtodpo( ngpu, uplo, n, JB, J, J, nb, A, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ timer_start( time ); for( j=0; j < J; j += nb ) { /* upload the diagonal of big panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_dsetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj < n; jj += nb ) { d = ((jj-J)/nb)%ngpu; magma_setdevice(d); jb = min(nb, n-jj); magma_dsetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj < JB; jj += nb ) { /* diagonal */ d = (jj/nb)%ngpu; J2 = jj/(nb*ngpu); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_dgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if ( n > J+JB ) { /* off-diagonal */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/ngpu)*nb; if (d < ((n-J)/nb)%ngpu) n_local[d] += nb; else if (d == ((n-J)/nb)%ngpu) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*ngpu)); if ( d < (JB/nb)%ngpu ) J2 += nb; n_local[d] -= J2; magma_dgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( jj=0; jj < 3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using three streams magma_dpotrf3_mgpu(ngpu, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, A, lda, h, stream, event, &iinfo); if ( iinfo != 0 ) { *info = J+iinfo; break; } time_sum += timer_stop( time ); /* upload the off-diagonal big panel */ magma_ddtohpo( ngpu, uplo, n, JB, J, J, nb, JB, A, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ timer_stop( time_total ); if ( ngpu0 > n/nb ) { ngpu = n/nb; if ( n%nb != 0 ) ngpu ++; } else { ngpu = ngpu0; } for (d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j < 5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); timer_printf( "\n n=%d NB=%d nb=%d\n", (int) n, (int) NB, (int) nb ); timer_printf( " Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_DPOTRF(n) / 1e9, time_total, FLOPS_DPOTRF(n) / 1e9 / time_total ); timer_printf( " Performance %f / %f = %f GFlop/s\n", FLOPS_DPOTRF(n) / 1e9, time_sum, FLOPS_DPOTRF(n) / 1e9 / time_sum ); return *info; } /* magma_dpotrf_ooc */