magma_int_t magma_dnan_inf_gpu( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaDouble_const_ptr dA, magma_int_t ldda, magma_int_t *cnt_nan, magma_int_t *cnt_inf ) { magma_int_t info = 0; if ( uplo != MagmaLower && uplo != MagmaUpper && uplo != MagmaFull ) info = -1; else if ( m < 0 ) info = -2; else if ( n < 0 ) info = -3; else if ( ldda < max(1,m) ) info = -5; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } magma_int_t lda = m; double* A; magma_dmalloc_cpu( &A, lda*n ); magma_dgetmatrix( m, n, dA, ldda, A, lda ); magma_int_t cnt = magma_dnan_inf( uplo, m, n, A, lda, cnt_nan, cnt_inf ); magma_free_cpu( A ); return cnt; }
// Solve dA * dX = dB, where dA and dX are stored in GPU device memory. // Internally, MAGMA uses a hybrid CPU + GPU algorithm. void GpuSolver::solve( magmaDouble_ptr X ) { real_Double_t gpu_time; magmaDouble_ptr dX=NULL, dWORKD=NULL, Z=NULL; float *dWORKS=NULL; magma_int_t qrsv_iters; magma_int_t info = 0; magma_dmalloc( &dX, n*nrhs ); magma_dmalloc( &dWORKD, n*nrhs ); magma_smalloc( &dWORKS, n*(n+nrhs) ); if ( dX == NULL || dWORKD == NULL || dWORKS == NULL ) { fprintf( stderr, "malloc failed - not enough GPU memory?\n" ); goto cleanup; } gpu_time = magma_wtime(); magma_dsposv_gpu( MagmaUpper, n, nrhs, dA, n, dB, n, dX, n, dWORKD, dWORKS, &qrsv_iters, &info ); gpu_time = magma_wtime() - gpu_time; fprintf( stdout, "DSPOSV GPU solution time = %fs\n", gpu_time); if ( qrsv_iters == -3 ) {fprintf( stderr, "cannot factor input matrix in single precision, bad initialization?\n"); } if ( info != 0 ) { fprintf( stderr, "magma_dsposv_gpu failed with info=%d\n", info ); } magma_dgetmatrix( n, nrhs, dX, n, X, n ); cleanup: magma_free( dX ); }
void magma_dprint_gpu( int m, int n, magmaDouble_ptr dA, size_t dA_offset, int ldda, magma_queue_t queue ) { int lda = m; double* A = (double*) malloc( lda*n*sizeof(double) ); magma_dgetmatrix( m, n, dA, dA_offset, ldda, A, 0, lda, queue ); magma_dprint( m, n, A, lda ); free( A ); }
SEXP magQR(SEXP a) { SEXP gpu = GET_SLOT(a, install("gpu")), b = PROTECT(NEW_OBJECT(MAKE_CLASS("magmaQR"))); int *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1], MIN_MN = (M < N ? M : N), NB = magma_get_dgeqrf_nb(M), *pivot, info; double *A, *tau; A = REAL(SET_VECTOR_ELT(b, 0, AS_NUMERIC(duplicate(a)))); SET_VECTOR_ELT(b, 1, ScalarInteger(MIN_MN)); tau = REAL(SET_VECTOR_ELT(b, 2, NEW_NUMERIC(MIN_MN))); pivot = INTEGER(SET_VECTOR_ELT(b, 3, NEW_INTEGER(N))); int i; for(i = 1; i <= N; i++) *pivot++ = i; if(LOGICAL_VALUE(gpu)) { int LENT = (2*MIN_MN + (N+31)/32*32)*NB; double *dA, *dT, *work; SET_SLOT(b, install("work"), NEW_NUMERIC(LENT)); work = REAL(GET_SLOT(b, install("work"))); magma_malloc((void**)&dA, (M*N)*sizeof(double)); magma_malloc((void**)&dT, LENT*sizeof(double)); magma_dsetmatrix(M, N, A, M, dA, M); magma_dgeqrf3_gpu(M, N, dA, M, tau, dT, &info); magma_dgetmatrix(M, N, dA, M, A, M); magma_dgetvector(LENT, dT, 1, work, 1); magma_free(dA); magma_free(dT); } else { int LWORK = N * NB; double *hA, *hwork; magma_malloc_pinned((void**)&hA, (M*N)*sizeof(double)); magma_malloc_pinned((void**)&hwork, LWORK*sizeof(double)); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, A, &M, hA, &M); magma_dgeqrf_ooc(M, N, hA, M, tau, hwork, LWORK, &info); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, hA, &M, A, &M); magma_free_pinned(hA); magma_free_pinned(hwork); } if(info < 0) error("illegal argument %d in 'magQR'", -1 * info); UNPROTECT(1); return b; }
void magma_task_dgetmatrix(Schedule* sched_obj ) { magma_int_t m; magma_int_t nb; double *dA_src; magma_int_t dA_LD; double *A_dst; magma_int_t LDA; schedule_unpack_args_6(sched_obj, m, nb, dA_src, dA_LD, A_dst, LDA); magma_dgetmatrix( m, nb, dA_src, dA_LD, A_dst, LDA); // task_getpanel(gpu_nrows, ncols, dAT(K,K+A_N), dAT_LD, A(K,K+A_N), A_LD); }
void magma_task_dev_dgetmatrix_transpose(Schedule* sched_obj ) { magma_int_t deviceID; magma_int_t m; magma_int_t nb; double *dA_src; magma_int_t dA_LD; double *A_dst; magma_int_t LDA; double *dwork; magma_int_t dwork_LD; void *dep_ptr; #if (dbglevel >=1) ca_trace_start(); #endif // printf("Matrix_get_transpose\n"); //schedule_unpack_args_8(sched_obj, m, nb, dA_src, dA_LD, A_dst, LDA, dwork, dwork_LD); schedule_unpack_args_10(sched_obj, deviceID, m, nb, dA_src, dA_LD, A_dst, LDA, dwork, dwork_LD, dep_ptr); magma_setdevice(deviceID); // printf("Matrix_get_transpose m:%d, nb:%d, dep_ptr:%p\n",m,nb,dep_ptr); //magma_dgetmatrix_transpose( m, nb, dA_src, dA_LD, A_dst, LDA); // task_getpanel(gpu_nrows, ncols, dAT(K,K+A_N), dAT_LD, A(K,K+A_N), A_LD); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(nb, m, dA_src, dA_LD, "dA before getMatrix"); #endif // pthread_mutex_lock(&mutex_dAP); /*1.transpose dA to dwork*/ magmablas_dtranspose2(dwork, dwork_LD, dA_src, dA_LD, nb, m); // printf("Matrix_get_transpose m:%d, nb:%d 1:done\n",m,nb); #if (dbglevel==10) // ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after dA transpose"); #endif /*2. copy dwork to A: send the panel to GPU*/ magma_dgetmatrix(m, nb, dwork, dwork_LD, A_dst, LDA); // printf("Matrix_get_transpose m:%d, nb:%d 2:done\n",m,nb); // pthread_mutex_unlock(&mutex_dAP); #if (dbglevel==10) ca_dbg_printMat(m, nb, A_dst, LDA, "A after getMatrix"); #endif #if (dbglevel >=1) ca_trace_end_gpu('G'); ca_trace_end_cpu('C'); #endif // printf("End Matrix_get_transpose m:%d, nb:%d\n",m,nb); }
void magma_dprint_gpu( magma_int_t m, magma_int_t n, const double *dA, magma_int_t ldda ) { if ( magma_is_devptr( dA ) == 0 ) { fprintf( stderr, "ERROR: dprint_gpu called with host pointer.\n" ); exit(1); } magma_int_t lda = m; double* A; magma_dmalloc_cpu( &A, lda*n ); magma_dgetmatrix( m, n, dA, ldda, A, lda ); magma_dprint( m, n, A, lda ); magma_free_cpu( A ); }
SEXP magChol(SEXP a) { SEXP gpu = GET_SLOT(a, install("gpu")), b = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int *DIMA = INTEGER(GET_DIM(a)), N = DIMA[0], N2 = N * N, LDB = N, info; double *B; if(DIMA[1] != N) error("non-square matrix"); b = SET_SLOT(b, install(".Data"), AS_NUMERIC(a)); SET_SLOT(b, install("gpu"), duplicate(gpu)); B = REAL(b); if(LOGICAL_VALUE(gpu)) { double *dB; magma_malloc((void**)&dB, N2*sizeof(double)); magma_dsetmatrix(N, N, B, LDB, dB, LDB); magma_dpotrf_gpu(magma_uplo_const('U'), N, dB, LDB, &info); magma_dgetmatrix(N, N, dB, LDB, B, LDB); magma_free(dB); } else { double *hB; magma_malloc_pinned((void**)&hB, N2*sizeof(double)); lapackf77_dlacpy(MagmaUpperStr, &N, &N, B, &LDB, hB, &LDB); magma_dpotrf(magma_uplo_const('U'), N, hB, N, &info); lapackf77_dlacpy(MagmaUpperStr, &N, &N, hB, &LDB, B, &LDB); magma_free_pinned(hB); } if(info < 0) error("illegal argument %d in 'magChol", -1 * info); else if(info > 0) error("leading minor of order %d is not positive definite", info); int i, j; for(j = 0; j < N; j++) { for(i = j + 1; i < N; i++) { B[i + j * N] = 0.0; } } UNPROTECT(1); return b; }
//=========================================================================== // Get a matrix with 1D block cyclic distribution on multiGPUs to the CPU. // The dA arrays are pointers to the matrix data for the corresponding GPUs. //=========================================================================== extern "C" void magmablas_dgetmatrix_1D_bcyclic( magma_int_t m, magma_int_t n, magmaDouble_ptr *dA, magma_int_t ldda, double *hA, magma_int_t lda, magma_int_t num_gpus, magma_int_t nb, magma_queue_t* trans_queues) { magma_int_t i, d, nk; for( i = 0; i < n; i += nb ) { d = (i/nb) % num_gpus; nk = min(nb, n-i); magma_dgetmatrix( m, nk, dA[d], i/(nb*num_gpus)*nb*ldda, ldda, &hA[i*lda], 0, lda, trans_queues[d]); } }
double *cholesky_gpu(double *ml, int m) { magma_int_t mm = m*m; magma_int_t info; double *a; double *d_a ; magma_err_t err; err = magma_dmalloc_cpu ( &a , mm ); err = magma_dmalloc ( &d_a , mm ); magma_dsetmatrix ( m, m, ml, m, d_a , m ); magma_dpotrf_gpu('L',m,d_a,m,&info); magma_dpotri_gpu('L',m,d_a,m,&info); magma_dgetmatrix ( m, m, d_a , m, a, m ); magma_free (d_a ); return a; }
SEXP magLU(SEXP a) { SEXP gpu = GET_SLOT(a, install("gpu")), b = PROTECT(NEW_OBJECT(MAKE_CLASS("magmaLU"))); int *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1], LDA = M, MIN_MN = M < N ? M : N, *ipiv, info; double *A = REAL(PROTECT(AS_NUMERIC(a))); b = SET_SLOT(b, install(".Data"), AS_NUMERIC(a)); SET_SLOT(b, install("pivot"), NEW_INTEGER(MIN_MN)); ipiv = INTEGER(GET_SLOT(b, install("pivot"))); SET_SLOT(b, install("gpu"), duplicate(gpu)); if(LOGICAL_VALUE(gpu)) { double *dA; magma_malloc((void**)&dA, (M*N)*sizeof(double)); magma_dsetmatrix(M, N, A, LDA, dA, LDA); magma_dgetrf_gpu(M, N, dA, LDA, ipiv, &info); magma_dgetmatrix(M, N, dA, LDA, REAL(b), LDA); magma_free(dA); } else { double *hA; magma_malloc_pinned((void**)&hA, (M*N)*sizeof(double)); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, A, &LDA, hA, &LDA); magma_dgetrf(M, N, hA, LDA, ipiv, &info); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, hA, &LDA, REAL(b), &LDA); magma_free_pinned(hA); } if(info < 0) error("illegal argument %d in 'magLU'", -1 * info); else if(info > 0) error("factor U is singular"); UNPROTECT(2); return b; }
void printMatrix(const double* v, int m, int n, int ldda, magma_queue_t queues){ printf("[\n"); double* bufv = (double*)malloc(m*n*sizeof(double)); if(magma_is_devptr(v) == 1){ //cudaMemcpy(bufv, v, m*n*sizeof(double), cudaMemcpyDeviceToHost); printf("-- On GPU --\n"); magma_dgetmatrix(n, m, v, ldda, bufv, n, queues); }else if(magma_is_devptr(v) == 0){ printf("-- On CPU --\n"); memcpy(bufv, v, m*n*sizeof(double)); } for(int i = 0; i < m; i++){ for(int j = 0; j < n ; j++) printf(" %8.4f ", bufv[i*n+j]); printf("\n"); } printf("];\n"); }
SEXP magMultmm(SEXP a, SEXP transa, SEXP b, SEXP transb) { SEXP gpu = magGetGPU(a, b), c = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int TA = LOGICAL_VALUE(transa), TB = LOGICAL_VALUE(transb), *DIMA = INTEGER(GET_DIM(a)), *DIMB = INTEGER(GET_DIM(b)), M = DIMA[TA], N = DIMB[!TB], K = DIMA[!TA], LDA = DIMA[0], LDB = DIMB[0], LDC = M; char TRANSA = (TA ? 'T' : 'N'), TRANSB = (TB ? 'T' : 'N'); double *A = REAL(PROTECT(AS_NUMERIC(a))), *B = REAL(PROTECT(AS_NUMERIC(b))), *dA, *dB, *dC; if(DIMB[TB] != K) error("non-conformable matrices"); c = SET_SLOT(c, install(".Data"), allocMatrix(REALSXP, M, N)); SET_SLOT(c, install("gpu"), duplicate(gpu)); magma_malloc((void**)&dA, (M*K)*sizeof(double)); magma_malloc((void**)&dB, (K*N)*sizeof(double)); magma_malloc((void**)&dC, (M*N)*sizeof(double)); magma_dsetmatrix(DIMA[0], DIMA[1], A, LDA, dA, LDA); magma_dsetmatrix(DIMB[0], DIMB[1], B, LDB, dB, LDB); if(LOGICAL_VALUE(gpu)) magmablas_dgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC); else cublasDgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC); magma_dgetmatrix(M, N, dC, LDC, REAL(c), LDC); magma_free(dA); magma_free(dB); magma_free(dC); UNPROTECT(3); return c; }
void gmm_magma(const Tensor_core<double,2>& A, const Tensor_core<double,2>& B, Tensor_core<double,2>& C, char TRANSA, char TRANSB, double alpha, double beta) { int AL0 = A.rank(0); int AL1 = A.rank(1); int BL0 = B.rank(0); int BL1 = B.rank(1); int CL0 = C.rank(0); int CL1 = C.rank(1); magma_int_t M, N, K, LDA, LDB, LDC; magma_trans_t transA=magma_trans_const(TRANSA), transB=magma_trans_const(TRANSB); magmaDouble_ptr d_A, d_B, d_C; //Set LDA, LDB, and LDC, round up to multiple of 32 for best GPU performance LDA = ((AL0+31)/32)*32; LDB = ((BL0+31)/32)*32; LDC = ((CL0+31)/32)*32; // Allocate memory for the matrices on GPU magma_dmalloc(&d_A, LDA*AL1 ); magma_dmalloc(&d_B, LDB*BL1 ); magma_dmalloc(&d_C, LDC*CL1 ); // Copy data from host (CPU) to device (GPU) magma_dsetmatrix( AL0, AL1, A.data(), AL0, d_A, LDA ); magma_dsetmatrix( BL0, BL1, B.data(), BL0, d_B, LDB ); if( abs(beta)>1e-32 ) magma_dsetmatrix( CL0, CL1, C.data(), CL0, d_C, LDC ); //Call magma_sgemm M=( TRANSA=='N' || TRANSA=='n' ) ? AL0:AL1; K=( TRANSA=='N' || TRANSA=='n' ) ? AL1:AL0; N=( TRANSB=='N' || TRANSB=='n' ) ? BL1:BL0; magma_dgemm(transA, transB, M, N, K, alpha, d_A, LDA, d_B, LDB, beta,d_C, LDC); // Copy solution from device (GPU) to host (CPU) magma_dgetmatrix(CL0, CL1, d_C, LDC, C.data(), CL0); // Free memory on GPU magma_free(d_A); magma_free(d_B); magma_free(d_C); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgesv_gpu */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double error, Rnorm, Anorm, Xnorm, *work; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_B, *h_X; magmaDouble_ptr d_A, d_B; magma_int_t *ipiv; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("%% N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default lddb = ldda; gflops = ( FLOPS_DGETRF( N, N ) + FLOPS_DGETRS( N, nrhs ) ) / 1e9; TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( work, double, N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ sizeA = lda*N; sizeB = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); magma_dsetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_dsetmatrix( N, nrhs, h_B, ldb, d_B, lddb, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgesv_gpu( N, nrhs, d_A, ldda, ipiv, d_B, lddb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgesv_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } //===================================================================== // Residual //===================================================================== magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb, opts.queue ); Anorm = lapackf77_dlange("I", &N, &N, h_A, &lda, work); Xnorm = lapackf77_dlange("I", &N, &nrhs, h_X, &ldb, work); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A, &lda, h_X, &ldb, &c_neg_one, h_B, &ldb); Rnorm = lapackf77_dlange("I", &N, &nrhs, h_B, &ldb, work); error = Rnorm/(N*Anorm*Xnorm); status += ! (error < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgesv( &N, &nrhs, h_A, &lda, ipiv, h_B, &ldb, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgesv returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } else { printf( "%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) nrhs, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dgeqp3( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *jpvt, double *tau, double *work, magma_int_t lwork, #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork, #endif magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGEQP3 computes a QR factorization with column pivoting of a matrix A: A*P = Q*R using Level 3 BLAS. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the upper triangle of the array contains the min(M,N)-by-N upper trapezoidal matrix R; the elements below the diagonal, together with the array TAU, represent the unitary matrix Q as a product of min(M,N) elementary reflectors. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) On entry, if JPVT(J).ne.0, the J-th column of A is permuted to the front of A*P (a leading column); if JPVT(J)=0, the J-th column of A is a free column. On exit, if JPVT(J)=K, then the J-th column of A*P was the the K-th column of A. TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors. WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO=0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. For [sd]geqp3, LWORK >= (N+1)*NB + 2*N; for [cz]geqp3, LWORK >= (N+1)*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. For [cz]geqp3 only: RWORK (workspace) DOUBLE PRECISION array, dimension (2*N) INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dwork + (i) + (j)*(ldda)) double *dwork, *df; magma_int_t ione = 1; magma_int_t n_j, ldda, ldwork; magma_int_t j, jb, na, nb, sm, sn, fjb, nfxd, minmn; magma_int_t topbmn, sminmn, lwkopt, lquery; *info = 0; lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } nb = magma_get_dgeqp3_nb(min(m, n)); if (*info == 0) { minmn = min(m,n); if (minmn == 0) { lwkopt = 1; } else { lwkopt = (n + 1)*nb; #if defined(PRECISION_d) || defined(PRECISION_s) lwkopt += 2*n; #endif } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); if (lwork < lwkopt && ! lquery) { *info = -8; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } if (minmn == 0) return *info; #if defined(PRECISION_d) || defined(PRECISION_s) double *rwork = work + (n + 1)*nb; #endif ldda = ((m+31)/32)*32; ldwork = n*ldda + (n+1)*nb; if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } df = dwork + n*ldda; // dwork used for dA magma_queue_t stream; magma_queue_create( &stream ); /* Move initial columns up front. * Note jpvt uses 1-based indices for historical compatibility. */ nfxd = 0; for (j = 0; j < n; ++j) { if (jpvt[j] != 0) { if (j != nfxd) { blasf77_dswap(&m, A(0, j), &ione, A(0, nfxd), &ione); jpvt[j] = jpvt[nfxd]; jpvt[nfxd] = j + 1; } else { jpvt[j] = j + 1; } ++nfxd; } else { jpvt[j] = j + 1; } } /* Factorize fixed columns ======================= Compute the QR factorization of fixed columns and update remaining columns. */ if (nfxd > 0) { na = min(m,nfxd); lapackf77_dgeqrf(&m, &na, A, &lda, tau, work, &lwork, info); if (na < n) { n_j = n - na; lapackf77_dormqr( MagmaLeftStr, MagmaTransStr, &m, &n_j, &na, A, &lda, tau, A(0, na), &lda, work, &lwork, info ); } } /* Factorize free columns */ if (nfxd < minmn) { sm = m - nfxd; sn = n - nfxd; sminmn = minmn - nfxd; if (nb < sminmn) { j = nfxd; // Set the original matrix to the GPU magma_dsetmatrix_async( m, sn, A (0,j), lda, dA(0,j), ldda, stream ); } /* Initialize partial column norms. */ for (j = nfxd; j < n; ++j) { rwork[j] = cblas_dnrm2(sm, A(nfxd, j), ione); rwork[n + j] = rwork[j]; } j = nfxd; if (nb < sminmn) { /* Use blocked code initially. */ magma_queue_sync( stream ); /* Compute factorization: while loop. */ topbmn = minmn - nb; while(j < topbmn) { jb = min(nb, topbmn - j); /* Factorize JB columns among columns J:N. */ n_j = n - j; if (j>nfxd) { // Get panel to the CPU magma_dgetmatrix( m-j, jb, dA(j,j), ldda, A (j,j), lda ); // Get the rows magma_dgetmatrix( jb, n_j - jb, dA(j,j + jb), ldda, A (j,j + jb), lda ); } magma_dlaqps( m, n_j, j, jb, &fjb, A (0, j), lda, dA(0, j), ldda, &jpvt[j], &tau[j], &rwork[j], &rwork[n + j], work, &work[jb], n_j, &df[jb], n_j ); j += fjb; /* fjb is actual number of columns factored */ } } /* Use unblocked code to factor the last or only block. */ if (j < minmn) { n_j = n - j; if (j > nfxd) { magma_dgetmatrix( m-j, n_j, dA(j,j), ldda, A (j,j), lda ); } lapackf77_dlaqp2(&m, &n_j, &j, A(0, j), &lda, &jpvt[j], &tau[j], &rwork[j], &rwork[n+j], work ); } } work[0] = MAGMA_D_MAKE( lwkopt, 0. ); magma_free( dwork ); magma_queue_destroy( stream ); return *info; } /* dgeqp3 */
extern "C" magma_int_t magma_dsgesv_gpu(char trans, magma_int_t n, magma_int_t nrhs, double *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, double *dB, magma_int_t lddb, double *dX, magma_int_t lddx, double *dworkd, float *dworks, magma_int_t *iter, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DSGESV computes the solution to a real system of linear equations A * X = B or A' * X = B where A is an N-by-N matrix and X and B are N-by-NRHS matrices. DSGESV first attempts to factorize the matrix in real SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with real DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a real DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio real SINGLE PRECISION performance over real DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input or input/output) DOUBLE PRECISION array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. ldda (input) INTEGER The leading dimension of the array dA. ldda >= max(1,N). IPIV (output) INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). dIPIV (output) INTEGER array on the GPU, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was moved to row IPIV(i). dB (input) DOUBLE PRECISION array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. lddb (input) INTEGER The leading dimension of the array dB. lddb >= max(1,N). dX (output) DOUBLE PRECISION array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. lddx (input) INTEGER The leading dimension of the array dX. lddx >= max(1,N). dworkd (workspace) DOUBLE PRECISION array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. dworks (workspace) SINGLE PRECISION array on the GPU, dimension (N*(N+NRHS)) This array is used to store the real single precision matrix and the right-hand sides or solutions in single precision. iter (output) INTEGER < 0: iterative refinement has failed, double precision factorization has been performed -1 : the routine fell back to full precision for implementation- or machine-specific reasons -2 : narrowing the precision induced an overflow, the routine fell back to full precision -3 : failure of SGETRF -31: stop the iterative refinement after the 30th iteration > 0: iterative refinement has been successfully used. Returns the number of iterations info (output) INTEGER = 0: successful exit < 0: if info = -i, the i-th argument had an illegal value > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. ===================================================================== */ #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t ione = 1; double *dR; float *dSA, *dSX; double Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_dlange('I', n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_dlag2s( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside dsgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_dlag2s( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_sgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_D_ABS( dX(i,j) ) instead of dlange? for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_dsgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_daxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_dgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_dgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER>0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_idamax( n, dX(0,j), 1) - 1; magma_dgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_dlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_idamax ( n, dR(0,j), 1 ) - 1; magma_dgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_dlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_dgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_dlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_dgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; double *h_A; magmaDouble_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_DGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_wtime(); magma_dgetrf_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { magma_dgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dorgqr( magma_int_t m, magma_int_t n, magma_int_t k, double *a, magma_int_t lda, double *tau, magmaDouble_ptr dT, size_t dT_offset, magma_int_t nb, magma_queue_t queue, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) DOUBLE_PRECISION array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF_GPU. DT (input) DOUBLE_PRECISION array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_dgeqrf_gpu. NB (input) INTEGER This is the block size used in DGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define a_ref(i,j) ( a + (j)*lda + (i)) #define da_ref(i,j) da, (da_offset + (j)*ldda + (i)) #define t_ref(a_1) dT, (dT_offset + (a_1)*nb) double c_zero = MAGMA_D_ZERO; magma_int_t i__1, i__2, i__3; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork = min(m, n); double *work; magmaDouble_ptr da, dwork; size_t da_offset, dwork_offset; magma_event_t event = NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) return *info; /* Allocate GPU work space */ ldda = ((m+31)/32)*32; lddwork = ((lddwork+31)/32)*32; if (MAGMA_SUCCESS != magma_dmalloc( &da, ((n)*ldda + nb*lddwork ) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } da_offset = 0; dwork = da; dwork_offset = da_offset + (n)*ldda; /* Allocate CPU work space */ lwork = n * nb; magma_dmalloc_cpu( &work, lwork ); if( work == NULL ) { magma_free( da ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } if ( (nb > 1) && (nb < k) ) { /* Use blocked code after the last block. The first kk columns are handled by the block method. */ ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); /* Set A(1:kk,kk+1:n) to zero. */ magmablas_dlaset(MagmaFull, kk, n-kk, c_zero, c_zero, da_ref(0,kk), ldda, queue); } else kk = 0; /* Use unblocked code for the last or only block. */ if (kk < n) { i__1 = m - kk; i__2 = n - kk; i__3 = k - kk; lapackf77_dorgqr(&i__1, &i__2, &i__3, a_ref(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo); magma_dsetmatrix(i__1, i__2, a_ref(kk, kk), lda, da_ref(kk, kk), ldda, queue); } if (kk > 0) { /* Use blocked code */ for (i = ki; i >= 0; i-=nb) { ib = min(nb, k - i); /* Send the current panel to the GPU */ i__2 = m - i; dpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work); magma_dsetmatrix(i__2, ib, a_ref(i, i), lda, da_ref(i, i), ldda, queue); if (i + ib < n) { /* Apply H to A(i:m,i+ib:n) from the left */ i__3 = n - i - ib; magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, i__2, i__3, ib, da_ref(i, i ), ldda, t_ref(i), nb, da_ref(i, i+ib), ldda, dwork, dwork_offset, lddwork, queue); } /* Apply H to rows i:m of current block on the CPU */ lapackf77_dorgqr(&i__2, &ib, &ib, a_ref(i, i), &lda, &tau[i], work, &lwork, &iinfo); magma_dsetmatrix_async( i__2, ib, a_ref(i,i), lda, da_ref(i,i), ldda, queue, &event ); /* Set rows 1:i-1 of current block to zero */ i__2 = i + ib; magmablas_dlaset(MagmaFull, i, i__2 - i, c_zero, c_zero, da_ref(0,i), ldda, queue); } } magma_dgetmatrix(m, n, da_ref(0, 0), ldda, a_ref(0, 0), lda, queue); //cudaStreamDestroy(stream); magma_free( da ); magma_free_cpu(work); return *info; } /* magma_dorgqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; double *d_A, *d_T, *ddA, *dtau; double *d_A2, *d_T2, *ddA2, *dtau2; double *dwork, *dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 10. * opts.tolerance * lapackf77_dlamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because dgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because dgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); TESTING_MALLOC_DEV( d_T2, double, N*N ); TESTING_MALLOC_DEV( ddA2, double, N*N ); TESTING_MALLOC_DEV( dtau2, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_dlaset cudaMemset(ddA, 0, N*N*sizeof(double)); cudaMemset(d_T, 0, N*N*sizeof(double)); cudaMemset(ddA2, 0, N*N*sizeof(double)); cudaMemset(d_T2, 0, N*N*sizeof(double)); lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_dgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_dgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_dgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_dgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, NULL); //magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_dgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); magma_dgetmatrix( N, N, ddA, N, h_T, N ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_dgetmatrix( N, N, d_T, N, h_T, N ); double terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }
/** Purpose ------- DORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by DGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A DOUBLE_PRECISION array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF. @param[in,out] C DOUBLE_PRECISION array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, double *C, magma_int_t ldc, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); // TODO fix memory leak (alloc after argument checks) magma_int_t nb = 128; double *T; magma_dmalloc_pinned(&T, nb*nb); //printf("calling dormqr_m with nb=%d\n", (int) nb); double* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_device_t igpu; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_dormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l; ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_dmalloc( &dw[igpu], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dsetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_dsetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_dlaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_dlarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_dsetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_dgetmatrix( m, kb, dC(igpu, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_dgetmatrix_async( m, kb, // dC(igpu, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { // TODO fix memory leak T, dw, event, stream fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_dlarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_dsetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_dlaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_dsetmatrix( ib, ib, T, ib, dT, ib ); magma_dlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_D_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dormqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_T, ddA, dtau; magmaDouble_ptr dwork; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lwork; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t nsize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t i, info, min_mn; magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; // process command line arguments printf( "\nUsage: %s -N <m,n> -c\n", argv[0] ); printf( " -N can be repeated up to %d times. If only m is given, then m=n.\n", MAXTESTS ); printf( " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n" ); int ntest = 0; for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); int m, n; info = sscanf( argv[++i], "%d,%d", &m, &n ); if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0.\n", argv[i] ); exit(1); } M = max( M, msize[ ntest ] ); N = max( N, nsize[ ntest ] ); ntest++; } else if ( strcmp("-M", argv[i]) == 0 ) { printf( "-M has been replaced in favor of -N m,n to allow -N to be repeated.\n\n" ); exit(1); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; M = msize[ntest-1]; N = nsize[ntest-1]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (32*2+2)*min_mn) ); double *h1 = (double*)malloc(sizeof(double)*N*N); memset(h1, 0, N*N*sizeof(double)); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); lwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_PIN( h_work, double, lwork ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( i = 0; i < ntest; ++i ) { M = msize[i]; N = nsize[i]; min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N)) / 1e9; /* Initialize the matrix */ magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up // magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); /* magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); */ gpu_time = magma_wtime(); magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d.\n", (int) info); if ( checkres ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d.\n", (int) info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue ); magma_dgetmatrix( N, N, ddA, 0, N, h_T, 0, N, queue ); // Restore the upper triangular part of A before the check for(int col=0; col<N; col++){ for(int row=0; row<=col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / error; // Check if T is the same double terr = 0.; magma_dgetmatrix( N, N, d_T, 0, N, h_T, 0, N, queue ); for(int col=0; col<N; col++) for(int row=0; row<=col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_T ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); free(h1); magma_queue_destroy( queue ); magma_finalize(); }
/** Purpose ------- DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by DGEQRF. This version recomputes the T matrices on the CPU and sends them to the GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] A DOUBLE_PRECISION array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] lda INTEGER The first dimension of the array A. LDA >= max(1,M). @param[in] tau DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF_GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dorgqr2(magma_int_t m, magma_int_t n, magma_int_t k, double *A, magma_int_t lda, double *tau, magma_int_t *info) { #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; magma_int_t nb = magma_get_dgeqrf_nb(min(m, n)); magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; double *dA, *dV, *dW, *dT, *T; double *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; dT = dA + ldda*n + ldda*nb + lddwork*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_dmalloc_cpu( &work, lwork ); T = work; if (work == NULL) { magma_free( dA ); magma_free_cpu( work ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } double *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* lapackf77_dorgqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_dlacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_dlaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_dlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_dsetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_dlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_dlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_dsetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &mi, &ib, A(i,i), &lda, &tau[i], T, &nb); magma_dsetmatrix_async( ib, ib, T, nb, dT, nb, stream ); // set panel to identity magmablas_dlaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_dlaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); magma_queue_sync( stream ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT, nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_dgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_dorgqr */
extern "C" magma_int_t magma_dlobpcg( magma_d_sparse_matrix A, magma_d_solver_par *solver_par ){ #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define magmablas_swap(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_d_bspmv_tuned(m, n, alpha, A, X, beta, AX) { \ magmablas_dtranspose( m, n, X, m, blockW, n ); \ magma_d_vector x, ax; \ x.memory_location = Magma_DEV; x.num_rows = m*n; x.nnz = m*n; x.val = blockW; \ ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX; \ magma_d_spmv(alpha, A, x, beta, ax ); \ magmablas_dtranspose( n, m, blockW, n, X, m ); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n =(solver_par->num_eigenvalues); double *blockX = solver_par->eigenvectors; double *evalues = solver_par->eigenvalues; double *dwork, *hwork; double *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW; double *gramA, *gramB, *gramM; double *gevectors, *h_gramB; double *pointer, *origX = blockX; double *eval_gpu; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_dmalloc_pinned( &hwork , lwork ); magma_dmalloc( &blockAX , m*n ); magma_dmalloc( &blockAR , m*n ); magma_dmalloc( &blockAP , m*n ); magma_dmalloc( &blockR , m*n ); magma_dmalloc( &blockP , m*n ); magma_dmalloc( &blockW , m*n ); magma_dmalloc( &dwork , m*n ); magma_dmalloc( &eval_gpu , 3*n ); //**********************************************************+ magma_int_t verbosity = 1; magma_int_t *iwork, liwork = 15*n+9; // === Set solver parameters === double residualTolerance = solver_par->epsilon; magma_int_t maxIterations = solver_par->maxiter; // === Set some constants & defaults === double c_one = MAGMA_D_ONE, c_zero = MAGMA_D_ZERO; double *residualNorms, *condestGhistory, condestG; double *gevalues; magma_int_t *activeMask; // === Check some parameters for possible quick exit === solver_par->info = 0; if (m < 2) solver_par->info = -1; else if (n > m) solver_par->info = -2; if (solver_par->info != 0) { magma_xerbla( __func__, -(solver_par->info) ); return solver_par->info; } magma_int_t *info = &(solver_par->info); // local info variable; // === Allocate GPU memory for the residual norms' history === magma_dmalloc(&residualNorms, (maxIterations+1) * n); magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) ); // === Allocate CPU work space === magma_dmalloc_cpu(&condestGhistory, maxIterations+1); magma_dmalloc_cpu(&gevalues, 3 * n); magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t)); double *hW; magma_dmalloc_pinned(&hW, n*n); magma_dmalloc_pinned(&gevectors, 9*n*n); magma_dmalloc_pinned(&h_gramB , 9*n*n); // === Allocate GPU workspace === magma_dmalloc(&gramM, n * n); magma_dmalloc(&gramA, 9 * n * n); magma_dmalloc(&gramB, 9 * n * n); #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); magma_dmalloc_cpu(&rwork, lrwork); #endif // === Set activemask to one === for(int k =0; k<n; k++) iwork[k]=1; magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n); magma_int_t gramDim, ldgram = 3*n, ikind = 4; // === Make the initial vectors orthonormal === magma_dgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info ); //magma_dorthomgs( m, n, blockX ); magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); // === Update X = X * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockX); // === Update AX = AX * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockAX); condestGhistory[1] = 7.82; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); /* for(int i=0; i<n; i++){ magma_daxpy(m, MAGMA_D_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1); } */ #if defined(PRECISION_z) || defined(PRECISION_d) magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #else magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #endif magma_dlobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu); magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === remove the residuals corresponding to already converged evectors magma_dcompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) // magmablas_dlacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m); /* // === make the preconditioned residuals orthogonal to X magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m); */ // === make the active preconditioned residuals orthonormal magma_dgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockR ); // === compute AR magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR ); if (!restart) { // === compact P & AP as well magma_dcompactActive(m, n, blockP, m, activeMask); magma_dcompactActive(m, n, blockAP, m, activeMask); /* // === make P orthogonal to X ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m); // === make P orthogonal to R ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_dgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockP ); //magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP ); magma_dsetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize); // magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, // m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #endif } iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize))); double condestGmean = 0.; for(int i = 0; i<iterationNumber-iter+1; i++) condestGmean += condestGhistory[i]; condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram); // identity if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram); } magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); lapackf77_dsyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, #endif info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", (int) iterationNumber); } // === assemble GramA; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram); // identity magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram); if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram); } /* // === Compute X' AX or just use the eigenvalues below ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram); */ if (restart==0) { magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } else { gramDim = n+cBlockSize; magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } for(int k=0; k<n; k++) *gevectors(k,k) = MAGMA_D_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); magma_int_t itype = 1; lapackf77_dsygvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_dsetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m); // === corresponding contribution from AP to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m); } else { // === contribution from R (only) to the new X magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m); // === corresponding contribution from AR (only) to the new AX magma_dgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m); } // === contribution from old X to the new X + the new search direction P magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockX); //magma_daxpy(m*n, c_one, blockP, 1, blockX, 1); magma_dlobpcg_maxpy( m, n, blockP, blockX ); // === corresponding contribution from old AX to new AX + AP magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAX); //magma_daxpy(m*n, c_one, blockAP, 1, blockAX, 1); magma_dlobpcg_maxpy( m, n, blockAP, blockAX ); condestGhistory[iterationNumber+1]=condestG; if (verbosity==1) { // double res; // magma_dgetmatrix(1, 1, // (double*)residualNorms(0, iterationNumber), 1, // (double*)&res, 1); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ) solver_par->info = -2; else solver_par->info = -1; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors magmablas_swap(blockX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m); // === update AX = AX * evectors to compute the final residual magmablas_swap(blockAX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m); // === compute R = AX - evalues X magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); for(int i=0; i<n; i++) magma_daxpy(m, MAGMA_D_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1); // === residualNorms[iterationNumber] = || R || magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === restore blockX if needed if (blockX != origX) magmablas_dlacpy( MagmaUpperLower, m, n, blockX, m, origX, m); printf("Eigenvalues:\n"); for(int i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Print residual history in a file for plotting ==== double *hresidualNorms; magma_dmalloc_cpu(&hresidualNorms, (iterationNumber+1) * n); magma_dgetmatrix(n, iterationNumber, (double*)residualNorms, n, (double*)hresidualNorms, n); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(int i =1; i<iterationNumber; i++) { for(int j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); magma_free( blockAX ); magma_free( blockAR ); magma_free( blockAP ); magma_free( blockR ); magma_free( blockP ); magma_free( blockW ); magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #if defined(PRECISION_z) || defined(PRECISION_c) magma_free_cpu( rwork ); #endif return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgels */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, error, Anorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("%% ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf("%% M N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) CPU GPU \n"); printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default lddb = magma_roundup( max_mn, opts.align ); // multiple of 32 by default nb = magma_get_dgeqrf_nb( M, N ); gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_A2, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ size = lda*N; lapackf77_dlarnv( &ione, ISEED, &size, h_A ); lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &size, h_B ); lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_dlarnv( &ione, ISEED, &size, h_X ); //blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgels_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // compute the residual magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb ); Anorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgels returned error %d: %s.\n", (int) info, magma_strerror( info )); } blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_dlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_daxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); bool okay; if ( M == N ) { okay = (gpu_error < tol && error < tol); } else { okay = (error < tol); } status += ! okay; printf( " %s\n", (okay ? "ok" : "failed")); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; const char trans[] = { 'N', 'C', 'T' }; const char uplo[] = { 'L', 'U' }; const char diag[] = { 'U', 'N' }; const char side[] = { 'L', 'R' }; double *A, *B, *C, *C2, *LU; double *dA, *dB, *dC1, *dC2; double alpha = MAGMA_D_MAKE( 0.5, 0.1 ); double beta = MAGMA_D_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_err_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int i = 0; i < opts.ntest; ++i ) { m = opts.msize[i]; n = opts.nsize[i]; k = opts.ksize[i]; printf("=========================================================================\n"); printf( "M %d, N %d, K %d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = maxn; size = maxn*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_dmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_dmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_dmalloc( &dA, size ); assert( err == 0 ); err = magma_dmalloc( &dB, size ); assert( err == 0 ); err = magma_dmalloc( &dC1, size ); assert( err == 0 ); err = magma_dmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test DSWAP // swap 2nd and 3rd columns of dA, then copy to C2 and compare with A assert( n >= 4 ); magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetmatrix( m, n, A, ld, dB, ld ); magma_dswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_dswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasDaxpy( ld*n, c_neg_one, dA, 1, dB, 1 ); magma_dgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "dswap diff %.2g\n", error ); // ----- test IDAMAX // get argmax of column of A magma_dsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_idamax( m, dA(0,j), 1 ); magma_int_t i2 = cublasIdamax( m, dA(0,j), 1 ); assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "idamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test DGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_dsetmatrix( m, n, A, ld, dA, ld ); magma_dsetvector( maxn, B, 1, dB, 1 ); magma_dsetvector( maxn, C, 1, dC1, 1 ); magma_dsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == 'N' ? m : n); cublasDaxpy( size, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMV( m, n ) / 1e9; printf( "dgemv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DSYMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetvector( m, B, 1, dB, 1 ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMV( m ) / 1e9; printf( "dsymv( %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test DTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_dlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_dgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_dsetmatrix( m, m, LU, ld, dA, ld ); magma_dsetvector( m, C, 1, dC1, 1 ); magma_dsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_dtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( m, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_dlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "dtrsv( %c, %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test DGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == 'N'); bool ntb = (trans[ib] == 'N'); magma_dsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_dsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DGEMM( m, n, k ) / 1e9; printf( "dgemm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", trans[ia], trans[ib], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_dsetmatrix( m, m, A, ld, dA, ld ); magma_dsetmatrix( m, n, B, ld, dB, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsymm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYMM( side[is], m, n ) / 1e9; printf( "dsymm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", side[is], uplo[iu], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYRK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_dsetmatrix( n, k, A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyrk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYRK( k, n ) / 1e9; printf( "dsyrk( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DSYR2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == 'N'); magma_dsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_dsetmatrix( n, n, C, ld, dC1, ld ); magma_dsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDsyr2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DSYR2K( k, n ) / 1e9; printf( "dsyr2k( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test DTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRMM( side[is], m, n ) / 1e9; printf( "dtrmm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test DTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == 'L'); magma_dsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_dsetmatrix( m, n, C, ld, dC1, ld ); magma_dsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_dtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasDtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasDaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 ); magma_dgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_dlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_DTRSM( side[is], m, n ) / 1e9; printf( "dtrsm( %c, %c ) diff %.2g, Gflop/s %6.2f, %6.2f\n", uplo[iu], trans[it], error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); return 0; }
/** Purpose ------- DGEQLF computes a QL factorization of a DOUBLE_PRECISION M-by-N matrix A: A = Q * L. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, if m >= n, the lower triangle of the subarray A(m-n+1:m,1:n) contains the N-by-N lower triangular matrix L; if m <= n, the elements on and below the (n-m)-th superdiagonal contain the M-by-N lower trapezoidal matrix L; the remaining elements, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max(1,N,2*NB^2). For optimum performance LWORK >= max(N*NB, 2*NB^2) where NB can be obtained through magma_get_dgeqlf_nb(M). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(m-k+i+1:m) = 0 and v(m-k+i) = 1; v(1:m-k+i-1) is stored on exit in A(1:m-k+i-1,n-k+i), and tau in TAU(i). @ingroup magma_dgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqlf( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dwork(i_) (dwork + (i_)) magmaDouble_ptr dA, dwork; double c_one = MAGMA_D_ONE; magma_int_t i, k, lddwork, old_i, old_ib, nb; magma_int_t rows, cols; magma_int_t ib, ki, kk, mu, nu, iinfo, ldda; int lquery; nb = magma_get_dgeqlf_nb(m); *info = 0; lquery = (lwork == -1); // silence "uninitialized" warnings old_ib = nb; old_i = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } k = min(m,n); if (*info == 0) { if (k == 0) work[0] = c_one; else { work[0] = MAGMA_D_MAKE( max(n*nb, 2*nb*nb), 0 ); } if (lwork < max(max(1,n), 2*nb*nb) && ! lquery) *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (k == 0) return *info; lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n)*ldda + nb*lddwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = dA + ldda*n; magma_queue_t queues[2]; magma_queue_create( &queues[0] ); magma_queue_create( &queues[1] ); if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially. The last kk columns are handled by the block method. First, copy the matrix on the GPU except the last kk columns */ magma_dsetmatrix_async( m, n-nb, A(0, 0), lda, dA(0, 0), ldda, queues[0] ); ki = ((k - nb - 1) / nb) * nb; kk = min(k, ki + nb); for (i = k - kk + ki; i >= k -kk; i -= nb) { ib = min(k-i,nb); if (i < k - kk + ki) { /* 1. Copy asynchronously the current panel to the CPU. 2. Copy asynchronously the submatrix below the panel to the CPU) */ rows = m - k + i + ib; magma_dgetmatrix_async( rows, ib, dA(0, n-k+i), ldda, A(0, n-k+i), lda, queues[1] ); magma_dgetmatrix_async( m-rows, ib, dA(rows, n-k+i), ldda, A(rows, n-k+i), lda, queues[0] ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the main update from the lookahead techniques. */ rows = m - k + old_i + old_ib; cols = n - k + old_i - old_ib; magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, old_ib, dA(0, cols+old_ib), ldda, dwork(0), lddwork, dA(0, 0 ), ldda, dwork(old_ib), lddwork); } magma_queue_sync( queues[1] ); /* Compute the QL factorization of the current block A(1:m-k+i+ib-1,n-k+i:n-k+i+ib-1) */ rows = m - k + i + ib; cols = n - k + i; lapackf77_dgeqlf( &rows, &ib, A(0,cols), &lda, tau+i, work, &lwork, &iinfo ); if (cols > 0) { /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ lapackf77_dlarft( MagmaBackwardStr, MagmaColumnwiseStr, &rows, &ib, A(0, cols), &lda, tau + i, work, &ib); dpanel_to_q( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib); magma_dsetmatrix( rows, ib, A(0,cols), lda, dA(0,cols), ldda ); dq_to_panel( MagmaLower, ib, A(rows-ib,cols), lda, work+ib*ib); // Send the triangular part on the GPU magma_dsetmatrix( ib, ib, work, ib, dwork(0), lddwork ); /* Apply H' to A(1:m-k+i+ib-1,1:n-k+i-1) from the left in two steps - implementing the lookahead techniques. This is the update of first ib columns. */ if (i-ib >= k -kk) magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, ib, ib, dA(0, cols), ldda, dwork(0), lddwork, dA(0,cols-ib), ldda, dwork(ib), lddwork); else { magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaBackward, MagmaColumnwise, rows, cols, ib, dA(0, cols), ldda, dwork(0), lddwork, dA(0, 0 ), ldda, dwork(ib), lddwork); } old_i = i; old_ib = ib; } } mu = m - k + i + nb; nu = n - k + i + nb; magma_dgetmatrix( m, nu, dA(0,0), ldda, A(0,0), lda ); } else { mu = m; nu = n; } /* Use unblocked code to factor the last or only block */ if (mu > 0 && nu > 0) lapackf77_dgeqlf(&mu, &nu, A(0,0), &lda, tau, work, &lwork, &iinfo); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; } /* magma_dgeqlf */
extern "C" magma_int_t magma_dgelqf( magma_int_t m, magma_int_t n, double *a, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGELQF computes an LQ factorization of a DOUBLE_PRECISION M-by-N matrix A: A = L * Q. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -10 internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) double *dA, *dAT; double c_one = MAGMA_D_ONE; magma_int_t maxm, maxn, maxdim, nb; magma_int_t iinfo, ldda; int lquery; /* Function Body */ *info = 0; nb = magma_get_dgelqf_nb(m); work[0] = MAGMA_D_MAKE( (double)(m*nb), 0 ); lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min(m, n) == 0) { work[0] = c_one; return *info; } maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); if (maxdim*maxdim < 2*maxm*maxn) { ldda = maxdim; if (MAGMA_SUCCESS != magma_dmalloc( &dA, maxdim*maxdim )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( m, n, a, lda, dA, ldda ); dAT = dA; magmablas_dtranspose_inplace( ldda, dAT, ldda ); } else { ldda = maxn; if (MAGMA_SUCCESS != magma_dmalloc( &dA, 2*maxn*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( m, n, a, lda, dA, maxm ); dAT = dA + maxn * maxm; magmablas_dtranspose2( dAT, ldda, dA, maxm, m, n ); } magma_dgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo); if (maxdim*maxdim < 2*maxm*maxn) { magmablas_dtranspose_inplace( ldda, dAT, ldda ); magma_dgetmatrix( m, n, dA, ldda, a, lda ); } else { magmablas_dtranspose2( dA, maxm, dAT, ldda, n, m ); magma_dgetmatrix( m, n, dA, maxm, a, lda ); } magma_free( dA ); return *info; } /* magma_dgelqf */
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); double *A, *B, *C; double *dA, *dB, *dC; double error, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n = 10; magma_int_t lda = n; magma_int_t ldda = ((n+31)/32)*32; magma_int_t size = lda*n; magma_int_t info; magma_dmalloc_cpu( &A, lda*n ); magma_dmalloc_cpu( &B, lda*n ); magma_dmalloc_cpu( &C, lda*n ); magma_dmalloc( &dA, ldda*n ); magma_dmalloc( &dB, ldda*n ); magma_dmalloc( &dC, ldda*n ); // initialize matrices lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_D_ADD( C[i+i*lda], MAGMA_D_MAKE( n*n, 0 )); } magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_dsetmatrix( n, n, B, lda, dB, ldda ); magma_dsetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_dpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_dpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference magma_dgetmatrix( n, n, dC, ldda, A, lda ); blasf77_daxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_dlange( "F", &n, &n, A, &lda, work ); printf( "n %d, error %8.2e\n", n, error ); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); cublasDestroy( handle ); magma_finalize(); return 0; }
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA DOUBLE_PRECISION array of pointers on the GPU, dimension (ngpu) On entry, the symmetric matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); double *work; int upper = (uplo == MagmaUpper); double *dwork[MagmaMaxGPUs]; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_dpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*ngpu)); if ( n%(nb*ngpu) != 0 ) lddp += min(nb, n-ngpu*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (ngpu == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( n, n, d_lA[0], ldda, work, n ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix( n, n, work, n, d_lA[0], ldda ); magma_free_pinned( work ); } else { lddp = nb*((n+nb-1)/nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], ngpu*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j < 5; j++ ) magma_event_create( &event[d][j] ); } magma_setdevice(0); h = 1; //ngpu; //(n+nb-1)/nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with three streams */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, stream, event, info); } else { /* with three streams */ magma_dpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, stream, event, info); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( stream[d][j] ); magma_queue_destroy( stream[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_dpotrf_mgpu */