extern "C" magma_int_t magmablas_dsymv_sync( magma_int_t num_gpus, magma_int_t k, magma_int_t n, double *work, double *W, magma_queue_t stream[][10] ) { double c_one = MAGMA_D_ONE; magma_int_t ione = 1; magma_int_t id, kk; /* reduce on CPU */ magma_setdevice(0); magma_queue_sync(stream[0][0]); for( kk=1; kk < k; kk++ ) { magma_queue_sync(stream[0][kk]); blasf77_daxpy( &n, &c_one, &work[kk*n], &ione, W, &ione ); } for( id=1; id < num_gpus; id++ ) { magma_setdevice(id); for( kk=0; kk < k; kk++ ) { magma_queue_sync(stream[id][kk]); blasf77_daxpy( &n, &c_one, &work[id*k*n + kk*n], &ione, W, &ione ); } } return 0; }
extern "C" void magma_dlarfxsym( magma_int_t N, double *A, magma_int_t LDA, double *V, double *TAU) { magma_int_t IONE=1; double dtmp; double Z_ZERO = MAGMA_D_ZERO; //double Z_ONE = MAGMA_D_ONE; double Z_MONE = MAGMA_D_NEG_ONE; double Z_HALF = MAGMA_D_HALF; //double WORK[N]; double *WORK; magma_dmalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_dlarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_dsymv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ dtmp = magma_cblas_ddot(N, WORK, IONE, V, IONE); /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_daxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_dsyr2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
/* //////////////////////////////////////////////////////////////////////////// -- like axpy for matrices: B += alpha*A. */ void dgeadd( magma_int_t m, magma_int_t n, double alpha, const double *A, magma_int_t lda, double *B, magma_int_t ldb ) { #define A(i_, j_) (A + (i_) + (j_)*lda) #define B(i_, j_) (B + (i_) + (j_)*ldb) const magma_int_t ione = 1; for( int j=0; j < n; ++j ) { blasf77_daxpy( &m, &alpha, A(0,j), &ione, B(0,j), &ione ); } }
inline static void magma_dlarfxsym_v2(magma_int_t n, double *A, magma_int_t lda, double *V, double *TAU, double *work) { /* WORK (workspace) double real array, dimension N */ magma_int_t ione = 1; double dtmp; double c_zero = MAGMA_D_ZERO; double c_neg_one= MAGMA_D_NEG_ONE; double c_half = MAGMA_D_HALF; /* X = AVtau */ blasf77_dsymv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ #if defined(PRECISION_z) || defined(PRECISION_c) dtmp = c_zero; for (magma_int_t j = 0; j < n; j++) dtmp = dtmp + MAGMA_D_CNJG(work[j]) * V[j]; //cblas_ddot_sub(n, work, ione, V, ione, &dtmp); #else dtmp = cblas_ddot(n, work, ione, V, ione); #endif /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_daxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_dsyr2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlarfb_gpu */ int main( int argc, char** argv ) { TESTING_CUDA_INIT(); double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; printf( "\nUsage: %s -M m -N n -K k\n\n", argv[0] ); magma_int_t m = 500; magma_int_t n = 300; magma_int_t k = 32; for( int i = 1; i < argc; i++ ) { if (strcmp("-M", argv[i]) == 0 && i+1 < argc) { m = atoi( argv[++i] ); } else if (strcmp("-N", argv[i]) == 0 && i+1 < argc) { n = atoi( argv[++i] ); } else if (strcmp("-K", argv[i]) == 0 && i+1 < argc) { k = atoi( argv[++i] ); } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( k <= 0 || k > m || k > n ) { printf( "requires 0 < k <= min(m,n)\n" ); exit(1); } magma_int_t ldc = m; magma_int_t ldv = max(m,n); magma_int_t ldt = k; magma_int_t ldw = max(m,n); magma_int_t nv; ldc = ((ldc+31)/32)*32; ldv = ((ldv+31)/32)*32; ldt = ((ldt+31)/32)*32; ldw = ((ldw+31)/32)*32; // Allocate memory for matrices double *C, *R, *V, *T, *W; TESTING_MALLOC( C, double, ldc*n ); TESTING_MALLOC( R, double, ldc*n ); TESTING_MALLOC( V, double, ldv*k ); TESTING_MALLOC( T, double, ldt*k ); TESTING_MALLOC( W, double, ldw*k ); double *dC, *dV, *dT, *dW; TESTING_DEVALLOC( dC, double, ldc*n ); TESTING_DEVALLOC( dV, double, ldv*k ); TESTING_DEVALLOC( dT, double, ldt*k ); TESTING_DEVALLOC( dW, double, ldw*k ); magma_int_t size; magma_int_t iseed[4] = { 1, 2, 3, 4 }; double error, work[1]; // test all combinations of input parameters const char* side[] = { MagmaLeftStr, MagmaRightStr }; const char* trans[] = { MagmaTransStr, MagmaNoTransStr }; const char* direct[] = { MagmaForwardStr, MagmaBackwardStr }; const char* storev[] = { MagmaColumnwiseStr, MagmaRowwiseStr }; printf(" M N K storev side direct trans ||R||_F / ||HC||_F\n"); printf("==================================================================================\n"); for( int istor = 0; istor < 2; ++istor ) { for( int iside = 0; iside < 2; ++iside ) { for( int idir = 0; idir < 2; ++idir ) { for( int itran = 0; itran < 2; ++itran ) { //printf( "# ----------\n" ); //printf( "# %-10s %-10s %-10s %-10s\n", storev[istor], side[iside], direct[idir], trans[itran] ); // C is full size = ldc*n; lapackf77_dlarnv( &ione, iseed, &size, C ); //printf( "C=" ); magma_dprint( m, n, C, ldc ); // V is ldv x nv. See larfb docs for description. ldv = (*side[iside] == 'L' ? m : n); nv = k; size = ldv*nv; lapackf77_dlarnv( &ione, iseed, &size, V ); if ( *storev[istor] == MagmaColumnwise ) { if ( *direct[idir] == MagmaForward ) { lapackf77_dlaset( MagmaUpperStr, &k, &k, &c_zero, &c_one, V, &ldv ); } else { lapackf77_dlaset( MagmaLowerStr, &k, &k, &c_zero, &c_one, &V[(ldv-k)], &ldv ); } } else { // rowwise, swap V's dimensions std::swap( ldv, nv ); if ( *direct[idir] == MagmaForward ) { lapackf77_dlaset( MagmaLowerStr, &k, &k, &c_zero, &c_one, V, &ldv ); } else { lapackf77_dlaset( MagmaUpperStr, &k, &k, &c_zero, &c_one, &V[(nv-k)*ldv], &ldv ); } } //printf( "# ldv %d, nv %d\n", ldv, nv ); //printf( "V=" ); magma_dprint( ldv, nv, V, ldv ); // T is upper triangular for forward, and lower triangular for backward magma_int_t k1 = k-1; size = ldt*k; lapackf77_dlarnv( &ione, iseed, &size, T ); if ( *direct[idir] == MagmaForward ) { lapackf77_dlaset( MagmaLowerStr, &k1, &k1, &c_zero, &c_zero, &T[1], &ldt ); } else { lapackf77_dlaset( MagmaUpperStr, &k1, &k1, &c_zero, &c_zero, &T[1*ldt], &ldt ); } //printf( "T=" ); magma_dprint( k, k, T, ldt ); magma_dsetmatrix( m, n, C, ldc, dC, ldc ); magma_dsetmatrix( ldv, nv, V, ldv, dV, ldv ); magma_dsetmatrix( k, k, T, ldt, dT, ldt ); lapackf77_dlarfb( side[iside], trans[itran], direct[idir], storev[istor], &m, &n, &k, V, &ldv, T, &ldt, C, &ldc, W, &ldw ); //printf( "HC=" ); magma_dprint( m, n, C, ldc ); magma_dlarfb_gpu( *side[iside], *trans[itran], *direct[idir], *storev[istor], m, n, k, dV, ldv, dT, ldt, dC, ldc, dW, ldw ); magma_dgetmatrix( m, n, dC, ldc, R, ldc ); //printf( "dHC=" ); magma_dprint( m, n, R, ldc ); // compute relative error |HC_magma - HC_lapack| / |HC_lapack| error = lapackf77_dlange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %-10s %-10s %-10s %-10s %8.2e\n", (int) m, (int) n, (int) k, storev[istor], side[iside], direct[idir], trans[itran], error ); }}}} // Memory clean up TESTING_FREE( C ); TESTING_FREE( R ); TESTING_FREE( V ); TESTING_FREE( T ); TESTING_FREE( W ); TESTING_DEVFREE( dC ); TESTING_DEVFREE( dV ); TESTING_DEVFREE( dT ); TESTING_DEVFREE( dW ); // Shutdown TESTING_CUDA_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlag2s and slag2d */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; float serror, swork[1]; double c_neg_one = MAGMA_D_NEG_ONE; float s_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, lda, ldda, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; float *SA, *SR; double *A, *R; magmaFloat_ptr dSA; magmaDouble_ptr dA; magma_opts opts; opts.parse_opts( argc, argv ); printf("%% func M N CPU GB/s (ms) GPU GB/s (ms) ||R||_F\n"); printf("%%====================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; lda = m; ldda = magma_roundup( m, opts.align ); // multiple of 32 by default // m*n double-real loads and m*n single-real stores (and vice-versa for slag2d) gbytes = (real_Double_t) m*n * (sizeof(double) + sizeof(float)) / 1e9; size = ldda*n; // ldda >= lda TESTING_MALLOC_CPU( SA, float, size ); TESTING_MALLOC_CPU( A, double, size ); TESTING_MALLOC_CPU( SR, float, size ); TESTING_MALLOC_CPU( R, double, size ); TESTING_MALLOC_DEV( dSA, float, size ); TESTING_MALLOC_DEV( dA, double, size ); lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( m, n, A, lda, dA, ldda, opts.queue ); magma_ssetmatrix( m, n, SA, lda, dSA, ldda, opts.queue ); /* ===================================================================== Performs operation using LAPACK dlag2s =================================================================== */ cpu_time = magma_wtime(); lapackf77_dlag2s( &m, &n, A, &lda, SA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) { printf("lapackf77_dlag2s returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA dlag2s =================================================================== */ gpu_time = magma_sync_wtime( opts.queue ); magmablas_dlag2s( m, n, dA, ldda, dSA, ldda, opts.queue, &info ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) { printf("magmablas_dlag2s returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_sgetmatrix( m, n, dSA, ldda, SR, lda, opts.queue ); /* ===================================================================== compute error |SA_magma - SA_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_saxpy( &size, &s_neg_one, SA, &ione, SR, &ione ); serror = lapackf77_slange( "Fro", &m, &n, SR, &lda, swork ); printf( "dlag2s %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., serror, (serror == 0 ? "ok" : "failed") ); status += ! (serror == 0); /* ===================================================================== Reset matrices =================================================================== */ lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( m, n, A, lda, dA, ldda, opts.queue ); magma_ssetmatrix( m, n, SA, lda, dSA, ldda, opts.queue ); /* ===================================================================== Performs operation using LAPACK slag2d =================================================================== */ cpu_time = magma_wtime(); lapackf77_slag2d( &m, &n, SA, &lda, A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) { printf("lapackf77_slag2d returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA slag2d =================================================================== */ magma_ssetmatrix( m, n, SA, lda, dSA, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_slag2d( m, n, dSA, ldda, dA, ldda, opts.queue, &info ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) { printf("magmablas_slag2d returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix( m, n, dA, ldda, R, lda, opts.queue ); /* ===================================================================== compute error |A_magma - A_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_daxpy( &size, &c_neg_one, A, &ione, R, &ione ); error = lapackf77_dlange( "Fro", &m, &n, R, &lda, work ); printf( "slag2d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error == 0 ? "ok" : "failed") ); status += ! (error == 0); TESTING_FREE_CPU( SA ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( SR ); TESTING_FREE_CPU( R ); TESTING_FREE_DEV( dSA ); TESTING_FREE_DEV( dA ); printf( "\n" ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by DGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T DOUBLE_PRECISION array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in,out] data Structure with pointers to dA, dT, dV, dW, dY which are distributed across multiple GPUs. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_aux ********************************************************************/ extern "C" magma_int_t magma_dlahr2_m( magma_int_t n, magma_int_t k, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *T, magma_int_t ldt, double *Y, magma_int_t ldy, struct dgehrd_data* data ) { #define A( i, j ) ( A + (i) + (j)*lda) #define Y( i, j ) ( Y + (i) + (j)*ldy) #define T( i, j ) ( T + (i) + (j)*ldt) #define dA( d, i, j ) (data->A [d] + (i) + (j)*ldda) #define dTi( d ) (data->Ti[d]) #define dV( d, i, j ) (data->V [d] + (i) + (j)*ldv ) #define dVd( d, i, j ) (data->Vd[d] + (i) + (j)*ldvd) #define dY( d, i, j ) (data->Y [d] + (i) + (j)*ldda) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double tmp; magma_int_t ngpu = data->ngpu; magma_int_t ldda = data->ldda; magma_int_t ldv = data->ldv; magma_int_t ldvd = data->ldvd; magma_int_t ione = 1; magma_int_t d, dki1, dn, nblocks, gblock, lblock, lgid; magma_int_t n_k_i_1, n_k; double scale; magma_int_t i; double ei = MAGMA_D_ZERO; magma_int_t info_data = 0; magma_int_t *info = &info_data; if (n < 0) { *info = -1; } else if (k < 0 || k >= n) { *info = -2; } else if (nb < 1 || nb > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldt < nb) { *info = -8; } else if (ldy < max(1,n)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // adjust from 1-based indexing k -= 1; // Function Body if (n <= 1) return 0; // zero out current top block of V on all GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(d,k,0), ldv ); } // set all Y=0 lapackf77_dlaset( "Full", &n, &nb, &c_zero, &c_zero, Y, &ldy ); for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Finish applying I - V * T * V' on right tmp = MAGMA_D_NEGATE( tau[i-1] ); blasf77_daxpy( &n_k, &tmp, Y(k,i-1), &ione, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_dcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_dtrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_dgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_dtrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_daxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_dlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // compute yi = A vi = sum_g A{d} vi{d} nblocks = (n-1) / nb / ngpu + 1; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // dV(k+i+1:n-1, i) = VA(k+i:n, i) magma_dsetvector_async( n_k_i_1, A(k+i+1,i), 1, dV(d, k+i+1, i), 1, data->streams[d] ); // copy column of dV -> dVd, using block cyclic distribution. // This assumes V and Vd have been padded so that // a 2D matrix copy doesn't access them out-of-bounds gblock = k / nb; lblock = gblock / ngpu; lgid = gblock % ngpu; if ( d < lgid ) { lblock += 1; } // treat V as (nb*ngpu) x nblock matrix, and Vd as nb x nblock matrix magmablas_dlacpy( MagmaFull, nb, nblocks-lblock, dV (d, d*nb + lblock*nb*ngpu, i), nb*ngpu, dVd(d, 0 + lblock*nb, i), nb ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); // dY(k:n, i) = dA(k:n, k+i+1:n) * dV(k+i+1:n, i) // skip if matrix is empty // each GPU copies to different temporary vector in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemv( MagmaNoTrans, n-k, dn-dki1, c_one, dA (d, k, dki1), ldda, dVd(d, dki1, i), 1, c_zero, dY (d, k, i), 1 ); // copy vector to host, storing in column nb+d of Y // as temporary space (Y has >= nb+ngpu columns) magma_dgetvector_async( n-k, dY(d, k, i), 1, Y(k, nb+d), 1, data->streams[d] ); } } // while GPU is doing above Ag*v... // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_D_NEGATE( tau[i] ); blasf77_dgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // apply reflectors to next column, A(i+1), on right only. // one axpy will be required to finish this, in the next iteration above if ( i > 0 && i+1 < nb ) { // Update next column, A(k:n,i+1), applying Q on right. // One axpy will be required to finish this, in the next iteration // above, after yi is computed. // This updates one more row than LAPACK does (row k), // making block above panel an even multiple of nb. // Use last column of T as workspace, w. magma_int_t i1 = i+1; // If real, conjugate row of V, and undo afterwards #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // w = T(0:i, 0:i+1) * VA(k+i+1, 0:i+1)' // T is now rectangular, so we use gemv instead of trmv as in lapack. blasf77_dgemv( "No trans", &i, &i1, &c_one, T(0,0), &ldt, A(k+i1,0), &lda, &c_zero, T(0,nb-1), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i1, A(k+i1,0), &lda ); #endif // A(k:n, i+1) -= Y(k:n, 0:i) * w blasf77_dgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i1), &ione ); } // yi = sum_g yi{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( data->streams[d] ); magma_indices_1D_bcyclic( nb, ngpu, d, k+i+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // yi = yi + yi{d} blasf77_daxpy( &n_k, &c_one, Y(k,nb+d), &ione, Y(k,i), &ione ); } } } // Restore diagonal element *A(k+nb,nb-1) = ei; // compute Y = Am V = sum_g Am{d} V{d} --- top part, Y(0:k-1,:) for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magmablasSetKernelStream( data->streams[d] ); // convert global indices (k) to local indices (dk) magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); // dY(0:k, :) = dA(0:k, k+i+1:n-1) * dV(k+i+1:n-1, :) // skip if matrix is empty // each GPU copies to different temporary block in Y, // which are summed in separate loop below if ( dn-dki1 > 0 ) { magma_dgemm( MagmaNoTrans, MagmaNoTrans, k, nb, dn-dki1, c_one, dA (d, 0, dki1), ldda, dVd(d, dki1, 0), ldvd, c_zero, dY (d, 0, 0), ldda ); // copy result to host, storing in columns [nb + nb*d : nb + nb*(d+1)] of Y // as temporary space (Y has nb + nb*ngpu columns) magma_dgetmatrix_async( k, nb, dY(d, 0, 0), ldda, Y(0,nb+nb*d), ldy, data->streams[d] ); } } // Y = sum_g Y{d} for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_queue_sync( 0 ); magma_indices_1D_bcyclic( nb, ngpu, d, k+1, n, &dki1, &dn ); if ( dn-dki1 > 0 ) { // Y = Y + Am V for( i = 0; i < nb; ++i ) { blasf77_daxpy( &k, &c_one, Y(0,nb+nb*d+i), &ione, Y(0,i), &ione ); } } } // copy Y and T matrices to GPUs for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_dsetmatrix_async( n, nb, Y, ldy, dY(d, 0, 0), ldda, data->streams[d] ); magma_dsetmatrix_async( nb, nb, T, nb, dTi(d), nb, data->streams[d] ); } return 0; } /* magma_dlahr2 */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dormqr_gpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Cnorm, error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max, dt_size; double *C, *R, *A, *hwork, *tau; magmaDouble_ptr dC, dA, dT; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; ldc = magma_roundup( m, opts.align ); // multiple of 32 by default // A is m x k (left) or n x k (right) mm = (side[iside] == MagmaLeft ? m : n); nb = magma_get_dgeqrf_nb( mm, k ); lda = magma_roundup( mm, opts.align ); // multiple of 32 by default gflops = FLOPS_DORMQR( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaLeft ) { // side = left lwork_max = (m - k + nb)*(n + nb) + n*nb; dt_size = ( 2*min(m,k) + magma_roundup( max(m,n), 32) )*nb; } else { // side = right lwork_max = (n - k + nb)*(m + nb) + m*nb; dt_size = ( 2*min(n,k) + magma_roundup( max(m,n), 32 ) )*nb; } // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_dmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*k ); TESTING_MALLOC_CPU( hwork, double, lwork_max ); TESTING_MALLOC_CPU( tau, double, k ); TESTING_MALLOC_DEV( dC, double, ldc*n ); TESTING_MALLOC_DEV( dA, double, lda*k ); TESTING_MALLOC_DEV( dT, double, dt_size ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); magma_dsetmatrix( m, n, C, ldc, dC, ldc ); // A is m x k (left) or n x k (right) size = lda*k; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute QR factorization to get Householder vectors in dA, tau, dT magma_dsetmatrix( mm, k, A, lda, dA, lda ); magma_dgeqrf_gpu( mm, k, dA, lda, tau, dT, &info ); magma_dgetmatrix( mm, k, dA, lda, A, lda ); if (info != 0) { printf("magma_dgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormqr( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, hwork, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dormqr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); if (info != 0) { printf("magma_dormqr_gpu (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_D_REAL( hwork[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } // dormqr2 takes a copy of dA in CPU memory if ( opts.version == 2 ) { magma_dgetmatrix( mm, k, dA, lda, A, lda ); } magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); // sync needed for L,N and R,T cases if ( opts.version == 1 ) { magma_dormqr_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, hwork, lwork, dT, nb, &info ); } else if ( opts.version == 2 ) { magma_dormqr2_gpu( side[iside], trans[itran], m, n, k, dA, lda, tau, dC, ldc, A, lda, &info ); } gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dormqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix( m, n, dC, ldc, R, ldc ); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_dlange( "Fro", &m, &n, C, &ldc, work ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, work ) / (magma_dsqrt(m*n) * Cnorm); printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dC ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; double magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 1.5, -2.3 ); double beta = MAGMA_D_MAKE( -0.6, 0.8 ); double *A, *X, *Y, *Ycublas, *Ymagma; double *dA, *dX, *dY; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_DGEMV( M, N ) / 1e9; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N; sizeX = incx*Xm; sizeY = incy*Ym; TESTING_MALLOC_CPU( A, double, sizeA ); TESTING_MALLOC_CPU( X, double, sizeX ); TESTING_MALLOC_CPU( Y, double, sizeY ); TESTING_MALLOC_CPU( Ycublas, double, sizeY ); TESTING_MALLOC_CPU( Ymagma, double, sizeY ); TESTING_MALLOC_DEV( dA, double, sizeA ); TESTING_MALLOC_DEV( dX, double, sizeX ); TESTING_MALLOC_DEV( dY, double, sizeY ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &sizeA, A ); lapackf77_dlarnv( &ione, ISEED, &sizeX, X ); lapackf77_dlarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( M, N, A, lda, dA, lda ); magma_dsetvector( Xm, X, incx, dX, incx ); magma_dsetvector( Ym, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasDgemv( handle, cublas_trans_const(opts.transA), M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetvector( Ym, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_dsetvector( Ym, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); magmablas_dgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy ); magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_dgetvector( Ym, dY, incx, Ymagma, incx ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_dgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_dlange( "M", &Ym, &ione, Ymagma, &Ym, work ) / Ym; blasf77_daxpy( &Ym, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_dlange( "M", &Ym, &ione, Ycublas, &Ym, work ) / Ym; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol ? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( X ); TESTING_FREE_CPU( Y ); TESTING_FREE_CPU( Ycublas ); TESTING_FREE_CPU( Ymagma ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dX ); TESTING_FREE_DEV( dY ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; opts.parse_opts( argc, argv ); opts.ngpu = abs( opts.ngpu ); // always uses multi-GPU code opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double eps = lapackf77_dlamch("E"); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("%%===============================================================================================\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("%%=========================================================================\n"); } for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default nb = magma_get_dgeqrf_nb( M, N ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, magma_ceildiv(N,nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, NULL, &M, NULL, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_PIN( h_R, double, n2 ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for( int j=0; j < 4; j++ ) ISEED2[j] = ISEED[j]; // save seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau2; TESTING_MALLOC_CPU( tau2, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &lda, tau2, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } TESTING_FREE_CPU( tau2 ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); } magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); if ( opts.check == 1 && M >= N ) { /* ===================================================================== Check the result -- dqrt02 requires M >= N =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC_CPU( h_W1, double, n2 ); // Q TESTING_MALLOC_CPU( h_W2, double, n2 ); // R TESTING_MALLOC_CPU( h_W3, double, lwork ); // WORK TESTING_MALLOC_CPU( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0], results[1] ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0], results[1] ); } // todo also check results[1] < tol? printf(" %s\n", (results[0] < tol ? "ok" : "failed")); status += ! (results[0] < tol); TESTING_FREE_CPU( h_W1 ); TESTING_FREE_CPU( h_W2 ); TESTING_FREE_CPU( h_W3 ); TESTING_FREE_CPU( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) ---", (int) M, (int) N, gpu_perf, gpu_time); } printf("%s\n", (opts.check != 0 ? " (error check only for M >= N)" : "")); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; double *d_A, *d_T, *ddA, *dtau; double *d_A2, *d_T2, *ddA2, *dtau2; double *dwork, *dwork2; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; #define BLOCK_SIZE 64 magma_opts opts; parse_opts( argc, argv, &opts ); double tol = 10. * opts.tolerance * lapackf77_dlamch("E"); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); printf("version %d\n", (int) opts.version ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if (N > 128) { printf("%5d %5d skipping because dgeqr2x requires N <= 128\n", (int) M, (int) N); continue; } if (M < N) { printf("%5d %5d skipping because dgeqr2x requires M >= N\n", (int) M, (int) N); continue; } min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N )) / 1e9; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); TESTING_MALLOC_DEV( d_T2, double, N*N ); TESTING_MALLOC_DEV( ddA2, double, N*N ); TESTING_MALLOC_DEV( dtau2, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); TESTING_MALLOC_DEV( dwork2, double, max(5*min_mn, (BLOCK_SIZE*2+2)*min_mn) ); // todo replace with magma_dlaset cudaMemset(ddA, 0, N*N*sizeof(double)); cudaMemset(d_T, 0, N*N*sizeof(double)); cudaMemset(ddA2, 0, N*N*sizeof(double)); cudaMemset(d_T2, 0, N*N*sizeof(double)); lwork = -1; lapackf77_dgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, double, lwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_R, lda, d_A2, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime(0); if (opts.version == 1) magma_dgeqr2x_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 2) magma_dgeqr2x2_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else if (opts.version == 3) magma_dgeqr2x3_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info); else { printf( "call magma_dgeqr2x4_gpu\n" ); /* Going through NULL stream is faster Going through any stream is slower Doing two streams in parallel is slower than doing them sequentially Queuing happens on the NULL stream - user defined buffers are smaller? */ magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, NULL); //magma_dgeqr2x4_gpu(M, N, d_A, ldda, dtau, d_T, ddA, dwork, &info, stream[1]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, stream[0]); //magma_dgeqr2x4_gpu(M, N, d_A2, ldda, dtau2, d_T2, ddA2, dwork2, &info, NULL); //gflops *= 2; } gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgeqr2x_gpu version %d returned error %d: %s.\n", (int) opts.version, (int) info, magma_strerror( info )); } else { if ( opts.check ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); //magma_dgeqr2(&M, &N, h_A, &lda, tau, h_work, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, ldda, h_R, M ); magma_dgetmatrix( N, N, ddA, N, h_T, N ); // Restore the upper triangular part of A before the check for(int col=0; col < N; col++){ for(int row=0; row <= col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / (N * error); // Check if T is the same magma_dgetmatrix( N, N, d_T, N, h_T, N ); double terr = 0.; for(int col=0; col < N; col++) for(int row=0; row <= col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_T ); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); TESTING_FREE_DEV( dwork ); TESTING_FREE_DEV( d_A2 ); TESTING_FREE_DEV( d_T2 ); TESTING_FREE_DEV( ddA2 ); TESTING_FREE_DEV( dtau2 ); TESTING_FREE_DEV( dwork2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *VL, *VR, *h_work, *w1, *w2; double *w1i, *w2i; double c_neg_one = MAGMA_D_NEG_ONE; double matnorm, tnrm, result[8]; /* Matrix size */ magma_int_t N=0, n2, lda, nb, lwork; magma_int_t size[8] = {1024,2048,3072,4032,5184,6016,7040,8064}; magma_int_t i, j, info, checkres, once = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_vec_t jobl = MagmaVec; magma_vec_t jobr = MagmaVec; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); once = 1; } else if (strcmp("-LN", argv[i])==0) jobl = MagmaNoVec; else if (strcmp("-LV", argv[i])==0) jobl = MagmaVec; else if (strcmp("-RN", argv[i])==0) jobr = MagmaNoVec; else if (strcmp("-RV", argv[i])==0) jobr = MagmaVec; } if ( N > 0 ) printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", (int) N); else { printf("\nUsage: \n"); printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeev -L[N|V] -R[N|V] -N %d\n\n", 1024); N = size[7]; } checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; lda = N; n2 = lda * N; nb = magma_get_dgehrd_nb(N); lwork = N*(2+nb); // generous workspace - required by dget22 lwork = max(lwork, N * ( 5 + 2*N)); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( w1i, double, N ); TESTING_MALLOC_CPU( w2i, double, N ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_PIN( VL, double, n2 ); TESTING_MALLOC_PIN( VR, double, n2 ); TESTING_MALLOC_PIN( h_work, double, lwork ); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf(" N CPU Time(s) GPU Time(s) ||R||_F / ||A||_F\n"); printf("==========================================================\n"); for(i=0; i<8; i++){ if ( argc == 1 ){ N = size[i]; } lda = N; n2 = lda*N; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up magma_dgeev(jobl, jobr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); gpu_time = magma_wtime(); magma_dgeev(jobl, jobr, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgeev had an illegal value.\n", (int) -info); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeev(lapack_const(jobl), lapack_const(jobr), &N, h_A, &lda, w2, w2i, VL, &lda, VR, &lda, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of dgeev had an illegal value.\n", (int) -info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ if ( checkres ) { /* =================================================================== Check the result following LAPACK's [zcds]drvev routine. The following 7 tests are performed: * (1) | A * VR - VR * W | / ( n |A| ) * * Here VR is the matrix of unit right eigenvectors. * W is a diagonal matrix with diagonal entries W(j). * * (2) | A**T * VL - VL * W**T | / ( n |A| ) * * Here VL is the matrix of unit left eigenvectors, A**T is the * ugate-transpose of A, and W is as above. * * (3) | |VR(i)| - 1 | and whether largest component real * * VR(i) denotes the i-th column of VR. * * (4) | |VL(i)| - 1 | and whether largest component real * * VL(i) denotes the i-th column of VL. * * (5) W(full) = W(partial) * * W(full) denotes the eigenvalues computed when both VR and VL * are also computed, and W(partial) denotes the eigenvalues * computed when only W, only W and VR, or only W and VL are * computed. * * (6) VR(full) = VR(partial) * * VR(full) denotes the right eigenvectors computed when both VR * and VL are computed, and VR(partial) denotes the result * when only VR is computed. * * (7) VL(full) = VL(partial) * * VL(full) denotes the left eigenvectors computed when both VR * and VL are also computed, and VL(partial) denotes the result * when only VL is computed. ================================================================= */ int jj; double ulp, ulpinv, vmx, vrmx, vtst, res[2]; double *LRE, DUM; TESTING_MALLOC_PIN( LRE, double, n2 ); ulp = lapackf77_dlamch( "P" ); ulpinv = 1./ulp; // Initialize RESULT for (j = 0; j < 8; j++) result[j] = -1.; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaVec, MagmaVec, N, h_R, lda, w1, w1i, VL, lda, VR, lda, h_work, lwork, &info, queue); // Do test 1 lapackf77_dget22("N", "N", "N", &N, h_A, &lda, VR, &lda, w1, w1i, h_work, res); result[0] = res[0]; result[0] *= ulp; // Do test 2 lapackf77_dget22("T", "N", "T", &N, h_A, &lda, VL, &lda, w1, w1i, h_work, &result[1]); result[1] *= ulp; // Do test 3 result[2] = -1.; for (j = 0; j < N; ++j) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_dnrm2(N, &VR[j * lda], ione); else if (w1i[j] > 0.) tnrm = magma_dlapy2( cblas_dnrm2(N, &VR[j * lda], ione), cblas_dnrm2(N, &VR[(j+1)* lda], ione) ); result[2] = fmax(result[2], fmin(ulpinv, magma_abs(tnrm-1.)/ulp)); if (w1i[j] > 0.) { vmx = vrmx = 0.; for (jj = 0; jj <N; ++jj) { vtst = magma_dlapy2( VR[jj+j*lda], VR[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VR[jj + (j+1)*lda])==0. && magma_abs( VR[jj+j*lda] ) > vrmx) vrmx = magma_abs( VR[jj+j*lda] ); } if (vrmx / vmx < 1. - ulp * 2.) result[2] = ulpinv; } } result[2] *= ulp; // Do test 4 result[3] = -1.; for (j = 0; j < N; ++j) { tnrm = 1.; if (w1i[j] == 0.) tnrm = cblas_dnrm2(N, &VL[j * lda], ione); else if (w1i[j] > 0.) tnrm = magma_dlapy2( cblas_dnrm2(N, &VL[j * lda], ione), cblas_dnrm2(N, &VL[(j+1)* lda], ione) ); result[3] = fmax(result[3], fmin(ulpinv, magma_abs(tnrm-1.)/ulp)); if (w1i[j] > 0.) { vmx = vrmx = 0.; for (jj = 0; jj <N; ++jj) { vtst = magma_dlapy2( VL[jj+j*lda], VL[jj+(j+1)*lda]); if (vtst > vmx) vmx = vtst; if ( (VL[jj + (j+1)*lda])==0. && magma_abs( VL[jj+j*lda]) > vrmx) vrmx = magma_abs( VL[jj+j*lda] ); } if (vrmx / vmx < 1. - ulp * 2.) result[3] = ulpinv; } } result[3] *= ulp; // Compute eigenvalues only, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaNoVec, MagmaNoVec, N, h_R, lda, w2, w2i, &DUM, 1, &DUM, 1, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case N, N\n", (int) info); } // Do test 5 result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with N N\n"); // Compute eigenvalues and right eigenvectors, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaNoVec, MagmaVec, N, h_R, lda, w2, w2i, &DUM, 1, LRE, lda, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case N, V\n", (int) info); } // Do test 5 again result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with N V\n"); // Do test 6 result[5] = 1; for (j = 0; j < N; ++j) for (jj = 0; jj < N; ++jj) if ( VR[j+jj*lda] != LRE[j+jj*lda] ) result[5] = 0; // Compute eigenvalues and left eigenvectors, and test them lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dgeev(MagmaVec, MagmaNoVec, N, h_R, lda, w2, w2i, LRE, lda, &DUM, 1, h_work, lwork, &info, queue); if (info != 0) { result[0] = ulpinv; info = abs(info); printf("Info = %d fo case V, N\n", (int) info); } // Do test 5 again result[4] = 1; for (j = 0; j < N; ++j) if ( w1[j] != w2[j] || w1i[j] != w2i[j] ) result[4] = 0; //if (result[4] == 0) printf("test 5 failed with V N\n"); // Do test 7 result[6] = 1; for (j = 0; j < N; ++j) for (jj = 0; jj < N; ++jj) if ( VL[j+jj*lda] != LRE[j+jj*lda] ) result[6] = 0; printf("Test 1: | A * VR - VR * W | / ( n |A| ) = %e\n", result[0]); printf("Test 2: | A'* VL - VL * W'| / ( n |A| ) = %e\n", result[1]); printf("Test 3: | |VR(i)| - 1 | = %e\n", result[2]); printf("Test 4: | |VL(i)| - 1 | = %e\n", result[3]); printf("Test 5: W (full) == W (partial) = %f\n", result[4]); printf("Test 6: VR (full) == VR (partial) = %f\n", result[5]); printf("Test 7: VL (full) == VL (partial) = %f\n", result[6]); //==================================================================== matnorm = lapackf77_dlange("f", &N, &ione, w1, &N, h_work); blasf77_daxpy(&N, &c_neg_one, w1, &ione, w2, &ione); result[7] = lapackf77_dlange("f", &N, &ione, w2, &N, h_work) / matnorm; printf("%5d %6.2f %6.2f %e\n", (int) N, cpu_time, gpu_time, result[7]); TESTING_FREE_PIN( LRE ); } else { printf("%5d %6.2f %6.2f\n", (int) N, cpu_time, gpu_time); } if (argc != 1) break; } /* Memory clean up */ TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( w1i ); TESTING_FREE_CPU( w2i ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( VL ); TESTING_FREE_PIN( VR ); TESTING_FREE_PIN( h_work ); /* Shutdown */ magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_T, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_T, ddA, dtau; magmaDouble_ptr dwork; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lwork; const int MAXTESTS = 10; magma_int_t msize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t nsize[MAXTESTS] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8100, 8192 }; magma_int_t i, info, min_mn; magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t checkres; checkres = getenv("MAGMA_TESTINGS_CHECK") != NULL; // process command line arguments printf( "\nUsage: %s -N <m,n> -c\n", argv[0] ); printf( " -N can be repeated up to %d times. If only m is given, then m=n.\n", MAXTESTS ); printf( " -c or setting $MAGMA_TESTINGS_CHECK runs LAPACK and checks result.\n\n" ); int ntest = 0; for( int i = 1; i < argc; ++i ) { if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ) { magma_assert( ntest < MAXTESTS, "error: -N repeated more than maximum %d tests\n", MAXTESTS ); int m, n; info = sscanf( argv[++i], "%d,%d", &m, &n ); if ( info == 2 && m > 0 && n > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = n; } else if ( info == 1 && m > 0 ) { msize[ ntest ] = m; nsize[ ntest ] = m; // implicitly } else { printf( "error: -N %s is invalid; ensure m > 0, n > 0.\n", argv[i] ); exit(1); } M = max( M, msize[ ntest ] ); N = max( N, nsize[ ntest ] ); ntest++; } else if ( strcmp("-M", argv[i]) == 0 ) { printf( "-M has been replaced in favor of -N m,n to allow -N to be repeated.\n\n" ); exit(1); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = true; } else { printf( "invalid argument: %s\n", argv[i] ); exit(1); } } if ( ntest == 0 ) { ntest = MAXTESTS; M = msize[ntest-1]; N = nsize[ntest-1]; } ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } /* Allocate memory for the matrix */ TESTING_MALLOC_PIN( tau, double, min_mn ); TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_T, double, N*N ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_T, double, N*N ); TESTING_MALLOC_DEV( ddA, double, N*N ); TESTING_MALLOC_DEV( dtau, double, min_mn ); TESTING_MALLOC_DEV( dwork, double, max(5*min_mn, (32*2+2)*min_mn) ); double *h1 = (double*)malloc(sizeof(double)*N*N); memset(h1, 0, N*N*sizeof(double)); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); lwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_PIN( h_work, double, lwork ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R||_F/||A||_F ||R_T||\n"); printf("=============================================================================\n"); for( i = 0; i < ntest; ++i ) { M = msize[i]; N = nsize[i]; min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRT( M, N)) / 1e9; /* Initialize the matrix */ magma_int_t ISEED[4] = {0,0,0,1}; lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // warm-up // magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); /* magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue ); clEnqueueWriteBuffer(queue, ddA, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); clEnqueueWriteBuffer(queue, d_T, CL_TRUE, 0, sizeof(double)*N*N, h1, 0, NULL, NULL); */ gpu_time = magma_wtime(); magma_dgeqr2x3_gpu(&M, &N, d_A, 0, &ldda, dtau, 0, d_T, 0, ddA, 0, dwork, 0, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d.\n", (int) info); if ( checkres ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &M, &N, h_A, &lda, tau, h_work, &N); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d.\n", (int) info); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue ); magma_dgetmatrix( N, N, ddA, 0, N, h_T, 0, N, queue ); // Restore the upper triangular part of A before the check for(int col=0; col<N; col++){ for(int row=0; row<=col; row++) h_R[row + col*M] = h_T[row + col*N]; } error = lapackf77_dlange("M", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("M", &M, &N, h_R, &lda, work) / error; // Check if T is the same double terr = 0.; magma_dgetmatrix( N, N, d_T, 0, N, h_T, 0, N, queue ); for(int col=0; col<N; col++) for(int row=0; row<=col; row++) terr += ( MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N])* MAGMA_D_ABS(h_work[row + col*N] - h_T[row + col*N]) ); terr = magma_dsqrt(terr); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) M, (int) N, cpu_perf, 1000.*cpu_time, gpu_perf, 1000.*gpu_time, error, terr); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, 1000.*gpu_time); } } /* Memory clean up */ TESTING_FREE_PIN( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_T ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_T ); TESTING_FREE_DEV( ddA ); TESTING_FREE_DEV( dtau ); free(h1); magma_queue_destroy( queue ); magma_finalize(); }
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double *h_R = NULL, *h_P = NULL; magmaDouble_ptr d_lA[MagmaMaxSubs * MagmaMaxGPUs]; magma_int_t N = 0, n2, lda, ldda; magma_int_t size[10] = { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 }; magma_int_t i, j, k, check = 0, info; double mz_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t num_gpus0 = 1, num_gpus, num_subs0 = 1, num_subs, tot_subs, flag = 0; magma_int_t nb, n_local, nk; magma_uplo_t uplo = MagmaLower; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i]) == 0){ N = atoi(argv[++i]); if (N > 0) { size[0] = size[9] = N; flag = 1; } } if(strcmp("-NGPU", argv[i]) == 0) num_gpus0 = atoi(argv[++i]); if(strcmp("-NSUB", argv[i]) == 0) num_subs0 = atoi(argv[++i]); if(strcmp("-UPLO", argv[i]) == 0) uplo = (strcmp("L", argv[++i]) == 0 ? MagmaLower : MagmaUpper); if(strcmp("-check", argv[i]) == 0) check = 1; } } /* Initialize */ magma_queue_t queues[2*MagmaMaxGPUs]; magma_device_t devices[ MagmaMaxGPUs ]; magma_int_t num = 0; magma_int_t err; magma_init(); err = magma_getdevices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_getdevices failed: %d\n", (int) err ); exit(-1); } for(i=0;i<num_gpus0;i++){ err = magma_queue_create( devices[i], &queues[2*i] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } err = magma_queue_create( devices[i], &queues[2*i+1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", (int) err ); exit(-1); } } printf("\nUsing %d GPUs:\n", num_gpus0); printf(" testing_dpotrf_msub -N %d -NGPU %d -NSUB %d -UPLO %c %s\n\n", size[0], num_gpus0,num_subs0, (uplo == MagmaLower ? 'L' : 'U'),(check == 1 ? "-check" : " ")); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma-R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================================================\n"); for(i=0; i<10; i++){ N = size[i]; lda = N; n2 = lda*N; gflops = FLOPS_DPOTRF( N ) / 1e9;; nb = magma_get_dpotrf_nb(N); if (num_subs0*num_gpus0 > N/nb) { num_gpus = N/nb; num_subs = 1; if(N%nb != 0) num_gpus ++; printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus); } else { num_gpus = num_gpus0; num_subs = num_subs0; } tot_subs = num_subs * num_gpus; /* Allocate host memory for the matrix */ #ifdef USE_PINNED_CLMEMORY cl_mem buffer1 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, n2*sizeof(double), NULL, NULL); cl_mem buffer2 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, lda*nb*sizeof(double), NULL, NULL); for (k=0; k<num_gpus; k++) { h_R = (double*)clEnqueueMapBuffer(queues[2*k], buffer1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, n2*sizeof(double), 0, NULL, NULL, NULL); h_P = (double*)clEnqueueMapBuffer(queues[2*k], buffer2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lda*nb*sizeof(double), 0, NULL, NULL, NULL); } #else TESTING_MALLOC_PIN( h_P, double, lda*nb ); TESTING_MALLOC_PIN( h_R, double, n2 ); #endif /* Initialize the matrix */ init_matrix( N, h_R, lda ); /* Allocate GPU memory */ if (uplo == MagmaUpper) { ldda = ((N+nb-1)/nb)*nb; n_local = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; } else { ldda = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb; n_local = ((N+nb-1)/nb)*nb; } for (j=0; j<tot_subs; j++) { TESTING_MALLOC_DEV( d_lA[j], double, n_local*ldda ); } /* Warm up to measure the performance */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_dsetmatrix( j+nk, nk, &h_R[j*lda], lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (magma_int_t kk = 0; kk<tot_subs; kk++) { magma_int_t mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { magma_int_t mii = min(nb, N-ii); lapackf77_dlacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_dsetmatrix( mk, nk, h_P, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_dsetmatrix( nk, j+nk, &h_R[j], lda, d_lA[k], j/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); }*/ } magma_dpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* distribute matrix to gpus */ if (uplo == MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_dsetmatrix( j+nk, nk, &h_R[j*lda], lda, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (magma_int_t kk = 0; kk<tot_subs; kk++) { magma_int_t mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { magma_int_t mii = min(nb, N-ii); lapackf77_dlacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda ); mk += mii; } k = ((j+kk*nb)/nb)%tot_subs; if (mk > 0 && nk > 0) { magma_dsetmatrix( mk, nk, h_P, lda, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, queues[2*(k%num_gpus)]); } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_dsetmatrix( nk, j+nk, &h_R[j], lda, d_lA[k], (j/(nb*tot_subs)*nb), ldda, queues[2*(k%num_gpus)]); }*/ } gpu_time = magma_wtime(); magma_dpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, queues, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf( "magma_dpotrf had error %d.\n", info ); /* gather matrix from gpus */ if (uplo==MagmaUpper) { for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_dgetmatrix( j+nk, nk, d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, &h_R[j*lda], lda, queues[2*(k%num_gpus)]); } } else { for (j=0; j<N; j+=nb) { nk = min(nb, N-j); for (magma_int_t kk = 0; kk<tot_subs; kk++) { k = ((j+kk*nb)/nb)%tot_subs; magma_int_t mk = 0; mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { mk += min(nb, N-ii); } if (mk > 0 && nk > 0) { magma_dgetmatrix( mk, nk, d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, h_P, lda, queues[2*(k%num_gpus)]); } mk = 0; for (magma_int_t ii=j+kk*nb; ii<N; ii+=nb*tot_subs) { magma_int_t mii = min(nb, N-ii); lapackf77_dlacpy( MagmaFullStr, &mii, &nk, &h_P[mk], &lda, &h_R[ii+j*lda], &lda ); mk += mii; } } } /*for (j=0; j<N; j+=nb) { k = (j/nb)%tot_subs; nk = min(nb, N-j); magma_dgetmatrix( nk, j+nk, d_lA[k], (j/(nb*tot_subs)*nb), ldda, &h_R[j], lda, queues[2*(k%num_gpus)] ); }*/ } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if (check == 1) { double work[1], matnorm, diffnorm; double *h_A; TESTING_MALLOC_PIN( h_A, double, n2 ); init_matrix( N, h_A, lda ); cpu_time = magma_wtime(); if (uplo == MagmaLower) { lapackf77_dpotrf( MagmaLowerStr, &N, h_A, &lda, &info ); } else { lapackf77_dpotrf( MagmaUpperStr, &N, h_A, &lda, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf( "lapackf77_dpotrf had error %d.\n", info ); /* ===================================================================== Check the result compared to LAPACK |R_magma - R_lapack| / |R_lapack| =================================================================== */ matnorm = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &mz_one, h_A, &ione, h_R, &ione); diffnorm = lapackf77_dlange("f", &N, &N, h_R, &lda, work); printf( "%5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm ); TESTING_FREE_PIN( h_A ); } else { printf( "%5d - - (- -) %6.2f (%6.2f) - -\n", N, gpu_perf, gpu_time ); } // free memory #ifdef USE_PINNED_CLMEMORY for (k=0; k<num_gpus; k++) { clEnqueueUnmapMemObject(queues[2*k], buffer1, h_R, 0, NULL, NULL); clEnqueueUnmapMemObject(queues[2*k], buffer2, h_P, 0, NULL, NULL); } clReleaseMemObject(buffer1); clReleaseMemObject(buffer2); #else TESTING_FREE_PIN( h_P ); TESTING_FREE_PIN( h_R ); #endif for (j=0; j<tot_subs; j++) { TESTING_FREE_DEV( d_lA[j] ); } if (flag != 0) break; } /* clean up */ for (i=0; i<num_gpus; i++) { magma_queue_destroy( queues[2*i] ); magma_queue_destroy( queues[2*i+1] ); } magma_finalize(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dormlq */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t mm, m, n, k, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; double *C, *R, *A, *W, *tau; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf(" M N K side trans CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||QC||_F\n"); printf("===============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_dgelqf_nb( min( m, n )); ldc = m; // A is k x m (left) or k x n (right) mm = (side[iside] == MagmaLeft ? m : n); lda = k; gflops = FLOPS_DORMLQ( m, n, k, side[iside] ) / 1e9; if ( side[iside] == MagmaLeft && m < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=left and m < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } if ( side[iside] == MagmaRight && n < k ) { printf( "%5d %5d %5d %4c %5c skipping because side=right and n < k\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ) ); continue; } // need at least 2*nb*nb for gelqf lwork_max = max( max( m*nb, n*nb ), 2*nb*nb ); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*mm ); TESTING_MALLOC_CPU( W, double, lwork_max ); TESTING_MALLOC_CPU( tau, double, k ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); lapackf77_dlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*mm; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute LQ factorization to get Householder vectors in A, tau magma_dgelqf( k, mm, A, lda, tau, W, lwork_max, &info ); if (info != 0) printf("magma_dgelqf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormlq( lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, W, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dormlq returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); if (info != 0) printf("magma_dormlq (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = (magma_int_t) MAGMA_D_REAL( W[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("optimal lwork %d > lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_dormlq( side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, W, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dormlq returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ error = lapackf77_dlange( "Fro", &m, &n, C, &ldc, work ); size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, work ) / error; printf( "%5d %5d %5d %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( W ); TESTING_FREE_CPU( tau ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iside, itran printf( "\n" ); } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dlatrd2(char uplo, magma_int_t n, magma_int_t nb, double *a, magma_int_t lda, double *e, double *tau, double *w, magma_int_t ldw, double *da, magma_int_t ldda, double *dw, magma_int_t lddw, double *dwork, magma_int_t ldwork) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= DLATRD2 reduces NB rows and columns of a real symmetric matrix A to symmetric tridiagonal form by an orthogonal similarity transformation Q' * A * Q, and returns the matrices V and W which are needed to apply the transformation to the unreduced part of A. If UPLO = 'U', DLATRD reduces the last NB rows and columns of a matrix, of which the upper triangle is supplied; if UPLO = 'L', DLATRD reduces the first NB rows and columns of a matrix, of which the lower triangle is supplied. This is an auxiliary routine called by DSYTRD2_GPU. It uses an accelerated HEMV that needs extra memory. Arguments ========= UPLO (input) CHARACTER*1 Specifies whether the upper or lower triangular part of the symmetric matrix A is stored: = 'U': Upper triangular = 'L': Lower triangular N (input) INTEGER The order of the matrix A. NB (input) INTEGER The number of rows and columns to be reduced. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading n-by-n upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading n-by-n lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit: if UPLO = 'U', the last NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements above the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the first NB columns have been reduced to tridiagonal form, with the diagonal elements overwriting the diagonal elements of A; the elements below the diagonal with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= (1,N). E (output) DOUBLE_PRECISION array, dimension (N-1) If UPLO = 'U', E(n-nb:n-1) contains the superdiagonal elements of the last NB columns of the reduced matrix; if UPLO = 'L', E(1:nb) contains the subdiagonal elements of the first NB columns of the reduced matrix. TAU (output) DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors, stored in TAU(n-nb:n-1) if UPLO = 'U', and in TAU(1:nb) if UPLO = 'L'. See Further Details. W (output) DOUBLE_PRECISION array, dimension (LDW,NB) The n-by-nb matrix W required to update the unreduced part of A. LDW (input) INTEGER The leading dimension of the array W. LDW >= max(1,N). Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n) H(n-1) . . . H(n-nb+1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i:n) = 0 and v(i-1) = 1; v(1:i-1) is stored on exit in A(1:i-1,i), and tau in TAU(i-1). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+1:n) is stored on exit in A(i+1:n,i), and tau in TAU(i). The elements of the vectors v together form the n-by-nb matrix V which is needed, with W, to apply the transformation to the unreduced part of the matrix, using a symmetric rank-2k update of the form: A := A - V*W' - W*V'. The contents of A on exit are illustrated by the following examples with n = 5 and nb = 2: if UPLO = 'U': if UPLO = 'L': ( a a a v4 v5 ) ( d ) ( a a v4 v5 ) ( 1 d ) ( a 1 v5 ) ( v1 1 a ) ( d 1 ) ( v1 v2 a a ) ( d ) ( v1 v2 a a a ) where d denotes a diagonal element of the reduced matrix, a denotes an element of the original matrix that is unchanged, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t i; double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double value = MAGMA_D_ZERO; magma_int_t ione = 1; magma_int_t i_n, i_1, iw; double alpha; double *f; if (n <= 0) { return 0; } magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, n ); assert( f != NULL ); // TODO return error, or allocate outside dlatrd if (lapackf77_lsame(uplo_, "U")) { /* Reduce last NB columns of upper triangle */ for (i = n-1; i >= n - nb ; --i) { i_1 = i + 1; i_n = n - i - 1; iw = i - n + nb; if (i < n-1) { /* Update A(1:i,i) */ #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw); #endif blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, A(0, i+1), &lda, W(i, iw+1), &ldw, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, W(i, iw+1), &ldw); lapackf77_dlacgv(&i_n, A(i, i+1), &ldw); #endif blasf77_dgemv("No transpose", &i_1, &i_n, &c_neg_one, W(0, iw+1), &ldw, A(i, i+1), &lda, &c_one, A(0, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i_n, A(i, i+1), &ldw); #endif } if (i > 0) { /* Generate elementary reflector H(i) to annihilate A(1:i-2,i) */ alpha = *A(i-1, i); lapackf77_dlarfg(&i, &alpha, A(0, i), &ione, &tau[i - 1]); e[i-1] = MAGMA_D_REAL( alpha ); *A(i-1,i) = MAGMA_D_MAKE( 1, 0 ); /* Compute W(1:i-1,i) */ // 1. Send the block reflector A(0:n-i-1,i) to the GPU magma_dsetvector( i, A(0, i), 1, dA(0, i), 1 ); //#if (GPUSHMEM < 200) //magma_dsymv(MagmaUpper, i, c_one, dA(0, 0), ldda, // dA(0, i), ione, c_zero, dW(0, iw), ione); //#else magmablas_dsymv_work(MagmaUpper, i, c_one, dA(0, 0), ldda, dA(0, i), ione, c_zero, dW(0, iw), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_dgetmatrix_async( i, 1, dW(0, iw), lddw, W(0, iw) /*test*/, ldw, stream ); if (i < n-1) { blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, W(0, iw+1), &ldw, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); } // 3. Here is where we need it // TODO find the right place magma_queue_sync( stream ); if (i < n-1) { blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, A(0, i+1), &lda, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); blasf77_dgemv(MagmaTransStr, &i, &i_n, &c_one, A(0, i+1), &lda, A(0, i), &ione, &c_zero, W(i+1, iw), &ione); blasf77_dgemv("No transpose", &i, &i_n, &c_neg_one, W(0, iw+1), &ldw, W(i+1, iw), &ione, &c_one, W(0, iw), &ione); } blasf77_dscal(&i, &tau[i - 1], W(0, iw), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i, W(0,iw), ione, A(0,i), ione, &value ); #else value = cblas_ddot( i, W(0,iw), ione, A(0,i), ione ); #endif alpha = tau[i - 1] * -0.5f * value; blasf77_daxpy(&i, &alpha, A(0, i), &ione, W(0, iw), &ione); } } } else { /* Reduce first NB columns of lower triangle */ for (i = 0; i < nb; ++i) { /* Update A(i:n,i) */ i_n = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(i, 0), &ldw); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i, 0), &lda, W(i, 0), &ldw, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, W(i, 0), &ldw); lapackf77_dlacgv(&i, A(i ,0), &lda); #endif blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i, 0), &ldw, A(i, 0), &lda, &c_one, A(i, i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i, A(i, 0), &lda); #endif if (i < n-1) { /* Generate elementary reflector H(i) to annihilate A(i+2:n,i) */ i_n = n - i - 1; alpha = *A(i+1, i); lapackf77_dlarfg(&i_n, &alpha, A(min(i+2,n-1), i), &ione, &tau[i]); e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = MAGMA_D_MAKE( 1, 0 ); /* Compute W(i+1:n,i) */ // 1. Send the block reflector A(i+1:n,i) to the GPU magma_dsetvector( i_n, A(i+1, i), 1, dA(i+1, i), 1 ); //#if (GPUSHMEM < 200) //magma_dsymv(MagmaLower, i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, // dW(i+1, i), ione); //#else magmablas_dsymv_work('L', i_n, c_one, dA(i+1, i+1), ldda, dA(i+1, i), ione, c_zero, dW(i+1, i), ione, dwork, ldwork); //#endif // 2. Start putting the result back (asynchronously) magma_dgetmatrix_async( i_n, 1, dW(i+1, i), lddw, W(i+1, i), ldw, stream ); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, W(i+1, 0), &ldw, A(i+1, i), &ione, &c_zero, W(0, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, A(i+1, 0), &lda, W(0, i), &ione, &c_zero, f, &ione); blasf77_dgemv(MagmaTransStr, &i_n, &i, &c_one, A(i+1, 0), &lda, A(i+1, i), &ione, &c_zero, W(0, i), &ione); // 3. Here is where we need it magma_queue_sync( stream ); if (i!=0) blasf77_daxpy(&i_n, &c_one, f, &ione, W(i+1, i), &ione); blasf77_dgemv("No transpose", &i_n, &i, &c_neg_one, W(i+1, 0), &ldw, W(0, i), &ione, &c_one, W(i+1, i), &ione); blasf77_dscal(&i_n, &tau[i], W(i+1,i), &ione); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i_n, W(i+1,i), ione, A(i+1,i), ione, &value ); #else value = cblas_ddot( i_n, W(i+1,i), ione, A(i+1,i), ione ); #endif alpha = tau[i] * -0.5f * value; blasf77_daxpy(&i_n, &alpha, A(i+1, i), &ione, W(i+1,i), &ione); } } } magma_free_cpu(f); magma_queue_destroy( stream ); return 0; } /* dlatrd */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotf2_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R; magmaDouble_ptr d_A; magma_int_t N, n2, lda, ldda, info; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf(" N CPU GFlop/s (ms) GPU GFlop/s (ms) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_DPOTRF( N ) / 1e9; if ( N > 512 ) { printf( "%5d skipping because dpotf2 does not support N > 512\n", (int) N ); continue; } TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dpotf2_gpu( opts.uplo, N, d_A, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); error = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time*1000. ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; magma_int_t M, N, n2, lda, lwork, info, min_mn, nb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; opts.lapack |= ( opts.check == 2 ); // check (-c2) implies lapack (-l) printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("===============================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=======================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; nb = magma_get_dgeqrf_nb(M); gflops = FLOPS_DGEQRF( M, N ) / 1e9; lwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); lwork = max( lwork, max( N*nb, 2*nb*nb )); TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); TESTING_MALLOC( h_work, double, lwork ); /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ double *tau; TESTING_MALLOC( tau, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE( tau ); } if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC( h_W1, double, n2 ); // Q TESTING_MALLOC( h_W2, double, n2 ); // R TESTING_MALLOC( h_W3, double, lwork ); // WORK TESTING_MALLOC( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE( h_W1 ); TESTING_FREE( h_W2 ); TESTING_FREE( h_W3 ); TESTING_FREE( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work) / error; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e", (int) M, (int) N, gpu_perf, gpu_time, error ); } printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R; double *d_A; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t N, n2, lda, ldda, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("=================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_DPOTRI( N ) / 1e9; TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dmake_hpd( N, h_A, lda ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* factorize matrix */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); magma_dpotrf_gpu( opts.uplo, N, d_A, ldda, &info ); // check for exact singularity //magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); //h_R[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_R, lda, d_A, ldda ); gpu_time = magma_wtime(); magma_dpotri_gpu( opts.uplo, N, d_A, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dpotri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lapackf77_dpotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime(); lapackf77_dpotri( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dpotri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( N, N, d_A, ldda, h_R, lda ); error = lapackf77_dlange("f", &N, &N, h_A, &lda, work); blasf77_daxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_dlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dormbr */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Cnorm, error, dwork[1]; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, k, mi, ni, mm, nn, nq, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t nb, ldc, lda, lwork, lwork_max; double *C, *R, *A, *work, *tau, *tauq, *taup; double *d, *e; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); // need slightly looser bound (60*eps instead of 30*eps) for some tests opts.tolerance = max( 60., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); // test all combinations of input parameters magma_vect_t vect [] = { MagmaQ, MagmaP }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magma_trans_t trans[] = { MagmaTrans, MagmaNoTrans }; printf("%% M N K vect side trans CPU Gflop/s (sec) GPU Gflop/s (sec) ||R||_F / ||QC||_F\n"); printf("%%==============================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int ivect = 0; ivect < 2; ++ivect ) { for( int iside = 0; iside < 2; ++iside ) { for( int itran = 0; itran < 2; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; nb = magma_get_dgebrd_nb( m, n ); ldc = m; // A is nq x k (vect=Q) or k x nq (vect=P) // where nq=m (left) or nq=n (right) nq = (side[iside] == MagmaLeft ? m : n ); mm = (vect[ivect] == MagmaQ ? nq : k ); nn = (vect[ivect] == MagmaQ ? k : nq); lda = mm; // MBR calls either MQR or MLQ in various ways if ( vect[ivect] == MagmaQ ) { if ( nq >= k ) { gflops = FLOPS_DORMQR( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_DORMQR( mi, ni, nq-1, side[iside] ) / 1e9; } } else { if ( nq > k ) { gflops = FLOPS_DORMLQ( m, n, k, side[iside] ) / 1e9; } else { if ( side[iside] == MagmaLeft ) { mi = m - 1; ni = n; } else { mi = m; ni = n - 1; } gflops = FLOPS_DORMLQ( mi, ni, nq-1, side[iside] ) / 1e9; } } // workspace for gebrd is (mm + nn)*nb // workspace for unmbr is m*nb or n*nb, depending on side lwork_max = max( (mm + nn)*nb, max( m*nb, n*nb )); // this rounds it up slightly if needed to agree with lwork query below lwork_max = int( real( magma_dmake_lwork( lwork_max ))); TESTING_MALLOC_CPU( C, double, ldc*n ); TESTING_MALLOC_CPU( R, double, ldc*n ); TESTING_MALLOC_CPU( A, double, lda*nn ); TESTING_MALLOC_CPU( work, double, lwork_max ); TESTING_MALLOC_CPU( d, double, min(mm,nn) ); TESTING_MALLOC_CPU( e, double, min(mm,nn) ); TESTING_MALLOC_CPU( tauq, double, min(mm,nn) ); TESTING_MALLOC_CPU( taup, double, min(mm,nn) ); // C is full, m x n size = ldc*n; lapackf77_dlarnv( &ione, ISEED, &size, C ); lapackf77_dlacpy( "Full", &m, &n, C, &ldc, R, &ldc ); size = lda*nn; lapackf77_dlarnv( &ione, ISEED, &size, A ); // compute BRD factorization to get Householder vectors in A, tauq, taup //lapackf77_dgebrd( &mm, &nn, A, &lda, d, e, tauq, taup, work, &lwork_max, &info ); magma_dgebrd( mm, nn, A, lda, d, e, tauq, taup, work, lwork_max, &info ); if (info != 0) { printf("magma_dgebrd returned error %d: %s.\n", (int) info, magma_strerror( info )); } if ( vect[ivect] == MagmaQ ) { tau = tauq; } else { tau = taup; } /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dormbr( lapack_vect_const( vect[ivect] ), lapack_side_const( side[iside] ), lapack_trans_const( trans[itran] ), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork_max, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ // query for workspace size lwork = -1; magma_dormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); if (info != 0) { printf("magma_dormbr (lwork query) returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = (magma_int_t) MAGMA_D_REAL( work[0] ); if ( lwork < 0 || lwork > lwork_max ) { printf("Warning: optimal lwork %d > allocated lwork_max %d\n", (int) lwork, (int) lwork_max ); lwork = lwork_max; } gpu_time = magma_wtime(); magma_dormbr( vect[ivect], side[iside], trans[itran], m, n, k, A, lda, tau, R, ldc, work, lwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dormbr returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== compute relative error |QC_magma - QC_lapack| / |QC_lapack| =================================================================== */ size = ldc*n; blasf77_daxpy( &size, &c_neg_one, C, &ione, R, &ione ); Cnorm = lapackf77_dlange( "Fro", &m, &n, C, &ldc, dwork ); error = lapackf77_dlange( "Fro", &m, &n, R, &ldc, dwork ) / (magma_dsqrt(m*n) * Cnorm); printf( "%5d %5d %5d %c %4c %5c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, lapacke_vect_const( vect[ivect] ), lapacke_side_const( side[iside] ), lapacke_trans_const( trans[itran] ), cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( C ); TESTING_FREE_CPU( R ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( d ); TESTING_FREE_CPU( e ); TESTING_FREE_CPU( taup ); TESTING_FREE_CPU( tauq ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } }}} // end ivect, iside, itran printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- DLAHR2 reduces the first NB columns of a real general n-BY-(n-k+1) matrix A so that elements below the k-th subdiagonal are zero. The reduction is performed by an orthogonal similarity transformation Q' * A * Q. The routine returns the matrices V and T which determine Q as a block reflector I - V*T*V', and also the matrix Y = A * V. (Note this is different than LAPACK, which computes Y = A * V * T.) This is an auxiliary routine called by DGEHRD. Arguments --------- @param[in] n INTEGER The order of the matrix A. @param[in] k INTEGER The offset for the reduction. Elements below the k-th subdiagonal in the first NB columns are reduced to zero. K < N. @param[in] nb INTEGER The number of columns to be reduced. @param[in,out] dA DOUBLE PRECISION array on the GPU, dimension (LDDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements in rows K:N of the first NB columns are overwritten with the matrix Y. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[out] dV DOUBLE PRECISION array on the GPU, dimension (LDDV, NB) On exit this n-by-nb array contains the Householder vectors of the transformation. @param[in] lddv INTEGER The leading dimension of the array dV. LDDV >= max(1,N). @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N-K+1) On entry, the n-by-(n-k+1) general matrix A. On exit, the elements on and above the k-th subdiagonal in the first NB columns are overwritten with the corresponding elements of the reduced matrix; the elements below the k-th subdiagonal, with the array TAU, represent the matrix Q as a product of elementary reflectors. The other columns of A are unchanged. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE PRECISION array, dimension (NB) The scalar factors of the elementary reflectors. See Further Details. @param[out] T DOUBLE PRECISION array, dimension (LDT,NB) The upper triangular matrix T. @param[in] ldt INTEGER The leading dimension of the array T. LDT >= NB. @param[out] Y DOUBLE PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- The matrix Q is represented as a product of nb elementary reflectors Q = H(1) H(2) . . . H(nb). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i+k-1) = 0, v(i+k) = 1; v(i+k+1:n) is stored on exit in A(i+k+1:n,i), and tau in TAU(i). The elements of the vectors v together form the (n-k+1)-by-nb matrix V which is needed, with T and Y, to apply the transformation to the unreduced part of the matrix, using an update of the form: A := (I - V*T*V') * (A - Y*T*V'). The contents of A on exit are illustrated by the following example with n = 7, k = 3 and nb = 2: @verbatim ( a a a a a ) ( a a a a a ) ( a a a a a ) ( h h a a a ) ( v1 h a a a ) ( v1 v2 a a a ) ( v1 v2 a a a ) @endverbatim where "a" denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_dgeev_aux ********************************************************************/ extern "C" magma_int_t magma_dlahr2( magma_int_t n, magma_int_t k, magma_int_t nb, magmaDouble_ptr dA, magma_int_t ldda, magmaDouble_ptr dV, magma_int_t lddv, double *A, magma_int_t lda, double *tau, double *T, magma_int_t ldt, double *Y, magma_int_t ldy, magma_queue_t queue ) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define Y(i_,j_) ( Y + (i_) + (j_)*ldy) #define T(i_,j_) ( T + (i_) + (j_)*ldt) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dV(i_,j_) (dV + (i_) + (j_)*lddv) double c_zero = MAGMA_D_ZERO; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t n_k_i_1, n_k; double scale; magma_int_t i; double ei = MAGMA_D_ZERO; magma_int_t info = 0; if (n < 0) { info = -1; } else if (k < 0 || k > n) { info = -2; } else if (nb < 1 || nb > n) { info = -3; } else if (ldda < max(1,n)) { info = -5; } else if (lddv < max(1,n)) { info = -7; } else if (lda < max(1,n)) { info = -9; } else if (ldt < max(1,nb)) { info = -12; } else if (ldy < max(1,n)) { info = -13; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } // adjust from 1-based indexing k -= 1; if (n <= 1) return info; for (i = 0; i < nb; ++i) { n_k_i_1 = n - k - i - 1; n_k = n - k; if (i > 0) { // Update A(k:n-1,i); Update i-th column of A - Y * T * V' // This updates one more row than LAPACK does (row k), // making the block above the panel an even multiple of nb. // Use last column of T as workspace, w. // w(0:i-1, nb-1) = VA(k+i, 0:i-1)' blasf77_dcopy( &i, A(k+i,0), &lda, T(0,nb-1), &ione ); #ifdef COMPLEX // If real, conjugate row of V. lapackf77_dlacgv(&i, T(0,nb-1), &ione); #endif // w = T(0:i-1, 0:i-1) * w blasf77_dtrmv( "Upper", "No trans", "No trans", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // A(k:n-1, i) -= Y(k:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k, &i, &c_neg_one, Y(k,0), &ldy, T(0,nb-1), &ione, &c_one, A(k,i), &ione ); // Apply I - V * T' * V' to this column (call it b) from the // left, using the last column of T as workspace, w. // // Let V = ( V1 ) and b = ( b1 ) (first i-1 rows) // ( V2 ) ( b2 ) // where V1 is unit lower triangular // w := b1 = A(k+1:k+i, i) blasf77_dcopy( &i, A(k+1,i), &ione, T(0,nb-1), &ione ); // w := V1' * b1 = VA(k+1:k+i, 0:i-1)' * w blasf77_dtrmv( "Lower", "Conj", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // w := w + V2'*b2 = w + VA(k+i+1:n-1, 0:i-1)' * A(k+i+1:n-1, i) blasf77_dgemv( "Conj", &n_k_i_1, &i, &c_one, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_one, T(0,nb-1), &ione ); // w := T'*w = T(0:i-1, 0:i-1)' * w blasf77_dtrmv( "Upper", "Conj", "Non-unit", &i, T(0,0), &ldt, T(0,nb-1), &ione ); // b2 := b2 - V2*w = A(k+i+1:n-1, i) - VA(k+i+1:n-1, 0:i-1) * w blasf77_dgemv( "No trans", &n_k_i_1, &i, &c_neg_one, A(k+i+1,0), &lda, T(0,nb-1), &ione, &c_one, A(k+i+1,i), &ione ); // w := V1*w = VA(k+1:k+i, 0:i-1) * w blasf77_dtrmv( "Lower", "No trans", "Unit", &i, A(k+1,0), &lda, T(0,nb-1), &ione ); // b1 := b1 - w = A(k+1:k+i-1, i) - w blasf77_daxpy( &i, &c_neg_one, T(0,nb-1), &ione, A(k+1,i), &ione ); // Restore diagonal element, saved below during previous iteration *A(k+i,i-1) = ei; } // Generate the elementary reflector H(i) to annihilate A(k+i+1:n-1,i) lapackf77_dlarfg( &n_k_i_1, A(k+i+1,i), A(k+i+2,i), &ione, &tau[i] ); // Save diagonal element and set to one, to simplify multiplying by V ei = *A(k+i+1,i); *A(k+i+1,i) = c_one; // dV(i+1:n-k-1, i) = VA(k+i+1:n-1, i) magma_dsetvector( n_k_i_1, A(k+i+1,i), 1, dV(i+1,i), 1, queue ); // Compute Y(k+1:n,i) = A vi // dA(k:n-1, i) = dA(k:n-1, i+1:n-k-1) * dV(i+1:n-k-1, i) magma_dgemv( MagmaNoTrans, n_k, n_k_i_1, c_one, dA(k,i+1), ldda, dV(i+1,i), ione, c_zero, dA(k,i), ione, queue ); // Compute T(0:i,i) = [ -tau T V' vi ] // [ tau ] // T(0:i-1, i) = -tau VA(k+i+1:n-1, 0:i-1)' VA(k+i+1:n-1, i) scale = MAGMA_D_NEGATE( tau[i]); blasf77_dgemv( "Conj", &n_k_i_1, &i, &scale, A(k+i+1,0), &lda, A(k+i+1,i), &ione, &c_zero, T(0,i), &ione ); // T(0:i-1, i) = T(0:i-1, 0:i-1) * T(0:i-1, i) blasf77_dtrmv( "Upper", "No trans", "Non-unit", &i, T(0,0), &ldt, T(0,i), &ione ); *T(i,i) = tau[i]; // Y(k:n-1, i) = dA(k:n-1, i) magma_dgetvector( n-k, dA(k,i), 1, Y(k,i), 1, queue ); } // Restore diagonal element *A(k+nb,nb-1) = ei; return info; } /* magma_dlahr2 */
int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time; double matnorm, work[1]; double mzone = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *hwork, tmp[1]; magmaDouble_ptr d_A; /* Matrix size */ magma_int_t M = 0, N = 0, n2, lda, ldda, lhwork; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10176}; magma_int_t i, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); else if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if ( M == 0 ) { M = N; } if ( N == 0 ) { N = M; } if (M>0 && N>0) printf(" testing_dgeqrf_gpu -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_dgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_dgeqrf_gpu -M %d -N %d\n\n", 1024, 1024); M = N = size[7]; } /* Initialize */ magma_queue_t queue1, queue2; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue1 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue2 ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magma_queue_t queues[2] = {queue1, queue2}; ldda = ((M+31)/32)*32; n2 = M * N; min_mn = min(M, N); /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); lhwork = -1; lapackf77_dgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info); lhwork = (magma_int_t)MAGMA_D_REAL( tmp[0] ); TESTING_MALLOC_CPU( hwork, double, lhwork ); printf("\n\n"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / ||A||_F\n"); printf("======================================================================\n"); for(i=0; i<8; i++){ if (argc == 1){ M = N = size[i]; } min_mn= min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS( (double)M, (double)N ) * 1e-9; /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_dgeqrf(&M, &N, h_A, &M, tau, hwork, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; if (info < 0) printf("Argument %d of lapack_dgeqrf had an illegal value.\n", -info); cpu_perf = gflops / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); magma_dgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); magma_dsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue1 ); clFinish(queue1); clFinish(queue2); gpu_time = magma_wtime(); magma_dgeqrf2_gpu( M, N, d_A, 0, ldda, tau, &info, queues); gpu_time = magma_wtime() - gpu_time; if (info < 0) printf("Argument %d of magma_dgeqrf2 had an illegal value.\n", -info); gpu_perf = gflops / gpu_time; /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_dgetmatrix( M, N, d_A, 0, ldda, h_R, 0, M, queue1 ); matnorm = lapackf77_dlange("f", &M, &N, h_A, &M, work); blasf77_daxpy(&n2, &mzone, h_A, &ione, h_R, &ione); printf("%5d %5d %6.2f (%6.2f) %6.2f (%6.2f) %e\n", M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, lapackf77_dlange("f", &M, &N, h_R, &M, work) / matnorm); if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( hwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); magma_queue_destroy( queue1 ); magma_queue_destroy( queue2 ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsyr2k */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An, Bk, Bn; magma_int_t sizeA, sizeB, sizeC; magma_int_t lda, ldb, ldc, ldda, lddb, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_C, *h_Ccublas; magmaDouble_ptr d_A, d_B, d_C; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); double beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); #ifdef COMPLEX if (opts.transA == MagmaTrans) { opts.transA = MagmaConjTrans; printf("%% WARNING: transA = MagmaTrans changed to MagmaConjTrans\n"); } #endif printf("%% If running lapack (option --lapack), CUBLAS error is computed\n" "%% relative to CPU BLAS result.\n\n"); printf("%% uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf("%% N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("%%=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.msize[itest]; K = opts.ksize[itest]; gflops = FLOPS_DSYR2K(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; ldb = Bn = N; Bk = K; } else { lda = An = K; Ak = N; ldb = Bn = K; Bk = N; } ldc = N; ldda = magma_roundup( lda, opts.align ); // multiple of 32 by default lddb = magma_roundup( ldb, opts.align ); // multiple of 32 by default lddc = magma_roundup( ldc, opts.align ); // multiple of 32 by default sizeA = lda*Ak; sizeB = ldb*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, double, lda*Ak ); TESTING_MALLOC_CPU( h_B, double, ldb*Bk ); TESTING_MALLOC_CPU( h_C, double, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, double, ldc*N ); TESTING_MALLOC_DEV( d_A, double, ldda*Ak ); TESTING_MALLOC_DEV( d_B, double, lddb*Bk ); TESTING_MALLOC_DEV( d_C, double, lddc*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_dlarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( Bn, Bk, h_B, ldb, d_B, lddb ); magma_dsetmatrix( N, N, h_C, ldc, d_C, lddc ); magmablasSetKernelStream( opts.queue ); // opts.handle also uses opts.queue cublas_time = magma_sync_wtime( opts.queue ); #ifdef HAVE_CUBLAS cublasDsyr2k( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, d_B, lddb, &beta, d_C, lddc ); #else magma_dsyr2k( opts.uplo, opts.transA, N, K, alpha, d_A, 0, ldda, d_B, 0, lddb, beta, d_C, 0, lddc, opts.queue ); #endif cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dsyr2k( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, h_B, &ldb, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &N, &N, h_C, &ldc, work ); blasf77_daxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_dlange( "M", &N, &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dtrmm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; double cublas_error, Cnorm, work[1]; magma_int_t M, N; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double *h_A, *h_B, *h_Bcublas; double *d_A, *d_B; double c_neg_one = MAGMA_D_NEG_ONE; double alpha = MAGMA_D_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_DTRMM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, double, lda*Ak ); TESTING_MALLOC_CPU( h_B, double, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, double, ldb*N ); TESTING_MALLOC_DEV( d_A, double, ldda*Ak ); TESTING_MALLOC_DEV( d_B, double, lddb*N ); /* Initialize the matrices */ lapackf77_dlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_dsetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, N, h_B, ldb, d_B, lddb ); // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. cublas_time = magma_sync_wtime( NULL ); cublasDtrmm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_dgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_dtrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_dlange( "M", &M, &N, h_B, &ldb, work ); blasf77_daxpy( &sizeB, &c_neg_one, h_B, &ione, h_Bcublas, &ione ); cublas_error = lapackf77_dlange( "M", &M, &N, h_Bcublas, &ldb, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) M, (int) N, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgels */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double gpu_error, cpu_error, error, Anorm, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaDouble_ptr d_A, d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t status = 0; double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; printf("%% ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf("%% M N NRHS CPU Gflop/s (sec) GPU Gflop/s (sec) CPU GPU \n"); printf("%%==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default lddb = magma_roundup( max_mn, opts.align ); // multiple of 32 by default nb = magma_get_dgeqrf_nb( M, N ); gflops = (FLOPS_DGEQRF( M, N ) + FLOPS_DGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, NULL, &lda, NULL, &ldb, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); lhwork = max( lhwork, lworkgpu ); TESTING_MALLOC_CPU( tau, double, min_mn ); TESTING_MALLOC_CPU( h_A, double, lda*N ); TESTING_MALLOC_CPU( h_A2, double, lda*N ); TESTING_MALLOC_CPU( h_B, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, double, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, double, lhwork ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_B, double, lddb*nrhs ); /* Initialize the matrices */ size = lda*N; lapackf77_dlarnv( &ione, ISEED, &size, h_A ); lapackf77_dlacpy( MagmaFullStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = ldb*nrhs; lapackf77_dlarnv( &ione, ISEED, &size, h_B ); lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_dlarnv( &ione, ISEED, &size, h_X ); //blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix( M, N, h_A, lda, d_A, ldda ); magma_dsetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_dgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgels_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // compute the residual magma_dgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb ); Anorm = lapackf77_dlange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_dlacpy( MagmaFullStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgels returned error %d: %s.\n", (int) info, magma_strerror( info )); } blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb ); cpu_error = lapackf77_dlange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_daxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); bool okay; if ( M == N ) { okay = (gpu_error < tol && error < tol); } else { okay = (error < tol); } status += ! okay; printf( " %s\n", (okay ? "ok" : "failed")); TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, double *a, magma_int_t lda, double *da, magma_int_t ldda, double *d, double *e, double *tauq, double *taup, double *x, magma_int_t ldx, double *dx, magma_int_t lddx, double *y, magma_int_t ldy, double *dy, magma_int_t lddy) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DLABRD reduces the first NB rows and columns of a real general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by SGEBRD Arguments ========= M (input) INTEGER The number of rows in the matrix A. N (input) INTEGER The number of columns in the matrix A. NB (input) INTEGER The number of leading rows and columns of A to be reduced. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). D (output) DOUBLE_PRECISION array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). E (output) DOUBLE_PRECISION array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. TAUQ (output) DOUBLE_PRECISION array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. TAUP (output) DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. X (output) DOUBLE_PRECISION array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. LDX (input) INTEGER The leading dimension of the array X. LDX >= M. Y (output) DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). ===================================================================== */ /* Table of constant values */ double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t c__1 = 1; /* System generated locals */ magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3; /* Local variables */ magma_int_t i__; double alpha; a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; --d; --e; --tauq; --taup; x_dim1 = ldx; x_offset = 1 + x_dim1; x -= x_offset; dx-= 1 + lddx; y_dim1 = ldy; y_offset = 1 + y_dim1; y -= y_offset; dy-= 1 + lddy; /* Function Body */ if (m <= 0 || n <= 0) { return 0; } double *f; magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, max(n,m) ); assert( f != NULL ); // TODO return error, or allocate outside dlatrd if (m >= n) { /* Reduce to upper bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i:m,i) */ i__2 = m - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + a_dim1], &lda, &y[i__+y_dim1], &ldy, &c_one, &a[i__ + i__ * a_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + x_dim1], &ldx, &a[i__*a_dim1+1], &c__1, &c_one, &a[i__+i__*a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = a[i__ + i__ * a_dim1]; i__2 = m - i__ + 1; i__3 = i__ + 1; lapackf77_dlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); d[i__] = MAGMA_D_REAL( alpha ); if (i__ < n) { a[i__ + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__ + 1; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, a + i__ + i__ * a_dim1, 1, da+(i__-1)+(i__-1)* (ldda), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv(MagmaTrans, i__2, i__3, c_one, da + (i__-1) + ((i__-1) + 1) * (ldda), ldda, da + (i__-1) + (i__-1) * (ldda), c__1, c_zero, dy + i__ + 1 + i__ * y_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dy+i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, stream ); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + a_dim1], &lda, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_dgemv("N", &i__2, &i__3, &c_neg_one, &y[i__ + 1 +y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &x[i__ + x_dim1], &ldx, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3!=0){ i__2 = n - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = i__ - 1; i__3 = n - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); /* Update A(i,i+1:n) */ i__2 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); lapackf77_dlacgv( &i__, &a[i__+a_dim1], &lda ); #endif blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + ( i__ + 1) * a_dim1], &lda); i__2 = i__ - 1; i__3 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__, &a[i__+a_dim1], &lda ); lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[ i__ + (i__ + 1) * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i__; /* Computing MIN */ i__3 = i__ + 2; alpha = a[i__ + (i__ + 1) * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[i__ + min( i__3,n) * a_dim1], &lda, &taup[i__]); e[i__] = MAGMA_D_REAL( alpha ); a[i__ + (i__ + 1) * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__3, a + i__ + (i__ +1)* a_dim1, lda, da+(i__-1)+((i__-1)+1)*(ldda), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_dgemv(MagmaNoTrans, i__2, i__3, c_one, da + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda, da + (i__-1) + ((i__-1)+1) * (ldda), ldda, //dy + 1 + lddy, 1, c_zero, dx + i__ + 1 + i__ * x_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dx+i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, stream ); i__2 = n - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[ i__ * x_dim1 + 1], &c__1); i__2 = m - i__; blasf77_dgemv("N", &i__2, &i__, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__; blasf77_dgemv("N", &i__2, &i__3, &c_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__!=0){ i__2 = m - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[ i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i__; lapackf77_dlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after DLACGV() magma_dsetvector( i__2, a + i__ + (i__ +1)* a_dim1, lda, da+(i__-1)+((i__-1)+1)*(ldda), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i,i:n) */ i__2 = n - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + i__ * a_dim1], &lda); i__2 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda); lapackf77_dlacgv(&i__3, &x[i__ + x_dim1], &ldx); #endif i__3 = n - i__ + 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[i__ * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[i__ + i__ * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &x[i__ + x_dim1], &ldx); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i__ + 1; /* Computing MIN */ i__3 = i__ + 1; alpha = a[i__ + i__ * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[i__ + min(i__3,n) * a_dim1], &lda, &taup[i__]); d[i__] = MAGMA_D_REAL( alpha ); if (i__ < m) { a[i__ + i__ * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__ + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_dsetvector( i__3, a + i__ + i__ * a_dim1, lda, da+(i__-1)+(i__-1)* (ldda), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_dgemv(MagmaNoTrans, i__2, i__3, c_one, da + (i__-1)+1 + (i__-1) * ldda, ldda, da + (i__-1) + (i__-1) * ldda, ldda, // dy + 1 + lddy, 1, c_zero, dx + i__ + 1 + i__ * x_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dx+i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, stream ); i__2 = n - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &y[i__ + y_dim1], &ldy, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__ + 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_one, &a[i__ * a_dim1 + 1], &lda, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2!=0){ i__3 = m - i__; blasf77_daxpy(&i__3, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = n - i__ + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_dsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da+(i__-1)+ (i__-1)*(ldda), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i__; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &y[i__ + y_dim1], &ldy, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); i__2 = m - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ * a_dim1 + 1], &c__1, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i__; i__3 = i__ + 2; alpha = a[i__ + 1 + i__ * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); e[i__] = MAGMA_D_REAL( alpha ); a[i__ + 1 + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, a + i__ +1+ i__ * a_dim1, 1, da+(i__-1)+1+ (i__-1)*(ldda), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv(MagmaTrans, i__2, i__3, c_one, da + (i__-1)+1+ ((i__-1)+1) * ldda, ldda, da + (i__-1)+1+ (i__-1) * ldda, c__1, c_zero, dy + i__ + 1 + i__ * y_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dy+i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, stream ); i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + 1 + a_dim1], &lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[ i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3!=0){ i__2 = n - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = n - i__; blasf77_dgemv(MagmaTransStr, &i__, &i__2, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i__ + 1; lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_dsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da+(i__-1)+ (i__-1)*(ldda), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu(f); return MAGMA_SUCCESS; } /* dlabrd */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgeqrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error, work[1]; double c_neg_one = MAGMA_D_NEG_ONE; double *h_A, *h_R, *tau, *h_work, tmp[1]; double *d_lA[ MagmaMaxGPUs ]; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, lhwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4]; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= (opts.check == 2); // check (-c2) implies lapack (-l) magma_int_t status = 0; double tol, eps = lapackf77_dlamch("E"); tol = opts.tolerance * eps; printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 1 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n"); printf("================================================================================================\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F /(M*||A||_F)\n"); printf("==========================================================================\n"); } for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_dgeqrf_nb( M ); gflops = FLOPS_DGEQRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // query for workspace size lhwork = -1; lapackf77_dgeqrf( &M, &N, h_A, &M, tau, tmp, &lhwork, &info ); lhwork = (magma_int_t) MAGMA_D_REAL( tmp[0] ); // Allocate host memory for the matrix TESTING_MALLOC( tau, double, min_mn ); TESTING_MALLOC( h_A, double, n2 ); TESTING_HOSTALLOC( h_R, double, n2 ); TESTING_MALLOC( h_work, double, lhwork ); // Allocate device memory for( int dev = 0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; magma_setdevice( dev ); TESTING_DEVALLOC( d_lA[dev], double, ldda*n_local ); } /* Initialize the matrix */ for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); lapackf77_dlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { double *tau; TESTING_MALLOC( tau, double, min_mn ); cpu_time = magma_wtime(); lapackf77_dgeqrf( &M, &N, h_A, &M, tau, h_work, &lhwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapack_dgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); TESTING_FREE( tau ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_dsetmatrix_1D_col_bcyclic( M, N, h_R, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgeqrf2_mgpu( ngpu, M, N, d_lA, ldda, tau, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgeqrf2 returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_R, lda, ngpu, nb ); magma_queue_sync( NULL ); if ( opts.check == 1 ) { /* ===================================================================== Check the result =================================================================== */ magma_int_t lwork = n2+N; double *h_W1, *h_W2, *h_W3; double *h_RW, results[2]; TESTING_MALLOC( h_W1, double, n2 ); // Q TESTING_MALLOC( h_W2, double, n2 ); // R TESTING_MALLOC( h_W3, double, lwork ); // WORK TESTING_MALLOC( h_RW, double, M ); // RWORK lapackf77_dlarnv( &ione, ISEED2, &n2, h_A ); lapackf77_dqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork, h_RW, results ); results[0] *= eps; results[1] *= eps; if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %8.2e", (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] ); printf("%s\n", (results[0] < tol ? "" : " failed")); } status |= ! (results[0] < tol); TESTING_FREE( h_W1 ); TESTING_FREE( h_W2 ); TESTING_FREE( h_W3 ); TESTING_FREE( h_RW ); } else if ( opts.check == 2 ) { /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = lapackf77_dlange("f", &M, &N, h_A, &lda, work ); blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange("f", &M, &N, h_R, &lda, work ) / (min_mn*error); printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error ); printf("%s\n", (error < tol ? "" : " failed")); status |= ! (error < tol); } else { if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) ---\n", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) M, (int) N, gpu_perf, gpu_time); } } TESTING_FREE( tau ); TESTING_FREE( h_A ); TESTING_FREE( h_work ); TESTING_HOSTFREE( h_R ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_DEVFREE( d_lA[dev] ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DLABRD reduces the first NB rows and columns of a real general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by DGEBRD. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. @param[in] n INTEGER The number of columns in the matrix A. @param[in] nb INTEGER The number of leading rows and columns of A to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. \n If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension (LDDA,N) Copy of A on GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] d DOUBLE_PRECISION array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). @param[out] e DOUBLE_PRECISION array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. @param[out] tauq DOUBLE_PRECISION array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] X DOUBLE_PRECISION array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. @param[in] ldx INTEGER The leading dimension of the array X. LDX >= M. @param[out] dX DOUBLE_PRECISION array, dimension (LDDX,NB) Copy of X on GPU. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= M. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[out] dY DOUBLE_PRECISION array, dimension (LDDY,NB) Copy of Y on GPU. @param[in] lddy INTEGER The leading dimension of the array dY. LDDY >= N. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) @endverbatim where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_dgesvd_aux ********************************************************************/ extern "C" magma_int_t magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *dA, magma_int_t ldda, double *d, double *e, double *tauq, double *taup, double *X, magma_int_t ldx, double *dX, magma_int_t lddx, double *Y, magma_int_t ldy, double *dY, magma_int_t lddy) { #define A(i_,j_) (A + (i_) + (j_)*lda) #define X(i_,j_) (X + (i_) + (j_)*ldx) #define Y(i_,j_) (Y + (i_) + (j_)*ldy) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dY(i_,j_) (dY + (i_) + (j_)*lddy) #define dX(i_,j_) (dX + (i_) + (j_)*lddx) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t ione = 1; magma_int_t i__2, i__3; magma_int_t i; double alpha; A -= 1 + lda; X -= 1 + ldx; dX -= 1 + lddx; Y -= 1 + ldy; dY -= 1 + lddy; --d; --e; --tauq; --taup; /* Quick return if possible */ magma_int_t info = 0; if (m <= 0 || n <= 0) { return info; } double *f; magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, max(n,m) ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (m >= n) { /* Reduce to upper bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i:m,i) */ i__2 = m - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i,1), &lda, Y(i,1), &ldy, &c_one, A(i,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i,1), &ldx, A(1,i), &ione, &c_one, A(i,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = *A(i,i); i__2 = m - i + 1; i__3 = i + 1; lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < n) { *A(i,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i + 1; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i,i), 1, dA(i-1,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i-1,i), ldda, dA(i-1,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i,1), &lda, A(i,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "N", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, X(i,1), &ldx, A(i,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = i - 1; i__3 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); /* Update A(i,i+1:n) */ i__2 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); lapackf77_dlacgv( &i, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, Y(i+1,1), &ldy, A(i,1), &lda, &c_one, A(i,i+1), &lda ); i__2 = i - 1; i__3 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i, A(i,1), &lda ); lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, X(i,1), &ldx, &c_one, A(i,i+1), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i; i__3 = i + 2; alpha = *A(i,i+1); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i,i+1) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__3, A(i,i+1), lda, dA(i-1,i), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i-1,i), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, Y(i+1,1), &ldy, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; blasf77_dgemv( "N", &i__2, &i, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i; blasf77_dgemv( "N", &i__2, &i__3, &c_one, A(1,i+1), &lda, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i != 0) { i__2 = m - i; blasf77_daxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i; lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after DLACGV() magma_dsetvector( i__2, A(i,i+1), lda, dA(i-1,i), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i,i:n) */ i__2 = n - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); lapackf77_dlacgv( &i__3, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i,1), &ldy, A(i,1), &lda, &c_one, A(i,i), &lda ); i__2 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, A(i,1), &lda ); lapackf77_dlacgv( &i__3, X(i,1), &ldx ); #endif i__3 = n - i + 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i), &lda, X(i,1), &ldx, &c_one, A(i,i), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i + 1; i__3 = i + 1; alpha = *A(i,i); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < m) { *A(i,i) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_dsetvector( i__3, A(i,i), lda, dA(i-1,i-1), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i-1), ldda, dA(i-1,i-1), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, Y(i,1), &ldy, A(i,i), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i + 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_one, A(1,i), &lda, A(i,i), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2 != 0) { i__3 = m - i; blasf77_daxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); i__2 = n - i + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, Y(i,1), &ldy, &c_one, A(i+1,i), &ione ); i__2 = m - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, X(i+1,1), &ldx, A(1,i), &ione, &c_one, A(i+1,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i; i__3 = i + 2; alpha = *A(i+1,i); lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i+1,i), 1, dA(i,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i+1,1), &lda, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, X(i+1,1), &ldx, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i + 1; lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu( f ); return info; } /* magma_dlabrd_gpu */
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); double *A, *B, *C; double *dA, *dB, *dC; double error, work[1]; double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n = 10; magma_int_t lda = n; magma_int_t ldda = ((n+31)/32)*32; magma_int_t size = lda*n; magma_int_t info; magma_dmalloc_cpu( &A, lda*n ); magma_dmalloc_cpu( &B, lda*n ); magma_dmalloc_cpu( &C, lda*n ); magma_dmalloc( &dA, ldda*n ); magma_dmalloc( &dB, ldda*n ); magma_dmalloc( &dC, ldda*n ); // initialize matrices lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_dlarnv( &ione, ISEED, &size, B ); lapackf77_dlarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_D_ADD( C[i+i*lda], MAGMA_D_MAKE( n*n, 0 )); } magma_dsetmatrix( n, n, A, lda, dA, ldda ); magma_dsetmatrix( n, n, B, lda, dB, ldda ); magma_dsetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_dpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_dpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_dpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference magma_dgetmatrix( n, n, dC, ldda, A, lda ); blasf77_daxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_dlange( "F", &n, &n, A, &lda, work ); printf( "n %d, error %8.2e\n", n, error ); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); cublasDestroy( handle ); magma_finalize(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R, *work; magmaDouble_ptr d_A, dwork; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // need looser bound (3000*eps instead of 30*eps) for tests // TODO: should compute ||I - A*A^{-1}|| / (n*||A||*||A^{-1}||) opts.tolerance = max( 3000., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / (N*||A||_F)\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; ldwork = N * magma_get_dgetri_nb( N ); gflops = FLOPS_DGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = int( MAGMA_D_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, double, lwork ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dwork, double, ldwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); error = lapackf77_dlange( "f", &N, &N, h_A, &lda, rwork ); // norm(A) /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_dsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); magma_dgetrf_gpu( N, N, d_A, 0, ldda, ipiv, opts.queue, &info ); magma_dgetmatrix( N, N, d_A, 0, ldda, h_A, lda, opts.queue ); if ( info != 0 ) printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, opts.queues2, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange( "f", &N, &N, h_R, &lda, rwork ) / (N*error); printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }