/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetri_batched */ int main( int argc, char** argv) { TESTING_INIT(); // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_Ainv, *h_R, *work; magmaDoubleComplex_ptr d_A, d_invA; magmaDoubleComplex_ptr *dA_array; magmaDoubleComplex_ptr *dinvA_array; magma_int_t **dipiv_array; magma_int_t *dinfo_array; magma_int_t *ipiv, *cpu_info; magma_int_t *d_ipiv, *d_info; magma_int_t N, n2, lda, ldda, info, info1, info2, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t columns; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); magma_int_t batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% batchCount N CPU Gflop/s (ms) GPU Gflop/s (ms) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N * batchCount; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default // This is the correct flops but since this getri_batched is based on // 2 trsm = getrs and to know the real flops I am using the getrs one //gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRI( N ))/ 1e9 * batchCount; gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, N ))/ 1e9 * batchCount; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork*batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Ainv, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_invA, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_DEV( d_info, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinvA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); columns = N * batchCount; lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_Ainv, &lda ); magma_zsetmatrix( N, columns, h_R, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zset_pointer( dA_array, d_A, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_zset_pointer( dinvA_array, d_invA, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_iset_pointer( dipiv_array, d_ipiv, 1, 0, 0, N, batchCount, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); info1 = magma_zgetrf_batched( N, N, dA_array, ldda, dipiv_array, dinfo_array, batchCount, opts.queue); info2 = magma_zgetri_outofplace_batched( N, dA_array, ldda, dipiv_array, dinvA_array, ldda, dinfo_array, batchCount, opts.queue); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1, opts.queue ); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_zgetrf_batched matrix %d returned error %d\n", (int) i, (int)cpu_info[i] ); } } if (info1 != 0) printf("magma_zgetrf_batched returned argument error %d: %s.\n", (int) info1, magma_strerror( info1 )); if (info2 != 0) printf("magma_zgetri_batched returned argument error %d: %s.\n", (int) info2, magma_strerror( info2 )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (int i=0; i < batchCount; i++) { magma_int_t locinfo; lapackf77_zgetrf(&N, &N, h_Ainv + i*lda*N, &lda, ipiv + i*N, &locinfo); if (locinfo != 0) { printf("lapackf77_zgetrf returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } lapackf77_zgetri(&N, h_Ainv + i*lda*N, &lda, ipiv + i*N, work + i*lwork, &lwork, &locinfo ); if (locinfo != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; printf("%10d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%10d %5d --- ( --- ) %7.2f (%7.2f)", (int) batchCount, (int) N, gpu_perf, gpu_time*1000. ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_igetvector( N*batchCount, d_ipiv, 1, ipiv, 1, opts.queue ); magma_zgetmatrix( N, N*batchCount, d_invA, ldda, h_Ainv, lda, opts.queue ); error = 0; for (magma_int_t i=0; i < batchCount; i++) { for (magma_int_t k=0; k < N; k++) { if (ipiv[i*N+k] < 1 || ipiv[i*N+k] > N ) { printf("error for matrix %d ipiv @ %d = %d\n", (int) i, (int) k, (int) ipiv[i*N+k]); error = -1; } } if (error == -1) { break; } // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond, err; normA = lapackf77_zlange( "1", &N, &N, h_A + i*lda*N, &lda, rwork ); normAinv = lapackf77_zlange( "1", &N, &N, h_Ainv + i*lda*N, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; err = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_zlaset( "full", &N, &N, &c_zero, &c_one, h_R + i*lda*N, &lda ); blasf77_zgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A + i*lda*N, &lda, h_Ainv + i*lda*N, &lda, &c_one, h_R + i*lda*N, &lda ); err = lapackf77_zlange( "1", &N, &N, h_R + i*lda*N, &lda, rwork ); err = err * rcond / N; } if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf("\n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_invA ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_info ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dinvA_array ); TESTING_FREE_DEV( dinfo_array ); TESTING_FREE_DEV( dipiv_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A is stored; - = MagmaLower: Lower triangles of A is stored. @param[in] n INTEGER The order of the matrix A. n >= 0. @param[in] nb INTEGER The order of the band matrix A. n >= nb >= 0. @param[in] Vblksiz INTEGER The size of the block of householder vectors applied at once. @param[in] A (workspace) DOUBLE PRECISION array, dimension (lda, n) On entry the band matrix stored in the following way: @param[in] lda INTEGER The leading dimension of the array A. lda >= 2*nb. @param[out] d DOUBLE array, dimension (n) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e DOUBLE array, dimension (n-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] V DOUBLE PRECISION array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. @param[in] ldv INTEGER The leading dimension of V. LDV > nb + VBLKSIZ + 1 @param[out] TAU DOUBLE PRECISION dimension(BLKCNT, VBLKSIZ) ??? @param[in] wantz INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed @param[out] T DOUBLE PRECISION dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced @param[in] ldt INTEGER The leading dimension of T. LDT > Vblksiz @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sb2st( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, double *A, magma_int_t lda, double *d, double *e, double *V, magma_int_t ldv, double *TAU, magma_int_t wantz, double *T, magma_int_t ldt) { #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif magma_int_t parallel_threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_int_t ompth = magma_get_omp_numthreads(); //magma_set_omp_numthreads(1); //magma_set_lapack_numthreads(1); magma_int_t blkcnt, sizTAU2, sizT2, sizV2; magma_dbulge_getstg2size(n, nb, wantz, Vblksiz, ldv, ldt, &blkcnt, &sizTAU2, &sizT2, &sizV2); memset(T, 0, sizT2*sizeof(double)); memset(TAU, 0, sizTAU2*sizeof(double)); memset(V, 0, sizV2*sizeof(double)); magma_int_t INgrsiz=1; magma_int_t nbtiles = magma_ceildiv(n, nb); volatile magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t)); memset((void *) prog, 0, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t)); magma_dbulge_id_data* arg; magma_malloc_cpu((void**) &arg, parallel_threads*sizeof(magma_dbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, parallel_threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_dbulge_data data_bulge; magma_dbulge_data_init(&data_bulge, parallel_threads, n, nb, nbtiles, INgrsiz, Vblksiz, wantz, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(parallel_threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < parallel_threads; thread++) { magma_dbulge_id_data_init(&(arg[thread]), thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_dsytrd_sb2st_parallel_section, &arg[thread]); } magma_dbulge_id_data_init(&(arg[0]), 0, &data_bulge); magma_dsytrd_sb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < parallel_threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f\n", timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu((void *) prog); magma_dbulge_data_destroy(&data_bulge); magma_set_omp_numthreads(ompth); magma_set_lapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag d and e * note that d and e are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in d and e */ /* In real case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to e. * When using HouseHolder elimination, * the DLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if (uplo == MagmaLower) { for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_D_REAL( A[i*lda ] ); e[i] = MAGMA_D_REAL( A[i*lda+1] ); } d[n-1] = MAGMA_D_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_D_REAL( A[i*lda+nb] ); e[i] = MAGMA_D_REAL( A[i*lda+nb-1] ); } d[n-1] = MAGMA_D_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if ( uplo == MagmaLower ) { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda]; // diag e[i] = A[i*lda+1]; // lower diag } d[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda+nb]; // diag e[i] = A[i*lda+nb-1]; // lower diag } d[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
static void *magma_dsytrd_sb2st_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_dbulge_id_data*)arg) -> id; magma_dbulge_data* data = ((magma_dbulge_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t nb = data -> nb; magma_int_t nbtiles = data -> nbtiles; magma_int_t grsiz = data -> grsiz; magma_int_t Vblksiz = data -> Vblksiz; magma_int_t wantz = data -> wantz; double *A = data -> A; magma_int_t lda = data -> lda; double *V = data -> V; magma_int_t ldv = data -> ldv; double *TAU = data -> TAU; double *T = data -> T; magma_int_t ldt = data -> ldt; volatile magma_int_t* prog = data -> prog; pthread_barrier_t* myptbarrier = &(data -> myptbarrier); //magma_int_t sys_corenbr = 1; #ifdef ENABLE_TIMER real_Double_t timeB=0.0, timeT=0.0; #endif // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. //magma_set_omp_numthreads(1); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(1); /* #ifndef MAGMA_NOAFFINITY // bind threads cpu_set_t set; // bind threads CPU_ZERO( &set ); CPU_SET( my_core_id, &set ); sched_setaffinity( 0, sizeof(set), &set); #endif magma_set_lapack_numthreads(1); magma_set_omp_numthreads(1); */ #ifndef MAGMA_NOAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); magma_int_t check = 0; magma_int_t check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif /* compute the Q1 overlapped with the bulge chasing+T. * if all_cores_num=1 it call Q1 on GPU and then bulgechasing. * otherwise the first thread run Q1 on GPU and * the other threads run the bulgechasing. * */ //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if (my_core_id == 0) timeB = magma_wtime(); #endif magma_dtile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, wantz, prog, myptbarrier); if (allcores_num > 1) pthread_barrier_wait(myptbarrier); #ifdef ENABLE_TIMER if (my_core_id == 0) { timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f\n", timeB); } #endif //========================= // compute the T's to be used when applying Q2 //========================= if ( wantz > 0 ) { #ifdef ENABLE_TIMER if (my_core_id == 0) timeT = magma_wtime(); #endif magma_dtile_bulge_computeT_parallel(my_core_id, allcores_num, V, ldv, TAU, T, ldt, n, nb, Vblksiz); if (allcores_num > 1) pthread_barrier_wait(myptbarrier); #ifdef ENABLE_TIMER if (my_core_id == 0) { timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f\n", timeT); } #endif } #ifndef MAGMA_NOAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf=0, cublas_time=0, cpu_perf, cpu_time; double magma_error, cublas_error, magma_error2, cublas_error2; magmaDoubleComplex *h_A, *h_R, *h_Amagma, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_A, *dtau_magma, *dtau_cublas; magmaDoubleComplex **dA_array = NULL; magmaDoubleComplex **dtau_array = NULL; magma_int_t *dinfo_magma, *dinfo_cublas; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount; magma_int_t column; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% BatchCount M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) |R - Q^H*A|_mag |I - Q^H*Q|_mag |R - Q^H*A|_cub |I - Q^H*Q|_cub\n"); printf("%%============================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N * batchCount; ldda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRT( M, N )) / 1e9 * batchCount; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn * batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Amagma, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( dtau_magma, magmaDoubleComplex, min_mn * batchCount); TESTING_MALLOC_DEV( dtau_cublas, magmaDoubleComplex, min_mn * batchCount); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount); TESTING_MALLOC_DEV( dinfo_cublas, magma_int_t, batchCount); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dtau_array, magmaDoubleComplex*, batchCount ); // to determine the size of lwork lwork = -1; lapackf77_zgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork * batchCount); column = N * batchCount; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaFullStr, &M, &column, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, column, h_R, lda, d_A, ldda ); magma_zset_pointer( dA_array, d_A, 1, 0, 0, ldda*N, batchCount, opts.queue ); magma_zset_pointer( dtau_array, dtau_magma, 1, 0, 0, min_mn, batchCount, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); info = magma_zgeqrf_batched(M, N, dA_array, ldda, dtau_array, dinfo_magma, batchCount, opts.queue); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetmatrix( M, column, d_A, ldda, h_Amagma, lda); if (info != 0) { printf("magma_zgeqrf_batched returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using CUBLAS =================================================================== */ /* cublasZgeqrfBatched is only available from CUBLAS v6.5 */ #if CUDA_VERSION >= 6050 magma_zsetmatrix( M, column, h_R, lda, d_A, ldda ); magma_zset_pointer( dA_array, d_A, 1, 0, 0, ldda*N, batchCount, opts.queue ); magma_zset_pointer( dtau_array, dtau_cublas, 1, 0, 0, min_mn, batchCount, opts.queue ); cublas_time = magma_sync_wtime( opts.queue ); int cublas_info; // not magma_int_t cublasZgeqrfBatched( opts.handle, M, N, dA_array, ldda, dtau_array, &cublas_info, batchCount); cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; if (cublas_info != 0) { printf("cublasZgeqrfBatched returned error %d: %s.\n", (int) cublas_info, magma_strerror( cublas_info )); } #endif /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { cpu_time = magma_wtime(); // #define BATCHED_DISABLE_PARCPU #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (magma_int_t s=0; s < batchCount; s++) { magma_int_t locinfo; lapackf77_zgeqrf(&M, &N, h_A + s * lda * N, &lda, tau + s * min_mn, h_work + s * lwork, &lwork, &locinfo); if (locinfo != 0) { printf("lapackf77_zgeqrf matrix %d returned error %d: %s.\n", (int) s, (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the MAGMA CUBLAS result compared to LAPACK =================================================================== */ magma_int_t ldq = M; magma_int_t ldr = min_mn; magmaDoubleComplex *Q, *R; double *work; TESTING_MALLOC_CPU( Q, magmaDoubleComplex, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, magmaDoubleComplex, ldr*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); /* check magma result */ magma_error = 0; magma_error2 = 0; magma_zgetvector(min_mn*batchCount, dtau_magma, 1, tau, 1); for (int i=0; i < batchCount; i++) { double err, err2; get_QR_error(M, N, min_mn, h_Amagma + i*lda*N, h_R + i*lda*N, lda, tau + i*min_mn, Q, ldq, R, ldr, h_work, lwork, work, &err, &err2); if ( isnan(err) || isinf(err) ) { magma_error = err; break; } magma_error = max( err, magma_error ); magma_error2 = max( err2, magma_error2 ); } /* check cublas result */ cublas_error = 0; cublas_error2 = 0; #if CUDA_VERSION >= 6050 magma_zgetvector(min_mn*batchCount, dtau_magma, 1, tau, 1); magma_zgetmatrix( M, column, d_A, ldda, h_A, lda); for (int i=0; i < batchCount; i++) { double err, err2; get_QR_error(M, N, min_mn, h_A + i*lda*N, h_R + i*lda*N, lda, tau + i*min_mn, Q, ldq, R, ldr, h_work, lwork, work, &err, &err2); if ( isnan(err) || isinf(err) ) { cublas_error = err; break; } cublas_error = max( err, cublas_error ); cublas_error2 = max( err2, cublas_error2 ); } #endif TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; bool okay = (magma_error < tol && magma_error2 < tol); //bool okay_cublas = (cublas_error < tol && cublas_error2 < tol); status += ! okay; printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %15.2e %15.2e %15.2e %15.2e %s\n", (int)batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, magma_error2, cublas_error, cublas_error2, (okay ? "ok" : "failed") ); } else { printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) ---\n", (int)batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Amagma); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau_magma ); TESTING_FREE_DEV( dtau_cublas ); TESTING_FREE_DEV( dinfo_magma ); TESTING_FREE_DEV( dinfo_cublas ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dtau_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }