/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix hA={Magma_CSR}, hA_SELLP={Magma_CSR}, dA={Magma_CSR}, dA_SELLP={Magma_CSR}; magma_s_matrix hx={Magma_CSR}, hy={Magma_CSR}, dx={Magma_CSR}, dy={Magma_CSR}, hrefvec={Magma_CSR}, hcheck={Magma_CSR}; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; #ifdef MAGMA_WITH_MKL magma_int_t *pntre=NULL; #endif cusparseHandle_t cusparseHandle = NULL; cusparseMatDescr_t descr = NULL; float c_one = MAGMA_S_MAKE(1.0, 0.0); float c_zero = MAGMA_S_MAKE(0.0, 0.0); float accuracy = 1e-10; #define PRECISION_s #if defined(PRECISION_c) accuracy = 1e-4; #endif #if defined(PRECISION_s) accuracy = 1e-4; #endif magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf("\n# usage: ./run_sspmm" " [ --blocksize %lld --alignment %lld (for SELLP) ] matrices\n\n", (long long) hA_SELLP.blocksize, (long long) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &hA, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &hA, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) hA.num_rows, (long long) hA.num_cols, (long long) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; // m - number of rows for the sparse matrix // n - number of vectors to be multiplied in the SpMM product magma_int_t m, n; m = hA.num_rows; n = 48; // init CPU vectors TESTING_CHECK( magma_svinit( &hx, Magma_CPU, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &hy, Magma_CPU, m, n, c_zero, queue )); // init DEV vectors TESTING_CHECK( magma_svinit( &dx, Magma_DEV, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, m, n, c_zero, queue )); // calling MKL with CSR #ifdef MAGMA_WITH_MKL TESTING_CHECK( magma_imalloc_cpu( &pntre, m + 1 ) ); pntre[0] = 0; for (j=0; j < m; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT num_vecs = n; MKL_INT *col; TESTING_CHECK( magma_malloc_cpu( (void**) &col, nnz * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_CHECK( magma_malloc_cpu( (void**) &row, num_rows * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } // === Call MKL with consecutive SpMVs, using mkl_scsrmv === // warmp up mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL SpMVs : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); // === Call MKL with blocked SpMVs, using mkl_scsrmm === char transa = 'n'; MKL_INT ldb = n, ldc=n; char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'}; // warm up mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); } end = magma_wtime(); printf( "\n > MKL SpMM : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); magma_free_cpu( row ); magma_free_cpu( col ); row = NULL; col = NULL; #endif // MAGMA_WITH_MKL // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue )); // SpMV on GPU (CSR) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue )); magma_smfree(&dA, queue ); // convert to SELLP and copy to GPU TESTING_CHECK( magma_smconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue )); TESTING_CHECK( magma_smtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue )); magma_smfree(&hA_SELLP, queue ); magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res=res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm SELL-P: ok\n"); else printf("%% tester spmm SELL-P: failed\n"); magma_smfree( &hcheck, queue ); magma_smfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); //#ifdef PRECISION_d start = magma_sync_wtime( queue ); TESTING_CHECK( cusparseCreate( &cusparseHandle )); TESTING_CHECK( cusparseSetStream( cusparseHandle, magma_queue_get_cuda_stream(queue) )); TESTING_CHECK( cusparseCreateMatDescr( &descr )); TESTING_CHECK( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); TESTING_CHECK( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); float alpha = c_one; float beta = c_zero; // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue) ); for (j=0; j < 10; j++) { cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, dA.num_rows, n, dA.num_cols, dA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, dA.num_cols, &beta, dy.dval, dA.num_cols); } end = magma_sync_wtime( queue ); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res = res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm cuSPARSE: ok\n"); else printf("%% tester spmm cuSPARSE: failed\n"); magma_smfree( &hcheck, queue ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); descr = NULL; cusparseHandle = NULL; //#endif printf("\n\n"); // free CPU memory magma_smfree( &hA, queue ); magma_smfree( &hx, queue ); magma_smfree( &hy, queue ); magma_smfree( &hrefvec, queue ); // free GPU memory magma_smfree( &dx, queue ); magma_smfree( &dy, queue ); magma_smfree( &dA, queue); #ifdef MAGMA_WITH_MKL magma_free_cpu( pntre ); #endif i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_R, *work; magmaDouble_ptr d_A, dwork; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) // need looser bound (3000*eps instead of 30*eps) for tests // TODO: should compute ||I - A*A^{-1}|| / (n*||A||*||A^{-1}||) opts.tolerance = max( 3000., opts.tolerance ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R||_F / (N*||A||_F)\n"); printf("=================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; ldwork = N * magma_get_dgetri_nb( N ); gflops = FLOPS_DGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); lwork = int( MAGMA_D_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, double, lwork ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_PIN( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dwork, double, ldwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); error = lapackf77_dlange( "f", &N, &N, h_A, &lda, rwork ); // norm(A) /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_dsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); magma_dgetrf_gpu( N, N, d_A, 0, ldda, ipiv, opts.queue, &info ); magma_dgetmatrix( N, N, d_A, 0, ldda, h_A, lda, opts.queue ); if ( info != 0 ) printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // check for exact singularity //h_A[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgetri_gpu( N, d_A, 0, ldda, ipiv, dwork, 0, ldwork, opts.queues2, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgetri( &N, h_A, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione ); error = lapackf77_dlange( "f", &N, &N, h_R, &lda, rwork ) / (N*error); printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f) ---\n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
//################################################################################################## static void *magma_capplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_capplyQ_id_data*)arg) -> id; magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; cuFloatComplex *E = data -> E; magma_int_t lde = data -> lde; cuFloatComplex *V = data -> V; magma_int_t ldv = data -> ldv; cuFloatComplex *TAU = data -> TAU; cuFloatComplex *T = data -> T; magma_int_t ldt = data -> ldt; cuFloatComplex *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; real_Double_t timeQcpu=0.0, timeQgpu=0.0; magma_int_t n_cpu = ne - n_gpu; #if defined(SETAFFINITY) cpu_set_t set; CPU_ZERO( &set ); CPU_SET( my_core_id, &set ); sched_setaffinity( 0, sizeof(set), &set) ; #endif if(my_core_id==0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= timeQgpu = magma_wtime(); magma_csetmatrix(n, n_gpu, E, lde, dE, ldde); magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f \n" ,timeQgpu); }else{ //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= if(my_core_id == 1) timeQcpu = magma_wtime(); magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); cuFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ctile_bulge_applyQ('L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); if(my_core_id == 1){ timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f \n" ,timeQcpu); } } // END if my_core_id #if defined(SETAFFINITY) // unbind threads magma_int_t sys_corenbr = 1; sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN); CPU_ZERO( &set ); for(magma_int_t i=0; i<sys_corenbr; i++) CPU_SET( i, &set ); sched_setaffinity( 0, sizeof(set), &set) ; #endif return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; float *h_A; magmaFloat_ptr d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("%% version %d\n", (int) opts.version ); if ( opts.check == 2 ) { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf("%% M N CPU Gflop/s (sec) GPU Gflop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("%%========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = FLOPS_SGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( opts, M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_sgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_sgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( opts, M, N, h_A, lda ); if ( opts.version == 2 ) { // no pivoting versions, so set ipiv to identity for (magma_int_t i=0; i < min_mn; ++i ) { ipiv[i] = i+1; } } magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_wtime(); if ( opts.version == 1 ) { magma_sgetrf_gpu( M, N, d_A, ldda, ipiv, &info); } else if ( opts.version == 2 ) { magma_sgetrf_nopiv_gpu( M, N, d_A, ldda, &info); } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_sgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_residual( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { magma_sgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( opts, M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zpotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_R; magmaDoubleComplex *d_A; magma_int_t N, n2, lda, ldda, info; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_ZPOTRF( N ) / 1e9; TESTING_MALLOC( h_A, magmaDoubleComplex, n2 ); TESTING_HOSTALLOC( h_R, magmaDoubleComplex, n2 ); TESTING_DEVALLOC( d_A, magmaDoubleComplex, ldda*N ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hpd( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_zpotrf_gpu( opts.uplo, N, d_A, ldda, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zpotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_zpotrf( &opts.uplo, &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_zgetmatrix( N, N, d_A, ldda, h_R, lda ); error = lapackf77_zlange("f", &N, &N, h_A, &lda, work); blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_zlange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e%s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "" : " failed") ); status |= ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE( h_A ); TESTING_HOSTFREE( h_R ); TESTING_DEVFREE( d_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_time, gpu_perf; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; // TODO add these options to parse_opts magma_int_t NE = 0; magma_int_t distblk = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t WANTZ = (opts.jobz == MagmaVec); double tol = opts.tolerance * lapackf77_dlamch("E"); if (opts.nb == 0) opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N); if (NE < 1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); nstream = max(3, opts.ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex_ptr da[MagmaMaxGPUs], dT1[MagmaMaxGPUs]; if ((distblk == 0) || (distblk < opts.nb)) distblk = max(256, opts.nb); printf("%% ngpu %d, distblk %d, NB %d, nstream %d\n", (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default n2 = lda*N; /* We suppose the magma NB is bigger than lapack NB */ lwork = N*opts.nb; //gflops = ....? /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb ); } /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk); //magmaDoubleComplex_ptr dabis; //TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda); for (int count=0; count < 1; ++count) { magma_setdevice(0); gpu_time = magma_wtime(); if (opts.version == 30) { // see src/obsolete and magmablas/obsolete printf( "magma_zhetrd_he2hb_mgpu_spec not compiled\n" ); //magma_zhetrd_he2hb_mgpu_spec( // opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, // da, ldda, dT1, opts.nb, opts.ngpu, distblk, // streams, nstream, opts.nthread, &info); } else { nstream = 3; magma_zhetrd_he2hb_mgpu( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info); gpu_time = magma_wtime() - gpu_time; printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time); } magma_setdevice(0); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice(dev); magma_device_sync(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); // todo neither of these is declared in headers // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // todo where is this timer started? // gpu_time = magma_wtime() - gpu_time; // todo what are the gflops? gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); int nt = min(12, opts.nthread); magma_set_lapack_numthreads(nt); lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #ifdef COMPLEX rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, nt); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); magma_set_lapack_numthreads(1); printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif // PRECISION_z || PRECISION_d #endif // CHECKEIG printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("%%===========================================================================\n\n\n"); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); } magma_setdevice( 0 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; magmaDoubleComplex *h_A; magmaDoubleComplex_ptr d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_zgetrf_nb( M ); gflops = FLOPS_ZGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], magmaDoubleComplex, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_zgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_zsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_zgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing spotrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A, *h_R; magmaFloat_ptr d_A; magma_int_t N, n2, lda, ldda, info; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; float work[1], error; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("uplo = %s, version = %d\n", lapack_uplo_const(opts.uplo), opts.version ); printf(" N CPU GFlop/s (sec) GPU GFlop/s (sec) ||R_magma - R_lapack||_F / ||R_lapack||_F\n"); printf("========================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = ((N+31)/32)*32; gflops = FLOPS_SPOTRF( N ) / 1e9; TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_PIN( h_R, float, n2 ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ lapackf77_slarnv( &ione, ISEED, &n2, h_A ); magma_smake_hpd( N, h_A, lda ); lapackf77_slacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); magma_ssetmatrix( N, N, h_A, lda, d_A, 0, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); if ( opts.version == 1 ) { magma_spotrf_gpu( opts.uplo, N, d_A, 0, ldda, opts.queue, &info ); } else if ( opts.version == 2 ) { magma_spotrf2_gpu( opts.uplo, N, d_A, 0, ldda, opts.queues2, &info ); } else { printf( "Unknown version %d\n", opts.version ); exit(1); } gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_spotrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.lapack ) { /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); lapackf77_spotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_spotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ magma_sgetmatrix( N, N, d_A, 0, ldda, h_R, lda, opts.queue ); error = lapackf77_slange("f", &N, &N, h_A, &lda, work); blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &N, &N, h_R, &lda, work) / error; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); } else { printf("%5d --- ( --- ) %7.2f (%7.2f) --- \n", (int) N, gpu_perf, gpu_time ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
magma_int_t magma_dpgmres( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par ){ // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE, c_mone = MAGMA_D_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; double nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); double *H, *HH, *y, *h1; magma_dmalloc_pinned( &H, (ldh+1)*ldh ); magma_dmalloc_pinned( &y, ldh ); magma_dmalloc_pinned( &HH, ldh*ldh ); magma_dmalloc_pinned( &h1, ldh ); // GPU workspace magma_d_vector r, q, q_t, z, z_t, t; magma_d_vinit( &t, Magma_DEV, dofs, c_zero ); magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z_t, Magma_DEV, dofs, c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; double *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_dmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_dmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_dscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_D_MAKE( nom0, 0. ); magma_dsetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_dcopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/H(1,0) r magma_dscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); // preconditioner // z[k] = M^(-1) q(k) magma_d_applyprecond_left( A, q_t, &t, precond_par ); magma_d_applyprecond_right( A, t, &z_t, precond_par ); magma_dcopy(dofs, z_t.val, 1, z(k-1), 1); // r = A q[k] magma_d_spmv( c_one, A, z_t, c_zero, r ); if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_ddot(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_daxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing dgemv with dnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_dgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_dcopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_dgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef DNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef DNRM2SCALE magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_dnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_ddot(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_D_REAL(H(k+1,k))); } magma_dsetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_dgemv(MagmaNoTrans, dofs, m, c_one, z(0), dofs, dy, 1, c_one, x->val, 1); magma_d_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_daxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_D_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_d_vfree(&t); magma_d_vfree(&r); magma_d_vfree(&q); magma_d_vfree(&z); magma_d_vfree(&z_t); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_dgmres */
extern "C" magma_int_t magma_sbulge_back_m(magma_int_t nrgpu, magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, float *Z, magma_int_t ldz, float *V, magma_int_t ldv, float *TAU, float *T, magma_int_t ldt, magma_int_t* info) { magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); real_Double_t timeaplQ2=0.0; float f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) // float gpu_cpu_perf = 32; //gpu over cpu performance //#else // float gpu_cpu_perf = 32; // gpu over cpu performance //#endif float perf_temp= .85; float perf_temp2= perf_temp; for (magma_int_t itmp=1; itmp < nrgpu; ++itmp) perf_temp2 *= perf_temp; magma_int_t gpu_cpu_perf = magma_get_sbulge_gcperf(); if (threads > 1) { f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf*(1.-perf_temp2)/(1.-perf_temp))); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ //n_gpu = ne; if (n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU > 0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_sapplyQ_m_data data_applyQ(nrgpu, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt); magma_sapplyQ_m_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_sapplyQ_m_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_sapplyQ_m_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_sapplyQ_m_parallel_section, &arg[thread]); } arg[0] = magma_sapplyQ_m_id_data(0, &data_applyQ); magma_sapplyQ_m_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); /*============================ * use only GPU *==========================*/ } else { magma_sbulge_applyQ_v2_m(nrgpu, MagmaLeft, ne, n, nb, Vblksiz, Z, ldz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_set_lapack_numthreads(mklth); return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing claset Code is very similar to testing_clacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magmaFloatComplex offdiag = MAGMA_C_MAKE( 1.2000, 6.7000 ); magmaFloatComplex diag = MAGMA_C_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, size, lda, ldb, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("==================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldb = lda; ldda = ((M+31)/32)*32; size = lda*N; if ( uplo[iuplo] == MagmaLower || uplo[iuplo] == MagmaUpper ) { // save triangle (with diagonal) // TODO wrong for trapezoid gbytes = sizeof(magmaFloatComplex) * 0.5*N*(N+1) / 1e9; } else { // save entire matrix gbytes = sizeof(magmaFloatComplex) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, magmaFloatComplex, size ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, size ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_C_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ); //magmablas_claset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda ); // inset by 1 row & col magmablas_claset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_claset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_claset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, lda ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &M, &N, h_R, &lda, work); printf("%4c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsyevd */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, cpu_time; double *h_A, *h_R, *d_R, *h_work; double *w1, *w2; magma_int_t *iwork; magma_int_t N, n2, info, lwork, liwork, lda, ldda, aux_iwork[1]; magma_int_t izero = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double result[3], eps, aux_work[1]; eps = lapackf77_dlamch( "E" ); magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tolulp = opts.tolerance * lapackf77_dlamch("P"); if ( opts.check && opts.jobz == MagmaNoVec ) { fprintf( stderr, "checking results requires vectors; setting jobz=V (option -JV)\n" ); opts.jobz = MagmaVec; } printf(" N CPU Time (sec) GPU Time (sec)\n"); printf("=======================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; n2 = N*N; lda = N; ldda = ((N + 31)/32)*32; // query for workspace sizes magma_dsyevd_gpu( opts.jobz, opts.uplo, N, NULL, ldda, NULL, NULL, lda, aux_work, -1, aux_iwork, -1, &info ); lwork = (magma_int_t) aux_work[0]; liwork = aux_iwork[0]; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, double, N*lda ); TESTING_MALLOC_CPU( w1, double, N ); TESTING_MALLOC_CPU( w2, double, N ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); TESTING_MALLOC_PIN( h_R, double, N*lda ); TESTING_MALLOC_PIN( h_work, double, lwork ); TESTING_MALLOC_DEV( d_R, double, N*ldda ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); magma_dsetmatrix( N, N, h_A, lda, d_R, ldda ); /* warm up run */ if ( opts.warmup ) { magma_dsyevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsyevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dsetmatrix( N, N, h_A, lda, d_R, ldda ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dsyevd_gpu( opts.jobz, opts.uplo, N, d_R, ldda, w1, h_R, lda, h_work, lwork, iwork, liwork, &info ); gpu_time = magma_wtime() - gpu_time; if (info != 0) printf("magma_dsyevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); if ( opts.check ) { /* ===================================================================== Check the results following the LAPACK's [zcds]drvst routine. A is factored as A = U S U' and the following 3 tests computed: (1) | A - U S U' | / ( |A| N ) (2) | I - U'U | / ( N ) (3) | S(with U) - S(w/o U) | / | S | =================================================================== */ double temp1, temp2; // tau=NULL is unused since itype=1 magma_dgetmatrix( N, N, d_R, ldda, h_R, lda ); lapackf77_dsyt21( &ione, &opts.uplo, &N, &izero, h_A, &lda, w1, h_work, h_R, &lda, h_R, &lda, NULL, h_work, &result[0] ); magma_dsetmatrix( N, N, h_A, lda, d_R, ldda ); magma_dsyevd_gpu( MagmaNoVec, opts.uplo, N, d_R, ldda, w2, h_R, lda, h_work, lwork, iwork, liwork, &info ); if (info != 0) printf("magma_dsyevd_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); temp1 = temp2 = 0; for( int j=0; j<N; j++ ) { temp1 = max(temp1, absv(w1[j])); temp1 = max(temp1, absv(w2[j])); temp2 = max(temp2, absv(w1[j]-w2[j])); } result[2] = temp2 / (((double)N)*temp1); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dsyevd( &opts.jobz, &opts.uplo, &N, h_A, &lda, w2, h_work, &lwork, iwork, &liwork, &info ); cpu_time = magma_wtime() - cpu_time; if (info != 0) printf("lapackf77_dsyevd returned error %d: %s.\n", (int) info, magma_strerror( info )); printf("%5d %7.2f %7.2f\n", (int) N, cpu_time, gpu_time); } else { printf("%5d --- %7.2f\n", (int) N, gpu_time); } /* ===================================================================== Print execution time =================================================================== */ if ( opts.check ) { printf("Testing the factorization A = U S U' for correctness:\n"); printf("(1) | A - U S U' | / (|A| N) = %8.2e%s\n", result[0]*eps, (result[0]*eps < tol ? "" : " failed") ); printf("(2) | I - U'U | / N = %8.2e%s\n", result[1]*eps, (result[1]*eps < tol ? "" : " failed") ); printf("(3) | S(w/ U) - S(w/o U) | / |S| = %8.2e%s\n\n", result[2] , (result[2] < tolulp ? "" : " failed") ); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( w1 ); TESTING_FREE_CPU( w2 ); TESTING_FREE_CPU( iwork ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_DEV( d_R ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { TESTING_INIT(); magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magma_c_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; magma_int_t *pntre; magmaFloatComplex c_one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex c_zero = MAGMA_C_MAKE(0.0, 0.0); magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_cspmv" " [ --blocksize %d --alignment %d (for SELLP) ]" " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_cm_5stencil( laplace_size, &hA, queue ); } else { // file-matrix test magma_c_csr_mtx( &hA, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; magma_c_vector hx, hy, dx, dy, hrefvec, hcheck; // init CPU vectors magma_c_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue ); magma_c_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue ); // init DEV vectors magma_c_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); #ifdef MAGMA_WITH_MKL // calling MKL with CSR pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) ); pntre[0] = 0; for (j=0; j<hA.num_rows; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT *col; TESTING_MALLOC_CPU( col, MKL_INT, nnz ); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_MALLOC_CPU( row, MKL_INT, num_rows ); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } start = magma_wtime(); for (j=0; j<10; j++ ) { mkl_ccsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); TESTING_FREE_CPU( row ); TESTING_FREE_CPU( col ); free(pntre); #endif // MAGMA_WITH_MKL // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); // SpMV on GPU (CSR) -- this is the reference! start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA, queue ); magma_c_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue ); // convert to ELL and copy to GPU magma_c_mconvert( hA, &hA_ELL, Magma_CSR, Magma_ELL, queue ); magma_c_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_ELL, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (ELL) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_ELL, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard ELL).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA_ELL, queue ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); if ( res < .000001 ) printf("# tester spmv ELL: ok\n"); else printf("# tester spmv ELL: failed\n"); magma_c_vfree( &hcheck, queue ); // convert to SELLP and copy to GPU magma_c_mconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue ); magma_c_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_SELLP, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv SELL-P: ok\n"); else printf("# tester spmv SELL-P: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_mfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); magmaFloatComplex alpha = c_one; magmaFloatComplex beta = c_zero; magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseCcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE CSR\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); cusparseMatDescr_t descrA; cusparseStatus = cusparseCreateMatDescr(&descrA); if (cusparseStatus != 0) printf("error\n"); cusparseHybMat_t hybA; cusparseStatus = cusparseCreateHybMat( &hybA ); if (cusparseStatus != 0) printf("error\n"); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE CSR: ok\n"); else printf("# tester spmv cuSPARSE CSR: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); cusparseCcsr2hyb(cusparseHandle, hA.num_rows, hA.num_cols, descrA, dA.dval, dA.drow, dA.dcol, hybA, 0, CUSPARSE_HYB_PARTITION_AUTO); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseChybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA, hybA, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE HYB\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (HYB).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE HYB: ok\n"); else printf("# tester spmv cuSPARSE HYB: failed\n"); magma_c_vfree( &hcheck, queue ); cusparseDestroyMatDescr( descrA ); cusparseDestroyHybMat( hybA ); cusparseDestroy( cusparseHandle ); magma_c_mfree(&dA, queue ); printf("\n\n"); // free CPU memory magma_c_mfree(&hA, queue ); magma_c_vfree(&hx, queue ); magma_c_vfree(&hy, queue ); magma_c_vfree(&hrefvec, queue ); // free GPU memory magma_c_vfree(&dx, queue ); magma_c_vfree(&dy, queue ); i++; } magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slaset_band Code is very similar to testing_slacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); #define h_A(i_,j_) (h_A + (i_) + (j_)*lda) #define d_A(i_,j_) (d_A + (i_) + (j_)*ldda) real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; float *d_A; float offdiag = MAGMA_S_MAKE( 1.2000, 6.7000 ); float diag = MAGMA_S_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, nb, cnt, size, lda, ldb, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); nb = (opts.nb == 0 ? 32 : opts.nb); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("K = nb = %d\n", (int) nb ); printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("==================================================================\n"); for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { int inset = 0; M = opts.msize[itest] + 2*inset; N = opts.nsize[itest] + 2*inset; lda = M; ldb = lda; ldda = ((M+31)/32)*32; size = lda*N; TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); /* ===================================================================== Performs operation on CPU Also count number of elements touched. =================================================================== */ cpu_time = magma_wtime(); cnt = 0; for( int j=inset; j < N-inset; ++j ) { for( int k=0; k < nb; ++k ) { // set k-th sub- or super-diagonal if ( k == 0 && j < M-inset ) { *h_A(j,j) = diag; cnt += 1; } else if ( uplo[iuplo] == MagmaLower && j+k < M-inset ) { *h_A(j+k,j) = offdiag; cnt += 1; } else if ( uplo[iuplo] == MagmaUpper && j-k >= inset && j-k < M-inset ) { *h_A(j-k,j) = offdiag; cnt += 1; } } } gbytes = cnt / 1e9; cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); int mm = M - 2*inset; int nn = N - 2*inset; magmablas_slaset_band( uplo[iuplo], mm, nn, nb, offdiag, diag, d_A(inset,inset), ldda ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda ); //printf( "h_R=" ); magma_sprint( M, N, h_R, lda ); //printf( "h_A=" ); magma_sprint( M, N, h_A, lda ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work); printf("%4c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cbulge_back(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, magmaFloatComplex *Z, magma_int_t ldz, magmaFloatComplex *dZ, magma_int_t lddz, magmaFloatComplex *V, magma_int_t ldv, magmaFloatComplex *TAU, magmaFloatComplex *T, magma_int_t ldt, magma_int_t* info) { magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); real_Double_t timeaplQ2=0.0; float f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) //float gpu_cpu_perf = 50; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) //float gpu_cpu_perf = 16; // gpu over cpu performance //100% ev // SandyB. - Fermi (M2090) //#else // float gpu_cpu_perf = 27.5; // gpu over cpu performance //100% ev // Westmere - Fermi (M2090) //float gpu_cpu_perf = 37; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) // float gpu_cpu_perf = 130; // gpu over cpu performance //100% ev // Bulldozer - Kepler (K20X) //#endif magma_int_t gpu_cpu_perf = magma_get_cbulge_gcperf(); if (threads > 1) { f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf) ); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //n_gpu=ne; //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if (n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU > 0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_capplyQ_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_capplyQ_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]); } arg[0] = magma_capplyQ_id_data(0, &data_applyQ); magma_capplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ } else { magma_csetmatrix(n, ne, Z, ldz, dZ, lddz); magma_cbulge_applyQ_v2(MagmaLeft, ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_set_lapack_numthreads(mklth); return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slacpy_batched Code is very similar to testing_sgeadd_batched.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_B; magmaFloat_ptr d_A, d_B; float **hAarray, **hBarray, **dAarray, **dBarray; magma_int_t M, N, mb, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); mb = (opts.nb == 0 ? 32 : opts.nb); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*mb; nstride = 3*nb; printf("%% mb=%d, nb=%d, mstride=%d, nstride=%d\n", (int) mb, (int) nb, (int) mstride, (int) nstride ); printf("%% M N ntile CPU Gflop/s (ms) GPU Gflop/s (ms) check\n"); printf("%%================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default size = lda*N; if ( N < nb || M < nb ) { ntile = 0; } else { ntile = min( (M - nb)/mstride + 1, (N - nb)/nstride + 1 ); } gbytes = 2.*mb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, float, lda *N ); TESTING_MALLOC_CPU( h_B, float, lda *N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_B, float, ldda*N ); TESTING_MALLOC_CPU( hAarray, float*, ntile ); TESTING_MALLOC_CPU( hBarray, float*, ntile ); TESTING_MALLOC_DEV( dAarray, float*, ntile ); TESTING_MALLOC_DEV( dBarray, float*, ntile ); lapackf77_slarnv( &ione, ISEED, &size, h_A ); lapackf77_slarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); magma_ssetmatrix( M, N, h_B, lda, d_B, ldda ); // setup pointers for( magma_int_t tile = 0; tile < ntile; ++tile ) { magma_int_t offset = tile*mstride + tile*nstride*ldda; hAarray[tile] = &d_A[offset]; hBarray[tile] = &d_B[offset]; } magma_setvector( ntile, sizeof(float*), hAarray, 1, dAarray, 1 ); magma_setvector( ntile, sizeof(float*), hBarray, 1, dBarray, 1 ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_slacpy_batched( MagmaFull, mb, nb, dAarray, ldda, dBarray, ldda, ntile, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( magma_int_t tile = 0; tile < ntile; ++tile ) { magma_int_t offset = tile*mstride + tile*nstride*lda; lapackf77_slacpy( MagmaFullStr, &mb, &nb, &h_A[offset], &lda, &h_B[offset], &lda ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_B, ldda, h_A, lda ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_B, &ione); error = lapackf77_slange("f", &M, &N, h_B, &lda, work); bool okay = (error == 0); status += ! okay; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) M, (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (okay ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_CPU( hAarray ); TESTING_FREE_CPU( hBarray ); TESTING_FREE_DEV( dAarray ); TESTING_FREE_DEV( dBarray ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
//################################################################################################## static void *magma_capplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_capplyQ_id_data*)arg) -> id; magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; magmaFloatComplex *E = data -> E; magma_int_t lde = data -> lde; magmaFloatComplex *V = data -> V; magma_int_t ldv = data -> ldv; magmaFloatComplex *TAU = data -> TAU; magmaFloatComplex *T = data -> T; magma_int_t ldt = data -> ldt; magmaFloatComplex *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (my_core_id == 0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_csetmatrix(n, n_gpu, E, lde, dE, ldde); magma_cbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f\n", timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if (my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ctile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f\n", timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slaset Code is very similar to testing_slacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; magmaFloat_ptr d_A; float offdiag, diag; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("%% uplo M N offdiag diag CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("%%===================================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { for( int ival = 0; ival < 4; ++ival ) { // test combinations of zero & non-zero: // ival offdiag diag // 0 0 0 // 1 0 3.14 // 2 1.23 0 // 3 1.23 3.14 offdiag = MAGMA_S_MAKE( 1.2345, 6.7890 ) * (ival / 2); diag = MAGMA_S_MAKE( 3.1415, 2.7183 ) * (ival % 2); M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldda = magma_roundup( M, opts.align ); size = lda*N; if ( uplo[iuplo] == MagmaLower ) { // save lower trapezoid (with diagonal) if ( M > N ) { gbytes = sizeof(float) * (1.*M*N - 0.5*N*(N-1)) / 1e9; } else { gbytes = sizeof(float) * 0.5*M*(M+1) / 1e9; } } else if ( uplo[iuplo] == MagmaUpper ) { // save upper trapezoid (with diagonal) if ( N > M ) { gbytes = sizeof(float) * (1.*M*N - 0.5*M*(M-1)) / 1e9; } else { gbytes = sizeof(float) * 0.5*N*(N+1) / 1e9; } } else { // save entire matrix gbytes = sizeof(float) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_ssetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); //magmablas_slaset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda, opts.queue ); // inset by 1 row & col magmablas_slaset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_slaset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_slaset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_sprint( M, N, h_A, lda ); printf( "dA=" ); magma_sprint_gpu( M, N, d_A, ldda ); } /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda, opts.queue ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work); bool okay = (error == 0); status += ! okay; printf("%5s %5d %5d %9.4f %6.4f %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const( uplo[iuplo] ), (int) M, (int) N, real(offdiag), real(diag), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (okay ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zungqr_gpu */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double Anorm, error, work[1]; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *hA, *hR, *tau, *h_work; magmaDoubleComplex_ptr dA, dT; magma_int_t m, n, k; magma_int_t n2, lda, ldda, lwork, min_mn, nb, info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); opts.lapack |= opts.check; // check (-c) implies lapack (-l) printf(" m n k CPU GFlop/s (sec) GPU GFlop/s (sec) ||R|| / ||A||\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; if ( m < n || n < k ) { printf( "%5d %5d %5d skipping because m < n or n < k\n", (int) m, (int) n, (int) k ); continue; } lda = m; ldda = ((m + 31)/32)*32; n2 = lda*n; min_mn = min(m, n); nb = magma_get_zgeqrf_nb( m ); lwork = (m + 2*n+nb)*nb; gflops = FLOPS_ZUNGQR( m, n, k ) / 1e9; TESTING_MALLOC_PIN( hA, magmaDoubleComplex, lda*n ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( hR, magmaDoubleComplex, lda*n ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn ); TESTING_MALLOC_DEV( dA, magmaDoubleComplex, ldda*n ); TESTING_MALLOC_DEV( dT, magmaDoubleComplex, ( 2*min_mn + ((n + 31)/32)*32 )*nb ); lapackf77_zlarnv( &ione, ISEED, &n2, hA ); lapackf77_zlacpy( MagmaFullStr, &m, &n, hA, &lda, hR, &lda ); Anorm = lapackf77_zlange("f", &m, &n, hA, &lda, work ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ // first, get QR factors in both hA and dA // okay that magma_zgeqrf_gpu has special structure for R; R isn't used here. magma_zsetmatrix( m, n, hA, lda, dA, ldda ); magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, &info ); if (info != 0) printf("magma_zgeqrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_zgetmatrix( m, n, dA, ldda, hA, lda ); gpu_time = magma_wtime(); magma_zungqr_gpu( m, n, k, dA, ldda, tau, dT, nb, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zungqr_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get dA back to the CPU to compare with the CPU result. magma_zgetmatrix( m, n, dA, ldda, hR, lda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_zungqr( &m, &n, &k, hA, &lda, tau, h_work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zungqr returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute relative error |R|/|A| := |Q_magma - Q_lapack|/|A| blasf77_zaxpy( &n2, &c_neg_one, hA, &ione, hR, &ione ); error = lapackf77_zlange("f", &m, &n, hR, &lda, work) / Anorm; bool okay = (error < tol); status += ! okay; printf("%5d %5d %5d %7.1f (%7.2f) %7.1f (%7.2f) %8.2e %s\n", (int) m, (int) n, (int) k, cpu_perf, cpu_time, gpu_perf, gpu_time, error, (okay ? "ok" : "failed")); } else { printf("%5d %5d %5d --- ( --- ) %7.1f (%7.2f) --- \n", (int) m, (int) n, (int) k, gpu_perf, gpu_time ); } TESTING_FREE_PIN( hA ); TESTING_FREE_PIN( h_work ); TESTING_FREE_CPU( hR ); TESTING_FREE_CPU( tau ); TESTING_FREE_DEV( dA ); TESTING_FREE_DEV( dT ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_ssytrd_sb2st(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, float *A, magma_int_t lda, float *D, float *E, float *V, magma_int_t ldv, float *TAU, magma_int_t compT, float *T, magma_int_t ldt) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= Arguments ========= THREADS (input) INTEGER Specifies the number of pthreads used. THREADS > 0 UPLO (input) CHARACTER*1 = 'U': Upper triangles of A is stored; = 'L': Lower triangles of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NB (input) INTEGER The order of the band matrix A. N >= NB >= 0. VBLKSIZ (input) INTEGER The size of the block of householder vectors applied at once. A (input/workspace) REAL array, dimension (LDA, N) On entry the band matrix stored in the following way: LDA (input) INTEGER The leading dimension of the array A. LDA >= 2*NB. D (output) DOUBLE array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). E (output) DOUBLE array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. V (output) REAL array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. LDV (input) INTEGER The leading dimension of V. LDV > NB + VBLKSIZ + 1 TAU (output) REAL dimension(BLKCNT, VBLKSIZ) ??? COMPT (input) INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed T (output) REAL dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced LDT (input) INTEGER The leading dimension of T. LDT > Vblksiz INFO (output) INTEGER ???????????????????????????????????????????????????????????????????????????????????? = 0: successful exit ===================================================================== */ #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif //char uplo_[2] = {uplo, 0}; magma_int_t mklth = threads; magma_int_t INgrsiz=1; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t nbtiles = magma_ceildiv(n, nb); memset(T, 0, blkcnt*ldt*Vblksiz*sizeof(float)); memset(TAU, 0, blkcnt*Vblksiz*sizeof(float)); memset(V, 0, blkcnt*ldv*Vblksiz*sizeof(float)); magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t)); memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t)); magma_sbulge_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_sbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_setlapack_numthreads(1); magma_sbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_sbulge_id_data(thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_ssytrd_sb2st_parallel_section, &arg[thread]); } arg[0] = magma_sbulge_id_data(0, &data_bulge); magma_ssytrd_sb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f \n" ,timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu(prog); magma_setlapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag D and E * note that D and E are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in D and E */ /* In real case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to E. * When using HouseHolder elimination, * the SLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #if defined(PRECISION_z) || defined(PRECISION_c) if(uplo==MagmaLower){ for (magma_int_t i=0; i < n-1 ; i++) { D[i] = MAGMA_S_REAL(A[i*lda ]); E[i] = MAGMA_S_REAL(A[i*lda+1]); } D[n-1] = MAGMA_S_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i<n-1; i++) { D[i] = MAGMA_S_REAL(A[i*lda+nb]); E[i] = MAGMA_S_REAL(A[i*lda+nb-1]); } D[n-1] = MAGMA_S_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if( uplo == MagmaLower ){ for (magma_int_t i=0; i < n-1; i++) { D[i] = A[i*lda]; // diag E[i] = A[i*lda+1]; //lower diag } D[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { D[i] = A[i*lda+nb]; // diag E[i] = A[i*lda+nb-1]; //lower diag } D[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlange */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex *d_A; double *d_work; magma_int_t M, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); // Only one norm supported for now, but leave this here for future support // of different norms. See similar code in testing_zlanhe.cpp. magma_norm_t norm[] = { MagmaInfNorm }; printf(" M N norm CPU GByte/s (ms) GPU GByte/s (ms) error \n"); printf("=====================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 1; ++inorm ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; n2 = lda*N; ldda = roundup( M, opts.pad ); // read whole matrix gbytes = M*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, M ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, M ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( M, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlange( norm[inorm], M, N, d_A, ldda, d_work ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma < 0) printf("magmablas_zlange returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); norm_lapack = lapackf77_zlange( lapack_norm_const(norm[inorm]), &M, &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) printf("lapackf77_zlange returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); /* ===================================================================== Check the result compared to LAPACK =================================================================== */ if ( norm[inorm] == MagmaMaxNorm ) error = fabs( norm_magma - norm_lapack ); else error = fabs( norm_magma - norm_lapack ) / norm_lapack; printf("%5d %5d %4c %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) M, (int) N, lapacke_norm_const(norm[inorm]), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error < tol ? "ok" : "failed") ); status += ! (error < tol); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); }} // end inorm, iter if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
static void *magma_ssytrd_sb2st_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_sbulge_id_data*)arg) -> id; magma_sbulge_data* data = ((magma_sbulge_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t nb = data -> nb; magma_int_t nbtiles = data -> nbtiles; magma_int_t grsiz = data -> grsiz; magma_int_t Vblksiz = data -> Vblksiz; magma_int_t compT = data -> compT; float *A = data -> A; magma_int_t lda = data -> lda; float *V = data -> V; magma_int_t ldv = data -> ldv; float *TAU = data -> TAU; float *T = data -> T; magma_int_t ldt = data -> ldt; volatile magma_int_t* prog = data -> prog; pthread_barrier_t* barrier = &(data -> barrier); //magma_int_t sys_corenbr = 1; #ifdef ENABLE_TIMER real_Double_t timeB=0.0, timeT=0.0; #endif // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_setlapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if(compT==1) { /* compute the Q1 overlapped with the bulge chasing+T. * if all_cores_num=1 it call Q1 on GPU and then bulgechasing. * otherwise the first thread run Q1 on GPU and * the other threads run the bulgechasing. * */ if(allcores_num==1) { //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER timeB = magma_wtime(); #endif magma_stile_bulge_parallel(0, 1, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); #ifdef ENABLE_TIMER timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f \n" ,timeB); #endif //========================= // compute the T's to be used when applying Q2 //========================= #ifdef ENABLE_TIMER timeT = magma_wtime(); #endif magma_stile_bulge_computeT_parallel(0, 1, V, ldv, TAU, T, ldt, n, nb, Vblksiz); #ifdef ENABLE_TIMER timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f \n" ,timeT); #endif }else{ // allcore_num > 1 magma_int_t id = my_core_id; magma_int_t tot = allcores_num; //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if(id == 0) timeB = magma_wtime(); #endif magma_stile_bulge_parallel(id, tot, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if(id == 0){ timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f \n" ,timeB); } #endif //========================= // compute the T's to be used when applying Q2 //========================= #ifdef ENABLE_TIMER if(id == 0) timeT = magma_wtime(); #endif magma_stile_bulge_computeT_parallel(id, tot, V, ldv, TAU, T, ldt, n, nb, Vblksiz); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (id == 0){ timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f \n" ,timeT); } #endif } // allcore == 1 }else{ // WANTZ = 0 //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if(my_core_id == 0) timeB = magma_wtime(); #endif magma_stile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if(my_core_id == 0){ timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f \n" ,timeB); } #endif } // WANTZ > 0 #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0){ check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetri */ int main( int argc, char** argv ) { TESTING_INIT(); // constants const double c_zero = MAGMA_D_ZERO; const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A, *h_Ainv, *h_R, *work; magmaDouble_ptr d_A, dwork; magma_int_t N, n2, lda, ldda, info, lwork, ldwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; double tmp; double error, rwork[1]; magma_int_t *ipiv; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% N CPU Gflop/s (sec) GPU Gflop/s (sec) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default ldwork = N * magma_get_dgetri_nb( N ); gflops = FLOPS_DGETRI( N ) / 1e9; // query for workspace size lwork = -1; lapackf77_dgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_D_REAL( tmp )); TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_CPU( work, double, lwork ); TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_Ainv, double, n2 ); TESTING_MALLOC_CPU( h_R, double, n2 ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( dwork, double, ldwork ); /* Initialize the matrix */ lapackf77_dlarnv( &ione, ISEED, &n2, h_A ); /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */ magma_dsetmatrix( N, N, h_A, lda, d_A, ldda, opts.queue ); magma_dgetrf_gpu( N, N, d_A, ldda, ipiv, &info ); magma_dgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); if (info != 0) { printf("magma_dgetrf_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } // check for exact singularity //h_Ainv[ 10 + 10*lda ] = MAGMA_D_MAKE( 0.0, 0.0 ); //magma_dsetmatrix( N, N, h_Ainv, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_dgetri_gpu( N, d_A, ldda, ipiv, dwork, ldwork, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) { printf("magma_dgetri_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_dgetri( &N, h_Ainv, &lda, ipiv, work, &lwork, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_dgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } printf( "%5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf( "%5d --- ( --- ) %7.2f (%7.2f)", (int) N, gpu_perf, gpu_time ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_dgetmatrix( N, N, d_A, ldda, h_Ainv, lda, opts.queue ); // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond; normA = lapackf77_dlange( "1", &N, &N, h_A, &lda, rwork ); normAinv = lapackf77_dlange( "1", &N, &N, h_Ainv, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; error = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_dlaset( "full", &N, &N, &c_zero, &c_one, h_R, &lda ); blasf77_dgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A, &lda, h_Ainv, &lda, &c_one, h_R, &lda ); error = lapackf77_dlange( "1", &N, &N, h_R, &lda, rwork ); error = error * rcond / N; } bool okay = (error < tol); status += ! okay; printf( " %8.2e %s\n", error, (okay ? "ok" : "failed")); } else { printf( "\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dwork ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgels */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; float gpu_error, cpu_error, error, Anorm, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *h_work, tmp[1]; magmaFloatComplex *d_A, *d_B; magma_int_t M, N, size, nrhs, lda, ldb, ldda, lddb, min_mn, max_mn, nb, info; magma_int_t lworkgpu, lhwork, lhwork2; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t status = 0; float tol = opts.tolerance * lapackf77_slamch("E"); nrhs = opts.nrhs; printf(" ||b-Ax|| / (N||A||) ||dx-x||/(N||A||)\n"); printf(" M N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) CPU GPU \n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; if ( M < N ) { printf( "%5d %5d %5d skipping because M < N is not yet supported.\n", (int) M, (int) N, (int) nrhs ); continue; } min_mn = min(M, N); max_mn = max(M, N); lda = M; ldb = max_mn; size = lda*N; ldda = ((M+31)/32)*32; lddb = ((max_mn+31)/32)*32; nb = magma_get_cgeqrf_nb(M); gflops = (FLOPS_CGEQRF( M, N ) + FLOPS_CGEQRS( M, N, nrhs )) / 1e9; lworkgpu = (M - N + nb)*(nrhs + nb) + nrhs*nb; // query for workspace size lhwork = -1; lapackf77_cgeqrf(&M, &N, NULL, &M, NULL, tmp, &lhwork, &info); lhwork2 = (magma_int_t) MAGMA_C_REAL( tmp[0] ); lhwork = -1; lapackf77_cunmqr( MagmaLeftStr, MagmaConjTransStr, &M, &nrhs, &min_mn, NULL, &lda, NULL, NULL, &ldb, tmp, &lhwork, &info); lhwork = (magma_int_t) MAGMA_C_REAL( tmp[0] ); lhwork = max( max( lhwork, lhwork2 ), lworkgpu ); TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_A2, magmaFloatComplex, lda*N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, ldb*nrhs ); TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*nrhs ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda ); // make random RHS size = M*nrhs; lapackf77_clarnv( &ione, ISEED, &size, h_B ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); // make consistent RHS //size = N*nrhs; //lapackf77_clarnv( &ione, ISEED, &size, h_X ); //blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, // &c_one, h_A, &lda, // h_X, &ldb, // &c_zero, h_B, &ldb ); //lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magma_csetmatrix( M, nrhs, h_B, ldb, d_B, lddb ); gpu_time = magma_wtime(); magma_cgels3_gpu( MagmaNoTrans, M, N, nrhs, d_A, ldda, d_B, lddb, h_work, lworkgpu, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgels3_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); // Get the solution in h_X magma_cgetmatrix( N, nrhs, d_B, lddb, h_X, ldb ); // compute the residual blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A, &lda, h_X, &ldb, &c_one, h_R, &ldb); Anorm = lapackf77_clange("f", &M, &N, h_A, &lda, work); /* ===================================================================== Performs operation using LAPACK =================================================================== */ lapackf77_clacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_cgels( MagmaNoTransStr, &M, &N, &nrhs, h_A, &lda, h_X, &ldb, h_work, &lhwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgels returned error %d: %s.\n", (int) info, magma_strerror( info )); blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N, &c_neg_one, h_A2, &lda, h_X, &ldb, &c_one, h_B, &ldb); cpu_error = lapackf77_clange("f", &M, &nrhs, h_B, &ldb, work) / (min_mn*Anorm); gpu_error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); // error relative to LAPACK size = M*nrhs; blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_clange("f", &M, &nrhs, h_R, &ldb, work) / (min_mn*Anorm); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %8.2e", (int) M, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, cpu_error, gpu_error, error ); if ( M == N ) { printf( " %s\n", (gpu_error < tol && error < tol ? "ok" : "failed")); status += ! (gpu_error < tol && error < tol); } else { printf( " %s\n", (error < tol ? "ok" : "failed")); status += ! (error < tol); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_R ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, info, min_mn; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; gflops = FLOPS_CGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_cgetrf( M, N, h_A, lda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlag2s and slag2d */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double error, work[1]; float serror, swork[1]; double c_neg_one = MAGMA_D_NEG_ONE; float s_neg_one = MAGMA_S_NEG_ONE; magma_int_t ione = 1; magma_int_t m, n, lda, ldda, size, info; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; float *SA, *SR; double *A, *R; magmaFloat_ptr dSA; magmaDouble_ptr dA; magma_opts opts; parse_opts( argc, argv, &opts ); printf("func M N CPU GB/s (ms) GPU GB/s (ms) ||R||_F\n"); printf("=====================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { m = opts.msize[itest]; n = opts.nsize[itest]; lda = m; ldda = ((m+31)/32)*32; // m*n double-real loads and m*n single-real stores (and vice-versa for slag2d) gbytes = (real_Double_t) m*n * (sizeof(double) + sizeof(float)) / 1e9; size = ldda*n; // ldda >= lda TESTING_MALLOC_CPU( SA, float, size ); TESTING_MALLOC_CPU( A, double, size ); TESTING_MALLOC_CPU( SR, float, size ); TESTING_MALLOC_CPU( R, double, size ); TESTING_MALLOC_DEV( dSA, float, size ); TESTING_MALLOC_DEV( dA, double, size ); lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( m, n, A, lda, dA, 0, ldda, opts.queue ); magma_ssetmatrix( m, n, SA, lda, dSA, 0, ldda, opts.queue ); /* ===================================================================== Performs operation using LAPACK dlag2s =================================================================== */ cpu_time = magma_wtime(); lapackf77_dlag2s( &m, &n, A, &lda, SA, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_dlag2s returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA dlag2s =================================================================== */ gpu_time = magma_sync_wtime(0); magmablas_dlag2s( m, n, dA, 0, ldda, dSA, 0, ldda, opts.queue, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_dlag2s returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_sgetmatrix( m, n, dSA, 0, ldda, SR, lda, opts.queue ); /* ===================================================================== compute error |SA_magma - SA_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_saxpy( &size, &s_neg_one, SA, &ione, SR, &ione ); serror = lapackf77_slange( "Fro", &m, &n, SR, &lda, swork ); printf( "dlag2s %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., serror, (serror == 0 ? "ok" : "failed") ); status += ! (serror == 0); /* ===================================================================== Reset matrices =================================================================== */ lapackf77_dlarnv( &ione, ISEED, &size, A ); lapackf77_slarnv( &ione, ISEED, &size, SA ); magma_dsetmatrix( m, n, A, lda, dA, 0, ldda, opts.queue ); magma_ssetmatrix( m, n, SA, lda, dSA, 0, ldda, opts.queue ); /* ===================================================================== Performs operation using LAPACK slag2d =================================================================== */ cpu_time = magma_wtime(); lapackf77_slag2d( &m, &n, SA, &lda, A, &lda, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (info != 0) printf("lapackf77_slag2d returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ==================================================================== Performs operation using MAGMA slag2d =================================================================== */ magma_ssetmatrix( m, n, SA, lda, dSA, 0, ldda, opts.queue ); gpu_time = magma_sync_wtime(0); magmablas_slag2d( m, n, dSA, 0, ldda, dA, 0, ldda, opts.queue, &info ); gpu_time = magma_sync_wtime(0) - gpu_time; gpu_perf = gbytes / gpu_time; if (info != 0) printf("magmablas_slag2d returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix( m, n, dA, 0, ldda, R, lda, opts.queue ); /* ===================================================================== compute error |A_magma - A_lapack| should be zero if both are IEEE compliant =================================================================== */ blasf77_daxpy( &size, &c_neg_one, A, &ione, R, &ione ); error = lapackf77_dlange( "Fro", &m, &n, R, &lda, work ); printf( "slag2d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) m, (int) n, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error == 0 ? "ok" : "failed") ); status += ! (error == 0); TESTING_FREE_CPU( SA ); TESTING_FREE_CPU( A ); TESTING_FREE_CPU( SR ); TESTING_FREE_CPU( R ); TESTING_FREE_DEV( dSA ); TESTING_FREE_DEV( dA ); printf( "\n" ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, cuFloatComplex *Z, magma_int_t ldz, cuFloatComplex *dZ, magma_int_t lddz, cuFloatComplex *V, magma_int_t ldv, cuFloatComplex *TAU, cuFloatComplex *T, magma_int_t ldt, magma_int_t* info) { magma_int_t mklth = threads; float timeaplQ2=0.0; #if defined(USEMKL) mkl_set_num_threads(1); #endif #if defined(USEACML) omp_set_num_threads(1); #endif float f= 1.; magma_int_t n_gpu = ne; if(threads>40){ f = 0.5; n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>10){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.68; #else f = 0.72; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>5){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.82; #else f = 0.86; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>1){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.96; #else f = 0.96; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if(n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_capplyQ_id_data* arg = new magma_capplyQ_id_data[threads]; pthread_t* thread_id = new pthread_t[threads]; pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]); } arg[0] = magma_capplyQ_id_data(0, &data_applyQ); magma_capplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } delete[] thread_id; delete[] arg; magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ }else{ magma_csetmatrix(n, ne, Z, ldz, dZ, lddz); magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; #if defined(USEMKL) mkl_set_num_threads(mklth); #endif #if defined(USEACML) omp_set_num_threads(mklth); #endif return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctrsm */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time=0, cublas_perf, cublas_time, cpu_perf=0, cpu_time=0; float magma_error, cublas_error, work[1]; magma_int_t M, N, info; magma_int_t Ak; magma_int_t sizeA, sizeB; magma_int_t lda, ldb, ldda, lddb; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t *ipiv; magmaFloatComplex *h_A, *h_B, *h_Bcublas, *h_Bmagma, *h_B1, *h_X1, *h_X2; magmaFloatComplex *d_A, *d_B; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("side = %s, uplo = %s, transA = %s, diag = %s \n", lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag) ); printf(" M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("==================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; gflops = FLOPS_CTRSM(opts.side, M, N) / 1e9; if ( opts.side == MagmaLeft ) { lda = M; Ak = M; } else { lda = N; Ak = N; } ldb = M; ldda = ((lda+31)/32)*32; lddb = ((ldb+31)/32)*32; sizeA = lda*Ak; sizeB = ldb*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_B1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X1, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_X2, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bcublas, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( h_Bmagma, magmaFloatComplex, ldb*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, Ak ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*N ); /* Initialize the matrices */ /* Factor A into LU to get well-conditioned triangular matrix. * Copy L to U, since L seems okay when used with non-unit diagonal * (i.e., from U), while U fails when used with unit diagonal. */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_cgetrf( &Ak, &Ak, h_A, &lda, ipiv, &info ); for( int j = 0; j < Ak; ++j ) { for( int i = 0; i < j; ++i ) { *h_A(i,j) = *h_A(j,i); } } lapackf77_clarnv( &ione, ISEED, &sizeB, h_B ); memcpy(h_B1, h_B, sizeB*sizeof(magmaFloatComplex)); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( Ak, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); magma_time = magma_sync_wtime( NULL ); magmablas_ctrsm( opts.side, opts.uplo, opts.transA, opts.diag, M, N, alpha, d_A, ldda, d_B, lddb ); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bmagma, ldb ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( M, N, h_B, ldb, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ); cublasCtrsm( handle, cublas_side_const(opts.side), cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), cublas_diag_const(opts.diag), M, N, &alpha, d_A, ldda, d_B, lddb ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( M, N, d_B, lddb, h_Bcublas, ldb ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_ctrsm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha, h_A, &lda, h_B, &ldb ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ // ||b - Ax|| / (||A||*||x||) memcpy(h_X1, h_Bmagma, sizeB*sizeof(magmaFloatComplex)); magmaFloatComplex alpha2 = MAGMA_C_DIV( c_one, alpha ); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X1, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X1, &ione ); float norm1 = lapackf77_clange( "M", &M, &N, h_X1, &ldb, work ); float normx = lapackf77_clange( "M", &M, &N, h_Bmagma, &ldb, work ); float normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); magma_error = norm1/(normx*normA); memcpy(h_X2, h_Bcublas, sizeB*sizeof(magmaFloatComplex)); blasf77_ctrmm( lapack_side_const(opts.side), lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), lapack_diag_const(opts.diag), &M, &N, &alpha2, h_A, &lda, h_X2, &ldb ); blasf77_caxpy( &sizeB, &c_neg_one, h_B1, &ione, h_X2, &ione ); norm1 = lapackf77_clange( "M", &M, &N, h_X2, &ldb, work ); normx = lapackf77_clange( "M", &M, &N, h_Bcublas, &ldb, work ); normA = lapackf77_clange( "M", &Ak, &Ak, h_A, &lda, work ); cublas_error = norm1/(normx*normA); if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } else { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) %8.2e %8.2e %s\n", (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, magma_error, cublas_error, (magma_error < tol && cublas_error < tol? "ok" : "failed")); status += ! (magma_error < tol && cublas_error < tol); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_B1 ); TESTING_FREE_CPU( h_X1 ); TESTING_FREE_CPU( h_X2 ); TESTING_FREE_CPU( h_Bcublas ); TESTING_FREE_CPU( h_Bmagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
double magma_sync_wtime( magma_queue_t queue ) { magma_queue_sync( queue ); return magma_wtime(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A, *h_R; magmaFloatComplex *d_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" M N CPU GFlop/s (ms) GPU GFlop/s (ms) ||PA-LU||/(||A||*N)\n"); printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; if ( N > 512 ) { fprintf( stderr, "cgetf2 does not support N > 512; skipping N=%d.\n", (int) N ); continue; } TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda ); magma_csetmatrix( M, N, h_R, lda, d_A, ldda ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); magma_cgetf2_gpu( M, N, d_A, ldda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetf2_gpu returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time*1000. ); } if ( opts.check ) { magma_cgetmatrix( M, N, d_A, ldda, h_A, lda ); error = get_LU_error( M, N, h_R, lda, h_A, ipiv ); printf(" %8.2e\n", error ); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }