// -------------------- // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm. // See testing_zlanhe.cpp double safe_lapackf77_zlanhe( const char *norm, const char *uplo, const magma_int_t *n, const magmaDoubleComplex *A, const magma_int_t *lda, double *work ) { #ifdef MAGMA_WITH_MKL // work around MKL bug in multi-threaded zlanhe magma_int_t la_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); #endif double result = lapackf77_zlanhe( norm, uplo, n, A, lda, work ); #ifdef MAGMA_WITH_MKL // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); #endif return result; }
// -------------------- // MKL 11.1 has bug in multi-threaded slansy; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm. // See testing_slansy.cpp float safe_lapackf77_slansy( const char *norm, const char *uplo, const magma_int_t *n, const float *A, const magma_int_t *lda, float *work ) { #ifdef MAGMA_WITH_MKL // work around MKL bug in multi-threaded slansy magma_int_t la_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); #endif float result = lapackf77_slansy( norm, uplo, n, A, lda, work ); #ifdef MAGMA_WITH_MKL // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); #endif return result; }
extern "C" magma_int_t magma_zbulge_back_m( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, magmaDoubleComplex *Z, magma_int_t ldz, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t* info) { magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); real_Double_t timeaplQ2=0.0; double f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) // double gpu_cpu_perf = 32; //gpu over cpu performance //#else // double gpu_cpu_perf = 32; // gpu over cpu performance //#endif double perf_temp= .85; double perf_temp2= perf_temp; for (magma_int_t itmp=1; itmp < ngpu; ++itmp) perf_temp2 *= perf_temp; magma_int_t gpu_cpu_perf = magma_get_zbulge_gcperf(); if (threads > 1) { f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf*(1.-perf_temp2)/(1.-perf_temp))); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ //n_gpu = ne; if (n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU > 0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_zapplyQ_m_data data_applyQ(ngpu, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt); magma_zapplyQ_m_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zapplyQ_m_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_zapplyQ_m_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_zapplyQ_m_parallel_section, &arg[thread]); } arg[0] = magma_zapplyQ_m_id_data(0, &data_applyQ); magma_zapplyQ_m_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); /*============================ * use only GPU *==========================*/ } else { magma_zbulge_applyQ_v2_m(ngpu, MagmaLeft, ne, n, nb, Vblksiz, Z, ldz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_set_lapack_numthreads(mklth); return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slansy */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float *h_A; float *h_work; magmaFloat_ptr d_A; magmaFloat_ptr d_work; magma_int_t N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; float error, norm_magma, norm_lapack; magma_int_t status = 0; bool mkl_warning = false; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_slansy( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_slansy( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL ); if ( norm_magma != -1 ) { printf( "expected magmablas_slansy to return -1 error, but got %f\n", norm_magma ); status = 1; } } } printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL printf( "\nNote: using single thread to work around MKL slansy bug.\n\n" ); #endif printf(" N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error \n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = roundup( N, opts.roundup ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(float) / 1e9; TESTING_MALLOC_CPU( h_A, float, n2 ); TESTING_MALLOC_CPU( h_work, float, N ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); TESTING_MALLOC_DEV( d_work, float, N ); /* Initialize the matrix */ lapackf77_slarnv( &idist, ISEED, &n2, h_A ); magma_ssetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_slansy( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because it isn't supported on this GPU\n", (int) N, lapacke_norm_const( norm[inorm] )); continue; } if (norm_magma < 0) printf("magmablas_slansy returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL // MKL (11.1.2) has bug in multi-threaded slansy; use single thread to work around int threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_slansy( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) printf("lapackf77_slansy returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); #ifdef MAGMA_WITH_MKL // end single thread to work around MKL bug magma_set_lapack_numthreads( threads ); #endif /* ===================================================================== Check the result compared to LAPACK Note: MKL (11.1.0) has bug for uplo=Lower with multiple threads. Try with $MKL_NUM_THREADS = 1. =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; float tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in fabsf. #if defined(PRECISION_s) || defined(PRECISION_d) tol2 = 0; #endif } bool okay = (error <= tol2); printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed") ); status += ! okay; if ( ! okay ) { mkl_warning = true; } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } } // end iuplo, inorm, iter printf( "\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1.0) has a bug in slansy with multiple threads.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_time, gpu_perf; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; // TODO add these options to parse_opts magma_int_t NE = 0; magma_int_t distblk = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t WANTZ = (opts.jobz == MagmaVec); double tol = opts.tolerance * lapackf77_dlamch("E"); if (opts.nb == 0) opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N); if (NE < 1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); nstream = max(3, opts.ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex_ptr da[MagmaMaxGPUs], dT1[MagmaMaxGPUs]; if ((distblk == 0) || (distblk < opts.nb)) distblk = max(256, opts.nb); printf("%% ngpu %d, distblk %d, NB %d, nstream %d\n", (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default n2 = lda*N; /* We suppose the magma NB is bigger than lapack NB */ lwork = N*opts.nb; //gflops = ....? /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb ); } /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk); //magmaDoubleComplex_ptr dabis; //TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda); for (int count=0; count < 1; ++count) { magma_setdevice(0); gpu_time = magma_wtime(); if (opts.version == 30) { // see src/obsolete and magmablas/obsolete printf( "magma_zhetrd_he2hb_mgpu_spec not compiled\n" ); //magma_zhetrd_he2hb_mgpu_spec( // opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, // da, ldda, dT1, opts.nb, opts.ngpu, distblk, // streams, nstream, opts.nthread, &info); } else { nstream = 3; magma_zhetrd_he2hb_mgpu( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info); gpu_time = magma_wtime() - gpu_time; printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time); } magma_setdevice(0); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice(dev); magma_device_sync(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); // todo neither of these is declared in headers // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // todo where is this timer started? // gpu_time = magma_wtime() - gpu_time; // todo what are the gflops? gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); int nt = min(12, opts.nthread); magma_set_lapack_numthreads(nt); lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #ifdef COMPLEX rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, nt); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); magma_set_lapack_numthreads(1); printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif // PRECISION_z || PRECISION_d #endif // CHECKEIG printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("%%===========================================================================\n\n\n"); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); } magma_setdevice( 0 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gpu_time, gpu_perf, gflops; magmaDoubleComplex *h_A, *h_R, *h_work, *dT1; magmaDoubleComplex *tau; double *D, *E; /* Matrix size */ magma_int_t N, n2, lda, lwork, lwork0; //ldt magma_int_t info; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) magma_int_t WANTZ=0; magma_int_t THREADS=1; #endif #endif magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t ngpu = 1; magma_opts opts; opts.parse_opts( argc, argv ); NB = opts.nb; if (NB < 1) NB = 64; //64; //magma_get_zhetrd_he2hb_nb(N); // what is NE ? if (NE < 1) NE = 64; //N; //magma_get_zhetrd_he2hb_nb(N); // N not yet initialized printf("%% N GPU GFlop/s \n"); printf("%%====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; //ldt = N; n2 = lda*N; gflops = FLOPS_ZHETRD( N ) / 1e9; /* We suppose the magma NB is bigger than lapack NB */ lwork0 = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork0 ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); //TESTING_MALLOC_DEV( dT1, magmaDoubleComplex, (2*min(N,N) + roundup( N, 32 ))*NB ); TESTING_MALLOC_DEV( dT1, magmaDoubleComplex, (N*NB) ); // if (WANTZ) gflops = 2.0*gflops; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_device_t cdev; magma_getdevice( &cdev ); gpu_time = magma_wtime(); /* magma_zhetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork0, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_zhetrd_bhe2trc_v5(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ /* magma_zhetrd_he2hb( opts.uplo, N, NB, h_R, lda, tau, h_work, lwork, dT1, THREADS, &info); tband = magma_wtime - gpu_time(); printf(" Finish BAND N %d NB %d ngpu %d timing= %f\n", N, NB, ngpu, tband); magma_zhetrd_bhe2trc(THREADS, WANTZ, opts.uplo, NE, N, NB, h_R, lda, D, E, dT1, ldt); */ magma_range_t range = MagmaRangeAll; magma_int_t m1 = 0; double vl = 0; double vu = 0; magma_int_t il = 0; magma_int_t iu = 0; if (opts.fraction == 0) { il = max( 1, magma_int_t(0.1*N) ); iu = max( 1, magma_int_t(0.3*N) ); } else { il = 1; iu = max( 1, magma_int_t(opts.fraction*N) ); } magmaDoubleComplex *hh_work; magma_int_t *iwork; magma_int_t /*nb,*/ /*lwork,*/ liwork; magma_int_t threads = magma_get_parallel_numthreads(); #ifdef COMPLEX double *rwork; magma_int_t lrwork; #endif magma_zheevdx_getworksize(N, threads, (opts.jobz == MagmaVec), &lwork, #ifdef COMPLEX &lrwork, #endif &liwork); TESTING_MALLOC_PIN( hh_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_CPU( iwork, magma_int_t, liwork ); #ifdef COMPLEX TESTING_MALLOC_PIN( rwork, double, lrwork ); #endif if (ngpu == 1) { printf("calling zheevdx_2stage 1 GPU\n"); magma_zheevdx_2stage( opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info); } else { printf("calling zheevdx_2stage_m %d GPU\n", (int) ngpu); magma_zheevdx_2stage_m(ngpu, opts.jobz, range, opts.uplo, N, h_R, lda, vl, vu, il, iu, &m1, D, hh_work, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info); } magma_setdevice( cdev ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; /* ===================================================================== Check the factorization =================================================================== */ /* if ( opts.check ) { FILE *fp; printf("Writing input matrix in matlab_i_mat.txt ...\n"); fp = fopen ("matlab_i_mat.txt", "w"); if ( fp == NULL ) { printf("Couldn't open output file\n"); return -1; } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #ifdef COMPLEX fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_A[k+j*lda].x, h_A[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_A[k+j*lda]); #endif } } fclose( fp ); printf("Writing output matrix in matlab_o_mat.txt ...\n"); fp = fopen ("matlab_o_mat.txt", "w"); if ( fp == NULL ) { printf("Couldn't open output file\n"); return -1; } for (j=0; j < N; j++) { for (k=0; k < N; k++) { #ifdef COMPLEX fprintf(fp, "%5d %5d %11.8f %11.8f\n", k+1, j+1, h_R[k+j*lda].x, h_R[k+j*lda].y); #else fprintf(fp, "%5d %5d %11.8f\n", k+1, j+1, h_R[k+j*lda]); #endif } } fclose( fp ); } */ /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); const char* JOBZ; if (WANTZ == 0) JOBZ = MagmaNoVecStr; else JOBZ = MagmaVecStr; double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ cpu_time = magma_wtime(); i= min(12, THREADS); magma_set_lapack_numthreads( i ); lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #ifdef COMPLEX rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); //// cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, i); /* for (i=0; i < 10; i++) printf(" voici lpk D[%d] %8.2e\n", i, D2[i]); */ //magmaDoubleComplex mydz=0.0, mydo=1.0; //magmaDoubleComplex *Z; // magma_zmalloc_cpu( &Z, N*lda ); // dgemm_("N", "N", &N, &N, &N, &mydo, h_R, &lda, h_A, &lda, &mydz, Z, &lda); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); magma_set_lapack_numthreads( 1 ); printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d gflops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); printf("%%===========================================================================\n\n\n"); /* Memory clean up */ TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); TESTING_FREE_DEV( dT1 ); /* TODO - not all memory has been freed inside loop */ fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return EXIT_SUCCESS; }
//################################################################################################## static void *magma_dapplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_dapplyQ_id_data*)arg) -> id; magma_dapplyQ_data* data = ((magma_dapplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; double *E = data -> E; magma_int_t lde = data -> lde; double *V = data -> V; magma_int_t ldv = data -> ldv; double *TAU = data -> TAU; double *T = data -> T; magma_int_t ldt = data -> ldt; double *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif cpu_set_t old_set, new_set; //store current affinity CPU_ZERO(&old_set); sched_getaffinity( 0, sizeof(old_set), &old_set); //set new affinity // bind threads CPU_ZERO(&new_set); CPU_SET(my_core_id, &new_set); sched_setaffinity( 0, sizeof(new_set), &new_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (my_core_id == 0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_dsetmatrix(n, n_gpu, E, lde, dE, ldde); magma_dbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f\n", timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if (my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); double* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_dtile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f\n", timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY //restore old affinity sched_setaffinity(0, sizeof(old_set), &old_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
magma_int_t magma_ztrevc3_mt( magma_side_t side, magma_vec_t howmany, magma_int_t *select, // logical in Fortran magma_int_t n, magmaDoubleComplex *T, magma_int_t ldt, magmaDoubleComplex *VL, magma_int_t ldvl, magmaDoubleComplex *VR, magma_int_t ldvr, magma_int_t mm, magma_int_t *mout, magmaDoubleComplex *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, #endif magma_int_t *info ) { #define T(i,j) ( T + (i) + (j)*ldt ) #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) #define work(i,j) (work + (i) + (j)*n) // .. Parameters .. const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magma_int_t nbmin = 16, nbmax = 128; const magma_int_t ione = 1; // .. Local Scalars .. magma_int_t allv, bothv, leftv, over, rightv, somev; magma_int_t i, ii, is, j, k, ki, iv, n2, nb, nb2, version; double ovfl, remax, unfl; //smlnum, smin, ulp // Decode and test the input parameters bothv = (side == MagmaBothSides); rightv = (side == MagmaRight) || bothv; leftv = (side == MagmaLeft ) || bothv; allv = (howmany == MagmaAllVec); over = (howmany == MagmaBacktransVec); somev = (howmany == MagmaSomeVec); // Set mout to the number of columns required to store the selected // eigenvectors. if ( somev ) { *mout = 0; for( j=0; j < n; ++j ) { if ( select[j] ) { *mout += 1; } } } else { *mout = n; } *info = 0; if ( ! rightv && ! leftv ) *info = -1; else if ( ! allv && ! over && ! somev ) *info = -2; else if ( n < 0 ) *info = -4; else if ( ldt < max( 1, n ) ) *info = -6; else if ( ldvl < 1 || ( leftv && ldvl < n ) ) *info = -8; else if ( ldvr < 1 || ( rightv && ldvr < n ) ) *info = -10; else if ( mm < *mout ) *info = -11; else if ( lwork < max( 1, 2*n ) ) *info = -14; if ( *info != 0 ) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if possible. if ( n == 0 ) { return *info; } // Use blocked version (2) if sufficient workspace. // Requires 1 vector to save diagonal elements, and 2*nb vectors for x and Q*x. // (Compared to dtrevc3, rwork stores 1-norms.) // Zero-out the workspace to avoid potential NaN propagation. nb = 2; if ( lwork >= n + 2*n*nbmin ) { version = 2; nb = (lwork - n) / (2*n); nb = min( nb, nbmax ); nb2 = 1 + 2*nb; lapackf77_zlaset( "F", &n, &nb2, &c_zero, &c_zero, work, &n ); } else { version = 1; } // Set the constants to control overflow. unfl = lapackf77_dlamch( "Safe minimum" ); ovfl = 1. / unfl; lapackf77_dlabad( &unfl, &ovfl ); //ulp = lapackf77_dlamch( "Precision" ); //smlnum = unfl*( n / ulp ); // Store the diagonal elements of T in working array work. for( i=0; i < n; ++i ) { *work(i,0) = *T(i,i); } // Compute 1-norm of each column of strictly upper triangular // part of T to control overflow in triangular solver. rwork[0] = 0.; for( j=1; j < n; ++j ) { rwork[j] = magma_cblas_dzasum( j, T(0,j), ione ); } // launch threads -- each single-threaded MKL magma_int_t nthread = magma_get_parallel_numthreads(); magma_int_t lapack_nthread = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); magma_thread_queue queue; queue.launch( nthread ); //printf( "nthread %d, %d\n", nthread, lapack_nthread ); // gemm_nb = N/thread, rounded up to multiple of 16, // but avoid multiples of page size, e.g., 512*8 bytes = 4096. magma_int_t gemm_nb = magma_int_t( ceil( ceil( ((double)n) / nthread ) / 16. ) * 16. ); if ( gemm_nb % 512 == 0 ) { gemm_nb += 32; } magma_timer_t time_total=0, time_trsv=0, time_gemm=0, time_gemv=0, time_trsv_sum=0, time_gemm_sum=0, time_gemv_sum=0; timer_start( time_total ); if ( rightv ) { // ============================================================ // Compute right eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=nb, goes down to 1. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; if ( version == 2 ) { iv = nb; } timer_start( time_trsv ); is = *mout - 1; for( ki=n-1; ki >= 0; --ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } //smin = max( ulp*MAGMA_Z_ABS1( *T(ki,ki) ), smlnum ); // -------------------------------------------------------- // Complex right eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k=0; k < ki; ++k ) { *work(k,iv) = -(*T(k,ki)); } // Solve upper triangular system: // [ T(1:ki-1,1:ki-1) - T(ki,ki) ]*X = scale*work. if ( ki > 0 ) { queue.push_task( new magma_zlatrsd_task( MagmaUpper, MagmaNoTrans, MagmaNonUnit, MagmaTrue, ki, T, ldt, *T(ki,ki), work(0,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VR and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VR and normalize queue.sync(); n2 = ki+1; blasf77_zcopy( &n2, work(0,iv), &ione, VR(0,is), &ione ); ii = blasf77_izamax( &n2, VR(0,is), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VR(ii,is) ); blasf77_zdscal( &n2, &remax, VR(0,is), &ione ); for( k=ki+1; k < n; ++k ) { *VR(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemv ); if ( ki > 0 ) { blasf77_zgemv( "n", &n, &ki, &c_one, VR, &ldvr, work(0, iv), &ione, work(ki,iv), VR(0,ki), &ione ); } time_gemv_sum += timer_stop( time_gemv ); ii = blasf77_izamax( &n, VR(0,ki), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VR(ii,ki) ); blasf77_zdscal( &n, &remax, VR(0,ki), &ione ); timer_start( time_trsv ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out below vector for( k=ki+1; k < n; ++k ) { *work(k,iv) = c_zero; } // Columns iv:nb of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == 1) || (ki == 0) ) { queue.sync(); time_trsv_sum += timer_stop( time_trsv ); timer_start( time_gemm ); nb2 = nb-iv+1; n2 = ki+nb-iv+1; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += gemm_nb ) { magma_int_t ib = min( gemm_nb, n-i ); queue.push_task( new zgemm_task( MagmaNoTrans, MagmaNoTrans, ib, nb2, n2, c_one, VR(i,0), ldvr, work(0,iv ), n, c_zero, work(i,nb+iv), n )); } queue.sync(); time_gemm_sum += timer_stop( time_gemm ); // normalize vectors // TODO if somev, should copy vectors individually to correct location. for( k = iv; k <= nb; ++k ) { ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) ); blasf77_zdscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_zlacpy( "F", &n, &nb2, work(0,nb+iv), &n, VR(0,ki), &ldvr ); iv = nb; timer_start( time_trsv ); } else { iv -= 1; } } // blocked back-transform is -= 1; } } timer_stop( time_trsv ); timer_stop( time_total ); timer_printf( "trevc trsv %.4f, gemm %.4f, gemv %.4f, total %.4f\n", time_trsv_sum, time_gemm_sum, time_gemv_sum, time_total ); if ( leftv ) { // ============================================================ // Compute left eigenvectors. // iv is index of column in current block. // Non-blocked version always uses iv=1; // blocked version starts with iv=1, goes up to nb. // (Note the "0-th" column is used to store the original diagonal.) iv = 1; is = 0; for( ki=0; ki < n; ++ki ) { if ( somev ) { if ( ! select[ki] ) { continue; } } //smin = max( ulp*MAGMA_Z_ABS1( *T(ki,ki) ), smlnum ); // -------------------------------------------------------- // Complex left eigenvector *work(ki,iv) = c_one; // Form right-hand side. for( k = ki + 1; k < n; ++k ) { *work(k,iv) = -MAGMA_Z_CONJ( *T(ki,k) ); } // Solve conjugate-transposed triangular system: // [ T(ki+1:n,ki+1:n) - T(ki,ki) ]**H * X = scale*work. // TODO what happens with T(k,k) - lambda is small? Used to have < smin test. if ( ki < n-1 ) { n2 = n-ki-1; queue.push_task( new magma_zlatrsd_task( MagmaUpper, MagmaConjTrans, MagmaNonUnit, MagmaTrue, n2, T(ki+1,ki+1), ldt, *T(ki,ki), work(ki+1,iv), work(ki,iv), rwork )); } // Copy the vector x or Q*x to VL and normalize. if ( ! over ) { // ------------------------------ // no back-transform: copy x to VL and normalize queue.sync(); n2 = n-ki; blasf77_zcopy( &n2, work(ki,iv), &ione, VL(ki,is), &ione ); ii = blasf77_izamax( &n2, VL(ki,is), &ione ) + ki - 1; remax = 1. / MAGMA_Z_ABS1( *VL(ii,is) ); blasf77_zdscal( &n2, &remax, VL(ki,is), &ione ); for( k=0; k < ki; ++k ) { *VL(k,is) = c_zero; } } else if ( version == 1 ) { // ------------------------------ // version 1: back-transform each vector with GEMV, Q*x. queue.sync(); if ( ki < n-1 ) { n2 = n-ki-1; blasf77_zgemv( "n", &n, &n2, &c_one, VL(0,ki+1), &ldvl, work(ki+1,iv), &ione, work(ki, iv), VL(0,ki), &ione ); } ii = blasf77_izamax( &n, VL(0,ki), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *VL(ii,ki) ); blasf77_zdscal( &n, &remax, VL(0,ki), &ione ); } else if ( version == 2 ) { // ------------------------------ // version 2: back-transform block of vectors with GEMM // zero out above vector // could go from (ki+1)-NV+1 to ki for( k=0; k < ki; ++k ) { *work(k,iv) = c_zero; } // Columns 1:iv of work are valid vectors. // When the number of vectors stored reaches nb, // or if this was last vector, do the GEMM if ( (iv == nb) || (ki == n-1) ) { queue.sync(); n2 = n-(ki+1)+iv; // split gemm into multiple tasks, each doing one block row for( i=0; i < n; i += gemm_nb ) { magma_int_t ib = min( gemm_nb, n-i ); queue.push_task( new zgemm_task( MagmaNoTrans, MagmaNoTrans, ib, iv, n2, c_one, VL(i,ki-iv+1), ldvl, work(ki-iv+1,1), n, c_zero, work(i,nb+1), n )); } queue.sync(); // normalize vectors for( k=1; k <= iv; ++k ) { ii = blasf77_izamax( &n, work(0,nb+k), &ione ) - 1; remax = 1. / MAGMA_Z_ABS1( *work(ii,nb+k) ); blasf77_zdscal( &n, &remax, work(0,nb+k), &ione ); } lapackf77_zlacpy( "F", &n, &iv, work(0,nb+1), &n, VL(0,ki-iv+1), &ldvl ); iv = 1; } else { iv += 1; } } // blocked back-transform is += 1; } } // close down threads queue.quit(); magma_set_lapack_numthreads( lapack_nthread ); return *info; } // End of ZTREVC
/** Purpose ------- DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sy2sb( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, double *dT, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define dA(a_1,a_2) (dA + ((a_2)-1)*(ldda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_1) (dT + ((a_1)-1)*(lddt)) int ldda = ((n+31)/32)*32; int lddt = nb; double c_neg_one = MAGMA_D_NEG_ONE; double c_neg_half = MAGMA_D_NEG_HALF; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, indj_old=0; int i; int lwkopt; int lquery; *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); double *dA; if (MAGMA_SUCCESS != magma_dmalloc( &dA, (n + 2*nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); /* Use the first panel of dA as work space */ double *dwork = dA + n*ldda; double *dW = dwork + nb*ldda; #ifdef TRACING char buf[80]; #endif magma_queue_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); stream[2] = 0; // default stream trace_init( 1, 1, 3, stream ); double *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(double)); magmablasSetKernelStream( stream[0] ); magma_event_t Pupdate_event; cudaEventCreateWithFlags(&Pupdate_event,cudaEventDisableTiming); //magma_event_create(&Pupdate_event); if (upper) { printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Copy the matrix to the GPU */ if (1 <= n-nb) { trace_gpu_start( 0, 0, "set", "set A" ); magma_dsetmatrix_async( (n-nb), (n-nb), A(nb+1, nb+1), lda, dA(nb+1, nb+1), ldda, stream[0] ); trace_gpu_end( 0, 0 ); } /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // dpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); trace_gpu_start( 0, 1, "get", "get panel" ); //magma_queue_sync( stream[0] ); magma_queue_wait_event(stream[1], Pupdate_event); //, 0); magma_dgetmatrix_async( (pm+pn), pn, dA( i, i), ldda, A ( i, i), lda, stream[1] ); trace_gpu_end( 0, 1 ); trace_gpu_start( 0, 2, "her2k", "her2k" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dA(indi_old+pn_old, indj_old), ldda, dW + pn_old, pm_old, d_one, dA(indi_old+pn_old, indi_old+pn_old), ldda); trace_gpu_end( 0, 2 ); trace_cpu_start( 0, "sync", "sync on 1" ); magma_queue_sync( stream[1] ); trace_cpu_end( 0 ); dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ #ifdef TRACING snprintf( buf, sizeof(buf), "panel %d", i ); #endif trace_cpu_start( 0, "geqrf", buf ); lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); trace_cpu_end( 0 ); /* Send V from the CPU to the GPU */ trace_gpu_start( 0, 0, "set", "set V and T" ); magma_dsetmatrix_async( pm, pk, A(indi, indj), lda, dA(indi, indj), ldda, stream[0] ); /* Send the triangular factor T to the GPU */ magma_dsetmatrix_async( pk, pk, hT, nb, dT(i), lddt, stream[0] ); trace_gpu_end( 0, 0 ); /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ /* dwork = V T */ trace_cpu_start( 0, "sync", "sync on 0" ); // this sync is done here to be sure that the copy has been finished // because below we made a restore dq_to_panel and this restore need // to ensure that the copy has been finished. we did it here to allow // overlapp of restore with next gemm and symm. magma_queue_sync( stream[0] ); trace_cpu_end( 0 ); trace_gpu_start( 0, 2, "gemm", "work = V*T" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dA(indi, indj), ldda, dT(i), lddt, c_zero, dwork, pm); trace_gpu_end( 0, 2 ); /* dW = X = A*V*T. dW = A*dwork */ trace_gpu_start( 0, 2, "hemm", "X = A*work" ); magma_dsymm(MagmaLeft, uplo, pm, pk, c_one, dA(indi, indi), ldda, dwork, pm, c_zero, dW, pm); trace_gpu_end( 0, 2 ); /* restore the panel */ dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ trace_gpu_start( 0, 2, "gemm", "work = T'*V'*X" ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork, pm, dW, pm, c_zero, dwork + pm*nb, nb); trace_gpu_end( 0, 2 ); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ trace_gpu_start( 0, 2, "gemm", "W = X - 0.5*V*(T'*V'*X)" ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dA(indi, indj), ldda, dwork + pm*nb, nb, c_one, dW, pm); trace_gpu_end( 0, 2 ); /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ trace_gpu_start( 0, 2, "gemm", "gemm 4 next panel left" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dA(indi, indj), ldda, dW, pm, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); trace_gpu_start( 0, 2, "gemm", "gemm 5 next panel right" ); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dW, pm, dA(indi, indj), ldda, c_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); magma_event_record(Pupdate_event, stream[0]); } else { /* no look-ahead as this is last iteration */ trace_gpu_start( 0, 2, "her2k", "her2k last iteration" ); magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dA(indi, indj), ldda, dW, pm, d_one, dA(indi, indi), ldda); trace_gpu_end( 0, 2 ); } indi_old = indi; indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) /* Send the last block to the CPU */ pk = min(pm,pn); if (1 <= n-nb) { dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); trace_gpu_start( 0, 2, "get", "get last block" ); magma_dgetmatrix( pk, pk, dA(n-pk+1, n-pk+1), ldda, A(n-pk+1, n-pk+1), lda ); trace_gpu_end( 0, 2 ); dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } }// end of LOWER trace_finalize( "dsytrd_sy2sb.svg", "trace.css" ); magma_event_destroy( Pupdate_event ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); work[0] = MAGMA_D_MAKE( lwkopt, 0 ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); return *info; } /* magma_dsytrd_sy2sb */
static void *magma_dsytrd_sb2st_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_dbulge_id_data*)arg) -> id; magma_dbulge_data* data = ((magma_dbulge_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t nb = data -> nb; magma_int_t nbtiles = data -> nbtiles; magma_int_t grsiz = data -> grsiz; magma_int_t Vblksiz = data -> Vblksiz; magma_int_t wantz = data -> wantz; double *A = data -> A; magma_int_t lda = data -> lda; double *V = data -> V; magma_int_t ldv = data -> ldv; double *TAU = data -> TAU; double *T = data -> T; magma_int_t ldt = data -> ldt; volatile magma_int_t* prog = data -> prog; pthread_barrier_t* myptbarrier = &(data -> myptbarrier); //magma_int_t sys_corenbr = 1; #ifdef ENABLE_TIMER real_Double_t timeB=0.0, timeT=0.0; #endif // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. //magma_set_omp_numthreads(1); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(1); /* #ifndef MAGMA_NOAFFINITY // bind threads cpu_set_t set; // bind threads CPU_ZERO( &set ); CPU_SET( my_core_id, &set ); sched_setaffinity( 0, sizeof(set), &set); #endif magma_set_lapack_numthreads(1); magma_set_omp_numthreads(1); */ #ifndef MAGMA_NOAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); magma_int_t check = 0; magma_int_t check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif /* compute the Q1 overlapped with the bulge chasing+T. * if all_cores_num=1 it call Q1 on GPU and then bulgechasing. * otherwise the first thread run Q1 on GPU and * the other threads run the bulgechasing. * */ //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if (my_core_id == 0) timeB = magma_wtime(); #endif magma_dtile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, wantz, prog, myptbarrier); if (allcores_num > 1) pthread_barrier_wait(myptbarrier); #ifdef ENABLE_TIMER if (my_core_id == 0) { timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f\n", timeB); } #endif //========================= // compute the T's to be used when applying Q2 //========================= if ( wantz > 0 ) { #ifdef ENABLE_TIMER if (my_core_id == 0) timeT = magma_wtime(); #endif magma_dtile_bulge_computeT_parallel(my_core_id, allcores_num, V, ldv, TAU, T, ldt, n, nb, Vblksiz); if (allcores_num > 1) pthread_barrier_wait(myptbarrier); #ifdef ENABLE_TIMER if (my_core_id == 0) { timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f\n", timeT); } #endif } #ifndef MAGMA_NOAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
/** Purpose ------- Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A is stored; - = MagmaLower: Lower triangles of A is stored. @param[in] n INTEGER The order of the matrix A. n >= 0. @param[in] nb INTEGER The order of the band matrix A. n >= nb >= 0. @param[in] Vblksiz INTEGER The size of the block of householder vectors applied at once. @param[in] A (workspace) DOUBLE PRECISION array, dimension (lda, n) On entry the band matrix stored in the following way: @param[in] lda INTEGER The leading dimension of the array A. lda >= 2*nb. @param[out] d DOUBLE array, dimension (n) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e DOUBLE array, dimension (n-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] V DOUBLE PRECISION array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. @param[in] ldv INTEGER The leading dimension of V. LDV > nb + VBLKSIZ + 1 @param[out] TAU DOUBLE PRECISION dimension(BLKCNT, VBLKSIZ) ??? @param[in] wantz INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed @param[out] T DOUBLE PRECISION dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced @param[in] ldt INTEGER The leading dimension of T. LDT > Vblksiz @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sb2st( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, double *A, magma_int_t lda, double *d, double *e, double *V, magma_int_t ldv, double *TAU, magma_int_t wantz, double *T, magma_int_t ldt) { #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif magma_int_t parallel_threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_int_t ompth = magma_get_omp_numthreads(); //magma_set_omp_numthreads(1); //magma_set_lapack_numthreads(1); magma_int_t blkcnt, sizTAU2, sizT2, sizV2; magma_dbulge_getstg2size(n, nb, wantz, Vblksiz, ldv, ldt, &blkcnt, &sizTAU2, &sizT2, &sizV2); memset(T, 0, sizT2*sizeof(double)); memset(TAU, 0, sizTAU2*sizeof(double)); memset(V, 0, sizV2*sizeof(double)); magma_int_t INgrsiz=1; magma_int_t nbtiles = magma_ceildiv(n, nb); volatile magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t)); memset((void *) prog, 0, (2*nbtiles+parallel_threads+10)*sizeof(magma_int_t)); magma_dbulge_id_data* arg; magma_malloc_cpu((void**) &arg, parallel_threads*sizeof(magma_dbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, parallel_threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_dbulge_data data_bulge; magma_dbulge_data_init(&data_bulge, parallel_threads, n, nb, nbtiles, INgrsiz, Vblksiz, wantz, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(parallel_threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < parallel_threads; thread++) { magma_dbulge_id_data_init(&(arg[thread]), thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_dsytrd_sb2st_parallel_section, &arg[thread]); } magma_dbulge_id_data_init(&(arg[0]), 0, &data_bulge); magma_dsytrd_sb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < parallel_threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f\n", timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu((void *) prog); magma_dbulge_data_destroy(&data_bulge); magma_set_omp_numthreads(ompth); magma_set_lapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag d and e * note that d and e are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in d and e */ /* In real case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to e. * When using HouseHolder elimination, * the DLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #ifdef COMPLEX if (uplo == MagmaLower) { for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_D_REAL( A[i*lda ] ); e[i] = MAGMA_D_REAL( A[i*lda+1] ); } d[n-1] = MAGMA_D_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_D_REAL( A[i*lda+nb] ); e[i] = MAGMA_D_REAL( A[i*lda+nb-1] ); } d[n-1] = MAGMA_D_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if ( uplo == MagmaLower ) { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda]; // diag e[i] = A[i*lda+1]; // lower diag } d[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda+nb]; // diag e[i] = A[i*lda+nb-1]; // lower diag } d[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
void test_num_threads() { printf( "%%=====================================================================\n%s\n", __func__ ); // test that getting & setting numthreads works magma_int_t p_nthread_orig = magma_get_parallel_numthreads(); magma_int_t l_nthread_orig = magma_get_lapack_numthreads(); printf( "get; parallel_numthread=%2d, lapack_numthread=%2d\n", p_nthread_orig, l_nthread_orig ); magma_set_lapack_numthreads( 4 ); magma_int_t p_nthread = magma_get_parallel_numthreads(); magma_int_t l_nthread = magma_get_lapack_numthreads(); printf( "set( 4); parallel_numthread=%2d, lapack_numthread=%2d (expect 4)\n", p_nthread, l_nthread ); warn( p_nthread == p_nthread_orig ); warn( l_nthread == 4 ); magma_set_lapack_numthreads( 1 ); p_nthread = magma_get_parallel_numthreads(); l_nthread = magma_get_lapack_numthreads(); printf( "set( 1); parallel_numthread=%2d, lapack_numthread=%2d (expect 1)\n", p_nthread, l_nthread ); warn( p_nthread == p_nthread_orig ); warn( l_nthread == 1 ); magma_set_lapack_numthreads( 8 ); p_nthread = magma_get_parallel_numthreads(); l_nthread = magma_get_lapack_numthreads(); printf( "set( 8); parallel_numthread=%2d, lapack_numthread=%2d (expect 8)\n", p_nthread, l_nthread ); warn( p_nthread == p_nthread_orig ); warn( l_nthread == 8 ); magma_set_lapack_numthreads( l_nthread_orig ); p_nthread = magma_get_parallel_numthreads(); l_nthread = magma_get_lapack_numthreads(); printf( "set(%2d); parallel_numthread=%2d, lapack_numthread=%2d (expect %2d)\n", l_nthread_orig, p_nthread, l_nthread, l_nthread_orig ); warn( p_nthread == p_nthread_orig ); warn( l_nthread == l_nthread_orig ); #ifndef _MSC_VER // not Windows // test that parsing MAGMA_NUM_THREADS works // TODO need some way to get ncores. This is circular: assume with huge // NUM_THREADS that the routine gives the ncores. The user can verify. setenv("MAGMA_NUM_THREADS", "10000", 1 ); magma_int_t ncores = magma_get_parallel_numthreads(); magma_int_t omp_threads = ncores; const char* omp_str = getenv("OMP_NUM_THREADS"); if ( omp_str != NULL ) { omp_threads = atoi( omp_str ); } printf( "\nusing ncores=%d, omp_num_threads=%d\n\n", ncores, omp_threads ); printf( "$MAGMA_NUM_THREADS nthread expect\n" ); printf( "%%==================================\n" ); unsetenv("MAGMA_NUM_THREADS"); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d (omp_threads)\n\n", "not set", p_nthread, omp_threads ); warn( p_nthread == omp_threads ); setenv("MAGMA_NUM_THREADS", "", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 1 ); warn( p_nthread == 1 ); setenv("MAGMA_NUM_THREADS", "-1", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 1 ); warn( p_nthread == 1 ); setenv("MAGMA_NUM_THREADS", "2junk", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 1 ); warn( p_nthread == 1 ); setenv("MAGMA_NUM_THREADS", "0", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 1 ); warn( p_nthread == 1 ); setenv("MAGMA_NUM_THREADS", "1", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 1 ); warn( p_nthread == 1 ); setenv("MAGMA_NUM_THREADS", "2", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 2 ); warn( p_nthread == min( 2, ncores ) ); setenv("MAGMA_NUM_THREADS", "4", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 4 ); warn( p_nthread == min( 4, ncores ) ); setenv("MAGMA_NUM_THREADS", "8", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 8 ); warn( p_nthread == min( 8, ncores ) ); setenv("MAGMA_NUM_THREADS", "16", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, 16 ); warn( p_nthread == min( 16, ncores ) ); setenv("MAGMA_NUM_THREADS", "1000", 1 ); p_nthread = magma_get_parallel_numthreads(); printf( "%-18s %7d %6d (ncores)\n\n", getenv("MAGMA_NUM_THREADS"), p_nthread, ncores ); warn( p_nthread == min( 1000, ncores ) ); #endif // not Windows }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cherk */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, cublas_perf, cublas_time, cpu_perf, cpu_time; float cublas_error, Cnorm, work[1]; magma_int_t N, K; magma_int_t Ak, An; magma_int_t sizeA, sizeC; magma_int_t lda, ldc, ldda, lddc; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaFloatComplex *h_A, *h_C, *h_Ccublas; magmaFloatComplex_ptr d_A, d_C; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float alpha = MAGMA_D_MAKE( 0.29, -0.86 ); float beta = MAGMA_D_MAKE( -0.48, 0.38 ); magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); opts.lapack |= opts.check; // check (-c) implies lapack (-l) float tol = opts.tolerance * lapackf77_slamch("E"); printf("If running lapack (option --lapack), CUBLAS error is computed\n" "relative to CPU BLAS result.\n\n"); printf("uplo = %s, transA = %s\n", lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA) ); printf(" N K CUBLAS Gflop/s (ms) CPU Gflop/s (ms) CUBLAS error\n"); printf("==================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; K = opts.ksize[itest]; gflops = FLOPS_CHERK(K, N) / 1e9; if ( opts.transA == MagmaNoTrans ) { lda = An = N; Ak = K; } else { lda = An = K; Ak = N; } ldc = N; ldda = ((lda+31)/32)*32; lddc = ((ldc+31)/32)*32; sizeA = lda*Ak; sizeC = ldc*N; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*Ak ); TESTING_MALLOC_CPU( h_C, magmaFloatComplex, ldc*N ); TESTING_MALLOC_CPU( h_Ccublas, magmaFloatComplex, ldc*N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*Ak ); TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N ); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeC, h_C ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( An, Ak, h_A, lda, d_A, ldda ); magma_csetmatrix( N, N, h_C, ldc, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ); cublasCherk( opts.handle, cublas_uplo_const(opts.uplo), cublas_trans_const(opts.transA), N, K, &alpha, d_A, ldda, &beta, d_C, lddc ); cublas_time = magma_sync_wtime( NULL ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetmatrix( N, N, d_C, lddc, h_Ccublas, ldc ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); blasf77_cherk( lapack_uplo_const(opts.uplo), lapack_trans_const(opts.transA), &N, &K, &alpha, h_A, &lda, &beta, h_C, &ldc ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { #ifdef MAGMA_WITH_MKL // MKL (11.1.2) has bug in multi-threaded clanhe; use single thread to work around int threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( 1 ); #endif // compute relative error for both magma & cublas, relative to lapack, // |C_magma - C_lapack| / |C_lapack| Cnorm = lapackf77_clanhe("fro", lapack_uplo_const(opts.uplo), &N, h_C, &ldc, work); blasf77_caxpy( &sizeC, &c_neg_one, h_C, &ione, h_Ccublas, &ione ); cublas_error = lapackf77_clanhe( "fro", lapack_uplo_const(opts.uplo), &N, h_Ccublas, &ldc, work ) / Cnorm; printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, cublas_error, (cublas_error < tol ? "ok" : "failed")); status += ! (cublas_error < tol); #ifdef MAGMA_WITH_MKL // end single thread to work around MKL bug magma_set_lapack_numthreads( threads ); #endif } else { printf("%5d %5d %7.2f (%7.2f) --- ( --- ) --- ---\n", (int) N, (int) K, cublas_perf, 1000.*cublas_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_C ); TESTING_FREE_CPU( h_Ccublas ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_C ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGEQRF computes a QR factorization of a DOUBLE PRECISION M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau DOUBLE PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_dgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_dgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_dgeqrf( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magma_int_t *info ) { #define A(i_,j_) (A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dA, ((i_) + (j_)*ldda + dA_offset) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dwork(i_) dwork, ((i_) + dwork_offset) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif /* Constants */ const double c_one = MAGMA_D_ONE; /* Local variables */ magmaDouble_ptr dA, dT, dwork; magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib; /* Function Body */ *info = 0; magma_int_t nb = magma_get_dgeqrf_nb( m, n ); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max( n*nb, 2*nb*nb ); work[0] = magma_dmake_lwork( lwkopt ); bool lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min( m, n ); if (min_mn == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = magma_roundup( n, 32 ) - nb; ldda = magma_roundup( m, 32 ); magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multiple-GPU interface */ return magma_dgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info ); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* alloc failed so call non-GPU-resident version */ return magma_dgeqrf_ooc( m, n, A, lda, tau, work, lwork, info ); } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); //used for timing CPU and GPU int iter = 0; float cpu_time = 0.0; float gpu_time = 0.0; int affinity = map_cpu(0); if(affinity != 0) { printf("affinity failed\n"); return -1; } magma_set_lapack_numthreads(1); // for initial setting, better to be automatic in the future // SetGPUFreq(324, 324); // system("echo 1200000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed"); SetGPUFreq(2600, 705); system("echo 2500000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_setspeed"); double gpu_iter1_low = 2096.544434; double gpu_iter1_high = 478.825226; double cpu_iter1_low = 1792.011230; double cpu_iter1_high = 1413.732788; double gpu_pred_high = gpu_iter1_high; double gpu_pred_low = gpu_iter1_low; double cpu_pred_high = cpu_iter1_high; double cpu_pred_low = cpu_iter1_low; double ratio_split_freq = 0; double time_until_interrupt = 0; cudaEvent_t start_cpu, stop_cpu; cudaEvent_t start_gpu, stop_gpu; // switches for different modes bool timing = false; //for initial setting only, greatly impact performance bool dvfs = false; //turn on dvfs energy saving bool relax = false; //turn on relax scheme bool r2h = false; // turn on race to halt //these parameters need to be tuned in future works. double dvfs_converage = 0.5; double prediction_offset_gpu = 0.65; double prediction_offset_cpu = 0.65; //for nvprof profiler, brings slight constant performance overhead //cudaProfilerStart(); if ( (nb > 1) && (nb < min_mn) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_dsetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, queues[0] ); old_i = 0; old_ib = nb; for (i = 0; i < min_mn-nb; i += nb) { ib = min( min_mn-i, nb ); if (i > 0) { /* get i-th panel from device */ magma_queue_sync( queues[1] ); magma_dgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, queues[0] ); if (timing) { //start gpu timing cudaEventCreate(&start_gpu); cudaEventCreate(&stop_gpu); cudaEventRecord(start_gpu, 0); } /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT(0,0), nb, dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] ); double ratio_slack_pred = 1.0 - (double)nb/(m-iter*nb); cpu_pred_high = cpu_pred_high * ratio_slack_pred; cpu_pred_low = cpu_pred_low * ratio_slack_pred; gpu_pred_high = gpu_pred_high * ratio_slack_pred * ratio_slack_pred; gpu_pred_low = gpu_pred_low * ratio_slack_pred * ratio_slack_pred; if (timing) { printf("iter:%d GPU time pred:%f\n", iter, gpu_pred_high); printf("iter:%d CPU time pred:%f\n", iter, cpu_pred_high); } if (iter < dvfs_converage*(min_mn-nb)/nb) { if (cpu_pred_high > gpu_pred_high) { //slack on GPU ratio_split_freq = (cpu_pred_high - gpu_pred_high) / (gpu_pred_high * ((gpu_iter1_low / gpu_iter1_high) - 1)); time_until_interrupt = gpu_pred_low * ratio_split_freq; //printf("iter:%d time_until_interrupt:%f\n", iter, time_until_interrupt); // printf("iter:%d ratio_split_freq:%f\n", iter, ratio_split_freq); if (dvfs) { if ((!relax) || (relax && ratio_split_freq > 0.05)) { if (ratio_split_freq < 1) dvfs_adjust(time_until_interrupt*prediction_offset_gpu, 'g'); else dvfs_adjust(cpu_pred_high, 'g'); } } else if (r2h) { r2h_adjust(gpu_pred_high, cpu_pred_high - gpu_pred_high, 'g'); } } else { //slack on CPU ratio_split_freq = (gpu_pred_high - cpu_pred_high) / (cpu_pred_high * ((cpu_iter1_low / cpu_iter1_high) - 1)); time_until_interrupt = cpu_pred_low * ratio_split_freq; if (dvfs) { if ((!relax) || (relax && ratio_split_freq > 0.05)) { if (ratio_split_freq < 1) dvfs_adjust(time_until_interrupt*prediction_offset_cpu, 'c'); else dvfs_adjust(gpu_pred_high, 'c'); } } else if (r2h) { r2h_adjust(cpu_pred_high, gpu_pred_high - cpu_pred_high, 'c'); } } } if (timing) { //end gpu timing cudaEventRecord(stop_gpu, 0); cudaEventSynchronize(stop_gpu); cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu); cudaEventDestroy(start_gpu); cudaEventDestroy(stop_gpu); printf("iter:%d GPU time:%f\n", iter, gpu_time); } magma_dgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); magma_queue_sync( queues[0] ); } magma_int_t rows = m-i; if (timing) { //start cpu timing cudaEventCreate(&start_cpu); cudaEventCreate(&stop_cpu); cudaEventRecord(start_cpu, 0); } lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib ); if (timing) { //end cpu timing cudaEventRecord(stop_cpu, 0); cudaEventSynchronize(stop_cpu); cudaEventElapsedTime(&cpu_time, start_cpu, stop_cpu); cudaEventDestroy(start_cpu); cudaEventDestroy(stop_cpu); printf("iter:%d CPU time:%f\n", iter, cpu_time); if (gpu_time < cpu_time) { printf("slack: +\n"); } else { printf("slack: -\n"); } } magma_dpanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); /* put i-th V matrix onto device */ magma_dsetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); /* put T matrix onto device */ magma_queue_sync( queues[1] ); magma_dsetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] ); magma_queue_sync( queues[0] ); if (i + ib < n) { if (i+ib < min_mn-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_dq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } old_i = i; old_ib = ib; } iter ++; } //for nvprof profiler. //cudaProfilerStop(); } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < min_mn) { ib = n-i; if (i != 0) { magma_dgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); } magma_int_t rows = m-i; lapackf77_dgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; } /* magma_dgeqrf */
//################################################################################################## static void *magma_zapplyQ_m_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_zapplyQ_m_id_data*)arg) -> id; magma_zapplyQ_m_data* data = ((magma_zapplyQ_m_id_data*)arg) -> data; magma_int_t ngpu = data -> ngpu; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; magmaDoubleComplex *E = data -> E; magma_int_t lde = data -> lde; magmaDoubleComplex *V = data -> V; magma_int_t ldv = data -> ldv; magmaDoubleComplex *TAU = data -> TAU; magmaDoubleComplex *T = data -> T; magma_int_t ldt = data -> ldt; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (my_core_id == 0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_zbulge_applyQ_v2_m(ngpu, MagmaLeft, n_gpu, n, nb, Vblksiz, E, lde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f\n", timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if (my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); magmaDoubleComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ztile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f\n", timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
/** Purpose ------- DSYTRD_HE2HB reduces a real symmetric matrix A to real symmetric band-diagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. This version stores the triangular matrices T used in the accumulated Householder transformations (I - V T V'). Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = MagmaUpper, the Upper band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements above the band diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = MagmaLower, the the Lower band-diagonal of A is overwritten by the corresponding elements of the band-diagonal matrix T, and the elements below the band-diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] dT DOUBLE_PRECISION array on the GPU, dimension N*NB, where NB is the optimal blocksize. On exit dT holds the upper triangular matrices T from the accumulated Householder transformations (I - V T V') used in the factorization. The nb x nb matrices T are ordered consecutively in memory one after another. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value Further Details --------------- If UPLO = MagmaUpper, the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = MagmaLower, the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = MagmaUpper: if UPLO = MagmaLower: ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). @ingroup magma_dsyev_2stage ********************************************************************/ extern "C" magma_int_t magma_dsytrd_sy2sb_mgpu( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, magmaDouble_ptr dAmgpu[], magma_int_t ldda, magmaDouble_ptr dTmgpu[], magma_int_t lddt, magma_int_t ngpu, magma_int_t distblk, magma_queue_t queues[][20], magma_int_t nqueue, magma_int_t *info) { #define A(a_1,a_2) ( A + ((a_2)-1)*( lda) + (a_1)-1) #define tau_ref(a_1) (tau + (a_1)-1) #define dT(a_0, a_1, a_2) (dTmgpu[a_0] + ((a_2)-1)*(lddt) + (a_1)-1) #define dA(a_0, a_1, a_2) (dAmgpu[a_0] + ((a_2)-1)*(ldda) + (a_1)-1) double c_neg_one = MAGMA_D_NEG_ONE; double c_neg_half = MAGMA_D_NEG_HALF; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; double d_one = MAGMA_D_ONE; magma_int_t pm, pn, indi, indj, pk; magma_int_t pm_old=0, pn_old=0, indi_old=0, flipV=-1; magma_int_t iblock, idev, di; int i; int lwkopt; int lquery; assert (nqueue >= 3); assert (nqueue >= (ngpu+1)); *info = 0; int upper = (uplo == MagmaUpper); lquery = (lwork == -1); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < 1 && ! lquery) { *info = -9; } /* Determine the block size. */ lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_D_MAKE( lwkopt, 0 ); } if (*info != 0) return *info; else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); // limit to 16 threads magma_int_t orig_threads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads( min(orig_threads,16) ); magma_int_t gnode[MagmaMaxGPUs][MagmaMaxGPUs+2]; magma_int_t nbcmplx=0; magma_buildconnection_mgpu(gnode, &nbcmplx, ngpu); #ifdef ENABLE_DEBUG printf(" Initializing communication pattern.... GPU-ncmplx %d\n\n", nbcmplx); #endif double *dspace[MagmaMaxGPUs]; double *dwork[MagmaMaxGPUs], *dworkbis[MagmaMaxGPUs]; double *dvall[MagmaMaxGPUs], *dv[MagmaMaxGPUs], *dw[MagmaMaxGPUs]; double *workngpu[MagmaMaxGPUs+1]; magma_event_t redevents[MagmaMaxGPUs][MagmaMaxGPUs*MagmaMaxGPUs+10]; magma_int_t nbevents = MagmaMaxGPUs*MagmaMaxGPUs; magma_int_t lddv = ldda; magma_int_t lddw = lddv; magma_int_t dwrk2siz = ldda*nb*(ngpu+1); magma_int_t worksiz = n*nb; magma_int_t devworksiz = 2*nb*lddv + nb*lddw + nb*ldda + dwrk2siz; // 2*dv(dv0+dv1) + dw + dwork +dworkbis // local allocation and stream creation // TODO check malloc for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_dmalloc( &dspace[dev], devworksiz ); magma_dmalloc_pinned ( &workngpu[dev], worksiz); dvall[dev] = dspace[dev]; dw[dev] = dvall[dev] + 2*nb*lddv; dwork[dev] = dw[dev] + nb*lddw; dworkbis[dev] = dwork[dev] + nb*ldda; magmablasSetKernelStream( queues[ dev ][ 0 ] ); for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&redevents[dev][i],cudaEventDisableTiming); } } magma_dmalloc_pinned ( &workngpu[ngpu], worksiz); double *worktest = NULL; //magma_dmalloc_cpu( &worktest, n*nb ); // not used // ====================== double *hT = work + lwork - nb*nb; lwork -= nb*nb; memset( hT, 0, nb*nb*sizeof(double)); if (upper) { printf("DSYTRD_HE2HB is not yet implemented for upper matrix storage. Exit.\n"); exit(1); } else { /* Reduce the lower triangle of A */ for (i = 1; i <= n-nb; i += nb) { indi = i+nb; indj = i; pm = n - i - nb + 1; //pn = min(i+nb-1, n-nb) -i + 1; pn = nb; /* Get the current panel (no need for the 1st iteration) */ if (i > 1 ) { // dpanel_to_q copy the upper oof diagonal part of // the matrix to work to be restored later. acctually // the zero's and one's putted are not used this is only // because we don't have a function that copy only the // upper part of A to be restored after copying the // lookahead panel that has been computted from GPU to CPU. dpanel_to_q(MagmaUpper, pn-1, A(i, i+1), lda, work); // find the device who own the panel then send it to the CPU. // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((i-1) / distblk) / ngpu; // local block id di = iblock*distblk + (i-1)%distblk; // local index in parent matrix idev = ((i-1) / distblk) % ngpu; // device with this block //printf("Receiving panel ofsize %d %d from idev %d A(%d,%d) \n",(pm+pn), pn,idev,i-1,di); magma_setdevice( idev ); //magma_device_sync(); magma_dgetmatrix_async( (pm+pn), pn, dA(idev, i, di+1), ldda, A( i, i), lda, queues[ idev ][ nqueue-1 ] ); //magma_setdevice( 0 ); //printf("updating dsyr2k on A(%d,%d) of size %d %d \n",indi_old+pn_old-1,indi_old+pn_old-1,pm_old-pn_old,pn_old); // compute DSYR2K_MGPU magmablas_dsyr2k_mgpu2( MagmaLower, MagmaNoTrans, pm_old-pn_old, pn_old, c_neg_one, dv, pm_old, pn_old, dw, pm_old, pn_old, d_one, dAmgpu, ldda, indi_old+pn_old-1, ngpu, distblk, queues, 2 ); //magma_setdevice( 0 ); magma_setdevice( idev ); magma_queue_sync( queues[idev][ nqueue-1 ] ); //magma_setdevice( 0 ); dq_to_panel(MagmaUpper, pn-1, A(i, i+1), lda, work); } /* ========================================================== QR factorization on a panel starting nb off of the diagonal. Prepare the V and T matrices. ========================================================== */ lapackf77_dgeqrf(&pm, &pn, A(indi, indj), &lda, tau_ref(i), work, &lwork, info); /* Form the matrix T */ pk=min(pm,pn); lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &pm, &pk, A(indi, indj), &lda, tau_ref(i), hT, &nb); /* Prepare V - put 0s in the upper triangular part of the panel (and 1s on the diagonal), temporaly storing the original in work */ dpanel_to_q(MagmaUpper, pk, A(indi, indj), lda, work); /* Send V and T from the CPU to the GPU */ // To be able to overlap the GET with the DSYR2K // it should be done on last stream. // TO Avoid a BUG that is overwriting the old_V // used atthis moment by dsyr2k with the new_V // send it now, we decide to have a flipflop // vector of Vs. if step%2=0 use V[0] else use V[nb*n] flipV = ((i-1)/nb)%2; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { dv[dev] = dvall[dev] + flipV*nb*lddv; } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); // send V magma_dsetmatrix_async( pm, pk, A(indi, indj), lda, dv[dev], pm, queues[dev][nqueue-1] ); // Send the triangular factor T to the GPU magma_dsetmatrix_async( pk, pk, hT, nb, dT(dev, 1, i), lddt, queues[dev][nqueue-1] ); } /* ========================================================== Compute W: 1. X = A (V T) 2. W = X - 0.5* V * (T' * (V' * X)) ========================================================== */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // dwork = V T magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ nqueue-1 ] ); magma_queue_sync( queues[dev][nqueue-1] ); magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_one, dv[dev], pm, dT(dev, 1, i), lddt, c_zero, dwork[dev], pm); } // =============================================== // SYNC TO BE SURE THAT BOTH V AND T WERE // RECEIVED AND VT IS COMPUTED and SYR2K is done // =============================================== for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t s = 0; s < nqueue; ++s ) magma_queue_sync( queues[dev][s] ); } // compute DSYMM_MGPU // The broadcast of the result done inside this function // should be done in stream [0] because i am assuming this // for the GEMMs below otherwise I have to SYNC over the // Broadcasting stream. if (ngpu == 1) { magmablasSetKernelStream( queues[ 0 ][ 0 ] ); magma_dsymm(MagmaLeft, uplo, pm, pk, c_one, dAmgpu[0]+(indi-1)*ldda+(indi-1), ldda, dwork[0], pm, c_zero, dw[0], pm); } else { magmablas_dsymm_mgpu_com( MagmaLeft, uplo, pm, pk, c_one, dAmgpu, ldda, indi-1, dwork, pm, c_zero, dw, pm, dworkbis, dwrk2siz, worktest, pm, workngpu, worksiz, ngpu, distblk, queues, nqueue-1, redevents, nbevents, gnode, nbcmplx); } /* dwork = V*T already ==> dwork' = T'*V' * compute T'*V'*X ==> dwork'*W ==> * dwork + pm*nb = ((T' * V') * X) = dwork' * X = dwork' * W */ for( magma_int_t dev = 0; dev < ngpu; ++dev ) { // Here we have to wait until the broadcast of DSYMM has been done. // Note that the broadcast should be done on stream[0] so in a way // we can continue here on the same stream and avoid a sync magma_setdevice( dev ); magmablasSetKernelStream( queues[ dev ][ 0 ] ); // magma_queue_sync( queues[dev][0] ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, pk, pk, pm, c_one, dwork[dev], pm, dw[dev], pm, c_zero, dworkbis[dev], nb); /* W = X - 0.5 * V * T'*V'*X * = X - 0.5 * V * (dwork + pm*nb) = W - 0.5 * V * (dwork + pm*nb) */ magma_dgemm(MagmaNoTrans, MagmaNoTrans, pm, pk, pk, c_neg_half, dv[dev], pm, dworkbis[dev], nb, c_one, dw[dev], pm); } /* restore the panel it is put here to overlap with the previous GEMM*/ dq_to_panel(MagmaUpper, pk, A(indi, indj), lda, work); // =============================================== // SYNC TO BE SURE THAT BOTH V AND W ARE DONE // =============================================== // Synchronise to be sure that W has been computed // because next DSYR2K use streaming and may happen // that lunch a gemm on stream 2 while stream 0 // which compute those 2 GEMM above has not been // computed and also used for the same reason in // the panel update below and also for the last HER2K for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_queue_sync( queues[dev][0] ); } /* ========================================================== Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' ========================================================== */ if (i + nb <= n-nb) { /* There would be next iteration; do lookahead - update the next panel */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ nqueue-1 ] ); //magma_queue_sync( queues[idev][0] ); removed because the sync has been done in the loop above magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dv[idev], pm, dw[idev], pm, c_one, dA(idev, indi, di+1), ldda); magma_dgemm(MagmaNoTrans, MagmaConjTrans, pm, pn, pn, c_neg_one, dw[idev], pm, dv[idev], pm, c_one, dA(idev, indi, di+1), ldda); //printf("updating next panel distblk %d idev %d on A(%d,%d) of size %d %d %d \n",distblk,idev,indi-1,di,pm,pn,pn); } else { /* no look-ahead as this is last iteration */ // below a -1 was added and then a -1 was done on di because of the fortran indexing iblock = ((indi-1) / distblk) / ngpu; // local block id di = iblock*distblk + (indi-1)%distblk; // local index in parent matrix idev = ((indi-1) / distblk) % ngpu; // device with this block magma_setdevice( idev ); magmablasSetKernelStream( queues[ idev ][ 0 ] ); //printf("LAST DSYR2K idev %d on A(%d,%d) of size %d \n",idev, indi-1,di,pk); magma_dsyr2k(MagmaLower, MagmaNoTrans, pk, pk, c_neg_one, dv[idev], pm, dw[idev], pm, d_one, dA(idev, indi, di+1), ldda); /* Send the last block to the CPU */ dpanel_to_q(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); magma_dgetmatrix( pk, pk, dA(idev, indi, di+1), ldda, A(n-pk+1, n-pk+1), lda ); dq_to_panel(MagmaUpper, pk-1, A(n-pk+1, n-pk+2), lda, work); } indi_old = indi; //indj_old = indj; pm_old = pm; pn_old = pn; } // end loop for (i) }// end of LOWER //magma_setdevice( 0 ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magma_free( dspace[dev]); magma_free_pinned(workngpu[dev]); for( magma_int_t e = 0; e < nbevents; ++e ) { magma_event_destroy( redevents[dev][e] ); } } magma_free_pinned(workngpu[ngpu]); magma_free_cpu(worktest); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); magma_set_lapack_numthreads( orig_threads ); work[0] = MAGMA_D_MAKE( lwkopt, 0 ); return *info; } /* magma_dsytrd_sy2sb_mgpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex_ptr d_A; magmaDouble_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_zlanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded zlanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_zlanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabs. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_Z_NAN; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_Z_INF; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/** Purpose ------- Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A is stored; - = MagmaLower: Lower triangles of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nb INTEGER The order of the band matrix A. N >= NB >= 0. @param[in] Vblksiz INTEGER The size of the block of householder vectors applied at once. @param[in] A (workspace) COMPLEX_16 array, dimension (LDA, N) On entry the band matrix stored in the following way: @param[in] lda INTEGER The leading dimension of the array A. LDA >= 2*NB. @param[out] d DOUBLE array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). @param[out] e DOUBLE array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = MagmaUpper, E(i) = A(i+1,i) if UPLO = MagmaLower. @param[out] V COMPLEX_16 array, dimension (BLKCNT, LDV, VBLKSIZ) On exit it contains the blocks of householder reflectors BLKCNT is the number of block and it is returned by the funtion MAGMA_BULGE_GET_BLKCNT. @param[in] ldv INTEGER The leading dimension of V. LDV > NB + VBLKSIZ + 1 @param[out] TAU COMPLEX_16 dimension(BLKCNT, VBLKSIZ) ??? @param[in] compT INTEGER if COMPT = 0 T is not computed if COMPT = 1 T is computed @param[out] T COMPLEX_16 dimension(LDT *) if COMPT = 1 on exit contains the matrices T needed for Q2 if COMPT = 0 T is not referenced @param[in] ldt INTEGER The leading dimension of T. LDT > Vblksiz @ingroup magma_zheev_2stage ********************************************************************/ extern "C" magma_int_t magma_zhetrd_hb2st( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz, magmaDoubleComplex *A, magma_int_t lda, double *d, double *e, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magma_int_t compT, magmaDoubleComplex *T, magma_int_t ldt) { #ifdef ENABLE_TIMER real_Double_t timeblg=0.0; #endif magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); //const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t INgrsiz=1; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t nbtiles = magma_ceildiv(n, nb); memset(T, 0, blkcnt*ldt*Vblksiz*sizeof(magmaDoubleComplex)); memset(TAU, 0, blkcnt*Vblksiz*sizeof(magmaDoubleComplex)); memset(V, 0, blkcnt*ldv*Vblksiz*sizeof(magmaDoubleComplex)); magma_int_t* prog; magma_malloc_cpu((void**) &prog, (2*nbtiles+threads+10)*sizeof(magma_int_t)); memset(prog, 0, (2*nbtiles+threads+10)*sizeof(magma_int_t)); magma_zbulge_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zbulge_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; magma_zbulge_data data_bulge(threads, n, nb, nbtiles, INgrsiz, Vblksiz, compT, A, lda, V, ldv, TAU, T, ldt, prog); // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); //timing #ifdef ENABLE_TIMER timeblg = magma_wtime(); #endif // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_zbulge_id_data(thread, &data_bulge); pthread_create(&thread_id[thread], &thread_attr, magma_zhetrd_hb2st_parallel_section, &arg[thread]); } arg[0] = magma_zbulge_id_data(0, &data_bulge); magma_zhetrd_hb2st_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } // timing #ifdef ENABLE_TIMER timeblg = magma_wtime()-timeblg; printf(" time BULGE+T = %f\n", timeblg); #endif magma_free_cpu(thread_id); magma_free_cpu(arg); magma_free_cpu(prog); magma_set_lapack_numthreads(mklth); /*================================================ * store resulting diag and lower diag d and e * note that d and e are always real *================================================*/ /* Make diagonal and superdiagonal elements real, * storing them in d and e */ /* In complex case, the off diagonal element are * not necessary real. we have to make off-diagonal * elements real and copy them to e. * When using HouseHolder elimination, * the ZLARFG give us a real as output so, all the * diagonal/off-diagonal element except the last one are already * real and thus we need only to take the abs of the last * one. * */ #if defined(PRECISION_z) || defined(PRECISION_c) if (uplo == MagmaLower) { for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_Z_REAL( A[i*lda ] ); e[i] = MAGMA_Z_REAL( A[i*lda+1] ); } d[n-1] = MAGMA_Z_REAL(A[(n-1)*lda]); } else { /* MagmaUpper not tested yet */ for (magma_int_t i=0; i < n-1; i++) { d[i] = MAGMA_Z_REAL( A[i*lda+nb] ); e[i] = MAGMA_Z_REAL( A[i*lda+nb-1] ); } d[n-1] = MAGMA_Z_REAL(A[(n-1)*lda+nb]); } /* end MagmaUpper */ #else if ( uplo == MagmaLower ) { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda]; // diag e[i] = A[i*lda+1]; // lower diag } d[n-1] = A[(n-1)*lda]; } else { for (magma_int_t i=0; i < n-1; i++) { d[i] = A[i*lda+nb]; // diag e[i] = A[i*lda+nb-1]; // lower diag } d[n-1] = A[(n-1)*lda+nb]; } #endif return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgetri_batched */ int main( int argc, char** argv) { TESTING_INIT(); // constants const magmaDoubleComplex c_zero = MAGMA_Z_ZERO; const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A, *h_Ainv, *h_R, *work; magmaDoubleComplex_ptr d_A, d_invA; magmaDoubleComplex_ptr *dA_array; magmaDoubleComplex_ptr *dinvA_array; magma_int_t **dipiv_array; magma_int_t *dinfo_array; magma_int_t *ipiv, *cpu_info; magma_int_t *d_ipiv, *d_info; magma_int_t N, n2, lda, ldda, info, info1, info2, lwork; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magmaDoubleComplex tmp; double error, rwork[1]; magma_int_t columns; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); magma_int_t batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% batchCount N CPU Gflop/s (ms) GPU Gflop/s (ms) ||I - A*A^{-1}||_1 / (N*cond(A))\n"); printf("%%===============================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N * batchCount; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default // This is the correct flops but since this getri_batched is based on // 2 trsm = getrs and to know the real flops I am using the getrs one //gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRI( N ))/ 1e9 * batchCount; gflops = (FLOPS_ZGETRF( N, N ) + FLOPS_ZGETRS( N, N ))/ 1e9 * batchCount; // query for workspace size lwork = -1; lapackf77_zgetri( &N, NULL, &lda, NULL, &tmp, &lwork, &info ); if (info != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) info, magma_strerror( info )); } lwork = magma_int_t( MAGMA_Z_REAL( tmp )); TESTING_MALLOC_CPU( cpu_info, magma_int_t, batchCount ); TESTING_MALLOC_CPU( ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_CPU( work, magmaDoubleComplex, lwork*batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Ainv, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_invA, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, N * batchCount ); TESTING_MALLOC_DEV( d_info, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinvA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); TESTING_MALLOC_DEV( dipiv_array, magma_int_t*, batchCount ); /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); columns = N * batchCount; lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_R, &lda ); lapackf77_zlacpy( MagmaFullStr, &N, &columns, h_A, &lda, h_Ainv, &lda ); magma_zsetmatrix( N, columns, h_R, lda, d_A, ldda, opts.queue ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zset_pointer( dA_array, d_A, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_zset_pointer( dinvA_array, d_invA, ldda, 0, 0, ldda * N, batchCount, opts.queue ); magma_iset_pointer( dipiv_array, d_ipiv, 1, 0, 0, N, batchCount, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); info1 = magma_zgetrf_batched( N, N, dA_array, ldda, dipiv_array, dinfo_array, batchCount, opts.queue); info2 = magma_zgetri_outofplace_batched( N, dA_array, ldda, dipiv_array, dinvA_array, ldda, dinfo_array, batchCount, opts.queue); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1, opts.queue ); for (magma_int_t i=0; i < batchCount; i++) { if (cpu_info[i] != 0 ) { printf("magma_zgetrf_batched matrix %d returned error %d\n", (int) i, (int)cpu_info[i] ); } } if (info1 != 0) printf("magma_zgetrf_batched returned argument error %d: %s.\n", (int) info1, magma_strerror( info1 )); if (info2 != 0) printf("magma_zgetri_batched returned argument error %d: %s.\n", (int) info2, magma_strerror( info2 )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (int i=0; i < batchCount; i++) { magma_int_t locinfo; lapackf77_zgetrf(&N, &N, h_Ainv + i*lda*N, &lda, ipiv + i*N, &locinfo); if (locinfo != 0) { printf("lapackf77_zgetrf returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } lapackf77_zgetri(&N, h_Ainv + i*lda*N, &lda, ipiv + i*N, work + i*lwork, &lwork, &locinfo ); if (locinfo != 0) { printf("lapackf77_zgetri returned error %d: %s.\n", (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; printf("%10d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) batchCount, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000. ); } else { printf("%10d %5d --- ( --- ) %7.2f (%7.2f)", (int) batchCount, (int) N, gpu_perf, gpu_time*1000. ); } /* ===================================================================== Check the result =================================================================== */ if ( opts.check ) { magma_igetvector( N*batchCount, d_ipiv, 1, ipiv, 1, opts.queue ); magma_zgetmatrix( N, N*batchCount, d_invA, ldda, h_Ainv, lda, opts.queue ); error = 0; for (magma_int_t i=0; i < batchCount; i++) { for (magma_int_t k=0; k < N; k++) { if (ipiv[i*N+k] < 1 || ipiv[i*N+k] > N ) { printf("error for matrix %d ipiv @ %d = %d\n", (int) i, (int) k, (int) ipiv[i*N+k]); error = -1; } } if (error == -1) { break; } // compute 1-norm condition number estimate, following LAPACK's zget03 double normA, normAinv, rcond, err; normA = lapackf77_zlange( "1", &N, &N, h_A + i*lda*N, &lda, rwork ); normAinv = lapackf77_zlange( "1", &N, &N, h_Ainv + i*lda*N, &lda, rwork ); if ( normA <= 0 || normAinv <= 0 ) { rcond = 0; err = 1 / (tol/opts.tolerance); // == 1/eps } else { rcond = (1 / normA) / normAinv; // R = I // R -= A*A^{-1} // err = ||I - A*A^{-1}|| / ( N ||A||*||A^{-1}|| ) = ||R|| * rcond / N, using 1-norm lapackf77_zlaset( "full", &N, &N, &c_zero, &c_one, h_R + i*lda*N, &lda ); blasf77_zgemm( "no", "no", &N, &N, &N, &c_neg_one, h_A + i*lda*N, &lda, h_Ainv + i*lda*N, &lda, &c_one, h_R + i*lda*N, &lda ); err = lapackf77_zlange( "1", &N, &N, h_R + i*lda*N, &lda, rwork ); err = err * rcond / N; } if ( isnan(err) || isinf(err) ) { error = err; break; } error = max( err, error ); } bool okay = (error < tol); status += ! okay; printf(" %8.2e %s\n", error, (okay ? "ok" : "failed") ); } else { printf("\n"); } TESTING_FREE_CPU( cpu_info ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( work ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Ainv ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_invA ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_info ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dinvA_array ); TESTING_FREE_DEV( dinfo_array ); TESTING_FREE_DEV( dipiv_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
static void *magma_zhetrd_hb2st_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_zbulge_id_data*)arg) -> id; magma_zbulge_data* data = ((magma_zbulge_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t nb = data -> nb; magma_int_t nbtiles = data -> nbtiles; magma_int_t grsiz = data -> grsiz; magma_int_t Vblksiz = data -> Vblksiz; magma_int_t compT = data -> compT; magmaDoubleComplex *A = data -> A; magma_int_t lda = data -> lda; magmaDoubleComplex *V = data -> V; magma_int_t ldv = data -> ldv; magmaDoubleComplex *TAU = data -> TAU; magmaDoubleComplex *T = data -> T; magma_int_t ldt = data -> ldt; volatile magma_int_t* prog = data -> prog; pthread_barrier_t* barrier = &(data -> barrier); //magma_int_t sys_corenbr = 1; #ifdef ENABLE_TIMER real_Double_t timeB=0.0, timeT=0.0; #endif // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (compT == 1) { /* compute the Q1 overlapped with the bulge chasing+T. * if all_cores_num=1 it call Q1 on GPU and then bulgechasing. * otherwise the first thread run Q1 on GPU and * the other threads run the bulgechasing. * */ if (allcores_num == 1) { //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER timeB = magma_wtime(); #endif magma_ztile_bulge_parallel(0, 1, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); #ifdef ENABLE_TIMER timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f\n", timeB); #endif //========================= // compute the T's to be used when applying Q2 //========================= #ifdef ENABLE_TIMER timeT = magma_wtime(); #endif magma_ztile_bulge_computeT_parallel(0, 1, V, ldv, TAU, T, ldt, n, nb, Vblksiz); #ifdef ENABLE_TIMER timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f\n", timeT); #endif } else { // allcore_num > 1 magma_int_t id = my_core_id; magma_int_t tot = allcores_num; //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if (id == 0) timeB = magma_wtime(); #endif magma_ztile_bulge_parallel(id, tot, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (id == 0) { timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f\n", timeB); } #endif //========================= // compute the T's to be used when applying Q2 //========================= #ifdef ENABLE_TIMER if (id == 0) timeT = magma_wtime(); #endif magma_ztile_bulge_computeT_parallel(id, tot, V, ldv, TAU, T, ldt, n, nb, Vblksiz); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (id == 0) { timeT = magma_wtime()-timeT; printf(" Finish T's timing= %f\n", timeT); } #endif } // allcore == 1 } else { // WANTZ = 0 //========================= // bulge chasing //========================= #ifdef ENABLE_TIMER if (my_core_id == 0) timeB = magma_wtime(); #endif magma_ztile_bulge_parallel(my_core_id, allcores_num, A, lda, V, ldv, TAU, n, nb, nbtiles, grsiz, Vblksiz, prog); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 0) { timeB = magma_wtime()-timeB; printf(" Finish BULGE timing= %f\n", timeB); } #endif } // WANTZ > 0 #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
extern "C" magma_int_t magma_dbulge_back( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, double *Z, magma_int_t ldz, magmaDouble_ptr dZ, magma_int_t lddz, double *V, magma_int_t ldv, double *TAU, double *T, magma_int_t ldt, magma_int_t* info) { magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); real_Double_t timeaplQ2=0.0; double f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) //double gpu_cpu_perf = 50; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) //double gpu_cpu_perf = 16; // gpu over cpu performance //100% ev // SandyB. - Fermi (M2090) //#else // double gpu_cpu_perf = 27.5; // gpu over cpu performance //100% ev // Westmere - Fermi (M2090) //double gpu_cpu_perf = 37; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) // double gpu_cpu_perf = 130; // gpu over cpu performance //100% ev // Bulldozer - Kepler (K20X) //#endif magma_int_t gpu_cpu_perf = magma_get_dbulge_gcperf(); if (threads > 1) { f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf) ); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //n_gpu=ne; //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if (n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU > 0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_dapplyQ_data data_applyQ; magma_dapplyQ_data_init(&data_applyQ, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_dapplyQ_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_dapplyQ_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { magma_dapplyQ_id_data_init(&(arg[thread]), thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_dapplyQ_parallel_section, &arg[thread]); } magma_dapplyQ_id_data_init(&(arg[0]), 0, &data_applyQ); magma_dapplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); magma_dapplyQ_data_destroy(&data_applyQ); magma_dsetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ } else { magma_dsetmatrix(n, ne, Z, ldz, dZ, lddz); magma_dbulge_applyQ_v2(MagmaLeft, ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_set_lapack_numthreads(mklth); return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zgeqrf_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf=0, cublas_time=0, cpu_perf, cpu_time; double magma_error, cublas_error, magma_error2, cublas_error2; magmaDoubleComplex *h_A, *h_R, *h_Amagma, *tau, *h_work, tmp[1]; magmaDoubleComplex *d_A, *dtau_magma, *dtau_cublas; magmaDoubleComplex **dA_array = NULL; magmaDoubleComplex **dtau_array = NULL; magma_int_t *dinfo_magma, *dinfo_cublas; magma_int_t M, N, lda, ldda, lwork, n2, info, min_mn; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount; magma_int_t column; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); batchCount = opts.batchcount; double tol = opts.tolerance * lapackf77_dlamch("E"); printf("%% BatchCount M N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) |R - Q^H*A|_mag |I - Q^H*Q|_mag |R - Q^H*A|_cub |I - Q^H*Q|_cub\n"); printf("%%============================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N * batchCount; ldda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default gflops = (FLOPS_ZGEQRF( M, N ) + FLOPS_ZGEQRT( M, N )) / 1e9 * batchCount; /* Allocate memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn * batchCount ); TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_Amagma, magmaDoubleComplex, n2 ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N * batchCount ); TESTING_MALLOC_DEV( dtau_magma, magmaDoubleComplex, min_mn * batchCount); TESTING_MALLOC_DEV( dtau_cublas, magmaDoubleComplex, min_mn * batchCount); TESTING_MALLOC_DEV( dinfo_magma, magma_int_t, batchCount); TESTING_MALLOC_DEV( dinfo_cublas, magma_int_t, batchCount); TESTING_MALLOC_DEV( dA_array, magmaDoubleComplex*, batchCount ); TESTING_MALLOC_DEV( dtau_array, magmaDoubleComplex*, batchCount ); // to determine the size of lwork lwork = -1; lapackf77_zgeqrf(&M, &N, NULL, &M, NULL, tmp, &lwork, &info); lwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] ); lwork = max(lwork, N*N); TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lwork * batchCount); column = N * batchCount; /* Initialize the matrix */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); lapackf77_zlacpy( MagmaFullStr, &M, &column, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_zsetmatrix( M, column, h_R, lda, d_A, ldda ); magma_zset_pointer( dA_array, d_A, 1, 0, 0, ldda*N, batchCount, opts.queue ); magma_zset_pointer( dtau_array, dtau_magma, 1, 0, 0, min_mn, batchCount, opts.queue ); magma_time = magma_sync_wtime( opts.queue ); info = magma_zgeqrf_batched(M, N, dA_array, ldda, dtau_array, dinfo_magma, batchCount, opts.queue); magma_time = magma_sync_wtime( opts.queue ) - magma_time; magma_perf = gflops / magma_time; magma_zgetmatrix( M, column, d_A, ldda, h_Amagma, lda); if (info != 0) { printf("magma_zgeqrf_batched returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using CUBLAS =================================================================== */ /* cublasZgeqrfBatched is only available from CUBLAS v6.5 */ #if CUDA_VERSION >= 6050 magma_zsetmatrix( M, column, h_R, lda, d_A, ldda ); magma_zset_pointer( dA_array, d_A, 1, 0, 0, ldda*N, batchCount, opts.queue ); magma_zset_pointer( dtau_array, dtau_cublas, 1, 0, 0, min_mn, batchCount, opts.queue ); cublas_time = magma_sync_wtime( opts.queue ); int cublas_info; // not magma_int_t cublasZgeqrfBatched( opts.handle, M, N, dA_array, ldda, dtau_array, &cublas_info, batchCount); cublas_time = magma_sync_wtime( opts.queue ) - cublas_time; cublas_perf = gflops / cublas_time; if (cublas_info != 0) { printf("cublasZgeqrfBatched returned error %d: %s.\n", (int) cublas_info, magma_strerror( cublas_info )); } #endif /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.check ) { cpu_time = magma_wtime(); // #define BATCHED_DISABLE_PARCPU #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_int_t nthreads = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); magma_set_omp_numthreads(nthreads); #pragma omp parallel for schedule(dynamic) #endif for (magma_int_t s=0; s < batchCount; s++) { magma_int_t locinfo; lapackf77_zgeqrf(&M, &N, h_A + s * lda * N, &lda, tau + s * min_mn, h_work + s * lwork, &lwork, &locinfo); if (locinfo != 0) { printf("lapackf77_zgeqrf matrix %d returned error %d: %s.\n", (int) s, (int) locinfo, magma_strerror( locinfo )); } } #if !defined (BATCHED_DISABLE_PARCPU) && defined(_OPENMP) magma_set_lapack_numthreads(nthreads); #endif cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) { printf("lapackf77_zgeqrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ===================================================================== Check the MAGMA CUBLAS result compared to LAPACK =================================================================== */ magma_int_t ldq = M; magma_int_t ldr = min_mn; magmaDoubleComplex *Q, *R; double *work; TESTING_MALLOC_CPU( Q, magmaDoubleComplex, ldq*min_mn ); // M by K TESTING_MALLOC_CPU( R, magmaDoubleComplex, ldr*N ); // K by N TESTING_MALLOC_CPU( work, double, min_mn ); /* check magma result */ magma_error = 0; magma_error2 = 0; magma_zgetvector(min_mn*batchCount, dtau_magma, 1, tau, 1); for (int i=0; i < batchCount; i++) { double err, err2; get_QR_error(M, N, min_mn, h_Amagma + i*lda*N, h_R + i*lda*N, lda, tau + i*min_mn, Q, ldq, R, ldr, h_work, lwork, work, &err, &err2); if ( isnan(err) || isinf(err) ) { magma_error = err; break; } magma_error = max( err, magma_error ); magma_error2 = max( err2, magma_error2 ); } /* check cublas result */ cublas_error = 0; cublas_error2 = 0; #if CUDA_VERSION >= 6050 magma_zgetvector(min_mn*batchCount, dtau_magma, 1, tau, 1); magma_zgetmatrix( M, column, d_A, ldda, h_A, lda); for (int i=0; i < batchCount; i++) { double err, err2; get_QR_error(M, N, min_mn, h_A + i*lda*N, h_R + i*lda*N, lda, tau + i*min_mn, Q, ldq, R, ldr, h_work, lwork, work, &err, &err2); if ( isnan(err) || isinf(err) ) { cublas_error = err; break; } cublas_error = max( err, cublas_error ); cublas_error2 = max( err2, cublas_error2 ); } #endif TESTING_FREE_CPU( Q ); Q = NULL; TESTING_FREE_CPU( R ); R = NULL; TESTING_FREE_CPU( work ); work = NULL; bool okay = (magma_error < tol && magma_error2 < tol); //bool okay_cublas = (cublas_error < tol && cublas_error2 < tol); status += ! okay; printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %15.2e %15.2e %15.2e %15.2e %s\n", (int)batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, magma_error2, cublas_error, cublas_error2, (okay ? "ok" : "failed") ); } else { printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) --- ( --- ) ---\n", (int)batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time ); } TESTING_FREE_CPU( tau ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_Amagma); TESTING_FREE_CPU( h_work ); TESTING_FREE_PIN( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( dtau_magma ); TESTING_FREE_DEV( dtau_cublas ); TESTING_FREE_DEV( dinfo_magma ); TESTING_FREE_DEV( dinfo_cublas ); TESTING_FREE_DEV( dA_array ); TESTING_FREE_DEV( dtau_array ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }