magma_int_t magma_dbulge_get_Vblksiz( magma_int_t m, magma_int_t nb, magma_int_t nbthreads ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler + SB return min(nb,64); } else { // 2.x Fermi or 1.x return min(nb,64); } }
magma_int_t magma_get_zgebrd_nb( magma_int_t /*m*/ ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 200 ) { // 2.x Fermi return 32; } else { // 1.x return 32; } }
/* //////////////////////////////////////////////////////////////////////////// -- Return nb for gehrd based on m */ magma_int_t magma_get_sgehrd_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 200 ) { // 2.x Fermi if (m < 1024) return 32; else return 96; } else { // 1.x if (m < 1024) return 32; else return 64; } }
magma_int_t magma_get_dgelqf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 200 ) { // 2.x Fermi return magma_get_dgeqrf_nb( m ); } else { // 1.x if (m < 2048) return 32; else if (m < 4032) return 64; else return 128; } }
magma_int_t magma_get_zbulge_gcperf( ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler + SB return 50; } else if ( arch >= 200 ) { // 2.x Fermi return 15000; } else { // 1.x return 10000; } }
magma_int_t magma_get_dbulge_nb( magma_int_t m, magma_int_t nbthreads ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler + SB return 128; } else if ( arch >= 200 ) { // 2.x Fermi return 128; } else { // 1.x return 64; } }
magma_int_t magma_get_cgeqrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 4096) return 64; else return 128; } else { // 1.x and 2.x Fermi if (m < 2048) return 32; else if (m < 4096) return 64; else return 128; } }
magma_int_t magma_get_dgeqrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 3072) return 64; else if (m < 10240) return 128; else return 256; } else { // 1.x and 2.x Fermi if (m < 4096) return 64; else return 128; } }
magma_int_t magma_get_zbulge_nb_mgpu( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler + SB return 64; } else if ( arch >= 200 ) { // 2.x Fermi return 64; } else { // 1.x return 64; } }
magma_int_t magma_zbulge_get_Vblksiz( magma_int_t /*m*/, magma_int_t nb, magma_int_t nbthreads ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler + SB if ( nbthreads > 14 ) return min(nb, 64); else return min(nb, 32); } else { // 2.x Fermi or 1.x return min(nb, 48); } }
magma_int_t magma_get_zhegst_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler return 384; } else if ( arch >= 200 ) { // 2.x Fermi return 256; } else { // 1.x return 64; } }
magma_int_t magma_get_zgebrd_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 200 ) { // 2.x Fermi return 32; //if (m < 1024) // return 64; //else // return 64; } else { // 1.x return 32; } }
magma_int_t magma_get_chegst_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 2048) return 384; else return 768; } else if ( arch >= 200 ) { // 2.x Fermi return 512; } else { // 1.x return 64; } }
magma_int_t magma_get_zpotrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler return 256; } else if ( arch >= 200 ) { // 2.x Fermi if (m < 1500) return 192; else return 256; } else { // 1.x return 64; } }
/* //////////////////////////////////////////////////////////////////////////// -- Return nb for sygst based on m */ magma_int_t magma_get_ssygst_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 4096) return 768; else return 1536; } else if ( arch >= 200 ) { // 2.x Fermi if (m < 2048) return 512; else return 1024; } else { // 1.x return 64; } }
magma_int_t magma_get_zgetrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 4096) return 64; else if (m < 8192) return 256; else return 512; } else if ( arch >= 200 ) { // 2.x Fermi if (m < 4096) return 64; else return 128; } else { // 1.x return 128; } }
magma_int_t magma_get_dsygst_nb_m( magma_int_t m ) { return 256; //to be updated magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 2048) return 384; else return 768; } else if ( arch >= 200 ) { // 2.x Fermi return 512; } else { // 1.x return 64; } }
magma_int_t magma_get_dpotrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 3072) return 256; else return 512; } else if ( arch >= 200 ) { // 2.x Fermi return 256; } else { // 1.x if (m < 3328) return 128; else if (m < 4256) return 128; else return 256; } }
magma_int_t magma_get_zbulge_nb( magma_int_t /*m*/, magma_int_t nbthreads ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler + SB if ( nbthreads > 14 ) return 128; else return 64; } else if ( arch >= 200 ) { // 2.x Fermi return 64; } else { // 1.x return 64; } }
/* //////////////////////////////////////////////////////////////////////////// -- Return nb for potrf based on m */ magma_int_t magma_get_spotrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 1500) return 256; else return 512; } else if ( arch >= 200 ) { // 2.x Fermi if (m < 2048) return 256; else return 512; } else { // 1.x if (m < 3328) return 128; else if (m < 4256) return 224; else return 288; } }
magma_int_t magma_get_dgetrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 3072) return 128; else if (m < 8192) return 256; else return 512; } else if ( arch >= 200 ) { // 2.x Fermi if (m < 3072) return 128; else if (m < 10240) return 256; else return 512; } else { // 1.x if (m < 2048) return 64; else return 128; } }
/* //////////////////////////////////////////////////////////////////////////// -- Return nb for geqrf based on m */ magma_int_t magma_get_sgeqrf_nb( magma_int_t m ) { magma_int_t arch = magma_getdevice_arch(); if ( arch >= 300 ) { // 3.x Kepler if (m < 4096) return 96; else if (m < 7168) return 128; else if (m < 18432) return 256; else return 512; } else if ( arch >= 200 ) { // 2.x Fermi if (m < 3072) return 64; else if (m < 8192) return 128; else return 256; } else { // 1.x if (m < 2048) return 32; else if (m < 4096) return 64; else return 128; } }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex_ptr d_A; magmaDouble_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_zlanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded zlanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_zlanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabs. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_Z_NAN; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_Z_INF; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dlansy */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; double *h_A; double *h_work; double *d_A; double *d_work; magma_int_t N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; bool mkl_warning = false; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_dlansy( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_dlansy( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL ); if ( norm_magma != -1 ) { printf( "expected magmablas_dlansy to return -1 error, but got %f\n", norm_magma ); status = 1; } } } printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif printf(" N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error \n"); printf("=======================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = roundup( N, opts.pad ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(double) / 1e9; TESTING_MALLOC_CPU( h_A, double, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, double, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_dlarnv( &idist, ISEED, &n2, h_A ); //magma_dmake_symmetric( N, h_A, lda ); // make diagonal real -- according to docs, should NOT be necesary //for( int i=0; i < N; ++i ) { // h_A[i + i*lda] = MAGMA_D_MAKE( MAGMA_D_REAL( h_A[i + i*lda] ), 0 ); //} magma_dsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_dlansy( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because it isn't supported on this GPU\n", (int) N, lapacke_norm_const( norm[inorm] )); continue; } if (norm_magma < 0) printf("magmablas_dlansy returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); norm_lapack = lapackf77_dlansy( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) printf("lapackf77_dlansy returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); /* ===================================================================== Check the result compared to LAPACK Note: MKL (11.1.0) has bug for uplo=Lower with multiple threads. Try with $MKL_NUM_THREADS = 1. =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; double tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in fabs. #if defined(PRECISION_s) || defined(PRECISION_d) tol2 = 0; #endif } if ( error > tol2 && norm[inorm] == MagmaInfNorm && uplo[iuplo] == MagmaLower ) { mkl_warning = true; } printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (error <= tol2 ? "ok" : "failed") ); status += ! (error <= tol2); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } } // end iuplo, inorm, iter printf( "\n" ); } if ( mkl_warning ) { printf("* Some versions of MKL (e.g., 11.1.0) have a bug in dlansy with uplo=L\n" " and multiple threads. Try again with MKL_NUM_THREADS=1.\n" ); } TESTING_FINALIZE(); return status; }