void BackwardEulerSorUpdater::config( double c, int N ) { Updater::setAlpha(c); // Construct A-matrix LU-decomposition d_A = Eigen::MatrixXd::Zero(N-1,N-1); for (int i=0; i<N-1; i++) { d_A(i,i) = 1+2*c; if (i>=1) d_A(i,i-1) = -c; if (i+1<=N-2) d_A(i,i+1) = -c; } }
void CrankNicolsonSorUpdater::config( double c, int N ) { Updater::setAlpha(c); // Construct A-matrix LU-decomposition d_A = Eigen::MatrixXd::Zero(N-1,N-1); for (int i=0; i<N-1; i++) { d_A(i,i) = 1+c; if (i>=1) d_A(i,i-1) = -0.5*c; if (i+1<=N-2) d_A(i,i+1) = -0.5*c; } // Construct B-matrix d_B = Eigen::MatrixXd::Zero(N-1,N-1); for (int i=0; i<N-1; i++) { d_B(i,i) = 1-c; if (i>=1) d_B(i,i-1) = 0.5*c; if (i+1<=N-2) d_B(i,i+1) = 0.5*c; } }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zlanhe */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; magmaDoubleComplex *h_A; double *h_work; magmaDoubleComplex_ptr d_A; magmaDouble_ptr d_work; magma_int_t i, j, N, n2, lda, ldda; magma_int_t idist = 3; // normal distribution (otherwise max norm is always ~ 1) magma_int_t ISEED[4] = {0,0,0,1}; double error, norm_magma, norm_lapack; magma_int_t status = 0; magma_int_t lapack_nan_fail = 0; magma_int_t lapack_inf_fail = 0; bool mkl_warning = false; magma_opts opts; opts.parse_opts( argc, argv ); double tol = opts.tolerance * lapackf77_dlamch("E"); double tol2; magma_uplo_t uplo[] = { MagmaLower, MagmaUpper }; magma_norm_t norm[] = { MagmaInfNorm, MagmaOneNorm, MagmaMaxNorm, MagmaFrobeniusNorm }; // Double-Complex inf-norm not supported on Tesla (CUDA arch 1.x) #if defined(PRECISION_z) magma_int_t arch = magma_getdevice_arch(); if ( arch < 200 ) { printf("!!!! NOTE: Double-Complex %s and %s norm are not supported\n" "!!!! on CUDA architecture %d; requires arch >= 200.\n" "!!!! It should report \"parameter number 1 had an illegal value\" below.\n\n", MagmaInfNormStr, MagmaOneNormStr, (int) arch ); for( int inorm = 0; inorm < 2; ++inorm ) { for( int iuplo = 0; iuplo < 2; ++iuplo ) { printf( "Testing that magmablas_zlanhe( %s, %s, ... ) returns -1 error...\n", lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] )); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], 1, NULL, 1, NULL, 1 ); if ( norm_magma != -1 ) { printf( "expected magmablas_zlanhe to return -1 error, but got %f\n", norm_magma ); status = 1; } }} printf( "...return values %s\n\n", (status == 0 ? "ok" : "failed") ); } #endif #ifdef MAGMA_WITH_MKL // MKL 11.1 has bug in multi-threaded zlanhe; use single thread to work around. // MKL 11.2 corrects it for inf, one, max norm. // MKL 11.2 still segfaults for Frobenius norm, which is not tested here // because MAGMA doesn't implement Frobenius norm yet. MKLVersion mkl_version; mkl_get_version( &mkl_version ); magma_int_t la_threads = magma_get_lapack_numthreads(); bool mkl_single_thread = (mkl_version.MajorVersion <= 11 && mkl_version.MinorVersion < 2); if ( mkl_single_thread ) { printf( "\nNote: using single thread to work around MKL zlanhe bug.\n\n" ); } #endif printf("%% N norm uplo CPU GByte/s (ms) GPU GByte/s (ms) error nan inf\n"); printf("%%=================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int inorm = 0; inorm < 3; ++inorm ) { /* < 4 for Frobenius */ for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; n2 = lda*N; ldda = magma_roundup( N, opts.align ); // read upper or lower triangle gbytes = 0.5*(N+1)*N*sizeof(magmaDoubleComplex) / 1e9; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 ); TESTING_MALLOC_CPU( h_work, double, N ); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N ); TESTING_MALLOC_DEV( d_work, double, N ); /* Initialize the matrix */ lapackf77_zlarnv( &idist, ISEED, &n2, h_A ); magma_zsetmatrix( N, N, h_A, lda, d_A, ldda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_wtime(); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gbytes / gpu_time; if (norm_magma == -1) { printf( "%5d %4c skipped because %s norm isn't supported\n", (int) N, lapacke_norm_const( norm[inorm] ), lapack_norm_const( norm[inorm] )); goto cleanup; } else if (norm_magma < 0) { printf("magmablas_zlanhe returned error %f: %s.\n", norm_magma, magma_strerror( (int) norm_magma )); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // work around MKL bug in multi-threaded zlanhe magma_set_lapack_numthreads( 1 ); } #endif cpu_time = magma_wtime(); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if (norm_lapack < 0) { printf("lapackf77_zlanhe returned error %f: %s.\n", norm_lapack, magma_strerror( (int) norm_lapack )); } /* ===================================================================== Check the result compared to LAPACK =================================================================== */ error = fabs( norm_magma - norm_lapack ) / norm_lapack; tol2 = tol; if ( norm[inorm] == MagmaMaxNorm ) { // max-norm depends on only one element, so for Real precisions, // MAGMA and LAPACK should exactly agree (tol2 = 0), // while Complex precisions incur roundoff in cuCabs. #ifdef REAL tol2 = 0; #endif } bool okay; okay = (error <= tol2); status += ! okay; mkl_warning |= ! okay; /* ==================================================================== Check for NAN and INF propagation =================================================================== */ #define h_A(i_, j_) (h_A + (i_) + (j_)*lda) #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) i = rand() % N; j = rand() % N; magma_int_t tmp; if ( uplo[iuplo] == MagmaLower && i < j ) { tmp = i; i = j; j = tmp; } else if ( uplo[iuplo] == MagmaUpper && i > j ) { tmp = i; i = j; j = tmp; } *h_A(i,j) = MAGMA_Z_NAN; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool nan_okay; nan_okay = isnan(norm_magma); bool la_nan_okay; la_nan_okay = isnan(norm_lapack); lapack_nan_fail += ! la_nan_okay; status += ! nan_okay; *h_A(i,j) = MAGMA_Z_INF; magma_zsetvector( 1, h_A(i,j), 1, d_A(i,j), 1 ); norm_magma = magmablas_zlanhe( norm[inorm], uplo[iuplo], N, d_A, ldda, d_work, N ); norm_lapack = lapackf77_zlanhe( lapack_norm_const( norm[inorm] ), lapack_uplo_const( uplo[iuplo] ), &N, h_A, &lda, h_work ); bool inf_okay; inf_okay = isinf(norm_magma); bool la_inf_okay; la_inf_okay = isinf(norm_lapack); lapack_inf_fail += ! la_inf_okay; status += ! inf_okay; #ifdef MAGMA_WITH_MKL if ( mkl_single_thread ) { // end single thread to work around MKL bug magma_set_lapack_numthreads( la_threads ); } #endif printf("%5d %4c %4c %7.2f (%7.2f) %7.2f (%7.2f) %#9.3g %-6s %6s%1s %6s%1s\n", (int) N, lapacke_norm_const( norm[inorm] ), lapacke_uplo_const( uplo[iuplo] ), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., error, (okay ? "ok" : "failed"), (nan_okay ? "ok" : "failed"), (la_nan_okay ? " " : "*"), (inf_okay ? "ok" : "failed"), (la_inf_okay ? " " : "*")); cleanup: TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_work ); fflush( stdout ); } // end iter if ( opts.niter > 1 ) { printf( "\n" ); } }} // end iuplo, inorm printf( "\n" ); } // don't print "failed" here because then run_tests.py thinks MAGMA failed if ( lapack_nan_fail ) { printf( "* Warning: LAPACK did not pass NAN propagation test; upgrade to LAPACK version >= 3.4.2 (Sep. 2012)\n" ); } if ( lapack_inf_fail ) { printf( "* Warning: LAPACK did not pass INF propagation test\n" ); } if ( mkl_warning ) { printf("* MKL (e.g., 11.1) has a bug in zlanhe with multiple threads;\n" " corrected in 11.2 for one, inf, max norms, but still in Frobenius norm.\n" " Try again with MKL_NUM_THREADS=1.\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing slaset_band Code is very similar to testing_slacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); #define h_A(i_,j_) (h_A + (i_) + (j_)*lda) #define d_A(i_,j_) (d_A + (i_) + (j_)*ldda) real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; float c_neg_one = MAGMA_S_NEG_ONE; float *h_A, *h_R; float *d_A; float offdiag = MAGMA_S_MAKE( 1.2000, 6.7000 ); float diag = MAGMA_S_MAKE( 3.1415, 2.7183 ); magma_int_t M, N, nb, cnt, size, lda, ldb, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); nb = (opts.nb == 0 ? 32 : opts.nb); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("K = nb = %d\n", (int) nb ); printf("uplo M N CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("==================================================================\n"); for( int iuplo = 0; iuplo < 2; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { int inset = 0; M = opts.msize[itest] + 2*inset; N = opts.nsize[itest] + 2*inset; lda = M; ldb = lda; ldda = ((M+31)/32)*32; size = lda*N; TESTING_MALLOC_CPU( h_A, float, size ); TESTING_MALLOC_CPU( h_R, float, size ); TESTING_MALLOC_DEV( d_A, float, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_S_MAKE( i + j/10000., j ); } } magma_ssetmatrix( M, N, h_A, lda, d_A, ldda ); /* ===================================================================== Performs operation on CPU Also count number of elements touched. =================================================================== */ cpu_time = magma_wtime(); cnt = 0; for( int j=inset; j < N-inset; ++j ) { for( int k=0; k < nb; ++k ) { // set k-th sub- or super-diagonal if ( k == 0 && j < M-inset ) { *h_A(j,j) = diag; cnt += 1; } else if ( uplo[iuplo] == MagmaLower && j+k < M-inset ) { *h_A(j+k,j) = offdiag; cnt += 1; } else if ( uplo[iuplo] == MagmaUpper && j-k >= inset && j-k < M-inset ) { *h_A(j-k,j) = offdiag; cnt += 1; } } } gbytes = cnt / 1e9; cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ==================================================================== Performs operation using MAGMA =================================================================== */ gpu_time = magma_sync_wtime( 0 ); int mm = M - 2*inset; int nn = N - 2*inset; magmablas_slaset_band( uplo[iuplo], mm, nn, nb, offdiag, diag, d_A(inset,inset), ldda ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Check the result =================================================================== */ magma_sgetmatrix( M, N, d_A, ldda, h_R, lda ); //printf( "h_R=" ); magma_sprint( M, N, h_R, lda ); //printf( "h_A=" ); magma_sprint( M, N, h_A, lda ); blasf77_saxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_slange("f", &M, &N, h_R, &lda, work); printf("%4c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapacke_uplo_const( uplo[iuplo] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctranspose Code is very similar to testing_csymmetrize.cpp */ int main( int argc, char** argv) { TESTING_INIT(); // OpenCL use: cl_mem , offset (two arguments); // else use: pointer + offset (one argument). #ifdef HAVE_clBLAS #define d_A(i_, j_) d_A, ((i_) + (j_)*ldda) #define d_B(i_, j_) d_B, ((i_) + (j_)*lddb) #else #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) #define d_B(i_, j_) (d_B + (i_) + (j_)*lddb) #endif real_Double_t gbytes, gpu_perf, gpu_time, gpu_perf2=0, gpu_time2=0, cpu_perf, cpu_time; float error, error2, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B, *h_R; magmaFloatComplex_ptr d_A, d_B; magma_int_t M, N, size, lda, ldda, ldb, lddb; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); #ifdef COMPLEX magma_int_t ntrans = 2; magma_trans_t trans[] = { Magma_ConjTrans, MagmaTrans }; #else magma_int_t ntrans = 1; magma_trans_t trans[] = { MagmaTrans }; #endif printf("%% Inplace transpose requires M == N.\n"); printf("%% Trans M N CPU GByte/s (ms) GPU GByte/s (ms) check Inplace GB/s (ms) check\n"); printf("%%=========================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int itran = 0; itran < ntrans; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default ldb = N; lddb = magma_roundup( N, opts.align ); // multiple of 32 by default // load entire matrix, save entire matrix gbytes = sizeof(magmaFloatComplex) * 2.*M*N / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); // input: M x N TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*M ); // output: N x M TESTING_MALLOC_CPU( h_R, magmaFloatComplex, ldb*M ); // output: N x M TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); // input: M x N TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*M ); // output: N x M /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_C_MAKE( i + j/10000., j ); } } for( int j = 0; j < M; ++j ) { for( int i = 0; i < N; ++i ) { h_B[i + j*ldb] = MAGMA_C_MAKE( i + j/10000., j ); } } magma_csetmatrix( N, M, h_B, ldb, d_B(0,0), lddb, opts.queue ); /* ===================================================================== Performs operation using naive out-of-place algorithm (LAPACK doesn't implement transpose) =================================================================== */ cpu_time = magma_wtime(); //for( int j = 1; j < N-1; ++j ) { // inset by 1 row & col // for( int i = 1; i < M-1; ++i ) { // inset by 1 row & col if ( trans[itran] == MagmaTrans ) { for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_B[j + i*ldb] = h_A[i + j*lda]; } } } else { for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_B[j + i*ldb] = conj( h_A[i + j*lda] ); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ==================================================================== Performs operation using MAGMA, out-of-place =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A(0,0), ldda, opts.queue ); magma_csetmatrix( N, M, h_B, ldb, d_B(0,0), lddb, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); if ( trans[itran] == MagmaTrans ) { //magmablas_ctranspose( M-2, N-2, d_A(1,1), ldda, d_B(1,1), lddb, opts.queue ); // inset by 1 row & col magmablas_ctranspose( M, N, d_A(0,0), ldda, d_B(0,0), lddb, opts.queue ); } #ifdef HAVE_CUBLAS else { //magmablas_ctranspose_conj( M-2, N-2, d_A(1,1), ldda, d_B(1,1), lddb, opts.queue ); // inset by 1 row & col magmablas_ctranspose_conj( M, N, d_A(0,0), ldda, d_B(0,0), lddb, opts.queue ); } #endif gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ==================================================================== Performs operation using MAGMA, in-place =================================================================== */ if ( M == N ) { magma_csetmatrix( M, N, h_A, lda, d_A(0,0), ldda, opts.queue ); gpu_time2 = magma_sync_wtime( opts.queue ); if ( trans[itran] == MagmaTrans ) { //magmablas_ctranspose_inplace( N-2, d_A(1,1), ldda, opts.queue ); // inset by 1 row & col magmablas_ctranspose_inplace( N, d_A(0,0), ldda, opts.queue ); } #ifdef HAVE_CUBLAS else { //magmablas_ctranspose_conj_inplace( N-2, d_A(1,1), ldda, opts.queue ); // inset by 1 row & col magmablas_ctranspose_conj_inplace( N, d_A(0,0), ldda, opts.queue ); } #endif gpu_time2 = magma_sync_wtime( opts.queue ) - gpu_time2; gpu_perf2 = gbytes / gpu_time2; } /* ===================================================================== Check the result =================================================================== */ // check out-of-place transpose (d_B) size = ldb*M; magma_cgetmatrix( N, M, d_B(0,0), lddb, h_R, ldb, opts.queue ); blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_clange("f", &N, &M, h_R, &ldb, work ); if ( M == N ) { // also check in-place tranpose (d_A) magma_cgetmatrix( N, M, d_A(0,0), ldda, h_R, ldb, opts.queue ); blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error2 = lapackf77_clange("f", &N, &M, h_R, &ldb, work ); printf("%5c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %6s %7.2f (%7.2f) %s\n", lapacke_trans_const( trans[itran] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed"), gpu_perf2, gpu_time2, (error2 == 0. ? "ok" : "failed") ); status += ! (error == 0. && error2 == 0.); } else { printf("%5c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %6s --- ( --- )\n", lapacke_trans_const( trans[itran] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } } opts.cleanup(); TESTING_FINALIZE(); return status; }
/* arcmin to comoving Mpc */ double arcmintoMpc(float z, float arcmin){ return (1+z)*d_A(z)*arcmin*PI/180.0/60.0; }