void magma_cprint_gpu( magma_int_t m, magma_int_t n, const magmaFloatComplex *dA, magma_int_t ldda ) { magma_int_t info = 0; if ( m < 0 ) info = -1; else if ( n < 0 ) info = -2; else if ( ldda < max(1,m) ) info = -4; if (info != 0) { magma_xerbla( __func__, -(info) ); return; //info; } magma_int_t lda = m; magmaFloatComplex* A; magma_cmalloc_cpu( &A, lda*n ); magma_cgetmatrix( m, n, dA, ldda, A, lda ); magma_cprint( m, n, A, lda ); magma_free_cpu( A ); }
void magma_cprint_gpu( magma_int_t m, magma_int_t n, const magmaFloatComplex *dA, magma_int_t ldda ) { if ( magma_is_devptr( dA ) == 0 ) { fprintf( stderr, "ERROR: cprint_gpu called with host pointer.\n" ); exit(1); } magma_int_t lda = m; magmaFloatComplex* A; magma_cmalloc_cpu( &A, lda*n ); magma_cgetmatrix( m, n, dA, ldda, A, lda ); magma_cprint( m, n, A, lda ); magma_free_cpu( A ); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cprint */ int main( int argc, char** argv) { TESTING_INIT(); magmaFloatComplex *hA, *dA; //magma_int_t ione = 1; //magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t M, N, lda, ldda; //size magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = ((M + 31)/32)*32; //size = lda*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( hA, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( dA, magmaFloatComplex, ldda*N ); //lapackf77_clarnv( &ione, ISEED, &size, hA ); for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { hA[i + j*lda] = MAGMA_C_MAKE( i + j*0.01, 0. ); } } magma_csetmatrix( M, N, hA, lda, dA, ldda ); printf( "A=" ); magma_cprint( M, N, hA, lda ); printf( "dA=" ); magma_cprint_gpu( M, N, dA, ldda ); TESTING_FREE_CPU( hA ); TESTING_FREE_DEV( dA ); } } TESTING_FINALIZE(); return status; }
int main( int argc, char** argv) { #define hA(i,j) (hA + (i) + (j)*lda) TESTING_CUDA_INIT(); cuFloatComplex c_zero = MAGMA_C_ZERO; cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex *hA, *hR, *dA; //real_Double_t gpu_time, gpu_perf; //int ione = 1; //int ISEED[4] = {0, 0, 0, 1}; int nsize[] = { 32, 64, 96, 256, 100, 200, 512 }; int ntest = sizeof(nsize) / sizeof(int); int n = nsize[ntest-1]; int lda = ((n + 31)/32)*32; int ntile, nb; TESTING_MALLOC ( hA, cuFloatComplex, lda*n ); TESTING_MALLOC ( hR, cuFloatComplex, lda*n ); TESTING_DEVALLOC ( dA, cuFloatComplex, lda*n ); for( int t = 0; t < ntest; ++t ) { n = nsize[t]; lda = ((n + 31)/32)*32; // initialize matrices; entries are (i.j) for A float nf = 100.; for( int j = 0; j < n; ++j ) { // upper for( int i = 0; i < j; ++i ) { *hA(i,j) = MAGMA_C_MAKE( (i + j/nf)/nf, 0. ); } // lower for( int i = j; i < n; ++i ) { *hA(i,j) = MAGMA_C_MAKE( i + j/nf, 0. ); } } printf( "A%d = ", n ); magma_cprint( n, n, hA, lda ); magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize( MagmaLower, n, dA, lda ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d = ", n ); magma_cprint( n, n, hR, lda ); magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize( MagmaUpper, n, dA, lda ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "U%d = ", n ); magma_cprint( n, n, hR, lda ); // ----- //lapackf77_claset( "u", &n, &n, &c_zero, &c_one, hA, &lda ); nb = 64; ntile = n / nb; magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_cprint( n, n, hR, lda ); nb = 32; ntile = n / nb; magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_cprint( n, n, hR, lda ); ntile = (n - nb < 0 ? 0 : (n - nb) / (2*nb) + 1); magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, 2*nb, nb ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d_2m = ", n, nb ); magma_cprint( n, n, hR, lda ); nb = 25; ntile = n / nb; magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_cprint( n, n, hR, lda ); nb = 25; ntile = (n - nb < 0 ? 0 : (n - nb) / (3*nb) + 1); magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, 3*nb ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d_3n = ", n, nb ); magma_cprint( n, n, hR, lda ); nb = 100; ntile = n / nb; magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize_tiles( MagmaLower, nb, dA, lda, ntile, nb, nb ); magmablas_csymmetrize( MagmaLower, n%nb, &dA[ ntile*nb*(1+lda) ], lda ); // last partial block magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "L%d_%d = ", n, nb ); magma_cprint( n, n, hR, lda ); // ----- nb = 64; ntile = n / nb; magma_csetmatrix( n, n, hA, lda, dA, lda ); magmablas_csymmetrize_tiles( MagmaUpper, nb, dA, lda, ntile, nb, nb ); magma_cgetmatrix( n, n, dA, lda, hR, lda ); printf( "U%d_%d = ", n, nb ); magma_cprint( n, n, hR, lda ); } TESTING_FREE( hA ); TESTING_FREE( hR ); TESTING_DEVFREE( dA ); /* Shutdown */ TESTING_CUDA_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing claswp */ int main( int argc, char** argv) { /* Initialize */ magma_queue_t queue; magma_device_t device[ MagmaMaxGPUs ]; int num = 0; magma_err_t err; magma_init(); err = magma_get_devices( device, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } err = magma_queue_create( device[0], &queue ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } magmaFloatComplex *h_A1, *h_A2, *h_A3, *h_AT; magmaFloatComplex_ptr d_A1; real_Double_t gpu_time, cpu_time1, cpu_time2; /* Matrix size */ int M=0, N=0, n2, lda, ldat; int size[7] = {1000,2000,3000,4000,5000,6000,7000}; int i, j; int ione = 1; int ISEED[4] = {0,0,0,1}; int *ipiv; int k1, k2, r, c, incx; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) N = atoi(argv[++i]); if (strcmp("-M", argv[i])==0) M = atoi(argv[++i]); } if (M>0 && N>0) printf(" testing_claswp -M %d -N %d\n\n", M, N); else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_claswp -M %d -N %d\n\n", 1024, 1024); M = N = size[6]; } lda = M; n2 = M*N; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( h_A1, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A2, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_A3, magmaFloatComplex, n2 ); TESTING_MALLOC_CPU( h_AT, magmaFloatComplex, n2 ); TESTING_MALLOC_DEV( d_A1, magmaFloatComplex, n2 ); ipiv = (int*)malloc(M * sizeof(int)); if (ipiv == 0) { fprintf (stderr, "!!!! host memory allocation error (ipiv)\n"); } printf("\n\n"); printf(" M N CPU_BLAS (sec) CPU_LAPACK (sec) GPU (sec) \n"); printf("=============================================================================\n"); for(i=0; i<7; i++) { if(argc == 1){ M = N = size[i]; } lda = M; ldat = N; n2 = M*N; /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &n2, h_A1 ); lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A1, &lda, h_A2, &lda ); for(r=0;r<M;r++){ for(c=0;c<N;c++){ h_AT[c+r*ldat] = h_A1[r+c*lda]; } } magma_csetmatrix( N, M, h_AT, 0, ldat, d_A1, 0, ldat, queue); for(j=0; j<M; j++) { ipiv[j] = (int)((rand()*1.*M) / (RAND_MAX * 1.)) + 1; } /* * BLAS swap */ /* Column Major */ cpu_time1 = magma_wtime(); for ( j=0; j<M; j++) { if ( j != (ipiv[j]-1)) { blasf77_cswap( &N, h_A1+j, &lda, h_A1+(ipiv[j]-1), &lda); } } cpu_time1 = magma_wtime() - cpu_time1; /* * LAPACK laswp */ cpu_time2 = magma_wtime(); k1 = 1; k2 = M; incx = 1; lapackf77_claswp(&N, h_A2, &lda, &k1, &k2, ipiv, &incx); cpu_time2 = magma_wtime() - cpu_time2; /* * GPU swap */ /* Col swap on transpose matrix*/ gpu_time = magma_wtime(); magma_cpermute_long2(N, d_A1, 0, ldat, ipiv, M, 0, queue); gpu_time = magma_wtime() - gpu_time; /* Check Result */ magma_cgetmatrix( N, M, d_A1, 0, ldat, h_AT, 0, ldat, queue); for(r=0;r<N;r++){ for(c=0;c<M;c++){ h_A3[c+r*lda] = h_AT[r+c*ldat]; } } int check_bl, check_bg, check_lg; check_bl = diffMatrix( h_A1, h_A2, M, N, lda ); check_bg = diffMatrix( h_A1, h_A3, M, N, lda ); check_lg = diffMatrix( h_A2, h_A3, M, N, lda ); printf("%5d %5d %6.2f %6.2f %6.2f %s %s %s\n", M, N, cpu_time1, cpu_time2, gpu_time, (check_bl == 0) ? "SUCCESS" : "FAILED", (check_bg == 0) ? "SUCCESS" : "FAILED", (check_lg == 0) ? "SUCCESS" : "FAILED"); if(check_lg !=0){ printf("lapack swap results:\n"); magma_cprint(M, N, h_A1, lda); printf("gpu swap transpose matrix result:\n"); magma_cprint(M, N, h_A3, lda); } if (argc != 1) break; } /* clean up */ TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A1 ); TESTING_FREE_CPU( h_A2 ); TESTING_FREE_CPU( h_A3 ); TESTING_FREE_CPU( h_AT ); TESTING_FREE_DEV( d_A1 ); magma_queue_destroy( queue ); magma_finalize(); }
/* //////////////////////////////////////////////////////////////////////////// -- Testing claset Code is very similar to testing_clacpy.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_R; magmaFloatComplex_ptr d_A; magmaFloatComplex offdiag, diag; magma_int_t M, N, size, lda, ldda; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_uplo_t uplo[] = { MagmaLower, MagmaUpper, MagmaFull }; printf("%% uplo M N offdiag diag CPU GByte/s (ms) GPU GByte/s (ms) check\n"); printf("%%===================================================================================\n"); for( int iuplo = 0; iuplo < 3; ++iuplo ) { for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { for( int ival = 0; ival < 4; ++ival ) { // test combinations of zero & non-zero: // ival offdiag diag // 0 0 0 // 1 0 3.14 // 2 1.23 0 // 3 1.23 3.14 offdiag = MAGMA_C_MAKE( 1.2345, 6.7890 ) * (ival / 2); diag = MAGMA_C_MAKE( 3.1415, 2.7183 ) * (ival % 2); M = opts.msize[itest]; N = opts.nsize[itest]; //M += 2; // space for insets //N += 2; lda = M; ldda = magma_roundup( M, opts.align ); size = lda*N; if ( uplo[iuplo] == MagmaLower ) { // save lower trapezoid (with diagonal) if ( M > N ) { gbytes = sizeof(magmaFloatComplex) * (1.*M*N - 0.5*N*(N-1)) / 1e9; } else { gbytes = sizeof(magmaFloatComplex) * 0.5*M*(M+1) / 1e9; } } else if ( uplo[iuplo] == MagmaUpper ) { // save upper trapezoid (with diagonal) if ( N > M ) { gbytes = sizeof(magmaFloatComplex) * (1.*M*N - 0.5*M*(M-1)) / 1e9; } else { gbytes = sizeof(magmaFloatComplex) * 0.5*N*(N+1) / 1e9; } } else { // save entire matrix gbytes = sizeof(magmaFloatComplex) * 1.*M*N / 1e9; } TESTING_MALLOC_CPU( h_A, magmaFloatComplex, size ); TESTING_MALLOC_CPU( h_R, magmaFloatComplex, size ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_C_MAKE( i + j/10000., j ); } } /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magmablasSetKernelStream( opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); //magmablas_claset( uplo[iuplo], M-2, N-2, offdiag, diag, d_A+1+ldda, ldda ); // inset by 1 row & col magmablas_claset( uplo[iuplo], M, N, offdiag, diag, d_A, ldda ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); //magma_int_t M2 = M-2; // inset by 1 row & col //magma_int_t N2 = N-2; //lapackf77_claset( lapack_uplo_const( uplo[iuplo] ), &M2, &N2, &offdiag, &diag, h_A+1+lda, &lda ); lapackf77_claset( lapack_uplo_const( uplo[iuplo] ), &M, &N, &offdiag, &diag, h_A, &lda ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; if ( opts.verbose ) { printf( "A= " ); magma_cprint( M, N, h_A, lda ); printf( "dA=" ); magma_cprint_gpu( M, N, d_A, ldda ); } /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_A, ldda, h_R, lda ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione); error = lapackf77_clange("f", &M, &N, h_R, &lda, work); bool okay = (error == 0); status += ! okay; printf("%5s %5d %5d %9.4f %6.4f %7.2f (%7.2f) %7.2f (%7.2f) %s\n", lapack_uplo_const( uplo[iuplo] ), (int) M, (int) N, real(offdiag), real(diag), cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (okay ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); fflush( stdout ); } } if ( opts.niter > 1 ) { printf( "\n" ); } } printf( "\n" ); } opts.cleanup(); TESTING_FINALIZE(); return status; }