extern "C" int calc_numerical_range(magmaFloatComplex *M, magma_int_t M_lead_dim, float _from, float _step, magma_int_t _steps, magmaFloatComplex *pts) { magma_int_t idx = 0, rslt = 0; magmaFloatComplex p, scalar; std::complex<float> vtmp; float j; magmaFloatComplex *dA = nullptr; magmaFloatComplex *dAth = NULL, *dAthT = NULL, *dX = NULL, *dY = NULL; float *dE = NULL; //float *hE = NULL; //magma_int_t *ipiv = NULL; magma_int_t lda = M_lead_dim; //magma_int_t ldx = lda; magma_int_t info = 0; magma_int_t nb = 0; //magma_vec_t jobvl; //magma_vec_t jobvr; magmaFloatComplex *work = nullptr; magma_int_t lwork = 0; float *rwork = nullptr; magma_int_t lrwork = 0; magma_int_t *iwork = nullptr; magma_int_t liwork = 0; nb = magma_get_cgehrd_nb( M_lead_dim ); lwork = 2 * max(M_lead_dim + M_lead_dim*nb, 2 * M_lead_dim + M_lead_dim*M_lead_dim); // MagmaVec lrwork = 1 + 5 * M_lead_dim + 2 * M_lead_dim*M_lead_dim; // MagmaVec liwork = (3 + 5 * M_lead_dim); // MagmaVec magma_imalloc_cpu(&iwork, liwork); magma_smalloc_cpu(&rwork, lrwork); magma_cmalloc_pinned(&work, lwork); magma_cmalloc_pinned(&dA, lda*M_lead_dim); magma_cmalloc_pinned(&dAth, lda*M_lead_dim); magma_cmalloc_pinned(&dAthT, lda*M_lead_dim); magma_smalloc_pinned(&dE, M_lead_dim); //magma_smalloc_cpu(&hE, M_lead_dim); magma_cmalloc_pinned(&dX, M_lead_dim); magma_cmalloc_pinned(&dY, M_lead_dim); magma_csetmatrix(M_lead_dim, M_lead_dim, M, lda, dA, M_lead_dim, queue); // th=[0:resolution:2*pi] j = _from; for (idx = 0; idx < _steps; idx++) { //scalar = exp( 1im * -j); vtmp.real( 0.0f ); vtmp.imag( -j ); //vtmp = _FCbuild(0.0f, -j); //printf("vtmp = %f + i%f\n", vtmp._Val[0], vtmp._Val[1]); vtmp = exp(vtmp); scalar.x = vtmp.real(); scalar.y = vtmp.imag(); //printf("scalar = %f + i%f\n", scalar.x, scalar.y); magma_ccopy(lda * M_lead_dim, dA, 1, dAth, 1, queue); // Ath = exp(1im * -j) * As magma_cscal(lda * M_lead_dim, scalar, dAth, 1, queue); //magma_cprint_gpu(N, N, dA, lda); //magma_cprint_gpu(N, N, dAth, lda); // AthT = (Ath + Ath') magmablas_ctranspose_conj(M_lead_dim, M_lead_dim, dAth, M_lead_dim, dAthT, M_lead_dim, queue); magmablas_cgeadd(M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dAth, M_lead_dim, dAthT, M_lead_dim, queue); // AthT = AthT / 2 magma_cscal(lda*M_lead_dim, MAGMA_C_MAKE(0.5f, 0.0f), dAthT, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); // e, r = eig(AthT) rslt = magma_cheevd(MagmaVec, MagmaLower, M_lead_dim, dAthT, lda, dE, work, lwork, rwork, lrwork, iwork, liwork, &info); magma_sync_wtime(queue); //printf("magma_cheevd info=%d\n", info); //magma_cprint_gpu(M_lead_dim, M_lead_dim, dAthT, lda); //magma_sprint_gpu(M_lead_dim, 1, dE, M_lead_dim); //magma_sgetvector(M_lead_dim, dE, 1, hE, 1, queue); //printf("%f %f\n", hE[0], hE[1]); // p = r[:,s]' * A * r[:,s] // r = r[:,s] magma_ccopy( M_lead_dim, dAthT + (M_lead_dim*(M_lead_dim-1)), 1, // dAthT + (N), where (N) is a column offset dX, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dX, M_lead_dim); // pp = A * r[:,s] magma_cgemv(MagmaNoTrans, M_lead_dim, M_lead_dim, MAGMA_C_MAKE(1.0f, 0.0f), dA, lda, dX, 1, MAGMA_C_MAKE(0.0f, 0.0f), dY, 1, queue); magma_sync_wtime(queue); //magma_cprint_gpu(M_lead_dim, 1, dY, M_lead_dim); // p = r' * pp p = magma_cdotc(M_lead_dim, dX, 1, dY, 1, queue); magma_sync_wtime(queue); pts[idx] = p; //printf("p = %f %fi\n", p.x, p.y); j += _step; } // end of for (idx = 0; idx < _steps; idx++) magma_free_pinned(dY); magma_free_pinned(dX); //magma_free_cpu(hE); magma_free_pinned(dE); magma_free_pinned(dAthT); magma_free_pinned(dAth); magma_free_pinned(dA); magma_free_pinned(work); magma_free_cpu(rwork); magma_free_cpu(iwork); //magma_free_cpu(w); //magma_free_cpu(A); return rslt; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing ctranspose Code is very similar to testing_csymmetrize.cpp */ int main( int argc, char** argv) { TESTING_INIT(); // OpenCL use: cl_mem , offset (two arguments); // else use: pointer + offset (one argument). #ifdef HAVE_clBLAS #define d_A(i_, j_) d_A, ((i_) + (j_)*ldda) #define d_B(i_, j_) d_B, ((i_) + (j_)*lddb) #else #define d_A(i_, j_) (d_A + (i_) + (j_)*ldda) #define d_B(i_, j_) (d_B + (i_) + (j_)*lddb) #endif real_Double_t gbytes, gpu_perf, gpu_time, gpu_perf2=0, gpu_time2=0, cpu_perf, cpu_time; float error, error2, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B, *h_R; magmaFloatComplex_ptr d_A, d_B; magma_int_t M, N, size, lda, ldda, ldb, lddb; magma_int_t ione = 1; magma_int_t status = 0; magma_opts opts; opts.parse_opts( argc, argv ); #ifdef COMPLEX magma_int_t ntrans = 2; magma_trans_t trans[] = { Magma_ConjTrans, MagmaTrans }; #else magma_int_t ntrans = 1; magma_trans_t trans[] = { MagmaTrans }; #endif printf("%% Inplace transpose requires M == N.\n"); printf("%% Trans M N CPU GByte/s (ms) GPU GByte/s (ms) check Inplace GB/s (ms) check\n"); printf("%%=========================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int itran = 0; itran < ntrans; ++itran ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default ldb = N; lddb = magma_roundup( N, opts.align ); // multiple of 32 by default // load entire matrix, save entire matrix gbytes = sizeof(magmaFloatComplex) * 2.*M*N / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda*N ); // input: M x N TESTING_MALLOC_CPU( h_B, magmaFloatComplex, ldb*M ); // output: N x M TESTING_MALLOC_CPU( h_R, magmaFloatComplex, ldb*M ); // output: N x M TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); // input: M x N TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*M ); // output: N x M /* Initialize the matrix */ for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_A[i + j*lda] = MAGMA_C_MAKE( i + j/10000., j ); } } for( int j = 0; j < M; ++j ) { for( int i = 0; i < N; ++i ) { h_B[i + j*ldb] = MAGMA_C_MAKE( i + j/10000., j ); } } magma_csetmatrix( N, M, h_B, ldb, d_B(0,0), lddb, opts.queue ); /* ===================================================================== Performs operation using naive out-of-place algorithm (LAPACK doesn't implement transpose) =================================================================== */ cpu_time = magma_wtime(); //for( int j = 1; j < N-1; ++j ) { // inset by 1 row & col // for( int i = 1; i < M-1; ++i ) { // inset by 1 row & col if ( trans[itran] == MagmaTrans ) { for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_B[j + i*ldb] = h_A[i + j*lda]; } } } else { for( int j = 0; j < N; ++j ) { for( int i = 0; i < M; ++i ) { h_B[j + i*ldb] = conj( h_A[i + j*lda] ); } } } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ==================================================================== Performs operation using MAGMA, out-of-place =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A(0,0), ldda, opts.queue ); magma_csetmatrix( N, M, h_B, ldb, d_B(0,0), lddb, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); if ( trans[itran] == MagmaTrans ) { //magmablas_ctranspose( M-2, N-2, d_A(1,1), ldda, d_B(1,1), lddb, opts.queue ); // inset by 1 row & col magmablas_ctranspose( M, N, d_A(0,0), ldda, d_B(0,0), lddb, opts.queue ); } #ifdef HAVE_CUBLAS else { //magmablas_ctranspose_conj( M-2, N-2, d_A(1,1), ldda, d_B(1,1), lddb, opts.queue ); // inset by 1 row & col magmablas_ctranspose_conj( M, N, d_A(0,0), ldda, d_B(0,0), lddb, opts.queue ); } #endif gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ==================================================================== Performs operation using MAGMA, in-place =================================================================== */ if ( M == N ) { magma_csetmatrix( M, N, h_A, lda, d_A(0,0), ldda, opts.queue ); gpu_time2 = magma_sync_wtime( opts.queue ); if ( trans[itran] == MagmaTrans ) { //magmablas_ctranspose_inplace( N-2, d_A(1,1), ldda, opts.queue ); // inset by 1 row & col magmablas_ctranspose_inplace( N, d_A(0,0), ldda, opts.queue ); } #ifdef HAVE_CUBLAS else { //magmablas_ctranspose_conj_inplace( N-2, d_A(1,1), ldda, opts.queue ); // inset by 1 row & col magmablas_ctranspose_conj_inplace( N, d_A(0,0), ldda, opts.queue ); } #endif gpu_time2 = magma_sync_wtime( opts.queue ) - gpu_time2; gpu_perf2 = gbytes / gpu_time2; } /* ===================================================================== Check the result =================================================================== */ // check out-of-place transpose (d_B) size = ldb*M; magma_cgetmatrix( N, M, d_B(0,0), lddb, h_R, ldb, opts.queue ); blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error = lapackf77_clange("f", &N, &M, h_R, &ldb, work ); if ( M == N ) { // also check in-place tranpose (d_A) magma_cgetmatrix( N, M, d_A(0,0), ldda, h_R, ldb, opts.queue ); blasf77_caxpy( &size, &c_neg_one, h_B, &ione, h_R, &ione ); error2 = lapackf77_clange("f", &N, &M, h_R, &ldb, work ); printf("%5c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %6s %7.2f (%7.2f) %s\n", lapacke_trans_const( trans[itran] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed"), gpu_perf2, gpu_time2, (error2 == 0. ? "ok" : "failed") ); status += ! (error == 0. && error2 == 0.); } else { printf("%5c %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %6s --- ( --- )\n", lapacke_trans_const( trans[itran] ), (int) M, (int) N, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (error == 0. ? "ok" : "failed") ); status += ! (error == 0.); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_R ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } } opts.cleanup(); TESTING_FINALIZE(); return status; }