/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_time, gpu_perf; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; // TODO add these options to parse_opts magma_int_t NE = 0; magma_int_t distblk = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_int_t WANTZ = (opts.jobz == MagmaVec); double tol = opts.tolerance * lapackf77_dlamch("E"); if (opts.nb == 0) opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N); if (NE < 1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); nstream = max(3, opts.ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex_ptr da[MagmaMaxGPUs], dT1[MagmaMaxGPUs]; if ((distblk == 0) || (distblk < opts.nb)) distblk = max(256, opts.nb); printf("voici ngpu %d distblk %d NB %d nstream %d\n ", (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; ldda = ((N+31)/32)*32; n2 = N*lda; /* We suppose the magma NB is bigger than lapack NB */ lwork = N*opts.nb; //gflops = ....? /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb ); } /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk); //magmaDoubleComplex_ptr dabis; //TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda); for (int count=0; count < 1; ++count) { magma_setdevice(0); gpu_time = magma_wtime(); if (opts.version == 30) { magma_zhetrd_he2hb_mgpu_spec( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } else { nstream = 3; magma_zhetrd_he2hb_mgpu( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info); gpu_time = magma_wtime() - gpu_time; printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time); } magma_setdevice(0); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice(dev); magma_device_sync(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); // todo neither of these is declared in headers // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // todo where is this timer started? // gpu_time = magma_wtime() - gpu_time; // todo what are the gflops? gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if (WANTZ == 0) JOBZ = 'N'; else JOBZ = 'V'; double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); int nt = min(12, opts.nthread); #if defined(USEMKL) mkl_set_num_threads(nt); #endif #if defined(USEACML) omp_set_num_threads(nt); #endif #if defined(PRECISION_z) || defined (PRECISION_c) lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, rwork2, &info ); #else lapackf77_dsyev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, &info ); #endif ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, nt); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); #if defined(USEMKL) mkl_set_num_threads(1); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif // PRECISION_z || PRECISION_d #endif // CHECKEIG printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("============================================================================\n\n\n"); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); } magma_setdevice( 0 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_dlobpcg( magma_d_sparse_matrix A, magma_d_solver_par *solver_par ){ #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define magmablas_swap(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_d_bspmv_tuned(m, n, alpha, A, X, beta, AX) { \ magmablas_dtranspose( m, n, X, m, blockW, n ); \ magma_d_vector x, ax; \ x.memory_location = Magma_DEV; x.num_rows = m*n; x.nnz = m*n; x.val = blockW; \ ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX; \ magma_d_spmv(alpha, A, x, beta, ax ); \ magmablas_dtranspose( n, m, blockW, n, X, m ); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n =(solver_par->num_eigenvalues); double *blockX = solver_par->eigenvectors; double *evalues = solver_par->eigenvalues; double *dwork, *hwork; double *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW; double *gramA, *gramB, *gramM; double *gevectors, *h_gramB; double *pointer, *origX = blockX; double *eval_gpu; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_dmalloc_pinned( &hwork , lwork ); magma_dmalloc( &blockAX , m*n ); magma_dmalloc( &blockAR , m*n ); magma_dmalloc( &blockAP , m*n ); magma_dmalloc( &blockR , m*n ); magma_dmalloc( &blockP , m*n ); magma_dmalloc( &blockW , m*n ); magma_dmalloc( &dwork , m*n ); magma_dmalloc( &eval_gpu , 3*n ); //**********************************************************+ magma_int_t verbosity = 1; magma_int_t *iwork, liwork = 15*n+9; // === Set solver parameters === double residualTolerance = solver_par->epsilon; magma_int_t maxIterations = solver_par->maxiter; // === Set some constants & defaults === double c_one = MAGMA_D_ONE, c_zero = MAGMA_D_ZERO; double *residualNorms, *condestGhistory, condestG; double *gevalues; magma_int_t *activeMask; // === Check some parameters for possible quick exit === solver_par->info = 0; if (m < 2) solver_par->info = -1; else if (n > m) solver_par->info = -2; if (solver_par->info != 0) { magma_xerbla( __func__, -(solver_par->info) ); return solver_par->info; } magma_int_t *info = &(solver_par->info); // local info variable; // === Allocate GPU memory for the residual norms' history === magma_dmalloc(&residualNorms, (maxIterations+1) * n); magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) ); // === Allocate CPU work space === magma_dmalloc_cpu(&condestGhistory, maxIterations+1); magma_dmalloc_cpu(&gevalues, 3 * n); magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t)); double *hW; magma_dmalloc_pinned(&hW, n*n); magma_dmalloc_pinned(&gevectors, 9*n*n); magma_dmalloc_pinned(&h_gramB , 9*n*n); // === Allocate GPU workspace === magma_dmalloc(&gramM, n * n); magma_dmalloc(&gramA, 9 * n * n); magma_dmalloc(&gramB, 9 * n * n); #if defined(PRECISION_z) || defined(PRECISION_c) double *rwork; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); magma_dmalloc_cpu(&rwork, lrwork); #endif // === Set activemask to one === for(int k =0; k<n; k++) iwork[k]=1; magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n); magma_int_t gramDim, ldgram = 3*n, ikind = 4; // === Make the initial vectors orthonormal === magma_dgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info ); //magma_dorthomgs( m, n, blockX ); magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); // === Update X = X * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockX); // === Update AX = AX * evectors === magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockAX); condestGhistory[1] = 7.82; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); /* for(int i=0; i<n; i++){ magma_daxpy(m, MAGMA_D_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1); } */ #if defined(PRECISION_z) || defined(PRECISION_d) magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #else magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #endif magma_dlobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu); magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === remove the residuals corresponding to already converged evectors magma_dcompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) // magmablas_dlacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m); /* // === make the preconditioned residuals orthogonal to X magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m); */ // === make the active preconditioned residuals orthonormal magma_dgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockR ); // === compute AR magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR ); if (!restart) { // === compact P & AP as well magma_dcompactActive(m, n, blockP, m, activeMask); magma_dcompactActive(m, n, blockAP, m, activeMask); /* // === make P orthogonal to X ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m); // === make P orthogonal to R ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_dgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info ); //magma_dorthomgs( m, cBlockSize, blockP ); //magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP ); magma_dsetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize); // magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, // m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #endif } iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize))); double condestGmean = 0.; for(int i = 0; i<iterationNumber-iter+1; i++) condestGmean += condestGhistory[i]; condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram); // identity if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram); } magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); lapackf77_dsyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, #endif info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", (int) iterationNumber); } // === assemble GramA; first, set it to I magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram); // identity magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram); if (!restart) { magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram); magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram); } /* // === Compute X' AX or just use the eigenvalues below ? magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram); */ if (restart==0) { magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } else { gramDim = n+cBlockSize; magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } for(int k=0; k<n; k++) *gevectors(k,k) = MAGMA_D_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); magma_int_t itype = 1; lapackf77_dsygvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_dsetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m); // === corresponding contribution from AP to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m); } else { // === contribution from R (only) to the new X magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m); // === corresponding contribution from AR (only) to the new AX magma_dgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m); } // === contribution from old X to the new X + the new search direction P magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockX); //magma_daxpy(m*n, c_one, blockP, 1, blockX, 1); magma_dlobpcg_maxpy( m, n, blockP, blockX ); // === corresponding contribution from old AX to new AX + AP magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAX); //magma_daxpy(m*n, c_one, blockAP, 1, blockAX, 1); magma_dlobpcg_maxpy( m, n, blockAP, blockAX ); condestGhistory[iterationNumber+1]=condestG; if (verbosity==1) { // double res; // magma_dgetmatrix(1, 1, // (double*)residualNorms(0, iterationNumber), 1, // (double*)&res, 1); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ) solver_par->info = -2; else solver_par->info = -1; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_dsyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors magmablas_swap(blockX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m); // === update AX = AX * evectors to compute the final residual magmablas_swap(blockAX, dwork); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m); // === compute R = AX - evalues X magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); for(int i=0; i<n; i++) magma_daxpy(m, MAGMA_D_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1); // === residualNorms[iterationNumber] = || R || magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === restore blockX if needed if (blockX != origX) magmablas_dlacpy( MagmaUpperLower, m, n, blockX, m, origX, m); printf("Eigenvalues:\n"); for(int i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Print residual history in a file for plotting ==== double *hresidualNorms; magma_dmalloc_cpu(&hresidualNorms, (iterationNumber+1) * n); magma_dgetmatrix(n, iterationNumber, (double*)residualNorms, n, (double*)hresidualNorms, n); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(int i =1; i<iterationNumber; i++) { for(int j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); magma_free( blockAX ); magma_free( blockAR ); magma_free( blockAP ); magma_free( blockR ); magma_free( blockP ); magma_free( blockW ); magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #if defined(PRECISION_z) || defined(PRECISION_c) magma_free_cpu( rwork ); #endif return MAGMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT_MGPU(); magma_timestr_t start, end; double eps, flops, gpu_perf, gpu_time; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; /* Matrix size */ magma_int_t N = 0, n2, lda, lwork,ldt; magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112}; magma_int_t i, info, checkres, once = 0; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; char *uplo = (char *)MagmaLowerStr; magma_int_t ngpu = magma_num_gpus(); magma_int_t nstream = max(3,ngpu+1); magma_int_t WANTZ=0; magma_int_t THREADS=1; magma_int_t NE = 0; magma_int_t NB = 0; magma_int_t distblk =0; magma_int_t ver =0; checkres = 0; //getenv("MAGMA_TESTINGS_CHECK") != NULL; if (argc != 1){ for(i = 1; i<argc; i++){ if (strcmp("-N", argv[i])==0) { N = atoi(argv[++i]); once = 1; } else if (strcmp("-NB", argv[i])==0) { NB = atoi(argv[++i]); } else if (strcmp("-D", argv[i])==0) { distblk = atoi(argv[++i]); } else if (strcmp("-threads", argv[i])==0) { THREADS = atoi(argv[++i]); } else if (strcmp("-wantz", argv[i])==0) { WANTZ = atoi(argv[++i]); } else if (strcmp("-NE", argv[i])==0) { NE = atoi(argv[++i]); } else if ( strcmp("-c", argv[i]) == 0 ) { checkres = 1; } else if ( strcmp("-v", argv[i]) == 0 && i+1 < argc ) { ver = atoi( argv[++i] ); } else if ( strcmp("-nstream", argv[i]) == 0 && i+1 < argc ) { nstream = atoi( argv[++i] ); magma_assert( nstream > 0 && nstream <= 20, "error: -nstream %s is invalid; must be > 0 and <= 20.\n", argv[i] ); } else if ( strcmp("-ngpu", argv[i]) == 0 && i+1 < argc ) { ngpu = atoi( argv[++i] ); magma_assert( ngpu > 0 || ngpu > MagmaMaxGPUs, "error: -ngpu %s is invalid; must be > 0.\n", argv[i] ); } else if (strcmp("-U", argv[i])==0) uplo = (char *)MagmaUpperStr; else if (strcmp("-L", argv[i])==0) uplo = (char *)MagmaLowerStr; } if ( N > 0 ) printf(" testing_zhetrd_he2hb -L|U -N %d -NB %d -wantz %d -threads %d check %d \n\n", N, NB, WANTZ, THREADS, checkres); else { printf("\nUsage: \n"); printf(" testing_zhetrd_he2hb -L|U -N %d -NB -wantz -threads \n\n", 1024); exit(1); } } else { printf("\nUsage: \n"); printf(" testing_zhetrd_he2hb -L|U -N %d\n\n", 1024); N = size[9]; } eps = lapackf77_dlamch( "E" ); lda = N; ldt = N; n2 = lda * N; if(NB<1) NB = 64; //64; //magma_get_zhetrd_he2hb_nb(N); if(NE<1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); /* We suppose the magma NB is bigger than lapack NB */ lwork = N*NB; /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); nstream = max(3,ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex *da[MagmaMaxGPUs],*dT1[MagmaMaxGPUs]; magma_int_t ldda = ((N+31)/32)*32; if((distblk==0)||(distblk<NB)) distblk = max(256,NB); printf("voici ngpu %d distblk %d NB %d nstream %d\n ",ngpu,distblk,NB,nstream); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*NB ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for(i=0; i<10; i++){ if ( !once ) { N = size[i]; } lda = N; n2 = N*lda; /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, ngpu, distblk); //magmaDoubleComplex *dabis; // TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); // magma_zsetmatrix(N,N,h_R,lda,dabis,ldda); for (int count=0; count<1;++count){ magma_setdevice(0); start = get_current_time(); if(ver==30){ magma_zhetrd_he2hb_mgpu_spec(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info); }else{ nstream =3; magma_zhetrd_he2hb_mgpu(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info); } // magma_zhetrd_he2hb(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, dT1[0], &info); end = get_current_time(); printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, NB, distblk, ngpu, ver, GetTimerValue(start,end) / 1000.); } magma_setdevice(0); //goto fin; //return 0; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice(dev); cudaDeviceSynchronize(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); magma_zhetrd_bhe2trc_v5(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt); end = get_current_time(); if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); gpu_perf = flops / GetTimerValue(start,end); gpu_time = GetTimerValue(start,end) / 1000.; /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( checkres ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); char JOBZ; if(WANTZ==0) JOBZ='N'; else JOBZ = 'V'; double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2 = (magmaDoubleComplex *) malloc (lwork2*sizeof(magmaDoubleComplex)); double *rwork2 = (double *) malloc (N*sizeof(double)); double *D2 = (double *) malloc (N*sizeof(double)); magmaDoubleComplex *AINIT = (magmaDoubleComplex *) malloc (N*lda*sizeof(magmaDoubleComplex)); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */ start = get_current_time(); i= min(12,THREADS); #if defined(USEMKL) mkl_set_num_threads( i ); #endif #if defined(USEACML) omp_set_num_threads(i); #endif #if defined(PRECISION_z) || defined (PRECISION_c) lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, rwork2, &info ); #else lapackf77_dsyev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, &info ); #endif ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); //// end = get_current_time(); printf(" Finish CHECK - EIGEN timing= %f threads %d\n", GetTimerValue(start,end) / 1000., i); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; WORKAJETER = (magmaDoubleComplex *) malloc( (2* N * N + N) * sizeof(magmaDoubleComplex) ); RWORKAJETER = (double *) malloc( N * sizeof(double) ); RESU = (double *) malloc(10*sizeof(double)); int MATYPE; memset(RESU,0,10*sizeof(double)); MATYPE=3; double NOTHING=0.0; start = get_current_time(); // check results zcheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2 , D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); end = get_current_time(); printf(" Finish CHECK - results timing= %f\n", GetTimerValue(start,end) / 1000.); #if defined(USEMKL) mkl_set_num_threads( 1 ); #endif #if defined(USEACML) omp_set_num_threads(1); #endif printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if(WANTZ>0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n",RESU[0]); if(WANTZ>0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif #endif printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("============================================================================\n\n\n"); if ( once ) break; } //fin: /* Memory clean up */ TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } magma_setdevice( 0 ); /* Shutdown */ TESTING_FINALIZE_MGPU(); return EXIT_SUCCESS; }