/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zhetrd_he2hb
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t gflops, gpu_time, gpu_perf;
    magmaDoubleComplex *h_A, *h_R, *h_work;
    magmaDoubleComplex *tau;
    double *D, *E;
    magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t status = 0;

    // TODO add these options to parse_opts
    magma_int_t NE      = 0;
    magma_int_t distblk = 0;

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    magma_int_t WANTZ = (opts.jobz == MagmaVec);
    double tol = opts.tolerance * lapackf77_dlamch("E");
    if (opts.nb == 0)
        opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N);

    if (NE < 1)
        NE = N; //64; //magma_get_zhetrd_he2hb_nb(N);

    nstream = max(3, opts.ngpu+2);
    magma_queue_t streams[MagmaMaxGPUs][20];
    magmaDoubleComplex_ptr da[MagmaMaxGPUs], dT1[MagmaMaxGPUs];
    if ((distblk == 0) || (distblk < opts.nb))
        distblk = max(256, opts.nb);
    printf("voici ngpu %d distblk %d NB %d nstream %d\n ",
           (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream);

    for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) {
        magma_setdevice( dev );
        for( int i = 0; i < nstream; ++i ) {
            magma_queue_create( &streams[dev][i] );
        }
    }
    magma_setdevice( 0 );

    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N     = opts.nsize[itest];
            lda   = N;
            ldt   = N;
            ldda  = ((N+31)/32)*32;
            n2    = N*lda;
            /* We suppose the magma NB is bigger than lapack NB */
            lwork = N*opts.nb;
            //gflops = ....?

            /* Allocate host memory for the matrix */
            TESTING_MALLOC_CPU( tau,    magmaDoubleComplex, N-1   );

            TESTING_MALLOC_PIN( h_A,    magmaDoubleComplex, lda*N );
            TESTING_MALLOC_PIN( h_R,    magmaDoubleComplex, lda*N );
            TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork );
            TESTING_MALLOC_PIN( D, double, N );
            TESTING_MALLOC_PIN( E, double, N );

            for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) {
                magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk;
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( da[dev],  magmaDoubleComplex, ldda*mlocal );
                TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb        );
            }
            
            /* ====================================================================
               Initialize the matrix
               =================================================================== */
            lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
            magma_zmake_hermitian( N, h_A, lda );

            lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );

            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            /* Copy the matrix to the GPU */
            magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk);
            //magmaDoubleComplex_ptr dabis;
            //TESTING_MALLOC_DEV( dabis,  magmaDoubleComplex, ldda*N );
            //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda);

            for (int count=0; count < 1; ++count) {
                magma_setdevice(0);
                gpu_time = magma_wtime();
                if (opts.version == 30) {
                    magma_zhetrd_he2hb_mgpu_spec(
                        opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork,
                        da, ldda, dT1, opts.nb, opts.ngpu, distblk,
                        streams, nstream, opts.nthread, &info);
                } else {
                    nstream = 3;
                    magma_zhetrd_he2hb_mgpu(
                        opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork,
                        da, ldda, dT1, opts.nb, opts.ngpu, distblk,
                        streams, nstream, opts.nthread, &info);
                }
                // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info);
                gpu_time = magma_wtime() - gpu_time;
                printf("  Finish BAND  N %d  NB %d  dist %d  ngpu %d version %d timing= %f\n",
                       N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time);
            }
            magma_setdevice(0);

            for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) {
                magma_setdevice(dev);
                magma_device_sync();
            }
            magma_setdevice(0);
            magmablasSetKernelStream( NULL );

            // todo neither of these is declared in headers
            // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt);
            // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt);
            
            // todo where is this timer started?
            // gpu_time = magma_wtime() - gpu_time;
            
            // todo what are the gflops?
            gpu_perf = gflops / gpu_time;
            
            if (info != 0)
                printf("magma_zhetrd_he2hb returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Print performance and error.
               =================================================================== */
#if defined(CHECKEIG)
#if defined(PRECISION_z) || defined(PRECISION_d)
            if ( opts.check ) {
                printf("  Total N %5d  flops %6.2f  timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time );
                char JOBZ;
                if (WANTZ == 0)
                    JOBZ = 'N';
                else
                    JOBZ = 'V';
                double nrmI=0.0, nrm1=0.0, nrm2=0.0;
                int    lwork2 = 256*N;
                magmaDoubleComplex *work2, *AINIT;
                double *rwork2, *D2;
                // TODO free this memory !
                magma_zmalloc_cpu( &work2, lwork2 );
                magma_dmalloc_cpu( &rwork2, N );
                magma_dmalloc_cpu( &D2, N );
                magma_zmalloc_cpu( &AINIT, N*lda );
                memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex));
                
                /* =====================================================================
                   Performs operation using LAPACK
                   =================================================================== */
                cpu_time = magma_wtime();
                int nt = min(12, opts.nthread);

                #if defined(USEMKL)
                mkl_set_num_threads(nt);
                #endif
                #if defined(USEACML)
                omp_set_num_threads(nt);
                #endif

                #if defined(PRECISION_z) || defined (PRECISION_c)
                lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, rwork2, &info );
                #else
                lapackf77_dsyev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, &info );
                #endif
                ///* call eigensolver for our resulting tridiag [D E] and for Q */
                //dstedc_withZ('V', N, D, E, h_R, lda);
                ////dsterf_( &N, D, E, &info);
                
                cpu_time = magma_wtime() - cpu_time;
                printf("  Finish CHECK - EIGEN   timing= %f  threads %d\n", cpu_time, nt);

                /* compare result */
                cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2);

                magmaDoubleComplex *WORKAJETER;
                double *RWORKAJETER, *RESU;
                // TODO free this memory !
                magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N)  );
                magma_dmalloc_cpu( &RWORKAJETER, N  );
                magma_dmalloc_cpu( &RESU, 10 );
                int MATYPE;
                memset(RESU, 0, 10*sizeof(double));

                MATYPE=3;
                double NOTHING=0.0;
                cpu_time = magma_wtime();
                // check results
                zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb,
                             AINIT, &lda, &NOTHING, &NOTHING, D2, D,
                             h_R, &lda, WORKAJETER, RWORKAJETER, RESU );
                cpu_time = magma_wtime() - cpu_time;
                printf("  Finish CHECK - results timing= %f\n", cpu_time);
                #if defined(USEMKL)
                mkl_set_num_threads(1);
                #endif
                #if defined(USEACML)
                omp_set_num_threads(1);
                #endif

                printf("\n");
                printf(" ================================================================================================================\n");
                printf("   ==> INFO voici  threads=%d    N=%d    NB=%d   WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ);
                printf(" ================================================================================================================\n");
                printf("            DSBTRD                : %15s \n", "STATblgv9withQ    ");
                printf(" ================================================================================================================\n");
                if (WANTZ > 0)
                    printf(" | A - U S U' | / ( |A| n ulp )   : %15.3E   \n", RESU[0]);
                if (WANTZ > 0)
                    printf(" | I - U U' | / ( n ulp )         : %15.3E   \n", RESU[1]);
                printf(" | D1 - EVEIGS | / (|D| ulp)      : %15.3E   \n",  RESU[2]);
                printf(" max | D1 - EVEIGS |              : %15.3E   \n",  RESU[6]);
                printf(" ================================================================================================================\n\n\n");

                printf(" ****************************************************************************************************************\n");
                printf(" * Hello here are the norm  Infinite (max)=%8.2e  norm one (sum)=%8.2e   norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2);
                printf(" ****************************************************************************************************************\n\n");
            }
#endif  // PRECISION_z || PRECISION_d
#endif  // CHECKEIG

            printf("  Total N %5d  flops %6.2f        timing %6.2f seconds\n", (int) N, 0.0, gpu_time );
            printf("============================================================================\n\n\n");

            TESTING_FREE_CPU( tau    );

            TESTING_FREE_PIN( h_A    );
            TESTING_FREE_PIN( h_R    );
            TESTING_FREE_PIN( h_work );
            TESTING_FREE_PIN( D      );
            TESTING_FREE_PIN( E      );

            for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( da[dev]  );
                TESTING_FREE_DEV( dT1[dev] );
            }
            magma_setdevice( 0 );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) {
        for( int i = 0; i < nstream; ++i ) {
            magma_queue_destroy( streams[dev][i] );
        }
    }

    TESTING_FINALIZE();
    return status;
}
Example #2
0
extern "C" magma_int_t
magma_dlobpcg( magma_d_sparse_matrix A, magma_d_solver_par *solver_par ){


#define  residualNorms(i,iter)  ( residualNorms + (i) + (iter)*n )
#define magmablas_swap(x, y)    { pointer = x; x = y; y = pointer; }
#define hresidualNorms(i,iter)  (hresidualNorms + (i) + (iter)*n )

#define gramA(    m, n)   (gramA     + (m) + (n)*ldgram)
#define gramB(    m, n)   (gramB     + (m) + (n)*ldgram)
#define gevectors(m, n)   (gevectors + (m) + (n)*ldgram) 
#define h_gramB(  m, n)   (h_gramB   + (m) + (n)*ldgram)

#define magma_d_bspmv_tuned(m, n, alpha, A, X, beta, AX)       {        \
            magmablas_dtranspose( m, n, X, m, blockW, n );        	\
            magma_d_vector x, ax;                                       \
            x.memory_location = Magma_DEV;  x.num_rows = m*n;  x.nnz = m*n;  x.val = blockW; \
            ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX;     \
            magma_d_spmv(alpha, A, x, beta, ax );                           \
            magmablas_dtranspose( n, m, blockW, n, X, m );            		\
}




//**************************************************************

    // Memory allocation for the eigenvectors, eigenvalues, and workspace
    solver_par->solver = Magma_LOBPCG;
    magma_int_t m = A.num_rows;
    magma_int_t n =(solver_par->num_eigenvalues);
    double *blockX = solver_par->eigenvectors;
    double *evalues = solver_par->eigenvalues;


    double *dwork, *hwork;
    double *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW;
    double *gramA, *gramB, *gramM;
    double *gevectors, *h_gramB;

    double *pointer, *origX = blockX;
    double *eval_gpu;

    magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n),
                                            1 + 6*3*n + 2* 3*n* 3*n);

    magma_dmalloc_pinned( &hwork   ,        lwork );
    magma_dmalloc(        &blockAX   ,        m*n );
    magma_dmalloc(        &blockAR   ,        m*n );
    magma_dmalloc(        &blockAP   ,        m*n );
    magma_dmalloc(        &blockR    ,        m*n );
    magma_dmalloc(        &blockP    ,        m*n );
    magma_dmalloc(        &blockW    ,        m*n );
    magma_dmalloc(        &dwork     ,        m*n );            
    magma_dmalloc(        &eval_gpu  ,        3*n );




//**********************************************************+

    magma_int_t verbosity = 1;
    magma_int_t *iwork, liwork = 15*n+9;

    // === Set solver parameters ===
    double residualTolerance  = solver_par->epsilon;
    magma_int_t maxIterations = solver_par->maxiter;

    // === Set some constants & defaults ===
    double c_one = MAGMA_D_ONE, c_zero = MAGMA_D_ZERO;

    double *residualNorms, *condestGhistory, condestG;
    double *gevalues;
    magma_int_t *activeMask;

    // === Check some parameters for possible quick exit ===
    solver_par->info = 0;
    if (m < 2)
        solver_par->info = -1;
    else if (n > m)
        solver_par->info = -2;

    if (solver_par->info != 0) {
        magma_xerbla( __func__, -(solver_par->info) );
        return solver_par->info;
    }
    magma_int_t *info = &(solver_par->info); // local info variable;

    // === Allocate GPU memory for the residual norms' history ===
    magma_dmalloc(&residualNorms, (maxIterations+1) * n);
    magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) );

    // === Allocate CPU work space ===
    magma_dmalloc_cpu(&condestGhistory, maxIterations+1);
    magma_dmalloc_cpu(&gevalues, 3 * n);
    magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t));

    double *hW;
    magma_dmalloc_pinned(&hW, n*n);
    magma_dmalloc_pinned(&gevectors, 9*n*n); 
    magma_dmalloc_pinned(&h_gramB  , 9*n*n);

    // === Allocate GPU workspace ===
    magma_dmalloc(&gramM, n * n);
    magma_dmalloc(&gramA, 9 * n * n);
    magma_dmalloc(&gramB, 9 * n * n);

    #if defined(PRECISION_z) || defined(PRECISION_c)
    double *rwork;
    magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n);

    magma_dmalloc_cpu(&rwork, lrwork);
    #endif

    // === Set activemask to one ===
    for(int k =0; k<n; k++)
        iwork[k]=1;
    magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n);

    magma_int_t gramDim, ldgram  = 3*n, ikind = 4;
       
    // === Make the initial vectors orthonormal ===
    magma_dgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info );
    //magma_dorthomgs( m, n, blockX );
    
    magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX );

    // === Compute the Gram matrix = (X, AX) & its eigenstates ===
    magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m,
                c_one,  blockX, m, blockAX, m, c_zero, gramM, n);

    magma_dsyevd_gpu( MagmaVec, MagmaUpper,
                      n, gramM, n, evalues, hW, n, hwork, lwork,
                      #if defined(PRECISION_z) || defined(PRECISION_c)
                      rwork, lrwork,
                      #endif
                      iwork, liwork, info );

    // === Update  X =  X * evectors ===
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one,  blockX, m, gramM, n, c_zero, blockW, m);
    magmablas_swap(blockW, blockX);

    // === Update AX = AX * evectors ===
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one,  blockAX, m, gramM, n, c_zero, blockW, m);
    magmablas_swap(blockW, blockAX);

    condestGhistory[1] = 7.82;
    magma_int_t iterationNumber, cBlockSize, restart = 1, iter;

    //Chronometry
    real_Double_t tempo1, tempo2;
    magma_device_sync(); tempo1=magma_wtime();
    // === Main LOBPCG loop ============================================================
    for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++)
        { 
            // === compute the residuals (R = Ax - x evalues )
            magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m);

/*
            for(int i=0; i<n; i++){
               magma_daxpy(m, MAGMA_D_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1);
            }
  */        
            #if defined(PRECISION_z) || defined(PRECISION_d)
                magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n );
            #else
                magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n );
            #endif

            magma_dlobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu);

            magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber));

            // === remove the residuals corresponding to already converged evectors
            magma_dcompact(m, n, blockR, m,
                           residualNorms(0, iterationNumber), residualTolerance, 
                           activeMask, &cBlockSize);
            
            if (cBlockSize == 0)
                break;
        
            // === apply a preconditioner P to the active residulas: R_new = P R_old
            // === for now set P to be identity (no preconditioner => nothing to be done )
            // magmablas_dlacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m);

            /*
            // === make the preconditioned residuals orthogonal to X
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m,
                        c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram);
            magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n,
                        c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m);
            */

            // === make the active preconditioned residuals orthonormal
            magma_dgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info );
            //magma_dorthomgs( m, cBlockSize, blockR );
            
            // === compute AR
            magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR );
 
            if (!restart) {
                // === compact P & AP as well
                magma_dcompactActive(m, n, blockP,  m, activeMask);
                magma_dcompactActive(m, n, blockAP, m, activeMask);
          
                /*
                // === make P orthogonal to X ?
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m,
                            c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram);
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n,
                            c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m);

                // === make P orthogonal to R ?
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                            c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram);
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize,
                            c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m);
                */

                // === Make P orthonormal & properly change AP (without multiplication by A)
                magma_dgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info );
                //magma_dorthomgs( m, cBlockSize, blockP );

                //magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP );
                magma_dsetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize);


//                magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 
  //                           m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m);

            // replacement according to Stan
#if defined(PRECISION_s) || defined(PRECISION_d)
            magmablas_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 
                        m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m);
#else
            magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, 
                            cBlockSize, c_one, dwork, cBlockSize, blockAP, m);
#endif
            }

            iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize)));
            double condestGmean = 0.;
            for(int i = 0; i<iterationNumber-iter+1; i++)
                condestGmean += condestGhistory[i];
            condestGmean = condestGmean / (iterationNumber-iter+1);

            if (restart)
                gramDim = n+cBlockSize;
            else
                gramDim = n+2*cBlockSize;

            /* --- The Raileight-Ritz method for [X R P] -----------------------
               [ X R P ]'  [AX  AR  AP] y = evalues [ X R P ]' [ X R P ], i.e.,
       
                      GramA                                 GramB
                / X'AX  X'AR  X'AP \                 / X'X  X'R  X'P \
               |  R'AX  R'AR  R'AP  | y   = evalues |  R'X  R'R  R'P  |
                \ P'AX  P'AR  P'AP /                 \ P'X  P'R  P'P /       
               -----------------------------------------------------------------   */

            // === assemble GramB; first, set it to I
            magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram);  // identity

            if (!restart) {
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m,
                            c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram);
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                            c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram);
            }
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m,
                        c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram);

            // === get GramB from the GPU to the CPU and compute its eigenvalues only
            magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram);
            lapackf77_dsyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, 
                            hwork, &lwork,
                            #if defined(PRECISION_z) || defined(PRECISION_c)
                            rwork, 
                            #endif
                            info);

            // === check stability criteria if we need to restart
            condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.;
            if ((condestG/condestGmean>2 && condestG>2) || condestG>8) {
                // Steepest descent restart for stability
                restart=1;
                printf("restart at step #%d\n", (int) iterationNumber);
            }

            // === assemble GramA; first, set it to I
            magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram);  // identity

            magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m,
                        c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram);
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                        c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram);

            if (!restart) {
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, 
                            c_one, blockP, m, blockAX, m, c_zero, 
                            gramA(n+cBlockSize,0), ldgram);
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, 
                            c_one, blockP, m, blockAR, m, c_zero, 
                            gramA(n+cBlockSize,n), ldgram);
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, 
                            c_one, blockP, m, blockAP, m, c_zero, 
                            gramA(n+cBlockSize,n+cBlockSize), ldgram);
            }

            /*
            // === Compute X' AX or just use the eigenvalues below ?
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m,
                        c_one, blockX, m, blockAX, m, c_zero,
                        gramA(0,0), ldgram);
            */

            if (restart==0) {
                magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram);
            }
            else {
                gramDim = n+cBlockSize;
                magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram);
            }

            for(int k=0; k<n; k++)
                *gevectors(k,k) = MAGMA_D_MAKE(evalues[k], 0);

            // === the previous eigensolver destroyed what is in h_gramB => must copy it again
            magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram);

            magma_int_t itype = 1;
            lapackf77_dsygvd(&itype, "V", "L", &gramDim, 
                             gevectors, &ldgram, h_gramB, &ldgram,
                             gevalues, hwork, &lwork, 
                             #if defined(PRECISION_z) || defined(PRECISION_c)
                             rwork, &lrwork,
                             #endif
                             iwork, &liwork, info);
 
            for(int k =0; k<n; k++)
                evalues[k] = gevalues[k];
            
            // === copy back the result to gramA on the GPU and use it for the updates
            magma_dsetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram);

            if (restart == 0) {
                // === contribution from P to the new X (in new search direction P)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, 
                            c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m);
                magmablas_swap(dwork, blockP);
 
                // === contribution from R to the new X (in new search direction P)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m);

                // === corresponding contribution from AP to the new AX (in AP)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m);
                magmablas_swap(dwork, blockAP);

                // === corresponding contribution from AR to the new AX (in AP)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m);
            }
            else {
                // === contribution from R (only) to the new X
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m);

                // === corresponding contribution from AR (only) to the new AX
                magma_dgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize,
                            c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m);
            }
            
            // === contribution from old X to the new X + the new search direction P
            magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                        c_one, blockX, m, gramA, ldgram, c_zero, dwork, m);
            magmablas_swap(dwork, blockX);
            //magma_daxpy(m*n, c_one, blockP, 1, blockX, 1);
            magma_dlobpcg_maxpy( m, n, blockP, blockX );    

            
            // === corresponding contribution from old AX to new AX + AP
            magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                        c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m);
            magmablas_swap(dwork, blockAX);
            //magma_daxpy(m*n, c_one, blockAP, 1, blockAX, 1);
            magma_dlobpcg_maxpy( m, n, blockAP, blockAX );    

            condestGhistory[iterationNumber+1]=condestG;
            if (verbosity==1) {
                // double res;
                // magma_dgetmatrix(1, 1, 
                //                  (double*)residualNorms(0, iterationNumber), 1, 
                //                  (double*)&res, 1);
                // 
                //  printf("Iteration %4d, CBS %4d, Residual: %10.7f\n",
                //         iterationNumber, cBlockSize, res);
                printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); 
                magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1);
            }

            restart = 0;
        }   // === end for iterationNumber = 1,maxIterations =======================


    // fill solver info
    magma_device_sync(); tempo2=magma_wtime();
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    solver_par->numiter = iterationNumber;
    if( solver_par->numiter < solver_par->maxiter){
        solver_par->info = 0;
    }else if( solver_par->init_res > solver_par->final_res )
        solver_par->info = -2;
    else
        solver_par->info = -1;
    
    // =============================================================================
    // === postprocessing;
    // =============================================================================

    // === compute the real AX and corresponding eigenvalues
    magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX );
    magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m,
                c_one,  blockX, m, blockAX, m, c_zero, gramM, n);

    magma_dsyevd_gpu( MagmaVec, MagmaUpper,
                      n, gramM, n, gevalues, dwork, n, hwork, lwork, 
                      #if defined(PRECISION_z) || defined(PRECISION_c)
                      rwork, lrwork,
                      #endif
                      iwork, liwork, info );
   
    for(int k =0; k<n; k++)
        evalues[k] = gevalues[k];

    // === update X = X * evectors
    magmablas_swap(blockX, dwork);
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one, dwork, m, gramM, n, c_zero, blockX, m);

    // === update AX = AX * evectors to compute the final residual
    magmablas_swap(blockAX, dwork);
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one, dwork, m, gramM, n, c_zero, blockAX, m);

    // === compute R = AX - evalues X
    magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m);
    for(int i=0; i<n; i++)
        magma_daxpy(m, MAGMA_D_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1);

    // === residualNorms[iterationNumber] = || R ||    
    magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber));

    // === restore blockX if needed
    if (blockX != origX)
        magmablas_dlacpy( MagmaUpperLower, m, n, blockX, m, origX, m);

    printf("Eigenvalues:\n");
    for(int i =0; i<n; i++)
        printf("%e  ", evalues[i]);
    printf("\n\n");

    printf("Final residuals:\n");
    magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1);
    printf("\n\n");

    //=== Print residual history in a file for plotting ====
    double *hresidualNorms;
    magma_dmalloc_cpu(&hresidualNorms, (iterationNumber+1) * n);
    magma_dgetmatrix(n, iterationNumber, 
                     (double*)residualNorms, n, 
                     (double*)hresidualNorms, n);

    printf("Residuals are stored in file residualNorms\n");
    printf("Plot the residuals using: myplot \n");
    
    FILE *residuals_file;
    residuals_file = fopen("residualNorms", "w");
    for(int i =1; i<iterationNumber; i++) {
        for(int j = 0; j<n; j++)
            fprintf(residuals_file, "%f ", *hresidualNorms(j,i));
        fprintf(residuals_file, "\n");
    }
    fclose(residuals_file);
    magma_free_cpu(hresidualNorms);

    // === free work space
    magma_free(     residualNorms   );
    magma_free_cpu( condestGhistory );
    magma_free_cpu( gevalues        );
    magma_free_cpu( iwork           );

    magma_free_pinned( hW           );
    magma_free_pinned( gevectors    );
    magma_free_pinned( h_gramB      );

    magma_free(     gramM           );
    magma_free(     gramA           );
    magma_free(     gramB           );
    magma_free(  activeMask         );

    magma_free(     blockAX    );
    magma_free(     blockAR    );
    magma_free(     blockAP    );
    magma_free(     blockR    );
    magma_free(     blockP    );
    magma_free(     blockW    );
    magma_free(     dwork    );   
    magma_free(     eval_gpu    );    

    magma_free_pinned( hwork    );


    #if defined(PRECISION_z) || defined(PRECISION_c)
    magma_free_cpu( rwork           );
    #endif

    return MAGMA_SUCCESS;
}
Example #3
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zhetrd_he2hb
*/
int main( int argc, char** argv)
{
    TESTING_INIT_MGPU();

    magma_timestr_t       start, end;
    double           eps, flops, gpu_perf, gpu_time;
    magmaDoubleComplex *h_A, *h_R, *h_work;
    magmaDoubleComplex *tau;
    double *D, *E;

    /* Matrix size */
    magma_int_t N = 0, n2, lda, lwork,ldt;
    magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};

    magma_int_t i, info, checkres, once = 0;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    char *uplo = (char *)MagmaLowerStr;

    magma_int_t ngpu    = magma_num_gpus();
    magma_int_t nstream = max(3,ngpu+1);
    magma_int_t WANTZ=0;
    magma_int_t THREADS=1;
    magma_int_t NE = 0;
    magma_int_t NB = 0;
    magma_int_t distblk =0;
    magma_int_t ver =0;

    checkres  = 0; //getenv("MAGMA_TESTINGS_CHECK") != NULL;

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0) {
                N = atoi(argv[++i]);
                once = 1;
            }
            else if (strcmp("-NB", argv[i])==0) {
                NB = atoi(argv[++i]);
            }
            else if (strcmp("-D", argv[i])==0) {
                distblk = atoi(argv[++i]);
            }
            else if (strcmp("-threads", argv[i])==0) {
                THREADS = atoi(argv[++i]);
            }
            else if (strcmp("-wantz", argv[i])==0) {
                WANTZ = atoi(argv[++i]);
            }
            else if (strcmp("-NE", argv[i])==0) {
                NE = atoi(argv[++i]);
            }
            else if ( strcmp("-c", argv[i]) == 0 ) {
                checkres = 1;
            }
            else if ( strcmp("-v", argv[i]) == 0 && i+1 < argc ) {
                ver = atoi( argv[++i] );
            }
            else if ( strcmp("-nstream", argv[i]) == 0 && i+1 < argc ) {
                nstream = atoi( argv[++i] );
                magma_assert( nstream > 0 && nstream <= 20,
                        "error: -nstream %s is invalid; must be > 0 and <= 20.\n", argv[i] );
            }
            else if ( strcmp("-ngpu", argv[i]) == 0 && i+1 < argc ) {
                ngpu = atoi( argv[++i] );
                magma_assert( ngpu > 0 || ngpu > MagmaMaxGPUs, "error: -ngpu %s is invalid; must be > 0.\n", argv[i] );
            }
            else if (strcmp("-U", argv[i])==0)
                uplo = (char *)MagmaUpperStr;
            else if (strcmp("-L", argv[i])==0)
                uplo = (char *)MagmaLowerStr;
        }
        if ( N > 0 )
            printf("  testing_zhetrd_he2hb -L|U -N %d -NB %d   -wantz %d   -threads %d    check %d \n\n", N, NB, WANTZ, THREADS, checkres);
        else
        {
            printf("\nUsage: \n");
            printf("  testing_zhetrd_he2hb -L|U -N %d -NB  -wantz -threads \n\n", 1024);
            exit(1);
        }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zhetrd_he2hb -L|U -N %d\n\n", 1024);
        N = size[9];
    }
        
    
    eps = lapackf77_dlamch( "E" );
    lda = N;
    ldt = N;
    n2  = lda * N;
    if(NB<1)
        NB  = 64; //64; //magma_get_zhetrd_he2hb_nb(N);

    if(NE<1)
        NE  = N; //64; //magma_get_zhetrd_he2hb_nb(N);

    /* We suppose the magma NB is bigger than lapack NB */
    lwork = N*NB;

    /* Allocate host memory for the matrix */
    TESTING_MALLOC_CPU( tau,    magmaDoubleComplex, N-1   );
    
    TESTING_MALLOC_PIN( h_A,    magmaDoubleComplex, lda*N );
    TESTING_MALLOC_PIN( h_R,    magmaDoubleComplex, lda*N );
    TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork );
    TESTING_MALLOC_PIN( D, double, N );
    TESTING_MALLOC_PIN( E, double, N );

    nstream = max(3,ngpu+2);
    magma_queue_t streams[MagmaMaxGPUs][20];
    magmaDoubleComplex *da[MagmaMaxGPUs],*dT1[MagmaMaxGPUs];
    magma_int_t ldda = ((N+31)/32)*32;
    if((distblk==0)||(distblk<NB))
        distblk = max(256,NB);
    printf("voici ngpu %d distblk %d NB %d nstream %d\n ",ngpu,distblk,NB,nstream);
    
    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_int_t mlocal = ((N / distblk) / ngpu + 1) * distblk;
        magma_setdevice( dev );
        TESTING_MALLOC_DEV( da[dev],  magmaDoubleComplex, ldda*mlocal );
        TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*NB        );
        for( int i = 0; i < nstream; ++i ) {
            magma_queue_create( &streams[dev][i] );
        }
    }
    magma_setdevice( 0 );


    for(i=0; i<10; i++){
        if ( !once ) {
            N = size[i];
        }
        lda  = N;
        n2   = N*lda;
        /* ====================================================================
           Initialize the matrix
           =================================================================== */
        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
        magma_zmake_hermitian( N, h_A, lda );

        lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );


        
        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        /* Copy the matrix to the GPU */
        magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, ngpu, distblk);
//magmaDoubleComplex *dabis;
//       TESTING_MALLOC_DEV( dabis,  magmaDoubleComplex, ldda*N );
//       magma_zsetmatrix(N,N,h_R,lda,dabis,ldda);
    
    for (int count=0; count<1;++count){
       magma_setdevice(0);
       start = get_current_time();
       if(ver==30){
           magma_zhetrd_he2hb_mgpu_spec(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info);
       }else{
           nstream =3;
           magma_zhetrd_he2hb_mgpu(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, da, ldda, dT1, NB, ngpu, distblk, streams, nstream, THREADS, &info);
       }
       // magma_zhetrd_he2hb(uplo[0], N, NB, h_R, lda, tau, h_work, lwork, dT1[0], &info);
       end = get_current_time();
       printf("  Finish BAND  N %d  NB %d  dist %d  ngpu %d version %d timing= %f\n", N, NB, distblk, ngpu, ver, GetTimerValue(start,end) / 1000.);
    }
       magma_setdevice(0);

//goto fin;
//return 0;
        for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
            magma_setdevice(dev);
            cudaDeviceSynchronize();
        }
        magma_setdevice(0);
        magmablasSetKernelStream( NULL );

        magma_zhetrd_bhe2trc_v5(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt);
        //  magma_zhetrd_bhe2trc(THREADS, WANTZ, uplo[0], NE, N, NB, h_R, lda, D, E, dT1[0], ldt);
        end = get_current_time();
        if (info != 0)
            printf("magma_zhetrd_he2hb returned error %d: %s.\n",
                   (int) info, magma_strerror( info ));

        gpu_perf = flops / GetTimerValue(start,end);
        gpu_time = GetTimerValue(start,end) / 1000.;
        /* =====================================================================
           Print performance and error.
           =================================================================== */

#if defined(CHECKEIG)
#if defined(PRECISION_z)  || defined(PRECISION_d)
        if ( checkres ) {
            printf("  Total N %5d  flops %6.2f  timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time );
            char JOBZ;
            if(WANTZ==0)
                    JOBZ='N';
            else
                    JOBZ = 'V';
            double nrmI=0.0, nrm1=0.0, nrm2=0.0;
            int    lwork2 = 256*N;
            magmaDoubleComplex *work2     = (magmaDoubleComplex *) malloc (lwork2*sizeof(magmaDoubleComplex));
            double *rwork2     = (double *) malloc (N*sizeof(double));
            double *D2          = (double *) malloc (N*sizeof(double));
            magmaDoubleComplex *AINIT    = (magmaDoubleComplex *) malloc (N*lda*sizeof(magmaDoubleComplex));
            memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex));
            /* compute the eigenvalues using lapack routine to be able to compare to it and used as ref */
            start = get_current_time();
            i= min(12,THREADS);

#if defined(USEMKL)
            mkl_set_num_threads( i );
#endif
#if defined(USEACML)
            omp_set_num_threads(i);
#endif

#if defined(PRECISION_z) || defined (PRECISION_c)
            lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, rwork2, &info );
#else
            lapackf77_dsyev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, &info );
#endif
            ///* call eigensolver for our resulting tridiag [D E] and for Q */
            //dstedc_withZ('V', N, D, E, h_R, lda);
            ////dsterf_( &N, D, E, &info);
            ////
            end = get_current_time();
            printf("  Finish CHECK - EIGEN   timing= %f  threads %d\n", GetTimerValue(start,end) / 1000., i);

            /* compare result */
            cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2);


           magmaDoubleComplex *WORKAJETER;
           double *RWORKAJETER, *RESU;
           WORKAJETER  = (magmaDoubleComplex *) malloc( (2* N * N + N) * sizeof(magmaDoubleComplex) );
           RWORKAJETER = (double *) malloc( N * sizeof(double) );
           RESU        = (double *) malloc(10*sizeof(double));
           int MATYPE;
           memset(RESU,0,10*sizeof(double));

           
           MATYPE=3;
           double NOTHING=0.0;
           start = get_current_time();
           // check results
           zcheck_eig_(&JOBZ, &MATYPE, &N, &NB, AINIT, &lda, &NOTHING, &NOTHING, D2 , D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU );
           end = get_current_time();
           printf("  Finish CHECK - results timing= %f\n", GetTimerValue(start,end) / 1000.);
#if defined(USEMKL)
           mkl_set_num_threads( 1 );
#endif
#if defined(USEACML)
           omp_set_num_threads(1);
#endif

           printf("\n");
           printf(" ================================================================================================================\n");
           printf("   ==> INFO voici  threads=%d    N=%d    NB=%d   WANTZ=%d\n", (int) THREADS, (int) N, (int) NB, (int) WANTZ);
           printf(" ================================================================================================================\n");
           printf("            DSBTRD                : %15s \n", "STATblgv9withQ    ");
           printf(" ================================================================================================================\n");
           if(WANTZ>0)
              printf(" | A - U S U' | / ( |A| n ulp )   : %15.3E   \n",RESU[0]);
           if(WANTZ>0)
              printf(" | I - U U' | / ( n ulp )         : %15.3E   \n", RESU[1]);
           printf(" | D1 - EVEIGS | / (|D| ulp)      : %15.3E   \n",  RESU[2]);
           printf(" max | D1 - EVEIGS |              : %15.3E   \n",  RESU[6]);
           printf(" ================================================================================================================\n\n\n");
           
           printf(" ****************************************************************************************************************\n");
           printf(" * Hello here are the norm  Infinite (max)=%8.2e  norm one (sum)=%8.2e   norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2);
           printf(" ****************************************************************************************************************\n\n");

        }
#endif
#endif

      printf("  Total N %5d  flops %6.2f        timing %6.2f seconds\n", (int) N, 0.0, gpu_time );
      printf("============================================================================\n\n\n");

      if ( once )
          break;
    }

//fin:

    /* Memory clean up */
    TESTING_FREE_CPU( tau    );
    
    TESTING_FREE_PIN( h_A    );
    TESTING_FREE_PIN( h_R    );
    TESTING_FREE_PIN( h_work );
    TESTING_FREE_PIN( D      );
    TESTING_FREE_PIN( E      );

    for( magma_int_t dev = 0; dev < ngpu; ++dev ) {
        magma_setdevice( dev );
        TESTING_FREE_DEV( da[dev]  );
        TESTING_FREE_DEV( dT1[dev] );
        for( int i = 0; i < nstream; ++i ) {
            magma_queue_destroy( streams[dev][i] );
        }
    }
    magma_setdevice( 0 );
    
    /* Shutdown */    
    TESTING_FINALIZE_MGPU();
    return EXIT_SUCCESS;
}