/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dpotrf_mgpu
int main( int argc, char** argv )

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double           error, work[1];
    double c_neg_one = MAGMA_D_NEG_ONE;
    double *h_A, *h_R;
    double *d_lA[ MagmaMaxGPUs ];
    magma_int_t N, n2, lda, ldda, max_size, ngpu;
    magma_int_t info, nb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t  status = 0;
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    opts.lapack |= opts.check;  // check (-c) implies lapack (-l)
    double tol = opts.tolerance * lapackf77_dlamch("E");
    printf("ngpu = %d, uplo = %s\n", (int) opts.ngpu, lapack_uplo_const(opts.uplo) );
    printf("    N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda    = N;
            n2     = lda*N;
            nb     = magma_get_dpotrf_nb( N );
            gflops = FLOPS_DPOTRF( N ) / 1e9;
            // ngpu must be at least the number of blocks
            ngpu = min( opts.ngpu, int((N+nb-1)/nb) );
            if ( ngpu < opts.ngpu ) {
                printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            // Allocate host memory for the matrix
            TESTING_MALLOC_CPU( h_A, double, n2 );
            TESTING_MALLOC_PIN( h_R, double, n2 );
            // Allocate device memory
            // matrix is distributed by block-rows or block-columns
            // this is maximum size that any GPU stores;
            // size is rounded up to full blocks in both rows and columns
            max_size = nb*(1+N/(nb*ngpu)) * nb*((N+nb-1)/nb);
            for( int dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev], double, max_size );
            /* Initialize the matrix */
            lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
            magma_dmake_hpd( N, h_A, lda );
            lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda );
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                cpu_time = magma_wtime();
                lapackf77_dpotrf( lapack_uplo_const(opts.uplo), &N, h_A, &lda, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_dpotrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            if ( opts.uplo == MagmaUpper ) {
                ldda = ((N+nb-1)/nb)*nb;
                magma_dsetmatrix_1D_col_bcyclic( N, N, h_R, lda, d_lA, ldda, ngpu, nb );
            } else {
                ldda = (1+N/(nb*ngpu))*nb;
                magma_dsetmatrix_1D_row_bcyclic( N, N, h_R, lda, d_lA, ldda, ngpu, nb );
            gpu_time = magma_wtime();
            magma_dpotrf_mgpu( ngpu, opts.uplo, N, d_lA, ldda, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_dpotrf_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            if ( opts.uplo == MagmaUpper ) {
                magma_dgetmatrix_1D_col_bcyclic( N, N, d_lA, ldda, h_R, lda, ngpu, nb );
            } else {
                magma_dgetmatrix_1D_row_bcyclic( N, N, d_lA, ldda, h_R, lda, ngpu, nb );
            /* =====================================================================
               Check the result compared to LAPACK
               =================================================================== */
            for( int dev=0; dev < ngpu; dev++ ){
                magma_setdevice( dev );
            if ( opts.lapack ) {
                error = lapackf77_dlange("f", &N, &N, h_A, &lda, work );
                blasf77_daxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione );
                error = lapackf77_dlange("f", &N, &N, h_R, &lda, work ) / error;
                printf("%5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e   %s\n",
                       (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time,
                       error, (error < tol ? "ok" : "failed") );
                status += ! (error < tol);
            else {
                printf("%5d     ---   (  ---  )   %7.2f (%7.2f)     ---\n",
                       (int) N, gpu_perf, gpu_time );
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_PIN( h_R );
            for( int dev=0; dev < ngpu; dev++ ){
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev] );
            fflush( stdout );
        if ( opts.niter > 1 ) {
            printf( "\n" );

    return status;
int main( int argc, char** argv)
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    double *h_A, *h_R;
    magmaDouble_ptr d_lA[MagmaMaxGPUs];
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 };
    magma_int_t i, j, k, info;
    double mz_one = MAGMA_D_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
    magma_int_t num_gpus0 = 1, num_gpus, flag = 0;
    int nb, mb, n_local, nk;

    magma_uplo_t uplo = MagmaLower;

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0){
                N = atoi(argv[++i]);
                if (N>0) {
                    size[0] = size[9] = N;
                    flag = 1;
                }else exit(1);
            if(strcmp("-NGPU", argv[i])==0)
                num_gpus0 = atoi(argv[++i]);
            if(strcmp("-UPLO", argv[i])==0){
                if(strcmp("L", argv[++i])==0){
                    uplo = MagmaLower;
                    uplo = MagmaUpper;
    else {
        printf("\nUsage: \n");
        printf("  testing_dpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0);

    /* looking for max. ldda */
    ldda = 0;
    n2 = 0;
        N = size[i];
        nb = magma_get_dpotrf_nb(N);
        mb = nb;
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
            num_gpus = num_gpus0;
        n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb);
        if(n_local > ldda) ldda = n_local;
        if(n2 < N*N) n2 = N*N;
        if(flag != 0) break;

     /* Allocate host memory for the matrix */
    TESTING_MALLOC_PIN( h_A, double, n2 );
    TESTING_MALLOC_PIN( h_R, double, n2 );

    /* Initialize */
    magma_queue_t  queues[MagmaMaxGPUs * 2];
    //magma_queue_t  queues[MagmaMaxGPUs];
    magma_device_t devices[ MagmaMaxGPUs ];
    magma_int_t num = 0;
    magma_int_t err;
    err = magma_getdevices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_getdevices failed: %d\n", (int) err );
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );

    printf("each buffer size: %d\n", ldda);
    /* allocate local matrix on Buffers */
    for(i=0; i<num_gpus0; i++){
        TESTING_MALLOC_DEV( d_lA[i], double, ldda );

    printf("Using GPUs: %d\n", num_gpus0);
    if(uplo == MagmaUpper){
        printf("\n  testing_dpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0);
        printf("\n  testing_dpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0);
            printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N;
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        /* Initialize the matrix */
        lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            h_A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL(h_A(i,i)) + N, 0 );
            for( int j = 0; j < i; ++j ) {
                h_A(i, j) = MAGMA_D_CNJG( h_A(j,i) );
        lapackf77_dlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda );

        /* Warm up to measure the performance */
        nb = magma_get_dpotrf_nb(N);
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
            printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus);
            num_gpus = num_gpus0;
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,

        magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,
        gpu_time = magma_wtime();
        magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_dpotrf had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
        /* gather matrix from gpus */
            // Upper
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dgetmatrix( N, nk,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda,
                                 &h_R[j*lda], lda, queues[2*k]);
            // Lower
            for(j=0; j<N; j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dgetmatrix( nk, N, 
                            d_lA[k], (j/(nb*num_gpus)*nb), ldda, 
                            &h_R[j], lda, queues[2*k] );

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        if(uplo == MagmaLower){
            lapackf77_dpotrf( MagmaLowerStr, &N, h_A, &lda, &info );
            lapackf77_dpotrf( MagmaUpperStr, &N, h_A, &lda, &info );
        cpu_time = magma_wtime() - cpu_time;
        if (info != 0)
            printf( "lapackf77_dpotrf had error %d.\n", info );
        cpu_perf = gflops / cpu_time;
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        matnorm = lapackf77_dlange("f", &N, &N, h_A, &lda, work);
        blasf77_daxpy(&n2, &mz_one, h_A, &ione, h_R, &ione);
        diffnorm = lapackf77_dlange("f", &N, &N, h_R, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n",
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        if (flag != 0)

    /* clean up */
        TESTING_FREE_DEV( d_lA[i] );
        magma_queue_destroy( queues[2*i]   );
        magma_queue_destroy( queues[2*i+1] );