Ejemplos de MAGMA_D_CNJG en C++ (Cpp)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: magma_dutil.cpp Proyecto: kjbartel/magma

void magma_dmake_hpd( magma_int_t N, double* A, magma_int_t lda )
{
    magma_int_t i, j;
    for( i=0; i<N; ++i ) {
        A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL( A(i,i) ) + N, 0. );
        for( j=0; j<i; ++j ) {
            A(j,i) = MAGMA_D_CNJG( A(i,j) );
        }
    }
}

Ejemplo n.º 2

0

Mostrar archivo

Archivo: testing_dpotrf_msub.cpp Proyecto: kjbartel/clmagma

void init_matrix( magma_int_t N, double *h_A, magma_int_t lda )
{
    magma_int_t ione = 1, n2 = N*lda;
    magma_int_t ISEED[4] = {0,0,0,1};
    lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
    /* Symmetrize and increase the diagonal */
    for (magma_int_t i = 0; i < N; ++i) {
        h_A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL(h_A(i,i)) + N, 0 );
        for (magma_int_t j = 0; j < i; ++j)
            h_A(i, j) = MAGMA_D_CNJG( h_A(j, i) );
    }
}

Ejemplo n.º 3

0

Mostrar archivo

Archivo: dbulge_kernel_v2.cpp Proyecto: EmergentOrder/magma

extern "C" void
magma_dtrdtype2cbHLsym_withQ_v2(magma_int_t n, magma_int_t nb,
                                double *A, magma_int_t lda,
                                double *V, magma_int_t ldv,
                                double *TAU,
                                magma_int_t st, magma_int_t ed,
                                magma_int_t sweep, magma_int_t Vblksiz,
                                double *work)
{
    /*
     WORK (workspace) double real array, dimension NB
    */

    magma_int_t ione = 1;
    magma_int_t vpos, taupos;

    double conjtmp;

    double c_one = MAGMA_D_ONE;

    magma_int_t ldx = lda-1;
    magma_int_t len = ed - st + 1;
    magma_int_t lem = min(ed+nb, n) - ed;

    if (lem > 0) {
        magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, st-1, ldv, &vpos, &taupos);
        /* apply remaining right coming from the top block */
        lapackf77_dlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(ed+1, st), &ldx, work);
    }
    if (lem > 1) {
        magma_bulge_findVTAUpos(n, nb, Vblksiz, sweep-1, ed, ldv, &vpos, &taupos);

        /* remove the first column of the created bulge */
        *V(vpos)  = c_one;
        //memcpy(V(vpos+1), A(ed+2, st), (lem-1)*sizeof(double));
        cblas_dcopy(lem-1, A(ed+2, st), ione, V(vpos+1), ione);
        memset(A(ed+2, st),0,(lem-1)*sizeof(double));

        /* Eliminate the col at st */
        lapackf77_dlarfg( &lem, A(ed+1, st), V(vpos+1), &ione, TAU(taupos) );

        /* apply left on A(J1:J2,st+1:ed) */
        len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been removed; */
        conjtmp = MAGMA_D_CNJG(*TAU(taupos));
        lapackf77_dlarfx("L", &lem, &len, V(vpos),  &conjtmp, A(ed+1, st+1), &ldx, work);
    }
}

Ejemplo n.º 4

0

Mostrar archivo

Archivo: dbulge_kernel.cpp Proyecto: rdiahelwe/magma

extern "C" void
magma_dtrdtype2cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    double *A, magma_int_t LDA,
    double *V, double *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    magma_int_t    J1, J2, len, lem, LDX;
    //magma_int_t    i, j;
    magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    double conjtmp;
    double Z_ONE  =  MAGMA_D_ONE;
    //double WORK[NB];
    double *WORK;
    magma_dmalloc_cpu( &WORK, NB );


    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    LDX    = LDA-1;
    J1     = ed+1;
    J2     = min(ed+NB,N);
    len    = ed-st+1;
    lem    = J2-J1+1;
    if (lem > 0) {
        /* apply remaining right commming from the top block */
        lapackf77_dlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK);
    }
    if (lem > 1) {
        findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid);
        /* remove the first column of the created bulge */
        *V(vpos)  = Z_ONE;
        memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(double));
        memset(A(J1+1, st),0,(lem-1)*sizeof(double));
        /* Eliminate the col at st */
        lapackf77_dlarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) );
        /* apply left on A(J1:J2,st+1:ed) */
        len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */
        conjtmp = MAGMA_D_CNJG(*TAU(taupos));
        lapackf77_dlarfx("L", &lem, &len, V(vpos),  &conjtmp, A(J1, st+1), &LDX, WORK);
    }
    magma_free_cpu(WORK);
}

Ejemplo n.º 5

0

Mostrar archivo

Archivo: dbulge_kernel_v2.cpp Proyecto: EmergentOrder/magma

inline static void
magma_dlarfxsym_v2(magma_int_t n,
                double *A, magma_int_t lda,
                double *V, double *TAU,
                double *work)
{
/*
    WORK (workspace) double real array, dimension N
*/

    magma_int_t ione = 1;
    double dtmp;
    double c_zero   =  MAGMA_D_ZERO;
    double c_neg_one=  MAGMA_D_NEG_ONE;
    double c_half   =  MAGMA_D_HALF;

    /* X = AVtau */
    blasf77_dsymv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione);

    /* compute dtmp= X'*V */
#if defined(PRECISION_z) || defined(PRECISION_c)
    dtmp = c_zero;
    for (magma_int_t j = 0; j < n; j++)
        dtmp = dtmp + MAGMA_D_CNJG(work[j]) * V[j];
    //cblas_ddot_sub(n, work, ione, V, ione, &dtmp);
#else
    dtmp = cblas_ddot(n, work, ione, V, ione);
#endif


    /* compute 1/2 X'*V*t = 1/2*dtmp*tau  */
    dtmp = -dtmp * c_half * (*TAU);

    /* compute W=X-1/2VX'Vt = X - dtmp*V */
    blasf77_daxpy(&n, &dtmp, V, &ione, work, &ione);

    /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */
    blasf77_dsyr2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda);
}

Ejemplo n.º 6

0

Mostrar archivo

Archivo: dbulge_kernel.cpp Proyecto: EmergentOrder/magma

extern "C" void
magma_dlarfxsym(magma_int_t N, double *A, magma_int_t LDA, double *V, double *TAU) {
  magma_int_t IONE=1;
  double dtmp;
  double Z_ZERO =  MAGMA_D_ZERO;
  //double Z_ONE  =  MAGMA_D_ONE;
  double Z_MONE =  MAGMA_D_NEG_ONE;
  double Z_HALF =  MAGMA_D_HALF;
  //double WORK[N];
  double *WORK;
  magma_dmalloc_cpu( &WORK, N );

  /* apply left and right on A(st:ed,st:ed)*/
  //magma_dlarfxsym(len,A(st,st),LDX,V(st),TAU(st));
  /* X = AVtau */
  blasf77_dsymv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE);
  /* je calcul dtmp= X'*V */
#if defined(PRECISION_z) || defined(PRECISION_c)
   dtmp = Z_ZERO;
   for (magma_int_t j = 0; j < N; j++)
      dtmp = dtmp + MAGMA_D_CNJG(WORK[j]) * V[j];
   //cblas_ddot_sub(N, WORK, IONE, V, IONE, &dtmp);
#else
  dtmp = cblas_ddot(N, WORK, IONE, V, IONE);
#endif
  /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau  */
  dtmp = -dtmp * Z_HALF * (*TAU);
  /* je calcul W=X-1/2VX'Vt = X - dtmp*V */
  /*
  for (j = 0; j < N; j++)
      WORK[j] = WORK[j] + (dtmp*V[j]); */
  blasf77_daxpy(&N, &dtmp, V, &IONE, WORK, &IONE);
  /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */
  blasf77_dsyr2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA);
  
  magma_free_cpu(WORK);
}

Ejemplo n.º 7

0

Mostrar archivo

Archivo: dsyr2k_mgpu_spec.cpp Proyecto: EmergentOrder/magma

void magmablas_dsyr2k_mgpu_spec(
    magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k,
    double alpha, double *dA[], magma_int_t lda, magma_int_t aoffset,
                           double *dB[], magma_int_t ldb, magma_int_t boffset,
    double beta,           double *dC[], magma_int_t ldc, magma_int_t coffset,
    magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream )
{
    #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) )
    #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) )
    #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc)
    
    /* Check arguments */
    magma_int_t info = 0;
    if ( uplo != MagmaLower ) {
        info = -1;  // 'u' not yet handled
    } else if ( trans != MagmaNoTrans ) {
        info = -2;  // 'c' not yet handled
    } else if ( n < 0 ) {
        info = -3;
    } else if ( k < 0 ) {
        info = -4;
    } else if ( ((trans == MagmaNoTrans)   && lda < max(1,n)) ||
                ((trans == MagmaTrans) && lda < max(1,k)) ) {
        info = -7;
    } else if ( aoffset < 0 || aoffset > lda ) {
        info = -8;
    } else if ( ((trans == MagmaNoTrans)   && ldb < max(1,n)) ||
                ((trans == MagmaTrans) && ldb < max(1,k)) ) {
        info = -10;
    } else if ( boffset < 0 || boffset > ldb ) {
        info = -11;
    } else if ( ldc < max(1,n) ) {
        info = -13;
    } else if ( coffset < 0 || coffset > ldc ) {
        info = -14;
    } else if ( ngpu <= 0 ) {
        info = -15;
    } else if ( nb <= 0 ) {
        info = -16;
    } else if ( nstream <= 0 ) {
        info = -18;
    }
    if ( info != 0 ) {
        magma_xerbla( __func__, -(info) );
        return;
    }
    
    const double c_one = MAGMA_D_ONE;
    double cbeta = MAGMA_D_MAKE( beta, 0. );
    
    magma_int_t ib, ioff, iblock, idev, di, s;
    
    magma_device_t cdev;
    magma_queue_t cqueue;
    magma_getdevice( &cdev );
    magmablasGetKernelStream( &cqueue );
    
    // loop over all blocks
    // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C
    // blockoffset is offset within first block; for subsequent blocks it is 0
    magma_int_t blockoffset = coffset % nb;
    for( magma_int_t i = 0; i < n; i += ib ) {
        ib     = min( nb-blockoffset, n-i );  // block size
        ioff   = i + coffset;                 // global index in parent matrix
        iblock = (ioff / nb) / ngpu;          // local block id
        idev   = (ioff / nb) % ngpu;          // device with this block
        di     = iblock*nb + blockoffset;     // local index in parent matrix
        
        magma_setdevice( idev );
        s = iblock % nstream;
        magmablasSetKernelStream( streams[ idev ][ s ] );
        
        // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i]
        //printf( "dgemm  n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
        magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k,
                     alpha, dA(idev,0,0), lda,
                            dB(idev,i,0), ldb,
                     cbeta, dC(idev,coffset,di), ldc );
        blockoffset = 0;
    }
    
    // second loop does C = (alpha)*B*A' + C_hat
    alpha = MAGMA_D_CNJG( alpha );
    blockoffset = coffset % nb;
    for( magma_int_t i = 0; i < n; i += ib ) {
        ib     = min( nb-blockoffset, n-i );  // block size
        ioff   = i + coffset;                 // global index in parent matrix
        iblock = (ioff / nb) / ngpu;          // local block id
        idev   = (ioff / nb) % ngpu;          // device with this block
        di     = iblock*nb + blockoffset;     // local index in parent matrix
        
        magma_setdevice( idev );
        s = iblock % nstream;
        magmablasSetKernelStream( streams[ idev ][ s ] );
        
        // C[i:n,i] += (alpha) * B[i:n,0] * A[i,0]'
        //printf( "dgemm  n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i );
        magma_dgemm( MagmaNoTrans, MagmaTrans, n, ib, k,
                     alpha, dB(idev,0,0), ldb,
                            dA(idev,i,0), lda,
                     c_one, dC(idev,coffset,di), ldc );
        blockoffset = 0;
    }
    
    magma_setdevice( cdev );
    magmablasSetKernelStream( cqueue );
}

Ejemplo n.º 8

0

Mostrar archivo

Archivo: testing_dpotrf2_gpu.cpp Proyecto: EmergentOrder/clmagma

int main( int argc, char** argv) 
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    double *hA, *hR;
    magmaDouble_ptr dA;
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8160, 8192 };
    
    magma_int_t i, info;
    double mz_one = MAGMA_D_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
    
    if (argc != 1){
        for(i = 1; i<argc; i++){        
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
        }
        if (N>0) size[0] = size[9] = N;
        else exit(1);
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_dpotrf2_gpu -N %d\n\n", 1024);
    }

    /* Initialize */
    magma_queue_t  queue1, queue2;
    magma_device_t device;
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( &device, 2, &num );
    if ( err != 0 or num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    err = magma_queue_create( device, &queue1 );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", err );
        exit(-1);
    }
    err = magma_queue_create( device, &queue2 );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", err );
        exit(-1);
    }

    magma_queue_t queues[2] = {queue1, queue2};

    /* Allocate memory for the largest matrix */
    N    = size[9];
    n2   = N * N;
    ldda = ((N+31)/32) * 32;
    TESTING_MALLOC_CPU( hA, double, n2 );
    TESTING_MALLOC_PIN( hR, double, n2 );
    TESTING_MALLOC_DEV( dA, double, ldda*N );
    
    printf("\n\n");
    printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N; 
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        
        /* Initialize the matrix */
        lapackf77_dlarnv( &ione, ISEED, &n2, hA );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            MAGMA_D_SET2REAL( hA(i,i), MAGMA_D_REAL(hA(i,i)) + N );
            for( int j = 0; j < i; ++j ) {
          hA(i, j) = MAGMA_D_CNJG( hA(j,i) );
            }
        }
        lapackf77_dlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda );

        /* Warm up to measure the performance */
        magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1);
        clFinish(queue1);
        magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues );
        /* ====================================================================
           Performs operation using MAGMA 
           =================================================================== */
        magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1 );
        clFinish(queue1);
        gpu_time = magma_wtime();
        magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_dpotrf2 had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
        
        /* =====================================================================
           Performs operation using LAPACK 
           =================================================================== */
        cpu_time = magma_wtime();
        lapackf77_dpotrf( MagmaLowerStr, &N, hA, &lda, &info );
        cpu_time = magma_wtime() - cpu_time;
        if (info != 0)
            printf( "lapackf77_dpotrf had error %d.\n", info );
        
        cpu_perf = gflops / cpu_time;
        
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        magma_dgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue1 );
        matnorm = lapackf77_dlange("f", &N, &N, hA, &lda, work);
        blasf77_daxpy(&n2, &mz_one, hA, &ione, hR, &ione);
        diffnorm = lapackf77_dlange("f", &N, &N, hR, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n", 
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
        if (argc != 1)
            break;
    }

    /* clean up */
    TESTING_FREE_CPU( hA );
    TESTING_FREE_PIN( hR );
    TESTING_FREE_DEV( dA );
    magma_queue_destroy( queue1 );
    magma_queue_destroy( queue2 );
    magma_finalize();
}

Ejemplo n.º 9

0

Mostrar archivo

Archivo: testing_dsymmetrize_tiles.cpp Proyecto: cjy7117/FT-MAGMA

/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dsymmetrize
   Code is very similar to testing_dtranspose.cpp
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t    gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time;
    double           error, work[1];
    double  c_neg_one = MAGMA_D_NEG_ONE;
    double *h_A, *h_R;
    magmaDouble_ptr d_A;
    magma_int_t N, nb, size, lda, ldda, mstride, nstride, ntile;
    magma_int_t ione     = 1;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    nb = (opts.nb == 0 ? 64 : opts.nb);
    mstride = 2*nb;
    nstride = 3*nb;
    
    printf("uplo = %s, nb = %d, mstride = %d, nstride = %d\n",
            lapack_uplo_const(opts.uplo), (int) nb, (int) mstride, (int) nstride );
    printf("    N ntile   CPU GByte/s (ms)    GPU GByte/s (ms)    check\n");
    printf("===========================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            lda    = N;
            ldda   = ((N+31)/32)*32;
            size   = lda*N;
            
            if ( N < nb ) {
                ntile = 0;
            } else {
                ntile = min( (N - nb)/mstride + 1,
                             (N - nb)/nstride + 1 );
            }
            // load each tile, save each tile
            gbytes = sizeof(double) * 2.*nb*nb*ntile / 1e9;
            
            TESTING_MALLOC_CPU( h_A, double, size   );
            TESTING_MALLOC_CPU( h_R, double, size   );
            
            TESTING_MALLOC_DEV( d_A, double, ldda*N );
            
            /* Initialize the matrix */
            for( int j = 0; j < N; ++j ) {
                for( int i = 0; i < N; ++i ) {
                    h_A[i + j*lda] = MAGMA_D_MAKE( i + j/10000., j );
                }
            }

            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            magma_dsetmatrix( N, N, h_A, lda, d_A, ldda );
            
            gpu_time = magma_sync_wtime( 0 );
            magmablas_dsymmetrize_tiles( opts.uplo, nb, d_A, ldda, ntile, mstride, nstride );
            gpu_time = magma_sync_wtime( 0 ) - gpu_time;
            gpu_perf = gbytes / gpu_time;
            
            /* =====================================================================
               Performs operation using naive in-place algorithm
               (LAPACK doesn't implement symmetrize)
               =================================================================== */
            cpu_time = magma_wtime();
            for( int tile = 0; tile < ntile; ++tile ) {
                int offset = tile*mstride + tile*nstride*lda;
                for( int j = 0; j < nb; ++j ) {
                    for( int i = 0; i < j; ++i ) {
                        if ( opts.uplo == MagmaLower ) {
                            h_A[offset + i + j*lda] = MAGMA_D_CNJG( h_A[offset + j + i*lda] );
                        }
                        else {
                            h_A[offset + j + i*lda] = MAGMA_D_CNJG( h_A[offset + i + j*lda] );
                        }
                    }
                }
            }
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gbytes / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            magma_dgetmatrix( N, N, d_A, ldda, h_R, lda );
            
            blasf77_daxpy(&size, &c_neg_one, h_A, &ione, h_R, &ione);
            error = lapackf77_dlange("f", &N, &N, h_R, &lda, work);

            printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %s\n",
                   (int) N, (int) ntile,
                   cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000.,
                   (error == 0. ? "ok" : "failed") );
            status += ! (error == 0.);
            
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_R );
            
            TESTING_FREE_DEV( d_A );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}

Ejemplo n.º 10

0

Mostrar archivo

Archivo: testing_dpotrf_mgpu.cpp Proyecto: kjbartel/clmagma

int main( int argc, char** argv)
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    double *h_A, *h_R;
    magmaDouble_ptr d_lA[MagmaMaxGPUs];
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 };
    
    magma_int_t i, j, k, info;
    double mz_one = MAGMA_D_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
   
    magma_int_t num_gpus0 = 1, num_gpus, flag = 0;
    int nb, mb, n_local, nk;

    magma_uplo_t uplo = MagmaLower;

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0){
                N = atoi(argv[++i]);
                if (N>0) {
                    size[0] = size[9] = N;
                    flag = 1;
                }else exit(1);
            }
            if(strcmp("-NGPU", argv[i])==0)
                num_gpus0 = atoi(argv[++i]);
            if(strcmp("-UPLO", argv[i])==0){
                if(strcmp("L", argv[++i])==0){
                    uplo = MagmaLower;
                }else{
                    uplo = MagmaUpper;
                }            
            }
        }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_dpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0);
    }

    /* looking for max. ldda */
    ldda = 0;
    n2 = 0;
    for(i=0;i<10;i++){
        N = size[i];
        nb = magma_get_dpotrf_nb(N);
        mb = nb;
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
        }else{
            num_gpus = num_gpus0;
        }
        n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb);
        if(n_local > ldda) ldda = n_local;
        if(n2 < N*N) n2 = N*N;
        if(flag != 0) break;
    }

     /* Allocate host memory for the matrix */
    TESTING_MALLOC_PIN( h_A, double, n2 );
    TESTING_MALLOC_PIN( h_R, double, n2 );

    /* Initialize */
    magma_queue_t  queues[MagmaMaxGPUs * 2];
    //magma_queue_t  queues[MagmaMaxGPUs];
    magma_device_t devices[ MagmaMaxGPUs ];
    magma_int_t num = 0;
    magma_int_t err;
    magma_init();
    err = magma_getdevices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_getdevices failed: %d\n", (int) err );
        exit(-1);
    }
    for(i=0;i<num_gpus;i++){
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
    }

    printf("each buffer size: %d\n", ldda);
    /* allocate local matrix on Buffers */
    for(i=0; i<num_gpus0; i++){
        TESTING_MALLOC_DEV( d_lA[i], double, ldda );
    }

    
    printf("\n\n");
    printf("Using GPUs: %d\n", num_gpus0);
    if(uplo == MagmaUpper){
        printf("\n  testing_dpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0);
    }else{
        printf("\n  testing_dpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0);
    }
            printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N;
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        
        /* Initialize the matrix */
        lapackf77_dlarnv( &ione, ISEED, &n2, h_A );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            h_A(i,i) = MAGMA_D_MAKE( MAGMA_D_REAL(h_A(i,i)) + N, 0 );
            for( int j = 0; j < i; ++j ) {
                h_A(i, j) = MAGMA_D_CNJG( h_A(j,i) );
            }
        }
        lapackf77_dlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda );

        /* Warm up to measure the performance */
        nb = magma_get_dpotrf_nb(N);
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
            printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus);
        }else{
            num_gpus = num_gpus0;
        }
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
                                 queues[2*k]);
            }
        }else{
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,
                                    queues[2*k]);
            }
        }

        magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        
        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
                                 queues[2*k]);
            }
        }else{
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,
                                    queues[2*k]);
            }
        }
    
        gpu_time = magma_wtime();
        magma_dpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_dpotrf had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
       
        /* gather matrix from gpus */
        if(uplo==MagmaUpper){
            // Upper
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dgetmatrix( N, nk,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda,
                                 &h_R[j*lda], lda, queues[2*k]);
            }
        }else{
            // Lower
            for(j=0; j<N; j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_dgetmatrix( nk, N, 
                            d_lA[k], (j/(nb*num_gpus)*nb), ldda, 
                            &h_R[j], lda, queues[2*k] );
            }
        }

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        if(uplo == MagmaLower){
            lapackf77_dpotrf( MagmaLowerStr, &N, h_A, &lda, &info );
        }else{
            lapackf77_dpotrf( MagmaUpperStr, &N, h_A, &lda, &info );
        }
        cpu_time = magma_wtime() - cpu_time;
        if (info != 0)
            printf( "lapackf77_dpotrf had error %d.\n", info );
        
        cpu_perf = gflops / cpu_time;
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        matnorm = lapackf77_dlange("f", &N, &N, h_A, &lda, work);
        blasf77_daxpy(&n2, &mz_one, h_A, &ione, h_R, &ione);
        diffnorm = lapackf77_dlange("f", &N, &N, h_R, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n",
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
        if (flag != 0)
            break;
    }

    /* clean up */
    TESTING_FREE_PIN( h_A );
    TESTING_FREE_PIN( h_R );
    for(i=0;i<num_gpus;i++){
        TESTING_FREE_DEV( d_lA[i] );
        magma_queue_destroy( queues[2*i]   );
        magma_queue_destroy( queues[2*i+1] );
    }
    magma_finalize();
}