예제 #1
0
int main( int argc, char** argv) 
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    double *hA, *hR;
    magmaDouble_ptr dA;
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 8160, 8192 };
    
    magma_int_t i, info;
    double mz_one = MAGMA_D_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
    
    if (argc != 1){
        for(i = 1; i<argc; i++){        
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
        }
        if (N>0) size[0] = size[9] = N;
        else exit(1);
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_dpotrf2_gpu -N %d\n\n", 1024);
    }

    /* Initialize */
    magma_queue_t  queue1, queue2;
    magma_device_t device;
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( &device, 2, &num );
    if ( err != 0 or num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    err = magma_queue_create( device, &queue1 );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", err );
        exit(-1);
    }
    err = magma_queue_create( device, &queue2 );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", err );
        exit(-1);
    }

    magma_queue_t queues[2] = {queue1, queue2};

    /* Allocate memory for the largest matrix */
    N    = size[9];
    n2   = N * N;
    ldda = ((N+31)/32) * 32;
    TESTING_MALLOC_CPU( hA, double, n2 );
    TESTING_MALLOC_PIN( hR, double, n2 );
    TESTING_MALLOC_DEV( dA, double, ldda*N );
    
    printf("\n\n");
    printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N; 
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        
        /* Initialize the matrix */
        lapackf77_dlarnv( &ione, ISEED, &n2, hA );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            MAGMA_D_SET2REAL( hA(i,i), MAGMA_D_REAL(hA(i,i)) + N );
            for( int j = 0; j < i; ++j ) {
          hA(i, j) = MAGMA_D_CNJG( hA(j,i) );
            }
        }
        lapackf77_dlacpy( MagmaFullStr, &N, &N, hA, &lda, hR, &lda );

        /* Warm up to measure the performance */
        magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1);
        clFinish(queue1);
        magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues );
        /* ====================================================================
           Performs operation using MAGMA 
           =================================================================== */
        magma_dsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue1 );
        clFinish(queue1);
        gpu_time = magma_wtime();
        magma_dpotrf2_gpu( MagmaLower, N, dA, 0, ldda, &info, queues );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_dpotrf2 had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
        
        /* =====================================================================
           Performs operation using LAPACK 
           =================================================================== */
        cpu_time = magma_wtime();
        lapackf77_dpotrf( MagmaLowerStr, &N, hA, &lda, &info );
        cpu_time = magma_wtime() - cpu_time;
        if (info != 0)
            printf( "lapackf77_dpotrf had error %d.\n", info );
        
        cpu_perf = gflops / cpu_time;
        
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        magma_dgetmatrix( N, N, dA, 0, ldda, hR, 0, lda, queue1 );
        matnorm = lapackf77_dlange("f", &N, &N, hA, &lda, work);
        blasf77_daxpy(&n2, &mz_one, hA, &ione, hR, &ione);
        diffnorm = lapackf77_dlange("f", &N, &N, hR, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n", 
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
        if (argc != 1)
            break;
    }

    /* clean up */
    TESTING_FREE_CPU( hA );
    TESTING_FREE_PIN( hR );
    TESTING_FREE_DEV( dA );
    magma_queue_destroy( queue1 );
    magma_queue_destroy( queue2 );
    magma_finalize();
}
예제 #2
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgetrf
*/
int main( int argc, char** argv)
{
    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float          error;
    magmaFloatComplex *h_A;
    magma_int_t     *ipiv;
    magma_int_t     M, N, n2, lda, ldda, info, min_mn;
    magma_int_t     status = 0;

    /* Initialize */
    magma_queue_t  queue[2];
    magma_device_t devices[MagmaMaxGPUs];
    int num = 0;
    magma_err_t err;
    magma_init();
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    float tol = opts.tolerance * lapackf77_slamch("E");

    err = magma_get_devices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }

    // Create two queues on device opts.device
    err = magma_queue_create( devices[opts.device], &queue[0] );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( devices[opts.device], &queue[1] );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }

    printf("ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[i];
            N = opts.nsize[i];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            gflops = FLOPS_CGETRF( M, N ) / 1e9;
            
            TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn );
            TESTING_MALLOC_PIN( h_A,  magmaFloatComplex, n2 );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_cgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( M, N, h_A, lda );
            
            gpu_time = magma_wtime();
            magma_cgetrf( M, N, h_A, lda, ipiv, &info, queue);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cgetrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e%s\n", error, (error < tol ? "" : "  failed"));
                status |= ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e%s\n", error, (error < tol ? "" : "  failed"));
                status |= ! (error < tol);
            }
            else {
                printf("     ---   \n");
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_PIN( h_A );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    magma_queue_destroy( queue[0] );
    magma_queue_destroy( queue[1] );
    magma_finalize();

    return status;
}
예제 #3
0
int main( int argc, char** argv)
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    magmaDoubleComplex *h_A, *h_R;
    magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs];
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 };
    
    magma_int_t i, j, k, info;
    magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    double      work[1], matnorm, diffnorm;
   
    magma_int_t num_gpus0 = 1, num_gpus, flag = 0;
    int nb, mb, n_local, nk;

    magma_uplo_t uplo = MagmaLower;

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0){
                N = atoi(argv[++i]);
                if (N>0) {
                    size[0] = size[9] = N;
                    flag = 1;
                }else exit(1);
            }
            if(strcmp("-NGPU", argv[i])==0)
                num_gpus0 = atoi(argv[++i]);
            if(strcmp("-UPLO", argv[i])==0){
                if(strcmp("L", argv[++i])==0){
                    uplo = MagmaLower;
                }else{
                    uplo = MagmaUpper;
                }            
            }
        }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zpotrf_mgpu -N %d -NGPU %d -UPLO -L\n\n", 1024, num_gpus0);
    }

    /* looking for max. ldda */
    ldda = 0;
    n2 = 0;
    for(i=0;i<10;i++){
        N = size[i];
        nb = magma_get_zpotrf_nb(N);
        mb = nb;
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
        }else{
            num_gpus = num_gpus0;
        }
        n_local = nb*(1+N/(nb*num_gpus))*mb*((N+mb-1)/mb);
        if(n_local > ldda) ldda = n_local;
        if(n2 < N*N) n2 = N*N;
        if(flag != 0) break;
    }

     /* Allocate host memory for the matrix */
    TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, n2 );
    TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, n2 );

    /* Initialize */
    magma_queue_t  queues[MagmaMaxGPUs * 2];
    //magma_queue_t  queues[MagmaMaxGPUs];
    magma_device_t devices[ MagmaMaxGPUs ];
    magma_int_t num = 0;
    magma_int_t err;
    magma_init();
    err = magma_getdevices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_getdevices failed: %d\n", (int) err );
        exit(-1);
    }
    for(i=0;i<num_gpus;i++){
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", (int) err );
            exit(-1);
        }
    }

    printf("each buffer size: %d\n", ldda);
    /* allocate local matrix on Buffers */
    for(i=0; i<num_gpus0; i++){
        TESTING_MALLOC_DEV( d_lA[i], magmaDoubleComplex, ldda );
    }

    
    printf("\n\n");
    printf("Using GPUs: %d\n", num_gpus0);
    if(uplo == MagmaUpper){
        printf("\n  testing_zpotrf_mgpu -N %d -NGPU %d -UPLO U\n\n", N, num_gpus0);
    }else{
        printf("\n  testing_zpotrf_mgpu -N %d -NGPU %d -UPLO L\n\n", N, num_gpus0);
    }
            printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N;
        n2  = lda*N;
        ldda = ((N+31)/32)*32;
        gflops = FLOPS( (double)N ) * 1e-9;
        
        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
        /* Symmetrize and increase the diagonal */
        for( int i = 0; i < N; ++i ) {
            h_A(i,i) = MAGMA_Z_MAKE( MAGMA_Z_REAL(h_A(i,i)) + N, 0 );
            for( int j = 0; j < i; ++j ) {
                h_A(i, j) = MAGMA_Z_CNJG( h_A(j,i) );
            }
        }
        lapackf77_zlacpy( MagmaFullStr, &N, &N, h_A, &lda, h_R, &lda );

        /* Warm up to measure the performance */
        nb = magma_get_zpotrf_nb(N);
        if(num_gpus0 > N/nb){
            num_gpus = N/nb;
            if(N%nb != 0) num_gpus ++;
            printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus);
        }else{
            num_gpus = num_gpus0;
        }
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
                                 queues[2*k]);
            }
        }else{
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,
                                    queues[2*k]);
            }
        }

        magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        
        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        /* distribute matrix to gpus */
        if(uplo == MagmaUpper){
            // Upper
            ldda = ((N+mb-1)/mb)*mb;    
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( N, nk, 
                                 &h_A[j*lda], lda,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda, 
                                 queues[2*k]);
            }
        }else{
            // Lower
            ldda = (1+N/(nb*num_gpus))*nb;
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zsetmatrix( nk, N, &h_A[j], lda,
                                    d_lA[k], (j/(nb*num_gpus)*nb), ldda,
                                    queues[2*k]);
            }
        }
    
        gpu_time = magma_wtime();
        magma_zpotrf_mgpu( num_gpus, uplo, N, d_lA, 0, ldda, queues, &info );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_zpotrf had error %d.\n", info );

        gpu_perf = gflops / gpu_time;
       
        /* gather matrix from gpus */
        if(uplo==MagmaUpper){
            // Upper
            for(j=0;j<N;j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zgetmatrix( N, nk,
                                 d_lA[k], j/(nb*num_gpus)*nb*ldda, ldda,
                                 &h_R[j*lda], lda, queues[2*k]);
            }
        }else{
            // Lower
            for(j=0; j<N; j+=nb){
                k = (j/nb)%num_gpus;
                nk = min(nb, N-j);
                magma_zgetmatrix( nk, N, 
                            d_lA[k], (j/(nb*num_gpus)*nb), ldda, 
                            &h_R[j], lda, queues[2*k] );
            }
        }

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        if(uplo == MagmaLower){
            lapackf77_zpotrf( MagmaLowerStr, &N, h_A, &lda, &info );
        }else{
            lapackf77_zpotrf( MagmaUpperStr, &N, h_A, &lda, &info );
        }
        cpu_time = magma_wtime() - cpu_time;
        if (info != 0)
            printf( "lapackf77_zpotrf had error %d.\n", info );
        
        cpu_perf = gflops / cpu_time;
        /* =====================================================================
           Check the result compared to LAPACK
           |R_magma - R_lapack| / |R_lapack|
           =================================================================== */
        matnorm = lapackf77_zlange("f", &N, &N, h_A, &lda, work);
        blasf77_zaxpy(&n2, &mz_one, h_A, &ione, h_R, &ione);
        diffnorm = lapackf77_zlange("f", &N, &N, h_R, &lda, work);
        printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n",
                N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
        if (flag != 0)
            break;
    }

    /* clean up */
    TESTING_FREE_PIN( h_A );
    TESTING_FREE_PIN( h_R );
    for(i=0;i<num_gpus;i++){
        TESTING_FREE_DEV( d_lA[i] );
        magma_queue_destroy( queues[2*i]   );
        magma_queue_destroy( queues[2*i+1] );
    }
    magma_finalize();
}
예제 #4
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sgeqrf
*/
int main( int argc, char** argv)
{
    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float           error, work[1];
    float  c_neg_one = MAGMA_S_NEG_ONE;
    float *h_A, *h_R, *tau, *h_work, tmp[1];
    magma_int_t M, N, n2, lda, lwork, info, min_mn, nb;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1}, ISEED2[4];
    
    /* Initialize */
    magma_queue_t  queue[2];
    magma_device_t devices[MagmaMaxGPUs];
    int num = 0;
    magma_err_t err;
    magma_init();

    magma_opts opts;
    parse_opts( argc, argv, &opts );

    magma_int_t status = 0;
    float tol, eps = lapackf77_slamch("E");
    tol = opts.tolerance * eps;

    opts.lapack |= ( opts.check == 2 );  // check (-c2) implies lapack (-l)

    err = magma_get_devices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }

    // Create two queues on device opts.device
    err = magma_queue_create( devices[opts.device], &queue[0] );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( devices[opts.device], &queue[1] );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }
    
    printf("ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 1 ) {
        printf("  M     N     CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R-Q'A||_1 / (M*||A||_1) ||I-Q'Q||_1 / M\n");
        printf("===============================================================================================\n");
    } else {
        printf("  M     N     CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
        printf("=======================================================================\n");
    }
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[i];
            N = opts.nsize[i];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            nb     = magma_get_sgeqrf_nb(M);
            gflops = FLOPS_SGEQRF( M, N ) / 1e9;
            
            lwork = -1;
            lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lwork, &info);
            lwork = (magma_int_t)MAGMA_S_REAL( tmp[0] );
            lwork = max( lwork, max( N*nb, 2*nb*nb ));
            
            TESTING_MALLOC_CPU( tau,    float, min_mn );
            TESTING_MALLOC_CPU( h_A,    float, n2     );
            TESTING_MALLOC_PIN( h_R,    float, n2     );
            TESTING_MALLOC_CPU( h_work, float, lwork  );
            
            /* Initialize the matrix */
            for ( int j=0; j<4; j++ ) ISEED2[j] = ISEED[j]; // saving seeds
            lapackf77_slarnv( &ione, ISEED, &n2, h_A );
            lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            gpu_time = magma_wtime();
            magma_sgeqrf(M, N, h_R, lda, tau, h_work, lwork, &info, queue);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_sgeqrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            if ( opts.lapack ) {
                /* =====================================================================
                   Performs operation using LAPACK
                   =================================================================== */
                float *tau;
                TESTING_MALLOC_CPU( tau, float, min_mn );
                cpu_time = magma_wtime();
                lapackf77_sgeqrf(&M, &N, h_A, &lda, tau, h_work, &lwork, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_sgeqrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                TESTING_FREE_CPU( tau );
            }

            if ( opts.check == 1 ) {
                /* =====================================================================
                   Check the result 
                   =================================================================== */
                magma_int_t lwork = n2+N;
                float *h_W1, *h_W2, *h_W3;
                float *h_RW, results[2];

                TESTING_MALLOC_CPU( h_W1, float, n2 ); // Q
                TESTING_MALLOC_CPU( h_W2, float, n2 ); // R
                TESTING_MALLOC_CPU( h_W3, float, lwork ); // WORK
                TESTING_MALLOC_CPU( h_RW, float, M );  // RWORK
                lapackf77_slarnv( &ione, ISEED2, &n2, h_A );
                lapackf77_sqrt02( &M, &N, &min_mn, h_A, h_R, h_W1, h_W2, &lda, tau, h_W3, &lwork,
                                  h_RW, results );
                results[0] *= eps;
                results[1] *= eps;

                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e                  %8.2e",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, results[0],results[1] );
                    printf("%s\n", (results[0] < tol ? "" : "  failed"));
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)    %8.2e                  %8.2e",
                           (int) M, (int) N, gpu_perf, gpu_time, results[0],results[1] );
                    printf("%s\n", (results[0] < tol ? "" : "  failed"));
                }
                status |= ! (results[0] < tol);

                TESTING_FREE_CPU( h_W1 );
                TESTING_FREE_CPU( h_W2 );
                TESTING_FREE_CPU( h_W3 );
                TESTING_FREE_CPU( h_RW );
            } else if ( opts.check == 2 ) {
                /* =====================================================================
                   Check the result compared to LAPACK
                   =================================================================== */
                error = lapackf77_slange("f", &M, &N, h_A, &lda, work);
                blasf77_saxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
                error = lapackf77_slange("f", &M, &N, h_R, &lda, work) / error;
                
                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   %8.2e",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time, error );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)    %8.2e",
                           (int) M, (int) N, gpu_perf, gpu_time, error );
                }
                printf("%s\n", (error < tol ? "" : "  failed"));
                status |= ! (error < tol);
            }
            else {
                if ( opts.lapack ) {
                    printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)   ---\n",
                           (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
                } else {
                    printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)     ---  \n",
                           (int) M, (int) N, gpu_perf, gpu_time);
                }
            }
            
            TESTING_FREE_CPU( tau );
            TESTING_FREE_CPU( h_A );
            TESTING_FREE_CPU( h_work );
            TESTING_FREE_PIN( h_R );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    magma_queue_destroy( queue[0] );
    magma_queue_destroy( queue[1] );
    magma_finalize();

    return status;
}
예제 #5
0
/* ////////////////////////////////////////////////////////////////////////////
   -- testing any solver
*/
int main(  int argc, char** argv )
{
    magma_int_t info = 0;
    TESTING_CHECK( magma_init() );
    magma_print_environment();

    magma_zopts zopts;
    magma_queue_t queue=NULL;
    magma_queue_create( 0, &queue );
    
    real_Double_t res;
    magma_z_matrix A={Magma_CSR}, A2={Magma_CSR}, 
    A3={Magma_CSR}, A4={Magma_CSR}, A5={Magma_CSR};
    
    int i=1;
    TESTING_CHECK( magma_zparse_opts( argc, argv, &zopts, &i, queue ));

    while( i < argc ) {
        if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) {   // Laplace test
            i++;
            magma_int_t laplace_size = atoi( argv[i] );
            TESTING_CHECK( magma_zm_5stencil(  laplace_size, &A, queue ));
        } else {                        // file-matrix test
            TESTING_CHECK( magma_z_csr_mtx( &A,  argv[i], queue ));
        }

        printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n",
                (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz );

        // filename for temporary matrix storage
        const char *filename = "testmatrix.mtx";

        // write to file
        TESTING_CHECK( magma_zwrite_csrtomtx( A, filename, queue ));
        // read from file
        TESTING_CHECK( magma_z_csr_mtx( &A2, filename, queue ));

        // delete temporary matrix
        unlink( filename );
                
        //visualize
        printf("A2:\n");
        TESTING_CHECK( magma_zprint_matrix( A2, queue ));
        
        //visualize
        TESTING_CHECK( magma_zmconvert(A2, &A4, Magma_CSR, Magma_CSRL, queue ));
        printf("A4:\n");
        TESTING_CHECK( magma_zprint_matrix( A4, queue ));
        TESTING_CHECK( magma_zmconvert(A4, &A5, Magma_CSR, Magma_ELL, queue ));
        printf("A5:\n");
        TESTING_CHECK( magma_zprint_matrix( A5, queue ));

        // pass it to another application and back
        magma_int_t m, n;
        magma_index_t *row, *col;
        magmaDoubleComplex *val=NULL;
        TESTING_CHECK( magma_zcsrget( A2, &m, &n, &row, &col, &val, queue ));
        TESTING_CHECK( magma_zcsrset( m, n, row, col, val, &A3, queue ));

        TESTING_CHECK( magma_zmdiff( A, A2, &res, queue ));
        printf("%% ||A-B||_F = %8.2e\n", res);
        if ( res < .000001 )
            printf("%% tester IO:  ok\n");
        else
            printf("%% tester IO:  failed\n");

        TESTING_CHECK( magma_zmdiff( A, A3, &res, queue ));
        printf("%% ||A-B||_F = %8.2e\n", res);
        if ( res < .000001 )
            printf("%% tester matrix interface:  ok\n");
        else
            printf("%% tester matrix interface:  failed\n");

        magma_zmfree(&A, queue );
        magma_zmfree(&A2, queue );
        magma_zmfree(&A4, queue );
        magma_zmfree(&A5, queue );

        i++;
    }
    
    magma_queue_destroy( queue );
    TESTING_CHECK( magma_finalize() );
    return info;
}
예제 #6
0
int main( int argc, char** argv )
{
    magma_init();
    magma_print_environment();
    
    magma_int_t err;
    magma_int_t num = 0;

    magma_device_t dev;

    magma_queue_t queues;
    magma_queue_create( 0, &queues );

    const double c_zero     = MAGMA_D_ZERO;
    const double c_one      = MAGMA_D_ONE;
    const double c_neg_one  = MAGMA_D_NEG_ONE;
    
    double dummy[1];
    magma_int_t M, N, MN,lda, ldb, ldc, ldda, info;
    double *h_A, *h_S, *h_U, *h_VT;
    double *d_Acu, *d_test;
    magmaDouble_ptr d_A, d_U, d_S, d_VT;
    magma_int_t ione  = 1;
    magma_int_t ISEED[4] = {0, 0, 0, 1};
    double tmp;
    double error, rwork[1];
    magma_int_t status = 0;

    M = 3;
    N = 4;
    MN = M*N;
    
    ldda = magma_roundup(N, 32);
    lda = N;

    h_A = (double*)malloc(M*N*sizeof(double));
    cudaMalloc((void**)& d_Acu, M*N*sizeof(double));
    cudaMemset(d_Acu, 0, M*N*sizeof(double));

    cudaPrintMatrix(d_Acu, M, N);

    magma_malloc((void**)&d_A, M*ldda*sizeof(double));
    magma_malloc((void**)&d_test, M*ldda*sizeof(double));
    cudaMemset(d_test, 0, M*ldda*sizeof(double));


    //magma_malloc((void**)&d_A, M*ldda*sizeof(double));
    //magma_malloc((void**)&d_S, N*ldda*sizeof(double));
    //magma_malloc((void**)&d_U, lddbm*lddbn*sizeof(double));
    //magma_malloc((void**)&d_VT, lddcm*lddcn*sizeof(double));

    //printMatrix(d_A, M, K, lddan);
    //exit(0);

    //printf("\n\n ldda: %d, M: %d, N: %d \n\n", (int)lddan, (int)M, (int)N);

    // Initialize the matrix
    lapackf77_dlarnv(&ione, ISEED, &MN, h_A);

    //cudaMemcpy(d_Acu, h_A, M*N*sizeof(double), cudaMemcpyHostToDevice);
    magma_dsetmatrix(N, M, h_A, lda, d_A, ldda, queues);
    
    printMatrix(h_A, M, N, lda, queues);

    printf("========MTOC============\n");
    magma_to_cuda(M, N, d_A, ldda, d_Acu);
    //printMatrix(d_test, M, N, lda, queues);
    printMatrix(d_A, M, N, ldda, queues);
    cudaPrintMatrix(d_Acu, M, N);
    printf("========CTOM============\n");

    cuda_to_magma(M, N, d_Acu, d_test, ldda);

    cudaPrintMatrix(d_Acu, M, N);
    printMatrix(d_test, M, N, ldda, queues);

    //exit(0);
    printf("====================\n");

    //cudaMemcpy(d_A, h_A, n2*sizeof(double), cudaMemcpyHostToDevice);

    //if(M >= N){

    //    printMatrix(h_U, M, N, ldda, queues);
    //    printMatrix(h_S, 1, N, ldda, queues);
    //    printMatrix(h_VT, N, N, ldda, queues);

    //}else{

    //    printMatrix(h_U, M, M, ldda, queues);
    //    printMatrix(h_S, 1, M, ldda, queues);
    //    printMatrix(h_VT, M, N, ldda, queues);

    //}
    

    magma_finalize();

    return 0; 
}
예제 #7
0
int main( int argc, char** argv )
{
    magma_init();
    cublasHandle_t handle;
    cudaSetDevice( 0 );
    cublasCreate( &handle );
    
    magmaFloatComplex *A, *B, *C;
    magmaFloatComplex *dA, *dB, *dC;
    float error, work[1];
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t ione = 1;
    magma_int_t ISEED[4] = { 1, 2, 3, 4 };
    magma_int_t n = 10;
    magma_int_t lda = n;
    magma_int_t ldda = ((n+31)/32)*32;
    magma_int_t size = lda*n;
    magma_int_t info;
    
    magma_cmalloc_cpu( &A, lda*n );
    magma_cmalloc_cpu( &B, lda*n );
    magma_cmalloc_cpu( &C, lda*n );
    magma_cmalloc( &dA, ldda*n );
    magma_cmalloc( &dB, ldda*n );
    magma_cmalloc( &dC, ldda*n );
    
    // initialize matrices
    lapackf77_clarnv( &ione, ISEED, &size, A );
    lapackf77_clarnv( &ione, ISEED, &size, B );
    lapackf77_clarnv( &ione, ISEED, &size, C );
    // increase diagonal to be SPD
    for( int i=0; i < n; ++i ) {
        C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 ));
    }
    
    magma_csetmatrix( n, n, A, lda, dA, ldda );
    magma_csetmatrix( n, n, B, lda, dB, ldda );
    magma_csetmatrix( n, n, C, lda, dC, ldda );
    
    // compute with cublas
    cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                 &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda );
    
    magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info );
    if (info != 0)
        printf("magma_cpotrf returned error %d: %s.\n",
               (int) info, magma_strerror( info ));
    
    // compute with LAPACK
    blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n,
                   &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda );
    
    lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info );
    if (info != 0)
        printf("lapackf77_cpotrf returned error %d: %s.\n",
               (int) info, magma_strerror( info ));
    
    // compute difference
    magma_cgetmatrix( n, n, dC, ldda, A, lda );
    blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione );
    error = lapackf77_clange( "F", &n, &n, A, &lda, work );
    printf( "n %d, error %8.2e\n", (int) n, error );
    
    magma_free( dA );
    magma_free( dB );
    magma_free( dC );
    magma_free_cpu( A );
    magma_free_cpu( B );
    magma_free_cpu( C );
    
    cublasDestroy( handle );
    magma_finalize();
    return 0;
}
예제 #8
0
/* ////////////////////////////////////////////////////////////////////////////
   -- testing any solver
*/
int main(  int argc, char** argv )
{
    magma_int_t info = 0;
    TESTING_CHECK( magma_init() );
    magma_print_environment();

    magma_copts zopts;
    magma_queue_t queue;
    magma_queue_create( 0, &queue );
    
    magmaFloatComplex one = MAGMA_C_MAKE(1.0, 0.0);
    magmaFloatComplex zero = MAGMA_C_MAKE(0.0, 0.0);
    magma_c_matrix A={Magma_CSR}, B={Magma_CSR}, B_d={Magma_CSR};
    magma_c_matrix x={Magma_CSR}, b={Magma_CSR};
    
    int i=1;
    TESTING_CHECK( magma_cparse_opts( argc, argv, &zopts, &i, queue ));
    B.blocksize = zopts.blocksize;
    B.alignment = zopts.alignment;

    TESTING_CHECK( magma_csolverinfo_init( &zopts.solver_par, &zopts.precond_par, queue ));

    while( i < argc ) {
        if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) {   // Laplace test
            i++;
            magma_int_t laplace_size = atoi( argv[i] );
            TESTING_CHECK( magma_cm_5stencil(  laplace_size, &A, queue ));
        } else {                        // file-matrix test
            TESTING_CHECK( magma_c_csr_mtx( &A,  argv[i], queue ));
        }

        // for the eigensolver case
        zopts.solver_par.ev_length = A.num_cols;
        TESTING_CHECK( magma_ceigensolverinfo_init( &zopts.solver_par, queue ));

        // scale matrix
        TESTING_CHECK( magma_cmscale( &A, zopts.scaling, queue ));
        
        // preconditioner
        if ( zopts.solver_par.solver != Magma_ITERREF ) {
            TESTING_CHECK( magma_c_precondsetup( A, b, &zopts.solver_par, &zopts.precond_par, queue ) );
        }

        TESTING_CHECK( magma_cmconvert( A, &B, Magma_CSR, zopts.output_format, queue ));
        
        printf( "\n%% matrix info: %lld-by-%lld with %lld nonzeros\n\n",
                            (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz );
        
        printf("matrixinfo = [\n");
        printf("%%   size   (m x n)     ||   nonzeros (nnz)   ||   nnz/m   ||   stored nnz\n");
        printf("%%============================================================================%%\n");
        printf("  %8lld  %8lld      %10lld             %4lld        %10lld\n",
               (long long) B.num_rows, (long long) B.num_cols, (long long) B.true_nnz,
               (long long) (B.true_nnz/B.num_rows), (long long) B.nnz );
        printf("%%============================================================================%%\n");
        printf("];\n");

        TESTING_CHECK( magma_cmtransfer( B, &B_d, Magma_CPU, Magma_DEV, queue ));

        // vectors and initial guess
        TESTING_CHECK( magma_cvinit( &b, Magma_DEV, A.num_rows, 1, one, queue ));
        //magma_cvinit( &x, Magma_DEV, A.num_cols, 1, one, queue );
        //magma_c_spmv( one, B_d, x, zero, b, queue );                 //  b = A x
        //magma_cmfree(&x, queue );
        TESTING_CHECK( magma_cvinit( &x, Magma_DEV, A.num_cols, 1, zero, queue ));
        
        info = magma_c_solver( B_d, b, &x, &zopts, queue );
        if( info != 0 ) {
            printf("%%error: solver returned: %s (%lld).\n",
                    magma_strerror( info ), (long long) info );
        }
        printf("convergence = [\n");
        magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue );
        printf("];\n\n");
        
        zopts.solver_par.verbose = 0;
        printf("solverinfo = [\n");
        magma_csolverinfo( &zopts.solver_par, &zopts.precond_par, queue );
        printf("];\n\n");
        
        printf("precondinfo = [\n");
        printf("%%   setup  runtime\n");        
        printf("  %.6f  %.6f\n",
           zopts.precond_par.setuptime, zopts.precond_par.runtime );
        printf("];\n\n");
        magma_cmfree(&B_d, queue );
        magma_cmfree(&B, queue );
        magma_cmfree(&A, queue );
        magma_cmfree(&x, queue );
        magma_cmfree(&b, queue );
        i++;
    }

    magma_queue_destroy( queue );
    TESTING_CHECK( magma_finalize() );
    return info;
}
예제 #9
0
int main( int argc, char** argv)
{
    real_Double_t    gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error;

    float           matnorm, work[1];
    magmaFloatComplex  c_neg_one = MAGMA_C_NEG_ONE;
    magmaFloatComplex *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaFloatComplex_ptr d_lA[MagmaMaxGPUs];

    /* Matrix size */
    magma_int_t M = 0, N = 0, n2, n_local[4], lda, ldda, lhwork;
    magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000};

    magma_int_t i, k, nk, info, min_mn;
    int max_num_gpus = 2, num_gpus = 2;
    
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
            else if (strcmp("-NGPU", argv[i])==0)
              num_gpus = atoi(argv[++i]);
        }
        if ( M == 0 ) {
            M = N;
        }
        if ( N == 0 ) {
            N = M;
        }
        if (M>0 && N>0)
          printf("  testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", (int) M, (int) N, (int) num_gpus);
        else
            {
                printf("\nUsage: \n");
                printf("  testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 
                       1024, 1024, 1);
                exit(1);
            }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_cgeqrf_gpu -M %d -N %d -NGPU %d\n\n", 1024, 1024, 1);
        M = N = size[9];
    }
    
    ldda   = ((M+31)/32)*32;
    n2     = M * N;
    min_mn = min(M, N);

    magma_int_t nb  = magma_get_cgeqrf_nb(M);

    if (num_gpus > max_num_gpus){
      printf("More GPUs requested than available. Have to change it.\n");
      num_gpus = max_num_gpus;
    }
    printf("Number of GPUs to be used = %d\n", (int) num_gpus);

    /* Initialize */
    magma_queue_t  queues[MagmaMaxGPUs * 2];
    magma_device_t devices[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    for(i=0;i<num_gpus;i++){
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", err );
            exit(-1);
        }
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", err );
            exit(-1);
        }
    }
    
    /* Allocate host memory for the matrix */
    TESTING_MALLOC_CPU( tau, magmaFloatComplex, min_mn );
    TESTING_MALLOC_CPU( h_A, magmaFloatComplex, n2     );
    TESTING_MALLOC_CPU( h_R, magmaFloatComplex, n2     );

    for(i=0; i<num_gpus; i++){      
        n_local[i] = ((N/nb)/num_gpus)*nb;
        if (i < (N/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (N/nb)%num_gpus)
            n_local[i] += N%nb;
        
        TESTING_MALLOC_DEV( d_lA[i], magmaFloatComplex, ldda*n_local[i] );
        printf("device %2d n_local = %4d\n", (int) i, (int) n_local[i]);  
    }

    lhwork = -1;
    lapackf77_cgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info);
    lhwork = (magma_int_t)MAGMA_C_REAL( tmp[0] );

    TESTING_MALLOC_CPU( h_work, magmaFloatComplex, lhwork );

    printf("  M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
    printf("======================================================================\n");
    for(i=0; i<10; i++){
        if (argc == 1){
            M = N = size[i];
        }
        min_mn= min(M, N);
        lda   = M;
        n2    = lda*N;
        ldda  = ((M+31)/32)*32;
        gflops = FLOPS( (float)M, (float)N ) * 1e-9;

        /* Initialize the matrix */
        lapackf77_clarnv( &ione, ISEED, &n2, h_A );
        lapackf77_clacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        lapackf77_cgeqrf(&M, &N, h_A, &M, tau, h_work, &lhwork, &info);
        cpu_time = magma_wtime() - cpu_time;
        if (info < 0)
            printf("Argument %d of lapack_cgeqrf had an illegal value.\n", (int) -info);

        cpu_perf = gflops / cpu_time;

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        int j;
        magma_queue_t *trans_queues = (magma_queue_t*)malloc(num_gpus*sizeof(magma_queue_t));
        for(j=0;j<num_gpus;j++){
            trans_queues[j] = queues[2*j];
        }
        
        // warm-up
        magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues);
        magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues);

        magmablas_csetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, num_gpus, nb, trans_queues);
        gpu_time = magma_wtime();
        magma_cgeqrf2_mgpu( num_gpus, M, N, d_lA, ldda, tau, &info, queues);
        gpu_time = magma_wtime() - gpu_time;

        if (info < 0)
          printf("Argument %d of magma_cgeqrf2 had an illegal value.\n", (int) -info);
        
        gpu_perf = gflops / gpu_time;
        
        /* =====================================================================
           Check the result compared to LAPACK
           =================================================================== */
        magmablas_cgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, num_gpus, nb, trans_queues);
        
        matnorm = lapackf77_clange("f", &M, &N, h_A, &M, work);
        blasf77_caxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
        
        printf("%5d %5d  %6.2f (%6.2f)        %6.2f (%6.2f)       %e\n",
               (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time,
               lapackf77_clange("f", &M, &N, h_R, &M, work) / matnorm);
        
        if (argc != 1)
          break;
    }
    
    /* Memory clean up */
    TESTING_FREE_PIN( tau );
    TESTING_FREE_PIN( h_A );
    TESTING_FREE_PIN( h_work );
    TESTING_FREE_PIN( h_R );

    for(i=0; i<num_gpus; i++){
        TESTING_FREE_DEV( d_lA[i] );
        magma_queue_destroy(queues[2*i]);
        magma_queue_destroy(queues[2*i+1]);
    }

    /* Shutdown */
    magma_finalize();
}
예제 #10
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgeqrf
*/
int main( magma_int_t argc, char** argv) 
{
    cuDoubleComplex *h_A, *h_R, *h_A2, *h_A3, *h_work, *h_work2, *tau, *d_work2;
    cuDoubleComplex *d_A, *d_work;
    float gpu_perf, cpu_perf, cpu2_perf;
    double flops;

    magma_timestr_t start, end;

    /* Matrix size */
    magma_int_t N=0, n2, lda, M=0;
    magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};
    
    magma_int_t i, j, info[1];

    magma_int_t loop = argc;

    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    magma_int_t num_cores = 4;
    magma_int_t num_gpus = 0;

    if (argc != 1){
      for(i = 1; i<argc; i++){      
        if (strcmp("-N", argv[i])==0)
          N = atoi(argv[++i]);
        else if (strcmp("-M", argv[i])==0)
          M = atoi(argv[++i]);
        else if (strcmp("-C", argv[i])==0)
         num_cores = atoi(argv[++i]);
      }
      if ((M>0 && N>0) || (M==0 && N==0)) {
        printf("  testing_zgeqrf_mc -M %d -N %d \n\n", M, N);
        if (M==0 && N==0) {
          M = N = size[9];
          loop = 1;
        }
      } else {
        printf("\nUsage: \n");
        printf("  testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024);
        exit(1);
      }
    } else {
      printf("\nUsage: \n");
      printf("  testing_zgeqrf_mc -M %d -N %d -B 128 -T 1\n\n", 1024, 1024);
      M = N = size[9];
    }

    n2 = M * N;

    magma_int_t min_mn = min(M,N);

    /* Allocate host memory for the matrix */
    h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0]));
    if (h_A2 == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A2)\n");
    }

    magma_int_t lwork = n2;

    h_work2 = (cuDoubleComplex*)malloc(lwork * sizeof(cuDoubleComplex));
    if (h_work2 == 0) {
        fprintf (stderr, "!!!! host memory allocation error (h_work2)\n");
    }

    h_A3 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A3[0]));
    if (h_A3 == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A3)\n");
    }

    tau = (cuDoubleComplex*)malloc(min_mn * sizeof(cuDoubleComplex));
    if (tau == 0) {
      fprintf (stderr, "!!!! host memory allocation error (tau)\n");
    }

    /* Initialize MAGMA hardware context, seeting how many CPU cores 
       and how many GPUs to be used in the consequent computations  */
    magma_context *context;
    context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv);

    printf("\n\n");
    printf("   M     N       LAPACK Gflop/s     Multi-core Gflop/s    ||R||_F / ||A||_F\n");
    printf("===========================================================================\n");
    for(i=0; i<10; i++){

      if (loop == 1) {
        M = N = size[i];
        n2 = M*N;
      }

      flops = FLOPS( (double)M, (double)N ) / 1000000;

      /* Initialize the matrix */
      lapackf77_zlarnv( &ione, ISEED, &n2, h_A2 );
      lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A2, &M, h_A3, &M );

      /* =====================================================================
         Performs operation using LAPACK 
     =================================================================== */

      start = get_current_time();
      lapackf77_zgeqrf(&M, &N, h_A3, &M, tau, h_work2, &lwork, info);
      end = get_current_time();

      if (info[0] < 0)  
        printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]);
 
      cpu2_perf = flops / GetTimerValue(start, end);

      /* =====================================================================
         Performs operation using multicore 
     =================================================================== */

      start = get_current_time();
      magma_zgeqrf_mc(context, &M, &N, h_A2, &M, tau, h_work2, &lwork, info);
      end = get_current_time();

      if (info[0] < 0)  
        printf("Argument %d of sgeqrf had an illegal value.\n", -info[0]);
  
      cpu_perf = flops / GetTimerValue(start, end);
      
      /* =====================================================================
         Check the result compared to LAPACK
         =================================================================== */

      double work[1], matnorm = 1.;
      cuDoubleComplex mone = MAGMA_Z_NEG_ONE;
      magma_int_t one = 1;
      matnorm = lapackf77_zlange("f", &M, &N, h_A2, &M, work);

      blasf77_zaxpy(&n2, &mone, h_A2, &one, h_A3, &one);
      printf("%5d  %5d       %6.2f               %6.2f           %e\n", 
         M,  N, cpu2_perf, cpu_perf,
         lapackf77_zlange("f", &M, &N, h_A3, &M, work) / matnorm);

      if (loop != 1)
    break;
    }

    /* Memory clean up */
    free(h_A2);
    free(tau);
    free(h_A3);
    free(h_work2);

    /* Shut down the MAGMA context */
    magma_finalize(context);

}
예제 #11
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sgeqrs_gpu
*/
int main( int argc, char** argv)
{
//#if defined(PRECISION_s)
    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( device, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    err = magma_queue_create( device[0], &queue );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", err );
        exit(-1);
    }
  
    real_Double_t gflops, gpu_perf, gpu_time, cpu_perf, cpu_time;
    float           matnorm, work[1];
    float  c_one     = MAGMA_S_ONE;
    float  c_neg_one = MAGMA_S_NEG_ONE;
    float *h_A, *h_A2, *h_B, *h_X, *h_R, *tau, *hwork, tmp[1];
    magmaFloat_ptr d_A, d_B;

    /* Matrix size */
    magma_int_t M = 0, N = 0, n2;
    magma_int_t lda, ldb, ldda, lddb, lworkgpu, lhwork;
    magma_int_t size[7] = {1024,2048,3072,4032,5184,6016,7000};

    magma_int_t i, info, min_mn, nb, l1, l2;
    magma_int_t ione     = 1;
    magma_int_t nrhs     = 3;
    magma_int_t ISEED[4] = {0,0,0,1};

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
            else if (strcmp("-nrhs", argv[i])==0)
                nrhs = atoi(argv[++i]);
        }
        if (N>0 && M>0 && M >= N)
            printf("  testing_sgeqrs_gpu -nrhs %d -M %d -N %d\n\n", nrhs, M, N);
        else
            {
                printf("\nUsage: \n");
                printf("  testing_sgeqrs_gpu -nrhs %d  -M %d  -N %d\n\n", nrhs, M, N);
                printf("  M has to be >= N, exit.\n");
                exit(1);
            }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_sgeqrs_gpu -nrhs %d  -M %d  -N %d\n\n", nrhs, 1024, 1024);
        M = N = size[6];
    }

    ldda   = ((M+31)/32)*32;
    lddb   = ldda;
    n2     = M * N;
    min_mn = min(M, N);
    nb     = magma_get_sgeqrf_nb(M);
    lda = ldb = M;
    lworkgpu = (M-N + nb)*(nrhs+2*nb);

    /* Allocate host memory for the matrix */
    TESTING_MALLOC_PIN( tau,  float, min_mn   );
    TESTING_MALLOC_PIN( h_A,  float, lda*N    );
    TESTING_MALLOC_PIN( h_A2, float, lda*N    );
    TESTING_MALLOC_PIN( h_B,  float, ldb*nrhs );
    TESTING_MALLOC_PIN( h_X,  float, ldb*nrhs );
    TESTING_MALLOC_PIN( h_R,  float, ldb*nrhs );

    TESTING_MALLOC_DEV( d_A, float, ldda*N      );
    TESTING_MALLOC_DEV( d_B, float, lddb*nrhs   );

    /*
     * Get size for host workspace
     */
    lhwork = -1;
    lapackf77_sgeqrf(&M, &N, h_A, &M, tau, tmp, &lhwork, &info);
    l1 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
    lhwork = -1;
    lapackf77_sormqr( MagmaLeftStr, MagmaTransStr,
                      &M, &nrhs, &min_mn, h_A, &lda, tau,
                      h_X, &ldb, tmp, &lhwork, &info);
    l2 = (magma_int_t)MAGMA_S_REAL( tmp[0] );
    lhwork = max( max( l1, l2 ), lworkgpu );

    TESTING_MALLOC_PIN( hwork, float, lhwork );

    printf("\n");
    printf("                                         ||b-Ax|| / (N||A||)\n");
    printf("  M     N    CPU GFlop/s   GPU GFlop/s      CPU      GPU    \n");
    printf("============================================================\n");
    for(i=0; i<7; i++){
        if (argc == 1){
            M = N = size[i];
        }
        min_mn= min(M, N);
        ldb = lda = M;
        n2    = lda*N;
        ldda  = ((M+31)/32)*32;
        gflops = (FLOPS_GEQRF( (float)M, (float)N )
                 + FLOPS_GEQRS( (float)M, (float)N, (float)nrhs )) / 1e9;

        /* Initialize the matrices */
        lapackf77_slarnv( &ione, ISEED, &n2, h_A );
        lapackf77_slacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_A2, &lda );

        n2 = M*nrhs;
        lapackf77_slarnv( &ione, ISEED, &n2, h_B );
        lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_R, &ldb );

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        /* Warm up to measure the performance */
        magma_ssetmatrix( M, N,    h_A, 0, lda, d_A, 0, ldda, queue );
        magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue );
        magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda,
                         d_B, 0, lddb, hwork, lworkgpu, &info, queue);
        
        magma_ssetmatrix( M, N,    h_A, 0, lda, d_A, 0, ldda, queue );
        magma_ssetmatrix( M, nrhs, h_B, 0, ldb, d_B, 0, lddb, queue );
        
        gpu_time = magma_wtime();
        magma_sgels_gpu( MagmaNoTrans, M, N, nrhs, d_A, 0, ldda,
                         d_B, 0, lddb, hwork, lworkgpu, &info, queue);
        gpu_time = magma_wtime() - gpu_time;
        if (info < 0)
            printf("Argument %d of magma_sgels had an illegal value.\n", -info);
        
        gpu_perf = gflops / gpu_time;

        // Get the solution in h_X
        magma_sgetmatrix( N, nrhs, d_B, 0, lddb, h_X, 0, ldb, queue );

        // compute the residual
        blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N,
                       &c_neg_one, h_A, &lda,
                                   h_X, &ldb,
                       &c_one,     h_R, &ldb);
        matnorm = lapackf77_slange("f", &M, &N, h_A, &lda, work);

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        lapackf77_slacpy( MagmaUpperLowerStr, &M, &nrhs, h_B, &ldb, h_X, &ldb );

        cpu_time = magma_wtime();
        lapackf77_sgels( MagmaNoTransStr, &M, &N, &nrhs,
                         h_A, &lda, h_X, &ldb, hwork, &lhwork, &info);
        cpu_time = magma_wtime()-cpu_time;
        cpu_perf = gflops / cpu_time;
        if (info < 0)
          printf("Argument %d of lapackf77_sgels had an illegal value.\n", -info);

        blasf77_sgemm( MagmaNoTransStr, MagmaNoTransStr, &M, &nrhs, &N,
                       &c_neg_one, h_A2, &lda,
                                   h_X,  &ldb,
                       &c_one,     h_B,  &ldb);

        printf("%5d %5d   %6.1f       %6.1f       %7.2e   %7.2e\n",
               M, N, cpu_perf, gpu_perf,
               lapackf77_slange("f", &M, &nrhs, h_B, &M, work)/(min_mn*matnorm),
               lapackf77_slange("f", &M, &nrhs, h_R, &M, work)/(min_mn*matnorm) );

        if (argc != 1)
            break;
    }

    /* Memory clean up */
    TESTING_FREE_PIN( tau );
    TESTING_FREE_PIN( h_A );
    TESTING_FREE_PIN( h_A2 );
    TESTING_FREE_PIN( h_B );
    TESTING_FREE_PIN( h_X );
    TESTING_FREE_PIN( h_R );
    TESTING_FREE_PIN( hwork );
    TESTING_FREE_DEV( d_A );
    TESTING_FREE_DEV( d_B );

    /* Shutdown */
    magma_queue_destroy( queue );
    magma_finalize();
}
예제 #12
0
/* ////////////////////////////////////////////////////////////////////////////
   -- testing any solver
*/
int main(  int argc, char** argv )
{
    magma_int_t info = 0;
    TESTING_CHECK( magma_init() );
    magma_print_environment();

    magma_zopts zopts;
    magma_queue_t queue=NULL;
    magma_queue_create( 0, &queue );

    real_Double_t res;
    magma_z_matrix Z={Magma_CSR}, Z2={Magma_CSR}, A={Magma_CSR}, A2={Magma_CSR}, 
    AT={Magma_CSR}, AT2={Magma_CSR}, B={Magma_CSR};
    int i=1;
    TESTING_CHECK( magma_zparse_opts( argc, argv, &zopts, &i, queue ));

    B.blocksize = zopts.blocksize;
    B.alignment = zopts.alignment;

    while( i < argc ) {
        if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) {   // Laplace test
            i++;
            magma_int_t laplace_size = atoi( argv[i] );
            TESTING_CHECK( magma_zm_5stencil(  laplace_size, &Z, queue ));
        } else {                        // file-matrix test
            TESTING_CHECK( magma_z_csr_mtx( &Z,  argv[i], queue ));
        }

        printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n",
                (long long) Z.num_rows, (long long) Z.num_cols, (long long) Z.nnz );
        
        // convert to be non-symmetric
        TESTING_CHECK( magma_zmconvert( Z, &A, Magma_CSR, Magma_CSRL, queue ));
        TESTING_CHECK( magma_zmconvert( Z, &B, Magma_CSR, Magma_CSRU, queue ));

        // transpose
        TESTING_CHECK( magma_zmtranspose( A, &AT, queue ));

        // quite some conversions
                    
        //ELL
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELL, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELL, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        //ELLPACKT
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELLPACKT, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELLPACKT, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        //ELLRT
        AT2.blocksize = 8;
        AT2.alignment = 8;
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELLRT, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELLRT, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        //SELLP
        AT2.blocksize = 8;
        AT2.alignment = 8;
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_SELLP, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_SELLP, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        //ELLD
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_ELLD, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_ELLD, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        //CSRCOO
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_CSRCOO, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_CSRCOO, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        //CSRLIST
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_CSRLIST, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_CSRLIST, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        //CSRD
        TESTING_CHECK( magma_zmconvert( AT, &AT2, Magma_CSR, Magma_CSRD, queue ));
        magma_zmfree(&AT, queue );
        TESTING_CHECK( magma_zmconvert( AT2, &AT, Magma_CSRD, Magma_CSR, queue ));
        magma_zmfree(&AT2, queue );
        
        // transpose
        TESTING_CHECK( magma_zmtranspose( AT, &A2, queue ));
        TESTING_CHECK( magma_zmdiff( A, A2, &res, queue));
        printf("%% ||A-A2||_F = %8.2e\n", res);
        if ( res < .000001 )
            printf("%% conversion tester:  ok\n");
        else
            printf("%% conversion tester:  failed\n");
        
        TESTING_CHECK( magma_zmlumerge( A2, B, &Z2, queue ));

        TESTING_CHECK( magma_zmdiff( Z, Z2, &res, queue));        
        printf("%% ||Z-Z2||_F = %8.2e\n", res);
        if ( res < .000001 )
            printf("%% LUmerge tester:  ok\n");
        else
            printf("%% LUmerge tester:  failed\n");

        magma_zmfree(&A, queue );
        magma_zmfree(&A2, queue );
        magma_zmfree(&AT, queue );
        magma_zmfree(&AT2, queue );
        magma_zmfree(&B, queue );
        magma_zmfree(&Z2, queue );
        magma_zmfree(&Z, queue );

        i++;
    }

    magma_queue_destroy( queue );
    TESTING_CHECK( magma_finalize() );
    return info;
}
예제 #13
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgeqrf
*/
int main( magma_int_t argc, char** argv) 
{
    magma_int_t nquarkthreads=2;
    magma_int_t nthreads=2;
    magma_int_t num_gpus  = 1;
    TRACE = 0;

    //magma_qr_params mp;

    cuDoubleComplex *h_A, *h_R, *h_work, *tau;
    double gpu_perf, cpu_perf, flops;

    magma_timestr_t start, end;

    magma_qr_params *mp = (magma_qr_params*)malloc(sizeof(magma_qr_params));

    /* Matrix size */
    magma_int_t M=0, N=0, n2;
    magma_int_t size[10] = {1024,2048,3072,4032,5184,6016,7040,8064,9088,10112};

    cublasStatus status;
    magma_int_t i, j, info;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    mp->nb=-1;
    mp->ob=-1;
    mp->fb=-1;
    mp->ib=32;

    magma_int_t loop = argc;
    magma_int_t accuracyflag = 1;

    char precision;

    magma_int_t nc = -1;
    magma_int_t ncps = -1;

    if (argc != 1)
      {
    for(i = 1; i<argc; i++){      
      if (strcmp("-N", argv[i])==0)
        N = atoi(argv[++i]);
      else if (strcmp("-M", argv[i])==0)
        M = atoi(argv[++i]);
      else if (strcmp("-F", argv[i])==0)
        mp->fb = atoi(argv[++i]);
      else if (strcmp("-O", argv[i])==0)
        mp->ob = atoi(argv[++i]);
      else if (strcmp("-B", argv[i])==0)
        mp->nb = atoi(argv[++i]);
      else if (strcmp("-b", argv[i])==0)
        mp->ib = atoi(argv[++i]);
      else if (strcmp("-A", argv[i])==0)
        accuracyflag = atoi(argv[++i]);
      else if (strcmp("-P", argv[i])==0)
        nthreads = atoi(argv[++i]);
      else if (strcmp("-Q", argv[i])==0)
        nquarkthreads = atoi(argv[++i]);
      else if (strcmp("-nc", argv[i])==0)
        nc = atoi(argv[++i]);
      else if (strcmp("-ncps", argv[i])==0)
        ncps = atoi(argv[++i]);
    }
    
    if ((M>0 && N>0) || (M==0 && N==0)) 
      {
        printf("  testing_zgeqrf-v2 -M %d -N %d\n\n", M, N);
        if (M==0 && N==0) {
          M = N = size[9];
          loop = 1;
        }
      } 
    else 
      {
        printf("\nUsage: \n");
        printf("  Make sure you set the number of BLAS threads to 1, e.g.,\n");
        printf("   > setenv MKL_NUM_THREADS 1\n");
        printf("   > testing_zgeqrf-v2 -M %d -N %d -B 128 -T 1\n\n", 1024, 1024);
        exit(1);
      }
      } 
    else 
      {
    printf("\nUsage: \n");
    printf("  Make sure you set the number of BLAS threads to 1, e.g.,\n");
        printf("   > setenv MKL_NUM_THREADS 1\n");
        printf("  Set number of cores per socket and number of cores.\n");
    printf("   > testing_zgeqrf-v2 -M %d -N %d -ncps 6 -nc 12\n\n", 1024, 1024);
        printf("  Alternatively, set:\n");
        printf("  Q:  Number of threads for panel factorization.\n");
        printf("  P:  Number of threads for trailing matrix update (CPU).\n");
        printf("  B:  Block size.\n");
        printf("  b:  Inner block size.\n");
        printf("  O:  Block size for trailing matrix update (CPU).\n");
    printf("   > testing_zgeqrf-v2 -M %d -N %d -Q 4 -P 4 -B 128 -b 32 -O 200\n\n", 10112, 10112);
    M = N = size[9];
      }

    /* Auto tune based on number of cores and number of cores per socket if provided */
    if ((nc > 0) && (ncps > 0)) {
      precision = 's';
      #if (defined(PRECISION_d))
        precision = 'd';
      #endif
      #if (defined(PRECISION_c))
        precision = 'c';
      #endif
      #if (defined(PRECISION_z))
        precision = 'z';
      #endif
            
      auto_tune('q', precision, nc, ncps, M, N,
                &(mp->nb), &(mp->ob), &(mp->ib), &nthreads, &nquarkthreads);
          
fprintf(stderr,"%d %d %d %d %d\n",mp->nb,mp->ob,mp->ib,nquarkthreads,nthreads);
          
    }       

    /* Initialize MAGMA hardware context, seeting how many CPU cores
       and how many GPUs to be used in the consequent computations  */
    mp->sync0 = 0;
    magma_context *context;
    context = magma_init((void*)(mp),cpu_thread, nthreads, nquarkthreads, num_gpus, argc, argv);
    context->params = (void *)(mp);

    mp->sync1 = (volatile magma_int_t *) malloc (sizeof(int)*nthreads);

    for (i = 0; i < nthreads; i++)
      mp->sync1[i] = 0;

    n2  = M * N;
    magma_int_t min_mn = min(M, N);
    magma_int_t nb = magma_get_zgeqrf_nb(min_mn);
    magma_int_t lwork = N*nb;

    /* Allocate host memory for the matrix */
    TESTING_MALLOC   ( h_A  , cuDoubleComplex, n2    );
    TESTING_MALLOC   ( tau  , cuDoubleComplex, min_mn);
    TESTING_HOSTALLOC( h_R  , cuDoubleComplex, n2    );
    TESTING_HOSTALLOC(h_work, cuDoubleComplex, lwork );

    printf("\n\n");
    printf("  M     N   CPU GFlop/s   GPU GFlop/s    ||R||_F / ||A||_F\n");
    printf("==========================================================\n");
    for(i=0; i<10; i++){
        if (loop==1){
            M = N = min_mn = size[i];
            n2 = M*N;
        }

        flops = FLOPS( (double)M, (double)N ) / 1000000;

        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
        lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &M, h_R, &M );

        //magma_zgeqrf(M, N, h_R, M, tau, h_work, lwork, &info);

        for(j=0; j<n2; j++)
          h_R[j] = h_A[j];

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        magma_qr_init(mp, M, N, h_R, nthreads);

        start = get_current_time();
        magma_zgeqrf3(context, M, N, h_R, M, tau, h_work, lwork, &info);
        end = get_current_time();

        gpu_perf = flops / GetTimerValue(start, end);

    /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        start = get_current_time();
        if (accuracyflag == 1)
          lapackf77_zgeqrf(&M, &N, h_A, &M, tau, h_work, &lwork, &info);
        end = get_current_time();
        if (info < 0)
      printf("Argument %d of zgeqrf had an illegal value.\n", -info);

        cpu_perf = 4.*M*N*min_mn/(3.*1000000*GetTimerValue(start,end));
    
        /* =====================================================================
           Check the result compared to LAPACK
           =================================================================== */
        double work[1], matnorm = 1.;
        cuDoubleComplex mone = MAGMA_Z_NEG_ONE;
        magma_int_t one = 1;

        if (accuracyflag == 1){
          matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work);
          blasf77_zaxpy(&n2, &mone, h_A, &one, h_R, &one);
        }

        if (accuracyflag == 1){
          printf("%5d %5d  %6.2f         %6.2f        %e\n",
                 M, N, cpu_perf, gpu_perf,
                 lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm);
        } else {
          printf("%5d %5d                %6.2f          \n",
                 M, N, gpu_perf);
        }

        if (loop != 1)
            break;
    }

    /* Memory clean up */
    TESTING_FREE    ( h_A  );
    TESTING_FREE    ( tau  );
    TESTING_HOSTFREE(h_work);
    TESTING_HOSTFREE( h_R  );

    /* Shut down the MAGMA context */
    magma_finalize(context);
}
예제 #14
0
int main( int argc, char** argv)
{
    real_Double_t   gflops, magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time;
    double      magma_error, clblas_error, work[1];
    int        transA = MagmaNoTrans;
    int        transB = MagmaNoTrans;

    magma_int_t istart = 1024;
    magma_int_t iend   = 6240;
    magma_int_t M, M0 = 0;
    magma_int_t N, N0 = 0;
    magma_int_t K, K0 = 0;
    magma_int_t i;
    magma_int_t Am, An, Bm, Bn;
    magma_int_t szeA, szeB, szeC;
    magma_int_t lda, ldb, ldc, ldda, lddb, lddc;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    
    double *h_A, *h_B, *h_C, *h_C2, *h_C3;
    magmaDouble_ptr d_A, d_B, d_C;
    double c_neg_one = MAGMA_D_NEG_ONE;
    double alpha = MAGMA_D_MAKE(  0.29, -0.86 );
    double beta  = MAGMA_D_MAKE( -0.48,  0.38 );
    
    int lapack = getenv("MAGMA_RUN_LAPACK") != NULL;
    int count = 1;

    printf("\nUsage: testing_dgemm [-NN|NT|TN|TT|NC|CN|TC|CT|CC] -M m -N n -K k -count c -l\n"
            "  -l  or setting $MAGMA_RUN_LAPACK runs CPU BLAS,\n"
            "      and computes both MAGMA and CLBLAS error using CPU BLAS result.\n"
            "      Else, MAGMA error is computed using CLBLAS result.\n\n");

    for( int i = 1; i < argc; ++i ) {
        if ( strcmp("-N", argv[i]) == 0 && i+1 < argc ){
            N0 = atoi(argv[++i]);
        }
        else if ( strcmp("-M", argv[i]) == 0 && i+1 < argc ){
            M0 = atoi(argv[++i]);
        }
        else if ( strcmp("-K", argv[i]) == 0 && i+1 < argc ){
            K0 = atoi(argv[++i]);
        }
        else if (strcmp("-NN", argv[i])==0){
            transA = transB = MagmaNoTrans;
        }
        else if (strcmp("-TT", argv[i])==0){
            transA = transB = MagmaTrans;
        }
        else if (strcmp("-NT", argv[i])==0){
            transA = MagmaNoTrans;
            transB = MagmaTrans;
        }
        else if (strcmp("-TN", argv[i])==0){
            transA = MagmaTrans;
            transB = MagmaNoTrans;
        }
        else if (strcmp("-NC", argv[i])==0){
            transA = MagmaNoTrans;
            transB = MagmaTrans;
        }
        else if (strcmp("-TC", argv[i])==0){
            transA = MagmaTrans;
            transB = MagmaTrans;
        }
        else if (strcmp("-CN", argv[i])==0){
            transA = MagmaTrans;
            transB = MagmaNoTrans;
        }
        else if (strcmp("-CT", argv[i])==0){
            transA = MagmaTrans;
            transB = MagmaTrans;
        }
        else if (strcmp("-CC", argv[i])==0){
            transA = transB = MagmaTrans;
        }
        else if (strcmp("-l", argv[i])==0) {
            lapack = true;
        }
        else if ( strcmp("-count", argv[i]) == 0 && i+1 < argc ){
            count = atoi(argv[++i]);
        }
        else {
            printf( "invalid argument: %s\n", argv[i] );
            exit(1);
        }
    }

    if ( (M0 != 0) && (N0 != 0) && (K0 != 0) )
        iend = istart + 1;
    
    M = N = K = iend;
    if ( M0 != 0 ) M = M0;
    if ( N0 != 0 ) N = N0;
    if ( K0 != 0 ) K = K0;
    
    if( transA == MagmaNoTrans ) {
        Am = M;
        An = K;
    }  else {
        Am = K;
        An = M;
    }
    
    if( transB == MagmaNoTrans ) {
        Bm = K;
        Bn = N;
    }  else {
        Bm = N;
        Bn = K;
    }
   
    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( device, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( device[0], &queue );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }


    lda = ldc = M;
    ldb = Bm;
    
    ldda = ((M+31)/32)*32;
    lddb = ((ldb+31)/32)*32;
    lddc = ldda;

    K += 32;
    M += 32;
    N += 32;

    TESTING_MALLOC_CPU( h_A,  double, lda*K );
    TESTING_MALLOC_CPU( h_B,  double, ldb*Bn );
    TESTING_MALLOC_CPU( h_C,  double, ldc*N );
    TESTING_MALLOC_CPU( h_C2, double, ldc*N );
    TESTING_MALLOC_CPU( h_C3, double, ldc*N );

    TESTING_MALLOC_DEV( d_A, double, ldda*K );
    TESTING_MALLOC_DEV( d_B, double, lddb*Bn );
    TESTING_MALLOC_DEV( d_C, double, lddc*N );

    printf("Testing transA = %c  transB = %c\n", *lapack_const(transA), *lapack_const(transB));
    printf("    M     N     K   MAGMA Gflop/s (sec)  CLBLAS Gflop/s (sec)  CPU Gflop/s (sec)  MAGMA error  CLBLAS error\n");
    printf("===========================================================================================================\n");
    for( i=istart; i<iend; i = (int)(i*1.25) ) {
        for( int cnt = 0; cnt < count; ++cnt ) {
            M = N = K = i;
            if ( M0 != 0 ) M = M0;
            if ( N0 != 0 ) N = N0;
            if ( K0 != 0 ) K = K0;
    
            if( transA == MagmaNoTrans ) {
                lda = Am = M;
                An = K;
            }  else {
                lda = Am = K;
                An = M;
            }
    
            if( transB == MagmaNoTrans ) {
                ldb = Bm = K;
                Bn = N;
            }  else {
                ldb = Bm = N;
                Bn = K;
            }
            gflops = FLOPS_DGEMM( M, N, K ) / 1e9;
            ldc = M;
    
            ldda = ((lda+31)/32)*32;
            lddb = ((ldb+31)/32)*32;
            lddc = ((ldc+31)/32)*32;
    
            szeA = lda * An;
            szeB = ldb * Bn;
            szeC = ldc * N;
    
            /* Initialize the matrices */
            lapackf77_dlarnv( &ione, ISEED, &szeA, h_A );
            lapackf77_dlarnv( &ione, ISEED, &szeB, h_B );
            lapackf77_dlarnv( &ione, ISEED, &szeC, h_C );
            
            /* =====================================================================
               Performs operation using MAGMA-BLAS
               =================================================================== */
            magma_dsetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue );
            magma_dsetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue );
            magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue );
    
            magmablas_dgemm_reduce( M, N, K,
                    alpha, d_A, 0, ldda,
                    d_B, 0, lddb,
                    beta,  d_C, 0, lddc, queue );
            magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue );
            magma_queue_sync(queue);
            
            magma_time = magma_wtime();
            magmablas_dgemm_reduce( M, N, K,
                    alpha, d_A, 0, ldda,
                    d_B, 0, lddb,
                    beta,  d_C, 0, lddc, queue );
            magma_queue_sync(queue);
            magma_time = magma_wtime() - magma_time;
            magma_perf = gflops / magma_time;
            
            magma_dgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue );
            
            /* =====================================================================
               Performs operation using CUDA-BLAS
               =================================================================== */
            magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue );
            
            magma_dgemm( transA, transB, M, N, K,
                         alpha, d_A, 0, ldda,
                                d_B, 0, lddb,
                         beta,  d_C, 0, lddc, queue );
            magma_dsetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue );
            magma_queue_sync(queue);
            
            clblas_time = magma_wtime();
            magma_dgemm( transA, transB, M, N, K,
                         alpha, d_A, 0, ldda,
                                d_B, 0, lddb,
                         beta,  d_C, 0, lddc, queue );
            magma_queue_sync(queue);
            clblas_time = magma_wtime() - clblas_time;
            clblas_perf = gflops / clblas_time;
            
            magma_dgetmatrix( M, N, d_C, 0, lddc, h_C3, 0, ldc, queue );
            
            /* =====================================================================
               Performs operation using BLAS
               =================================================================== */
            if ( lapack ) {
                cpu_time = magma_wtime();
                blasf77_dgemm( lapack_const(transA), lapack_const(transB), &M, &N, &K,
                               &alpha, h_A, &lda,
                                       h_B, &ldb,
                               &beta,  h_C, &ldc );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
            }
            
            /* =====================================================================
               Error Computation and Performance Compariosn
               =================================================================== */
            if ( lapack ) {
                // compare both magma & clblas to lapack
                blasf77_daxpy(&szeC, &c_neg_one, h_C, &ione, h_C2, &ione);
                magma_error = lapackf77_dlange("M", &M, &N, h_C2, &ldc, work);
                
                blasf77_daxpy(&szeC, &c_neg_one, h_C, &ione, h_C3, &ione);
                clblas_error = lapackf77_dlange("M", &M, &N, h_C3, &ldc, work);
                
                printf("%5d %5d %5d   %7.2f (%7.4f)    %7.2f (%7.4f)   %7.2f (%7.4f)    %8.2e     %8.2e\n",
                       (int) M, (int) N, (int) K,
                       magma_perf, magma_time, clblas_perf, clblas_time, cpu_perf, cpu_time,
                       magma_error, clblas_error );
            }
            else {
                // compare magma to clblas
                blasf77_daxpy(&szeC, &c_neg_one, h_C3, &ione, h_C2, &ione);
                magma_error = lapackf77_dlange("M", &M, &N, h_C2, &ldc, work);
                
                printf("%5d %5d %5d   %7.2f (%7.4f)    %7.2f (%7.4f)     ---   (  ---  )    %8.2e     ---\n",
                       (int) M, (int) N, (int) K,
                       magma_perf, magma_time, clblas_perf, clblas_time,
                       magma_error );
            }
        }
        if ( count > 1 ) {
            printf( "\n" );
        }
    }

    /* Memory clean up */
    TESTING_FREE_CPU( h_A );
    TESTING_FREE_CPU( h_B );
    TESTING_FREE_CPU( h_C );
    TESTING_FREE_CPU( h_C2 );
    TESTING_FREE_CPU( h_C3 );

    TESTING_FREE_DEV( d_A );
    TESTING_FREE_DEV( d_B );
    TESTING_FREE_DEV( d_C );

    magma_queue_destroy( queue );
    magma_finalize();
}
예제 #15
0
int main( int argc, char** argv)
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    magmaFloatComplex *h_R = NULL, *h_P = NULL;
    magmaFloatComplex_ptr d_lA[MagmaMaxSubs * MagmaMaxGPUs];
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] =
        { 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 };
    
    magma_int_t i, j, k, check = 0, info;
    magmaFloatComplex mz_one = MAGMA_C_NEG_ONE;
    magma_int_t ione     = 1;
   
    magma_int_t num_gpus0 = 1, num_gpus, num_subs0 = 1, num_subs, tot_subs, flag = 0;
    int nb, n_local, nk;

    magma_uplo_t uplo = MagmaLower;

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i]) == 0){
                N = atoi(argv[++i]);
                if (N > 0) {
                    size[0] = size[9] = N;
                    flag = 1;
                }
            }
            if(strcmp("-NGPU", argv[i]) == 0)
                num_gpus0 = atoi(argv[++i]);
            if(strcmp("-NSUB", argv[i]) == 0)
                num_subs0 = atoi(argv[++i]);
            if(strcmp("-UPLO", argv[i]) == 0)
                uplo = (strcmp("L", argv[++i]) == 0 ? MagmaLower :  MagmaUpper);
            if(strcmp("-check", argv[i]) == 0)
                check = 1;
        }
    }

    /* Initialize */
    magma_queue_t  queues[2*MagmaMaxGPUs];
    magma_device_t devices[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    for(i=0;i<num_gpus0;i++){
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", err );
            exit(-1);
        }
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", err );
            exit(-1);
        }
    }

    printf("\nUsing %d GPUs:\n", num_gpus0);
    printf("  testing_cpotrf_msub -N %d -NGPU %d -NSUB %d -UPLO %c %s\n\n", size[0], num_gpus0,num_subs0,
           (uplo == MagmaLower ? 'L' : 'U'),(check == 1 ? "-check" : " "));

    printf("  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F\n");
    printf("========================================================================================\n");
    for(i=0; i<10; i++){
        N   = size[i];
        lda = N;
        n2  = lda*N;
        gflops = FLOPS_CPOTRF( N ) / 1e9;;
        nb = magma_get_cpotrf_nb(N);
        if (num_subs0*num_gpus0 > N/nb) {
            num_gpus = N/nb;
            num_subs = 1;
            if(N%nb != 0) num_gpus ++;
            printf("too many GPUs for the matrix size, using %d GPUs\n", (int)num_gpus);
        } else {
            num_gpus = num_gpus0;
            num_subs = num_subs0;
        }
        tot_subs = num_subs * num_gpus;
        
        /* Allocate host memory for the matrix */
        #ifdef USE_PINNED_CLMEMORY
        cl_mem buffer1 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, n2*sizeof(magmaFloatComplex), NULL, NULL);
        cl_mem buffer2 = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, lda*nb*sizeof(magmaFloatComplex), NULL, NULL);
        for (k=0; k<num_gpus; k++) {
            h_R = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 
                                                          n2*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL);
            h_P = (magmaFloatComplex*)clEnqueueMapBuffer(queues[2*k], buffer2, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 
                                                          lda*nb*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL);
        }
        #else
        TESTING_MALLOC_PIN( h_P, magmaFloatComplex, lda*nb );
        TESTING_MALLOC_PIN( h_R, magmaFloatComplex, n2     );
        #endif
        /* Initialize the matrix */
        init_matrix( N, h_R, lda );

        /* Allocate GPU memory */
        if (uplo == MagmaUpper) {
            ldda    = ((N+nb-1)/nb)*nb;    
            n_local = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb;
        } else {
            ldda    = ((N+nb*tot_subs-1)/(nb*tot_subs))*nb;
            n_local = ((N+nb-1)/nb)*nb;
        }
        for (j=0; j<tot_subs; j++) {
            TESTING_MALLOC_DEV( d_lA[j], magmaFloatComplex, n_local*ldda );
        }

        /* Warm up to measure the performance */
        /* distribute matrix to gpus */
        if (uplo == MagmaUpper) {
            for (j=0; j<N; j+=nb) {
                k = (j/nb)%tot_subs;
                nk = min(nb, N-j);
                magma_csetmatrix(j+nk, nk, 
                                 &h_R[j*lda], 0, lda,
                                 d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, 
                                 queues[2*(k%num_gpus)]);
            }
        } else {
            for (j=0; j<N; j+=nb) {
                nk = min(nb, N-j);
                for (int kk = 0; kk<tot_subs; kk++) {
                    int mk = 0;
                    for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) {
                        int mii = min(nb, N-ii);
                        lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda );
                        mk += mii;
                    }
                    k = ((j+kk*nb)/nb)%tot_subs;
                    if (mk > 0 && nk > 0) {
                        magma_csetmatrix(mk, nk, 
                                         h_P, 0, lda,
                                         d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, 
                                         queues[2*(k%num_gpus)]);
                    }
                }
            }
            /*for (j=0; j<N; j+=nb) {
                k = (j/nb)%tot_subs;
                nk = min(nb, N-j);
                magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda,
                                    d_lA[k], j/(nb*tot_subs)*nb, ldda,
                                    queues[2*(k%num_gpus)]);
            }*/
        }
        magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues );

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        /* distribute matrix to gpus */
        if (uplo == MagmaUpper) {
            for (j=0; j<N; j+=nb) {
                k = (j/nb)%tot_subs;
                nk = min(nb, N-j);
                magma_csetmatrix(j+nk, nk, 
                                 &h_R[j*lda], 0, lda,
                                 d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda, 
                                 queues[2*(k%num_gpus)]);
            }
        } else {
            for (j=0; j<N; j+=nb) {
                nk = min(nb, N-j);
                for (int kk = 0; kk<tot_subs; kk++) {
                    int mk = 0;
                    for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) {
                        int mii = min(nb, N-ii);
                        lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_R[ii+j*lda], &lda, &h_P[mk], &lda );
                        mk += mii;
                    }
                    k = ((j+kk*nb)/nb)%tot_subs;
                    if (mk > 0 && nk > 0) {
                        magma_csetmatrix(mk, nk, 
                                         h_P, 0, lda,
                                         d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, 
                                         queues[2*(k%num_gpus)]);
                    }
                }
            }
            /*for (j=0; j<N; j+=nb) {
                k = (j/nb)%tot_subs;
                nk = min(nb, N-j);
                magma_csetmatrix(nk, j+nk, &h_R[j], 0, lda,
                                    d_lA[k], (j/(nb*tot_subs)*nb), ldda,
                                    queues[2*(k%num_gpus)]);
            }*/
        }
    
        gpu_time = magma_wtime();
        magma_cpotrf_msub( num_subs, num_gpus, uplo, N, d_lA, 0, ldda, &info, queues );
        gpu_time = magma_wtime() - gpu_time;
        gpu_perf = gflops / gpu_time;
        if (info != 0)
            printf( "magma_cpotrf had error %d.\n", info );
       
        /* gather matrix from gpus */
        if (uplo==MagmaUpper) {
            for (j=0; j<N; j+=nb) {
                k = (j/nb)%tot_subs;
                nk = min(nb, N-j);
                magma_cgetmatrix(j+nk, nk,
                                 d_lA[k], j/(nb*tot_subs)*nb*ldda, ldda,
                                 &h_R[j*lda], 0, lda, queues[2*(k%num_gpus)]);
            }
        } else {
            for (j=0; j<N; j+=nb) {
                nk = min(nb, N-j);
                for (int kk = 0; kk<tot_subs; kk++) {
                    k = ((j+kk*nb)/nb)%tot_subs;
                    int mk = 0;
                    mk = 0;
                    for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) {
                        mk += min(nb, N-ii);
                    }
                    if (mk > 0 && nk > 0) {
                        magma_cgetmatrix(mk, nk, 
                                         d_lA[k], j*ldda+(j+kk*nb)/(nb*tot_subs)*nb, ldda, 
                                         h_P, 0, lda,
                                         queues[2*(k%num_gpus)]);
                    }
                    mk = 0;
                    for (int ii=j+kk*nb; ii<N; ii+=nb*tot_subs) {
                        int mii = min(nb, N-ii);
                        lapackf77_clacpy( MagmaFullStr, &mii, &nk, &h_P[mk], &lda, &h_R[ii+j*lda], &lda );
                        mk += mii;
                    }
                }
            }
            /*for (j=0; j<N; j+=nb) {
                k = (j/nb)%tot_subs;
                nk = min(nb, N-j);
                magma_cgetmatrix( nk, j+nk, 
                            d_lA[k], (j/(nb*tot_subs)*nb), ldda, 
                            &h_R[j], 0, lda, queues[2*(k%num_gpus)] );
            }*/
        }

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        if (check == 1) {
            float work[1], matnorm, diffnorm;
            magmaFloatComplex *h_A;
            TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 );
            init_matrix( N, h_A, lda );

            cpu_time = magma_wtime();
            if (uplo == MagmaLower) {
                lapackf77_cpotrf( MagmaLowerStr, &N, h_A, &lda, &info );
            } else {
                lapackf77_cpotrf( MagmaUpperStr, &N, h_A, &lda, &info );
            }
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            if (info != 0)
                printf( "lapackf77_cpotrf had error %d.\n", info );
        
            /* =====================================================================
               Check the result compared to LAPACK
               |R_magma - R_lapack| / |R_lapack|
               =================================================================== */
            matnorm = lapackf77_clange("f", &N, &N, h_A, &lda, work);
            blasf77_caxpy(&n2, &mz_one, h_A, &ione, h_R, &ione);
            diffnorm = lapackf77_clange("f", &N, &N, h_R, &lda, work);
            printf( "%5d     %6.2f (%6.2f)     %6.2f (%6.2f)         %e\n",
                    N, cpu_perf, cpu_time, gpu_perf, gpu_time, diffnorm / matnorm );
        
            TESTING_FREE_PIN( h_A );
        } else {
            printf( "%5d      - -     (- -)     %6.2f (%6.2f)          - -\n",
                    N, gpu_perf, gpu_time );
        }
        // free memory
        #ifdef USE_PINNED_CLMEMORY
        for (k=0; k<num_gpus; k++) {
            clEnqueueUnmapMemObject(queues[2*k], buffer1, h_R, 0, NULL, NULL);
            clEnqueueUnmapMemObject(queues[2*k], buffer2, h_P, 0, NULL, NULL);
        }
        clReleaseMemObject(buffer1);
        clReleaseMemObject(buffer2);
        #else
        TESTING_FREE_PIN( h_P );
        TESTING_FREE_PIN( h_R );
        #endif
        for (j=0; j<tot_subs; j++) {
            TESTING_FREE_DEV( d_lA[j] );
        }
        if (flag != 0)
            break;
    }

    /* clean up */
    for (i=0; i<num_gpus; i++) {
        magma_queue_destroy( queues[2*i] );
        magma_queue_destroy( queues[2*i+1] );
    }
    magma_finalize();
    return 0;
}
예제 #16
0
SEXP smagmaCholeskyFinal_m(SEXP A, SEXP n, SEXP NB, SEXP zeroTri, SEXP ngpu, SEXP lowerTri)
{
	magma_init();
	int ndevices;
	ndevices = INTEGER_VALUE(ngpu);
        int idevice;
        for(idevice=0; idevice < ndevices; idevice++)
        {
                magma_setdevice(idevice);
                if(CUBLAS_STATUS_SUCCESS != cublasInit())
                {
                        printf("Error: gpu %d: cublasInit failed\n", idevice);
                        magma_finalize();
                        exit(-1);
                }
        }
//	magma_print_devices();
	
	int In, INB;
	In = INTEGER_VALUE(n);
	INB = INTEGER_VALUE(NB);
	double *PA = NUMERIC_POINTER(A);
	float *sPA = calloc(In*In, sizeof(float));
	int i,j;
	for(i = 0; i < In; i++)
        {
                for(j = 0; j < In; j++)
                {
                        sPA[i*In + j] = (float) PA[i*In + j];
                }
        }
	magma_int_t N, status, info, nGPUs;
	N = In;
	status = 0;
	nGPUs = ndevices;
	
	//INB = magma_get_dpotrf_nb(N);
//	INB = 224;
//	printf("INB = %d\n", INB);
	//ngpu = ndevices;
//	printf("ngpu = %d\n", ngpu);
	//max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB);
//	printf("max_size = %d\n", max_size);
	//int imax_size = max_size;
	//double *dA;
	//magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double));
	
	//ldda = (1+N/(INB*ndevices))*INB;
//	printf("ldda = %d\n", ldda);
	//magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB);
	//magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info);
	int lTri;
	lTri = INTEGER_VALUE(lowerTri);
	if(lTri)
		magma_spotrf_m(nGPUs, MagmaLower, N, sPA, N, &info);
	else
		magma_spotrf_m(nGPUs, MagmaUpper, N, sPA, N, &info);
	if(info != 0)
	{
		printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror(info));
	}
	
	//magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB);
	//for(dev = 0; dev < ndevices; dev++)
	//{
		//magma_setdevice(dev);
		//cudaFree(dA[dev]);
	//}
	magma_finalize();
	cublasShutdown();
	
	//caste sPA back to double and set upper or lower triangle to zero if necessary:
	int IZeroTri = INTEGER_VALUE(zeroTri);
        int zeroUTri = IZeroTri & lTri;
        int zeroLTri = IZeroTri & !lTri;
        if(!IZeroTri) {
                for(i = 1; i< In; i++) {
                        for(j=1; j < In; j++) {
                                PA[i*In + j] = (double) sPA[i*In + j];
                        }
        }
        } else if(zeroUTri) {
                for(i = 1; i< In; i++) {
                        for(j=1; j < In; j++) {
                                if(i > j)
                                        PA[i*In + j] = 0;
                                else
                                        PA[i*In + j] = (double) sPA[i*In + j];
                        }
                }
        } else {
                for(i = 1; i< In; i++) {
                        for(j=1; j < In; j++) {
                                if(i < j)
                                        PA[i*In + j] = 0;
                                else
                                        PA[i*In + j] = (double) sPA[i*In + j];
                        }
                }
        }
	
	UNPROTECT(1);
	free(sPA);
	return(R_NilValue);
}
예제 #17
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetrf
*/
int main( int argc, char** argv)
{
    real_Double_t    gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error;
    magmaDoubleComplex *h_A, *h_R;
    magmaDoubleComplex_ptr d_A;
    magma_int_t     *ipiv;

    /* Matrix size */
    magma_int_t M = 0, N = 0, n2, lda, ldda;
#if defined (PRECISION_z)
    magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7000,7000,7000,7000};
#else
    magma_int_t size[10] = {1024,2048,3072,4032,4992,5952,7104,8064,9000,10000};
#endif
    magma_int_t i, info, min_mn;
    //magma_int_t nb, maxn, ret;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            else if (strcmp("-M", argv[i])==0)
                M = atoi(argv[++i]);
        }
        if (M>0 && N>0)
            printf("  testing_zgetrf -M %d -N %d\n\n", M, N);
        else
            {
                printf("\nUsage: \n");
                printf("  testing_zgetrf -M %d -N %d\n\n", 1024, 1024);
                exit(1);
            }
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zgetrf_gpu -M %d -N %d\n\n", 1024, 1024);
        M = N = size[9];
    }

    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;

    magma_init();
    err = magma_get_devices( device, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( device[0], &queue );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }
    
    ldda   = ((M+31)/32)*32;
    //maxn   = ((N+31)/32)*32;
    n2     = M * N;
    min_mn = min(M, N);
    //nb     = magma_get_zgetrf_nb(min_mn);

    /* Allocate host memory for the matrix */
    TESTING_MALLOC_CPU( ipiv, magma_int_t,        min_mn );
    TESTING_MALLOC_CPU( h_A,  magmaDoubleComplex, n2     );
    TESTING_MALLOC_PIN( h_R,  magmaDoubleComplex, n2     );
    TESTING_MALLOC_DEV( d_A,  magmaDoubleComplex, ldda*N );

    printf("\n\n");
    printf("  M     N    CPU GFlop/ (sec)s   GPU GFlop/s (sec)   ||PA-LU||/(||A||*N)\n");
    printf("========================================================================\n");
    for(i=0; i<10; i++){
        if (argc == 1){
            M = N = size[i];
        }
        min_mn= min(M, N);
        lda   = M;
        n2    = lda*N;
        ldda  = ((M+31)/32)*32;
        gflops = FLOPS( (double)M, (double)N ) *1e-9;

        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
        lapackf77_zlacpy( MagmaUpperLowerStr, &M, &N, h_A, &lda, h_R, &lda );

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = magma_wtime();
        lapackf77_zgetrf(&M, &N, h_A, &lda, ipiv, &info);
        cpu_time = magma_wtime() - cpu_time;
        if (info < 0)
            printf("Argument %d of zgetrf had an illegal value.\n", -info);

        cpu_perf = gflops / cpu_time;

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue );
        magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue );

        magma_zsetmatrix( M, N, h_R, 0, lda, d_A, 0, ldda, queue );
        gpu_time = magma_wtime();
        magma_zgetrf_gpu( M, N, d_A, 0, ldda, ipiv, &info, queue );
        gpu_time = magma_wtime() - gpu_time;
        if (info < 0)
            printf("Argument %d of zgetrf had an illegal value.\n", -info);

        gpu_perf = gflops / gpu_time;

        /* =====================================================================
           Check the factorization
           =================================================================== */
        magma_zgetmatrix( M, N, d_A, 0, ldda, h_A, 0, lda, queue );
        error = get_LU_error(M, N, h_R, lda, h_A, ipiv);
        
        printf("%5d %5d  %6.2f (%6.2f)     %6.2f (%6.2f)      %e\n",
               M, N, cpu_perf, cpu_time, gpu_perf, gpu_time, error);

        if (argc != 1)
            break;
    }

    /* clean up */
    TESTING_FREE_CPU( ipiv );
    TESTING_FREE_CPU( h_A );
    TESTING_FREE_PIN( h_R );
    TESTING_FREE_DEV( d_A );

    magma_queue_destroy( queue );
    magma_finalize();
}
예제 #18
0
int main( int argc, char** argv)
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;
    float      error, work[1];
    int         transA = MagmaNoTrans;
    int         transB = MagmaNoTrans;
    float Cnorm;

    magma_int_t istart = 1024;
    magma_int_t iend   = 8194;
    magma_int_t M, M0 = 0;
    magma_int_t N, N0 = 0;
    magma_int_t K, K0 = 0;
    magma_int_t i;
    magma_int_t Am, An, Bm, Bn;
    magma_int_t szeA, szeB, szeC;
    magma_int_t lda, ldb, ldc, ldda, lddb, lddc;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    
    magmaFloatComplex *h_A, *h_B, *h_C, *h_C2;
    magmaFloatComplex_ptr d_A, d_B, d_C;
    magmaFloatComplex mzone = MAGMA_C_NEG_ONE;
    magmaFloatComplex alpha = MAGMA_C_MAKE(  0.29, -0.86 );
    magmaFloatComplex beta  = MAGMA_C_MAKE( -0.48,  0.38 );

    if (argc != 1){
        for(i=1; i<argc; i++){
            if ( strcmp("-N", argv[i]) == 0 ){
                N0 = atoi(argv[++i]);
            }
            else if ( strcmp("-M", argv[i]) == 0 ){
                M0 = atoi(argv[++i]);
            }
            else if ( strcmp("-K", argv[i]) == 0 ){
                K0 = atoi(argv[++i]);
            }
            else if (strcmp("-NN", argv[i])==0){
                transA = transB = MagmaNoTrans;
            }
            else if (strcmp("-TT", argv[i])==0){
                transA = transB = MagmaTrans;
            }
            else if (strcmp("-NT", argv[i])==0){
                transA = MagmaNoTrans;
                transB = MagmaTrans;
            }
            else if (strcmp("-TN", argv[i])==0){
                transA = MagmaTrans;
                transB = MagmaNoTrans;
            }
#if defined(PRECISION_z) || defined(PRECISION_c)
            else if (strcmp("-NC", argv[i])==0){
                transA = MagmaNoTrans;
                transB = MagmaConjTrans;
            }
            else if (strcmp("-TC", argv[i])==0){
                transA = MagmaTrans;
                transB = MagmaConjTrans;
            }
            else if (strcmp("-CN", argv[i])==0){
                transA = MagmaConjTrans;
                transB = MagmaNoTrans;
            }
            else if (strcmp("-CT", argv[i])==0){
                transA = MagmaConjTrans;
                transB = MagmaTrans;
            }
            else if (strcmp("-CC", argv[i])==0){
                transA = transB = MagmaConjTrans;
            }
#endif
        }
    }

    if ( (M0 != 0) && (N0 != 0) && (K0 != 0) )
        iend = istart + 1;
    
    M = N = K = iend;
    if ( M0 != 0 ) M = M0;
    if ( N0 != 0 ) N = N0;
    if ( K0 != 0 ) K = K0;
    
    if( transA == MagmaNoTrans ) {
        Am = M;
        An = K;
    }  else {
        Am = K;
        An = M;
    }
    
    if( transB == MagmaNoTrans ) {
        Bm = K;
        Bn = N;
    }  else {
        Bm = N;
        Bn = K;
    }
    
    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( device, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( device[0], &queue );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }

    lda = ldc = M;
    ldb = Bm;
    
    ldda = lddc = ((M+31)/32)*32;
    lddb = ((ldb+31)/32)*32;

    K+=32;
    M+=32;
    N +=32;

    TESTING_MALLOC_CPU( h_A,  magmaFloatComplex, lda*K );
    TESTING_MALLOC_CPU( h_B,  magmaFloatComplex, ldb*Bn );
    TESTING_MALLOC_CPU( h_C,  magmaFloatComplex, ldc*N );
    TESTING_MALLOC_CPU( h_C2, magmaFloatComplex, ldc*N );

    TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*K );
    TESTING_MALLOC_DEV( d_B, magmaFloatComplex, lddb*Bn );
    TESTING_MALLOC_DEV( d_C, magmaFloatComplex, lddc*N );

    printf("\nUsage: \n");
    printf("  testing_cgemm [-NN|NT|TN|TT] [-N %d] \n\n", 1024);

    printf("\n");
    printf("Testing transA = %c  transB = %c\n", transA, transB);
    printf("    M    N    K   clAmdBlas GFLop/s (sec)    CPU GFlop/s (sec)     error\n");
    printf("===========================================================================\n");
    for(i=istart; i<iend; i = (int)(i*1.25) )
    {
        M = N = K = i;
        if ( M0 != 0 ) M = M0;
        if ( N0 != 0 ) N = N0;
        if ( K0 != 0 ) K = K0;

        if( transA == MagmaNoTrans ) {
            lda = Am = M;
            An = K;
        }  else {
            lda = Am = K;
            An = M;
        }

        if( transB == MagmaNoTrans ) {
            ldb = Bm = K;
            Bn = N;
        }  else {
            ldb = Bm = N;
            Bn = K;
        }
        gflops = FLOPS( (float)M, (float)N, (float)K ) * 1e-9;
        ldc = M;

        ldda = ((lda+31)/32)*32;
        lddb = ((ldb+31)/32)*32;
        lddc = ((ldc+31)/32)*32;

        szeA = lda * An;
        szeB = ldb * Bn;
        szeC = ldc * N;

        /* Initialize the matrices */
        lapackf77_clarnv( &ione, ISEED, &szeA, h_A );
        lapackf77_clarnv( &ione, ISEED, &szeB, h_B );
        lapackf77_clarnv( &ione, ISEED, &szeC, h_C );
        
        /* =====================================================================
           Performs operation using MAGMA-BLAS
           =================================================================== */
        magma_csetmatrix( Am, An, h_A, 0, lda, d_A, 0, ldda, queue );
        magma_csetmatrix( Bm, Bn, h_B, 0, ldb, d_B, 0, lddb, queue );
        magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue );
    
        magma_cgemm( transA, transB, M, N, K,
                     alpha, d_A, 0, ldda,
                     d_B, 0, lddb,
                     beta,  d_C, 0, lddc, queue );
        magma_csetmatrix( M, N, h_C, 0, ldc, d_C, 0, lddc, queue );
        magma_queue_sync( queue );

        gpu_time = magma_wtime();
        magma_cgemm( transA, transB, M, N, K,
                     alpha, d_A, 0, ldda,
                     d_B, 0, lddb,
                     beta,  d_C, 0, lddc, queue );
        magma_queue_sync( queue);
        gpu_time = magma_wtime() - gpu_time;
        gpu_perf = gflops / gpu_time;
        
        magma_cgetmatrix( M, N, d_C, 0, lddc, h_C2, 0, ldc, queue );
        
        /* =====================================================================
           Performs operation using CPU-BLAS
           =================================================================== */

        cpu_time = magma_wtime();
        blasf77_cgemm( lapack_const(transA), lapack_const(transB),
                       &M, &N, &K,
                       &alpha, h_A, &lda,
                       h_B, &ldb,
                       &beta,  h_C, &ldc );
        cpu_time = magma_wtime() - cpu_time;
        cpu_perf = gflops / cpu_time;
        
        // |C_magma - C_lapack| / |C_lapack|
        Cnorm = lapackf77_clange( "M", &M, &N, h_C, &ldc, work );

        /* =====================================================================
           Error Computation and Performance Compariosn
           =================================================================== */
        blasf77_caxpy(&szeC, &mzone, h_C, &ione, h_C2, &ione);
        error = lapackf77_clange("M", &M, &N, h_C2, &ldc, work)/Cnorm;
        printf("%5d %5d %5d    %8.2f (%6.2f)    %6.2f (%6.2f)    %e\n",
               M, N, K, gpu_perf, gpu_time, cpu_perf, cpu_time, error);
    }

    /* Memory clean up */
    TESTING_FREE_CPU( h_A );
    TESTING_FREE_CPU( h_B );
    TESTING_FREE_CPU( h_C );
    TESTING_FREE_CPU( h_C2 );

    TESTING_FREE_DEV( d_A );
    TESTING_FREE_DEV( d_B );
    TESTING_FREE_DEV( d_C );

    magma_queue_destroy( queue );
    magma_finalize();
}
예제 #19
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgesv_gpu
*/
int main(int argc , char **argv)
{
    real_Double_t gflops, gpu_perf, gpu_time;
    double Rnorm, Anorm, Xnorm, *work;
    magmaDoubleComplex *hA, *hB, *hX;
    magmaDoubleComplex_ptr dA, dB;
    magma_int_t     *ipiv;
    magma_int_t N = 0, n2, lda, ldb, ldda, lddb;
    magma_int_t size[7] =
        { 1024, 2048, 3072, 4032, 5184, 6048, 7000};
    
    magma_int_t i, info, szeB;
    magmaDoubleComplex z_one = MAGMA_Z_ONE;
    magmaDoubleComplex mz_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t NRHS = 100;
    
    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            if (strcmp("-R", argv[i])==0)
                NRHS = atoi(argv[++i]);
        }
        if (N>0) size[0] = size[6] = N;
        else exit(1);
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zgesv_gpu -N <matrix size> -R <right hand sides>\n\n");
    }
    
    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device[ MagmaMaxGPUs ];
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( device, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    err = magma_queue_create( device[0], &queue );
    if ( err != 0 ) {
        fprintf( stderr, "magma_queue_create failed: %d\n", err );
        exit(-1);
    }
    
    /* Allocate memory for the largest matrix */
    N    = size[6];
    n2   = N * N;
    ldda = ((N+31)/32) * 32;
   // ldda = N;
    lddb = ldda;
    TESTING_MALLOC_PIN( ipiv, magma_int_t,        N         );
    TESTING_MALLOC_PIN( hA,   magmaDoubleComplex, n2        );
    TESTING_MALLOC_PIN( hB,   magmaDoubleComplex, N*NRHS    );
    TESTING_MALLOC_PIN( hX,   magmaDoubleComplex, N*NRHS    );
    TESTING_MALLOC_PIN( work, double,             N         );
    TESTING_MALLOC_DEV( dA,   magmaDoubleComplex, ldda*N    );
    TESTING_MALLOC_DEV( dB,   magmaDoubleComplex, lddb*NRHS );

    printf("\n\n");
    printf("    N   NRHS   GPU GFlop/s (sec)   ||B - AX|| / ||A||*||X||\n");
    printf("===========================================================\n");
    for( i = 0; i < 7; i++ ) {
        N   = size[i];
        lda = N;
        ldb = lda;
        n2  = lda*N;
        szeB = ldb*NRHS;
        ldda = ((N+31)/32)*32;
        //ldda = N;
        lddb = ldda;
        gflops = ( FLOPS_GETRF( (double)N, (double)N ) +
                  FLOPS_GETRS( (double)N, (double)NRHS ) ) / 1e9;

        /* Initialize the matrices */
        lapackf77_zlarnv( &ione, ISEED, &n2, hA );
        lapackf77_zlarnv( &ione, ISEED, &szeB, hB );

        /* Warm up to measure the performance */
        magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue );
        magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue );
        magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue );

        //=====================================================================
        // Solve Ax = b through an LU factorization
        //=====================================================================
        magma_zsetmatrix( N, N, hA, 0, lda, dA, 0, ldda, queue );
        magma_zsetmatrix( N, NRHS, hB, 0, lda, dB, 0, lddb, queue );
        gpu_time = magma_wtime();
        magma_zgesv_gpu( N, NRHS, dA, 0, ldda, ipiv, dB, 0, lddb, &info, queue );
        gpu_time = magma_wtime() - gpu_time;
        if (info != 0)
            printf( "magma_zposv had error %d.\n", info );

        gpu_perf = gflops / gpu_time;

        /* =====================================================================
           Residual
           =================================================================== */
        magma_zgetmatrix( N, NRHS, dB, 0, lddb, hX, 0, ldb, queue );
        Anorm = lapackf77_zlange("I", &N, &N,    hA, &lda, work);
        Xnorm = lapackf77_zlange("I", &N, &NRHS, hX, &ldb, work);

        blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &NRHS, &N,
                        &z_one,  hA, &lda,
                        hX, &ldb,
                        &mz_one, hB, &ldb );

        Rnorm = lapackf77_zlange("I", &N, &NRHS, hB, &ldb, work);

        printf( "%5d  %5d   %7.2f (%7.2f)   %8.2e\n",
                N, NRHS, gpu_perf, gpu_time, Rnorm/(Anorm*Xnorm) );

        if (argc != 1)
            break;
    }

    /* clean up */
    TESTING_FREE_PIN( hA );
    TESTING_FREE_PIN( hB );
    TESTING_FREE_PIN( hX );
    TESTING_FREE_PIN( work );
    TESTING_FREE_PIN( ipiv );
    TESTING_FREE_DEV( dA );
    TESTING_FREE_DEV( dB );
    magma_queue_destroy( queue );
    magma_finalize();
}
예제 #20
0
int main( int argc, char** argv )
{
    magma_init();
    cublasHandle_t handle;
    cudaSetDevice( 0 );
    cublasCreate( &handle );
    
    magmaFloatComplex *A, *B, *C;
    magmaFloatComplex *dA, *dB, *dC;
    float error, work[1];
    magmaFloatComplex c_one     = MAGMA_C_ONE;
    magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE;
    magma_int_t ione = 1;
    magma_int_t ISEED[4] = { 1, 2, 3, 4 };
    magma_int_t n, lda, ldda, size, info;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    float tol = opts.tolerance * lapackf77_slamch("E");
    
    printf("    N   |dC - C|/|C|\n");
    printf("====================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            // for this simple case, all matrices are N-by-N
            n = opts.nsize[itest];
            lda = n;
            ldda = ((n+31)/32)*32;
            
            magma_cmalloc_cpu( &A, lda*n );
            magma_cmalloc_cpu( &B, lda*n );
            magma_cmalloc_cpu( &C, lda*n );
            magma_cmalloc( &dA, ldda*n );
            magma_cmalloc( &dB, ldda*n );
            magma_cmalloc( &dC, ldda*n );
            
            // initialize matrices
            size = lda*n;
            lapackf77_clarnv( &ione, ISEED, &size, A );
            lapackf77_clarnv( &ione, ISEED, &size, B );
            lapackf77_clarnv( &ione, ISEED, &size, C );
            // increase diagonal to be SPD
            for( int i=0; i < n; ++i ) {
                C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 ));
            }
            
            magma_csetmatrix( n, n, A, lda, dA, ldda );
            magma_csetmatrix( n, n, B, lda, dB, ldda );
            magma_csetmatrix( n, n, C, lda, dC, ldda );
            
            // compute with cublas
            cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                         &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda );
            
            magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info );
            if (info != 0)
                printf("magma_cpotrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            // compute with LAPACK
            blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n,
                           &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda );
            
            lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info );
            if (info != 0)
                printf("lapackf77_cpotrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            // compute difference, |dC - C| / |C|
            magma_cgetmatrix( n, n, dC, ldda, A, lda );
            blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione );
            error = lapackf77_clange( "F", &n, &n, A, &lda, work )
                  / lapackf77_clange( "F", &n, &n, C, &lda, work );
            printf( "%5d   %8.2e   %s\n",
                    (int) n, error, (error < tol ? "ok" : "failed"));
            status += ! (error < tol);
            
            magma_free( dA );
            magma_free( dB );
            magma_free( dC );
            magma_free_cpu( A );
            magma_free_cpu( B );
            magma_free_cpu( C );
            fflush( stdout );
        }
    }
    
    cublasDestroy( handle );
    magma_finalize();
    return status;
}
예제 #21
0
int main( int argc, char **argv ) {
  printf("Starting\n");
  int size;
  cudaError_t cudaStat;
  magma_err_t magmaStat;
  cublasStatus_t stat;
  cublasHandle_t handle;
  int it,i;

  cublasOperation_t N = 'N';
  cublasOperation_t T = 'T';
  char N2 = 'N';
  char T2 = 'T';
  double one = 1., zero=0.;
  char uplo = 'L';
  int info;
  
  int err; double* A; double* B;
  magmaStat = magma_init();

  int use_pinned;
  if(argc > 1) {
    use_pinned = atoi(argv[1]);
  } else use_pinned = 0;
  printf("Setting use_pinned to %d\n", use_pinned);

  for( size = 256; size <= 8192; size*=2 ) {
 
     if(use_pinned) {
       // allocate pinned memory on CPU
       err = magma_dmalloc_pinned( &A,  size*size );  assert( err == 0 );
       err = magma_dmalloc_pinned( &B,  size*size );  assert( err == 0 );
     } else {
       // allocate standard memory on CPU
       A = (double*) malloc( sizeof(double)*size*size );
       B = (double*) malloc( sizeof(double)*size*size );
     }

    cudaDeviceSynchronize();
    double tInit = read_timer();     
    double *dA,*dB;
    // allocate memory on GPU
    magma_malloc( (void**) &dA, sizeof(double)*size*size );
    magma_malloc( (void**) &dB, sizeof(double)*size*size );
    
    cudaDeviceSynchronize();
    double tAlloc = read_timer();     
 
    fillMatrix(B, size*size);
 

    cudaDeviceSynchronize();
    double tInit2 = read_timer();

    // transfer data to GPU
    magma_dsetmatrix( size, size, B, size, dB, size );

    cudaDeviceSynchronize();
    double tTransferToGPU = read_timer();

    // matrix multiply
    magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size );
    // magma_dgemm is apparently synonymous with magmablas_dgemm

    cudaDeviceSynchronize();
    double tMatMult = read_timer();
 
    // Cholesky decomposition on GPU with GPU interface (called with object on GPU)
    magma_dpotrf_gpu( 'L', size, dA, size, &info );

    cudaDeviceSynchronize();
    double tChol = read_timer();

    // transfer data back to CPU
    magma_dgetmatrix( size, size, dA, size, A, size );
    cudaDeviceSynchronize();
    double tTransferFromGPU = read_timer();
 
    // standard BLAS matrix multiply on CPU
    dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );

    cudaDeviceSynchronize();
    double tMatMultBlas = read_timer();

    // Cholesky decomposition on GPU with CPU interface (called with object on CPU)
    magma_dpotrf( 'L', size, A, size, &info );

    cudaDeviceSynchronize();
    double tCholCpuInterface = read_timer();

    // recreate A = B * B (could just do a save and copy instead....)
    dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );

    cudaDeviceSynchronize();
    double tInit3 = read_timer();

    // standard Lapack Cholesky decomposition on CPU
    dpotrf_(&uplo, &size, A, &size, &info);
  
    cudaDeviceSynchronize();
    double tCholCpu= read_timer();
 

    printf("====================================================\n");
    printf("Timing results for n = %d\n", size);
    printf("GPU memory allocation time: %f\n", tAlloc - tInit);
    printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
    printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU);
    printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU);
    printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult);
    printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas);
    printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3);
    printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol);

    if(use_pinned) {
      magma_free_pinned(A);
      magma_free_pinned(B);
    } else {
      free(A);
      free(B);
    }
    magma_free(dA);
    magma_free(dB);
 
  }
  return EXIT_SUCCESS;
}
예제 #22
0
SEXP magmaCholeskyFinal(SEXP A, SEXP n, SEXP NB, SEXP id, SEXP zeroTri, SEXP lowerTri)
{
	magma_init();
//	magma_print_devices();
	
	double *h_R;
	int In, INB, ID;
	In = INTEGER_VALUE(n);
	INB = INTEGER_VALUE(NB);
	ID = INTEGER_VALUE(id);
	double *PA = NUMERIC_POINTER(A);
	int i,j;

	magma_int_t N, n2, lda, status, info, max_size;
	N=In;
   	lda = N;
   	n2 = lda*N;
  	

/*	for(i = 0; i < In; i++)
	{
		for(j = 0; j < In; j++)
		{
			printf("%.8f ", PA[i+j*In]);
		}
		printf("\n");
	}	*/

	if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) {      
        fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); 
        magma_finalize();                                                  
        exit(-1);   
     	}
        

        lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda );
	
	N = In;
	status = 0;
	magma_setdevice(ID);
	//printf("Modified by Vinay in one GPU\n");
	//INB = magma_get_dpotrf_nb(N);
//	INB = 224;
//	printf("INB = %d\n", INB);
	//ngpu = ndevices;
//	printf("ngpu = %d\n", ngpu);
	//max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB);
//	printf("max_size = %d\n", max_size);
	//int imax_size = max_size;
	//double *dA;
	//magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double));
	//ldda = (1+N/(INB*ndevices))*INB;
//	printf("ldda = %d\n", ldda);
	//magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB);
	//magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info);
	int lTri;
	lTri = INTEGER_VALUE(lowerTri);
	if(lTri)
		magma_dpotrf(MagmaLower, N, h_R, N, &info);
	else
		magma_dpotrf(MagmaUpper, N, h_R, N, &info);
	if(info != 0)
	{
		printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info));
	}
		
	lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda );
	//magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB);
	//for(dev = 0; dev < ndevices; dev++)
	//{
		//magma_setdevice(dev);
		//cudaFree(dA[dev]);
	//}
	magma_free_pinned(h_R);
	magma_finalize();
	cublasShutdown();
	/*
	int IZeroTri;
        IZeroTri = INTEGER_VALUE(zeroTri);
	if(IZeroTri & lTri) {
		for(i = 1; i < In; i++)
        	{
       			for(j=0; j< i; j++)
                	{
                       		PA[i*In+j] = 0.0;
                	}
        	}
	}
	else if(IZeroTri)
		for(i = 0; i < In; i++)
                {
                        for(j=i+1; j < In; j++)
                        {
                                PA[i*In+j] = 0.0;
                        }
                }*/
	return(R_NilValue);
}
예제 #23
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zpotrf_mc
*/
int main( magma_int_t argc, char** argv)
{
    cuDoubleComplex *h_A, *h_R, *h_work, *h_A2;
    cuDoubleComplex *d_A;
    float gpu_perf, cpu_perf, cpu_perf2;

    magma_timestr_t start, end;

    /* Matrix size */
    magma_int_t N=0, n2, lda;
    magma_int_t size[10] = {1024,2048,3072,4032,5184,6048,7200,8064,8928,10080};

    magma_int_t i, j, info[1];

    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};

    magma_int_t num_cores = 4;
    int num_gpus = 0;

    magma_int_t loop = argc;

    if (argc != 1) {
        for(i = 1; i<argc; i++) {
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
            else if (strcmp("-C", argv[i])==0)
                num_cores = atoi(argv[++i]);
        }
        if (N==0) {
            N = size[9];
            loop = 1;
        } else {
            size[0] = size[9] = N;
        }
    } else {
        printf("\nUsage: \n");
        printf("  testing_zpotrf_mc -N %d -B 128 \n\n", 1024);
        N = size[9];
    }

    lda = N;
    n2 = size[9] * size[9];

    /* Allocate host memory for the matrix */
    h_A = (cuDoubleComplex*)malloc(n2 * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
    }

    /* Allocate host memory for the matrix */
    h_A2 = (cuDoubleComplex*)malloc(n2 * sizeof(h_A2[0]));
    if (h_A2 == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A2)\n");
    }

    /* Initialize MAGMA hardware context, seeting how many CPU cores
       and how many GPUs to be used in the consequent computations  */
    magma_context *context;
    context = magma_init(NULL, NULL, 0, num_cores, num_gpus, argc, argv);


    printf("\n\n");
    printf("  N    Multicore GFlop/s    ||R||_F / ||A||_F\n");
    printf("=============================================\n");
    for(i=0; i<10; i++)
    {
        N = lda = size[i];
        n2 = N*N;

        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );

        for(j=0; j<N; j++)
            MAGMA_Z_SET2REAL( h_A[j*lda+j], ( MAGMA_Z_GET_X(h_A[j*lda+j]) + 2000. ) );

        for(j=0; j<n2; j++)
            h_A2[j] = h_A[j];

        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */

        //lapackf77_zpotrf("L", &N, h_A, &lda, info);
        lapackf77_zpotrf("U", &N, h_A, &lda, info);

        if (info[0] < 0)
            printf("Argument %d of zpotrf had an illegal value.\n", -info[0]);

        /* =====================================================================
           Performs operation using multi-core
           =================================================================== */
        start = get_current_time();
        //magma_zpotrf_mc(context, "L", &N, h_A2, &lda, info);
        magma_zpotrf_mc(context, "U", &N, h_A2, &lda, info);
        end = get_current_time();

        if (info[0] < 0)
            printf("Argument %d of magma_zpotrf_mc had an illegal value.\n", -info[0]);

        cpu_perf2 = FLOPS( (double)N ) / (1000000.*GetTimerValue(start,end));

        /* =====================================================================
           Check the result compared to LAPACK
           =================================================================== */
        double work[1], matnorm = 1.;
        cuDoubleComplex mone = MAGMA_Z_NEG_ONE;
        int one = 1;

        matnorm = lapackf77_zlange("f", &N, &N, h_A, &N, work);
        blasf77_zaxpy(&n2, &mone, h_A, &one, h_A2, &one);
        printf("%5d     %6.2f                %e\n",
               size[i], cpu_perf2,
               lapackf77_zlange("f", &N, &N, h_A2, &N, work) / matnorm);

        if (loop != 1)
            break;
    }

    /* Memory clean up */
    free(h_A);
    free(h_A2);

    /* Shut down the MAGMA context */
    magma_finalize(context);


}
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing zgetrf
*/
int main( int argc, char** argv)
{
    real_Double_t    gflops, gpu_perf, cpu_perf, gpu_time, cpu_time, error;
    magmaDoubleComplex *h_A, *h_R;
    magmaDoubleComplex_ptr d_A, dwork;
    magma_int_t N = 0, n2, lda, ldda;
    magma_int_t size[10] = { 1024, 2048, 3072, 4032, 5184, 5600, 5600, 5600, 5600, 5600 };
    magma_int_t ntest = 10;
    
    magma_int_t i, info;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0, 0, 0, 1};
    magmaDoubleComplex *work;
    magmaDoubleComplex tmp;
    double rwork[1];
    magma_int_t *ipiv;
    magma_int_t lwork, ldwork;
    double A_norm, R_norm;
    
    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0)
                N = atoi(argv[++i]);
        }
        if (N>0) size[0] = size[ntest-1] = N;
        else exit(1);
    }
    else {
        printf("\nUsage: \n");
        printf("  testing_zgetri_gpu -N %d\n\n", 1024);
    }
    
    /* query for Lapack workspace size */
    N     = size[ntest-1];
    lda   = N;
    work  = &tmp;
    lwork = -1;
    lapackf77_zgetri( &N, h_A, &lda, ipiv, work, &lwork, &info );
    if (info != 0)
        printf("lapackf77_zgetri returned error %d\n", (int) info);
    lwork = int( MAGMA_Z_REAL( *work ));

    /* query for Magma workspace size */
    ldwork = N * magma_get_zgetri_nb( N );

    /* Initialize */
    magma_queue_t  queue;
    magma_device_t device;
    int num = 0;
    magma_err_t err;

    magma_init();
    err = magma_get_devices( &device, 1, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( device, &queue );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }
    
	/* Allocate memory */
    n2   = N * N;
    ldda = ((N+31)/32) * 32;
    TESTING_MALLOC(    ipiv,  magma_int_t,     N      );
    TESTING_MALLOC(    work,  magmaDoubleComplex, lwork  );
    TESTING_MALLOC(    h_A,   magmaDoubleComplex, n2     );
    TESTING_MALLOC_HOST( h_R,   magmaDoubleComplex, n2     );
    TESTING_MALLOC_DEV(  d_A,   magmaDoubleComplex, ldda*N );
    TESTING_MALLOC_DEV(  dwork, magmaDoubleComplex, ldwork );

    printf("  N    CPU GFlop/s    GPU GFlop/s    ||R||_F / ||A||_F\n");
    printf("========================================================\n");
    for( i=0; i < ntest; i++ ){
        N   = size[i];
        lda = N;
        n2  = lda*N;
        gflops = FLOPS_ZGETRI( (double)N ) / 1e9;
        
        ldda = ((N+31)/32)*32;

        /* Initialize the matrix */
        lapackf77_zlarnv( &ione, ISEED, &n2, h_A );
        A_norm = lapackf77_zlange( "f", &N, &N, h_A, &lda, rwork );

        /* Factor the matrix. Both MAGMA and LAPACK will use this factor. */
        magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue );
        magma_zgetrf_gpu( N, N, d_A, 0, ldda, ipiv, &info, queue );
        magma_zgetmatrix( N, N, d_A, 0, ldda, h_A, 0, lda, queue );
        
        // check for exact singularity
        //h_A[ 10 + 10*lda ] = MAGMA_Z_MAKE( 0.0, 0.0 );
        //magma_zsetmatrix( N, N, h_A, lda, d_A, ldda );

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        //warm-up
		magma_zgetri_gpu( N,    d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue );
        
		magma_zsetmatrix( N, N, h_A, 0, lda, d_A, 0, ldda, queue );
		gpu_time = get_time();
        magma_zgetri_gpu( N,    d_A, 0, ldda, ipiv, dwork, 0, ldwork, &info, queue );
        gpu_time = get_time()-gpu_time;
        if (info != 0)
            printf("magma_zgetri_gpu returned error %d\n", (int) info);

        gpu_perf = gflops / gpu_time;
        
        magma_zgetmatrix( N, N, d_A, 0, ldda, h_R, 0, lda, queue );
         
        /* =====================================================================
           Performs operation using LAPACK
           =================================================================== */
        cpu_time = get_time();
        lapackf77_zgetri( &N,     h_A, &lda, ipiv, work, &lwork, &info );
        cpu_time = get_time() - cpu_time;
        if (info != 0)
            printf("lapackf77_zgetri returned error %d\n", (int) info);
        
        cpu_perf = gflops / cpu_time;
        
        /* =====================================================================
           Check the result compared to LAPACK
           =================================================================== */
        blasf77_zaxpy( &n2, &c_neg_one, h_A, &ione, h_R, &ione );
        R_norm = lapackf77_zlange( "f", &N, &N, h_R, &lda, rwork );
        
        printf( "%5d    %6.2f         %6.2f        %e\n",
                (int) N, cpu_perf, gpu_perf, R_norm / A_norm );
        
        if (argc != 1)
            break;
    }

    /* Memory clean up */
    TESTING_FREE(     ipiv  );
    TESTING_FREE(     work  );
    TESTING_FREE(     h_A   );
    TESTING_FREE_HOST( h_R   );
    TESTING_FREE_DEV(  d_A   );
    TESTING_FREE_DEV(  dwork );

    /* Shutdown */
    magma_queue_destroy( queue );
    magma_finalize();
}