int main( int argc, char** argv)
{
    real_Double_t gflops, gpu_perf, cpu_perf, gpu_time, cpu_time;

    double  matnorm, work[1];
    magmaDoubleComplex  c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex *h_A, *h_R, *tau, *h_work, tmp[1];
    magmaDoubleComplex_ptr d_lA[MagmaMaxGPUs];

    /* Matrix size */
    magma_int_t M = 0, N = 0, flag = 0, n2, check = 0;
    magma_int_t n_local[MagmaMaxGPUs*MagmaMaxSubs], lda, ldda, lhwork;
    magma_int_t size[10] = {1000,2000,3000,4000,5000,6000,7000,8000,9000,10000};

    magma_int_t i, info, min_mn, nb;
    int num_gpus = 1, num_subs = 1, tot_subs = 1;
    magma_int_t ione = 1;

    M = N = size[9];
    if (argc != 1){
        for(i = 1; i<argc; i++){
            if (strcmp("-N", argv[i])==0) {
                N = atoi(argv[++i]);
                flag = 1;
            } else if (strcmp("-M", argv[i])==0) {
                M = atoi(argv[++i]);
                flag = 1;
            } else if (strcmp("-NGPU", argv[i])==0) {
                num_gpus = atoi(argv[++i]);
            } else if (strcmp("-NSUB", argv[i])==0) {
                num_subs = atoi(argv[++i]);
            } else if (strcmp("-check", argv[i])==0) {
                check = 1;
            }
        }
        if ( M == 0 ) {
            M = N;
        }
        if ( N == 0 ) {
            N = M;
        }
    }
    

    if (num_gpus > MagmaMaxGPUs){
      printf("More GPUs requested than available. Have to change it.\n");
      num_gpus = MagmaMaxGPUs;
    }
    if (num_subs > MagmaMaxSubs) {
      printf("More buffers requested than available. Have to change it.\n");
      num_subs = MagmaMaxSubs;
    }
    tot_subs = num_gpus * num_subs;

    printf("\nNumber of GPUs to be used = %d\n", (int) num_gpus);
    printf("Usage: \n");
    printf("  testing_zgeqrf_msub -M %d -N %d -NGPU %d -NSUB %d %s\n\n", M, N, num_gpus, num_subs, (check == 1 ? "-check" : " "));

    /* Initialize */
    magma_queue_t  queues[2*MagmaMaxGPUs];
    magma_device_t devices[MagmaMaxGPUs];
    int num = 0;
    magma_err_t err;
    magma_init();
    err = magma_get_devices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
        fprintf( stderr, "magma_get_devices failed: %d\n", err );
        exit(-1);
    }
    for (i=0; i<num_gpus; i++){
        err = magma_queue_create( devices[i], &queues[2*i] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", err );
            exit(-1);
        }
        err = magma_queue_create( devices[i], &queues[2*i+1] );
        if ( err != 0 ) {
            fprintf( stderr, "magma_queue_create failed: %d\n", err );
            exit(-1);
        }
    }
    printf( "\n" );
    
    printf("  M     N     CPU GFlop/s (sec.)     GPU GFlop/s (sec)   ||R||_F / ||A||_F\n");
    printf("==========================================================================\n");
    for(i=0; i<10; i++){
        if (flag == 0) {
            M = N = size[i];
        }
        nb     = magma_get_zgeqrf_nb(M);
        min_mn = min(M, N);
        lda    = M;
        n2     = lda*N;
        ldda   = ((M+31)/32)*32;
        gflops = FLOPS_ZGEQRF( (double)M, (double)N ) / 1e9;

        /* Allocate host memory for the matrix */
        TESTING_MALLOC_CPU( tau, magmaDoubleComplex, min_mn );
        TESTING_MALLOC_CPU( h_R, magmaDoubleComplex, n2 );

        /* Allocate host workspace */
        lhwork = -1;
        lapackf77_zgeqrf(&M, &N, h_R, &M, tau, tmp, &lhwork, &info);
        lhwork = (magma_int_t)MAGMA_Z_REAL( tmp[0] );
        TESTING_MALLOC_CPU( h_work, magmaDoubleComplex, lhwork );

        /* Allocate device memory for the matrix */
        for (int j=0; j<tot_subs; j++) {      
            n_local[j] = ((N/nb)/tot_subs)*nb;
            if (j < (N/nb)%tot_subs)
                n_local[j] += nb;
            else if (j == (N/nb)%tot_subs)
                n_local[j] += N%nb;
      
            TESTING_MALLOC_DEV( d_lA[j], magmaDoubleComplex, ldda*n_local[j] );
        }

        /* Initialize the matrix */
        init_matrix( M, N, h_R, lda );

        /* ====================================================================
           Performs operation using MAGMA
           =================================================================== */
        magma_queue_t *trans_queues = (magma_queue_t*)malloc(tot_subs*sizeof(magma_queue_t));
        for (int j=0; j<tot_subs; j++) {
            trans_queues[j] = queues[2*(j%num_gpus)];
        }
        
        // warm-up
        magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues);
        magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues);

        magmablas_zsetmatrix_1D_bcyclic(M, N, h_R, lda, d_lA, ldda, tot_subs, nb, trans_queues);
        gpu_time = magma_wtime();
        magma_zgeqrf_msub(num_subs, num_gpus, M, N, d_lA, ldda, tau, &info, queues);
        gpu_time = magma_wtime() - gpu_time;
        gpu_perf = gflops / gpu_time;

        if (info < 0)
          printf("Argument %d of magma_zgeqrf_msub had an illegal value.\n", (int) -info);
        
        if (check == 1) {
            /* =====================================================================
               Check the result compared to LAPACK
               =================================================================== */
            magmablas_zgetmatrix_1D_bcyclic(M, N, d_lA, ldda, h_R, lda, tot_subs, nb, trans_queues);
            TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, n2 );
            init_matrix( M, N, h_A, lda );

            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            cpu_time = magma_wtime();
            lapackf77_zgeqrf(&M, &N, h_A, &lda, tau, h_work, &lhwork, &info);
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;

            if (info < 0)
                printf("Argument %d of lapack_zgeqrf had an illegal value.\n", (int) -info);

            matnorm = lapackf77_zlange("f", &M, &N, h_A, &M, work);
            blasf77_zaxpy(&n2, &c_neg_one, h_A, &ione, h_R, &ione);
        
            printf("%5d %5d      %6.2f (%6.2f)       %6.2f (%6.2f)       %e\n",
                   (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time,
                   lapackf77_zlange("f", &M, &N, h_R, &M, work) / matnorm);

            TESTING_FREE_PIN( h_A );
        } else {
            printf("%5d %5d            -- ( -- )       %6.2f (%6.2f)           --\n",
                   (int) M, (int) N, gpu_perf, gpu_time );
        }
        /* Memory clean up */
        TESTING_FREE_PIN( tau );
        TESTING_FREE_PIN( h_work );
        TESTING_FREE_PIN( h_R );
        for (int j=0; j<tot_subs; j++) {
            TESTING_FREE_DEV( d_lA[j] );
        }

        if (flag != 0)
          break;
    }
    
    for (i=0; i<num_gpus; i++) {
        magma_queue_destroy(queues[2*i]);
        magma_queue_destroy(queues[2*i+1]);
    }

    /* Shutdown */
    magma_finalize();
}
示例#2
0
extern "C" magma_int_t
magma_zgeqrf4(magma_int_t num_gpus, magma_int_t m, magma_int_t n, 
              cuDoubleComplex *a,    magma_int_t lda, cuDoubleComplex *tau, 
              cuDoubleComplex *work, magma_int_t lwork,
              magma_int_t *info )
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======
    ZGEQRF4 computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
    A = Q * R using multiple GPUs. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    Arguments
    =========
    NUM_GPUS 
            (input) INTEGER
            The number of GPUs to be used for the factorization.
 
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX_16 array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_zgeqrf_nb(M).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    cuDoubleComplex *da[4];
    cuDoubleComplex c_one = MAGMA_Z_ONE;

    int i, k, ldda;

    *info = 0;
    int nb = magma_get_zgeqrf_nb(min(m, n));

    int lwkopt = n * nb;
    work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
    int lquery = (lwork == -1);
    if (num_gpus <0 || num_gpus > 4) {
        *info = -1;
    } else if (m < 0) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return *info;
    }

    ldda    = ((m+31)/32)*32;

    magma_int_t  n_local[4];
    for(i=0; i<num_gpus; i++){
        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
            n_local[i] += n%nb;

        magma_setdevice(i);
        
        // TODO on failure, free previously allocated memory
        if (MAGMA_SUCCESS != magma_zmalloc( &da[i], ldda*n_local[i] )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
    }

    if (m > nb && n > nb) {

        /* Copy the matrix to the GPUs in 1D block cyclic distribution */
        magmablas_zsetmatrix_1D_bcyclic(m, n, a, lda, da, ldda, num_gpus, nb);

        /* Factor using the GPU interface */
        magma_zgeqrf2_mgpu( num_gpus, m, n, da, ldda, tau, info);

        /* Copy the matrix back from the GPUs to the CPU */
        magmablas_zgetmatrix_1D_bcyclic(m, n, da, ldda, a, lda, num_gpus, nb);

    } else {

      lapackf77_zgeqrf(&m, &n, a, &lda, tau, work, &lwork, info);

    }


    /* Free the allocated GPU memory */
    for(i=0; i<num_gpus; i++){
        magma_setdevice(i);
        magma_free( da[i] );
    }

    return *info;
} /* magma_zgeqrf4 */