Пример #1
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sswap, sswapblk, spermute, slaswp, slaswpx
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    float *h_A1, *h_A2;
    float *d_A1, *d_A2;
    float *h_R1, *h_R2;
    
    // row-major and column-major performance
    real_Double_t row_perf0, col_perf0;
    real_Double_t row_perf1, col_perf1;
    real_Double_t row_perf2, col_perf2;
    real_Double_t row_perf3;
    real_Double_t row_perf4;
    real_Double_t row_perf5, col_perf5;
    real_Double_t row_perf6, col_perf6;
    real_Double_t row_perf7;
    real_Double_t cpu_perf;

    real_Double_t time, gbytes;

    magma_int_t N, lda, ldda, nb, j;
    magma_int_t ione = 1;
    magma_int_t *ipiv, *ipiv2;
    magma_int_t *d_ipiv;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    magma_queue_t queue = 0;
    
    printf("            cublasSswap       sswap             sswapblk          slaswp   spermute slaswp2  slaswpx           scopymatrix      CPU      (all in )\n");
    printf("    N   nb  row-maj/col-maj   row-maj/col-maj   row-maj/col-maj   row-maj  row-maj  row-maj  row-maj/col-maj   row-blk/col-blk  slaswp   (GByte/s)\n");
    printf("==================================================================================================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            // each test is assigned one bit in the check bitmask, bit=1 is failure.
            // shift keeps track of which bit is for current test
            int shift = 1;
            int check = 0;
            N = opts.nsize[i];
            lda    = N;
            ldda   = ((N+31)/32)*32;
            nb     = (opts.nb > 0 ? opts.nb : magma_get_sgetrf_nb( N ));
            // for each swap, does 2N loads and 2N stores
            gbytes = sizeof(float) * 4.*N*nb / 1e9;
            
            TESTING_MALLOC_PIN( h_A1, float, lda*N );
            TESTING_MALLOC_PIN( h_A2, float, lda*N );
            TESTING_MALLOC_PIN( h_R1, float, lda*N );
            TESTING_MALLOC_PIN( h_R2, float, lda*N );
            
            TESTING_MALLOC_CPU( ipiv,  magma_int_t, nb );
            TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb );
            
            TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb );
            TESTING_MALLOC_DEV( d_A1, float, ldda*N );
            TESTING_MALLOC_DEV( d_A2, float, ldda*N );
            
            for( j=0; j < nb; j++ ) {
                ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1;
            }
            
            /* =====================================================================
             * cublasSswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    cublasSswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    cublasSswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* =====================================================================
             * sswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_sswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_sswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda );
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* =====================================================================
             * sswapblk, blocked version (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_sswapblk( 'R', N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            row_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_sswapblk( 'C', N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            col_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* =====================================================================
             * spermute_long (1 matrix)
             */
            
            /* Row Major */
            memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) );  // spermute updates ipiv2
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_spermute_long2( N, d_A1, ldda, ipiv2, nb, 0 );
            time = magma_sync_wtime( queue ) - time;
            row_perf3 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* =====================================================================
             * LAPACK-style slaswp (1 matrix)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_slaswp( N, d_A1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf4 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* =====================================================================
             * LAPACK-style slaswp (1 matrix) - d_ipiv on GPU
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 );
            magmablas_slaswp2( N, d_A1, ldda, 1, nb, d_ipiv );
            time = magma_sync_wtime( queue ) - time;
            row_perf7 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* =====================================================================
             * LAPACK-style slaswpx (extended for row- and col-major) (1 matrix)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_slaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf5 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* Col Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_slaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            col_perf5 = gbytes / time;
            
            time = magma_wtime();
            lapackf77_slaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione);
            time = magma_wtime() - time;
            cpu_perf = gbytes / time;
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* =====================================================================
             * Copy matrix.
             */
            
            time = magma_sync_wtime( queue );
            magma_scopymatrix( N, nb, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            col_perf6 = 0.5 * gbytes / time;
            
            time = magma_sync_wtime( queue );
            magma_scopymatrix( nb, N, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            row_perf6 = 0.5 * gbytes / time;
            
            printf("%5d  %3d  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c  %6.2f%c  %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f / %6.2f  %6.2f  %10s\n",
                   (int) N, (int) nb,
                   row_perf0, ((check & 0x001) != 0 ? '*' : ' '),
                   col_perf0, ((check & 0x002) != 0 ? '*' : ' '),
                   row_perf1, ((check & 0x004) != 0 ? '*' : ' '),
                   col_perf1, ((check & 0x008) != 0 ? '*' : ' '),
                   row_perf2, ((check & 0x010) != 0 ? '*' : ' '),
                   col_perf2, ((check & 0x020) != 0 ? '*' : ' '),
                   row_perf3, ((check & 0x040) != 0 ? '*' : ' '),
                   row_perf4, ((check & 0x080) != 0 ? '*' : ' '),
                   row_perf7, ((check & 0x100) != 0 ? '*' : ' '),
                   row_perf5, ((check & 0x200) != 0 ? '*' : ' '),
                   col_perf5, ((check & 0x400) != 0 ? '*' : ' '),
                   row_perf6,
                   col_perf6,
                   cpu_perf,
                   (check == 0 ? "ok" : "* failures") );
            
            TESTING_FREE_PIN( h_A1 );
            TESTING_FREE_PIN( h_A2 );
            TESTING_FREE_PIN( h_R1 );
            TESTING_FREE_PIN( h_R2 );
            
            TESTING_FREE_CPU( ipiv  );
            TESTING_FREE_CPU( ipiv2 );
            
            TESTING_FREE_DEV( d_ipiv );
            TESTING_FREE_DEV( d_A1 );
            TESTING_FREE_DEV( d_A2 );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return 0;
}
Пример #2
0
/**
    Purpose
    -------
    SGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels.

    Arguments
    ---------
    @param[in]
    num_gpus INTEGER
            The number of GPUs to be used for the factorization.

    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       REAL array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    @param[in]
    ldda    INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    @param[out]
    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_sgesv_comp
    ********************************************************************/
extern "C" magma_int_t
magma_sgetrf2_mgpu(magma_int_t num_gpus,
         magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
         float *d_lAT[], magma_int_t lddat, magma_int_t *ipiv,
         float *d_lAP[], float *w, magma_int_t ldw,
         magma_queue_t streaml[][2], magma_int_t *info)
{
#define dAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (w+((j)%num_gpus)*nb*ldw)

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs];
    magma_int_t id, i_local, i_local2, nb0, nb1, h = 2+num_gpus;
    float *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (num_gpus*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if ( num_gpus > ceil((float)n/nb) ) {
        *info = -1;
        return *info;
    }

    /* Use hybrid blocked code. */
    maxm  = ((m + block_size-1)/block_size)*block_size;

    /* some initializations */
    for (i=0; i < num_gpus; i++) {
        magma_setdevice(i);
        
        n_local[i] = ((n/nb)/num_gpus)*nb;
        if (i < (n/nb)%num_gpus)
            n_local[i] += nb;
        else if (i == (n/nb)%num_gpus)
            n_local[i] += n%nb;
        
        /* workspaces */
        d_panel[i] = &(d_lAP[i][h*nb*maxm]);   /* temporary panel storage */
    }
    trace_init( 1, num_gpus, 2, (CUstream_st**)streaml );

    /* start sending the panel to cpu */
    nb0 = min(mindim, nb);
    magma_setdevice(0);
    magmablasSetKernelStream(streaml[0][1]);
    trace_gpu_start( 0, 1, "comm", "get" );
    magmablas_stranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm );
    magma_sgetmatrix_async( m, nb0,
                            d_lAP[0], maxm,
                            W(0),     ldw, streaml[0][1] );
    trace_gpu_end( 0, 1 );

    /* ------------------------------------------------------------------------------------- */
    magma_timer_t time=0;
    timer_start( time );

    s = mindim / nb;
    for( i=0; i < s; i++ ) {
        /* Set the GPU number that holds the current panel */
        id = i%num_gpus;
        magma_setdevice(id);
        
        /* Set the local index where the current panel is */
        i_local = i/num_gpus;
        cols  = maxm - i*nb;
        rows  = m - i*nb;
        
        /* synchrnoize i-th panel from id-th gpu into work */
        magma_queue_sync( streaml[id][1] );
        
        /* i-th panel factorization */
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_sgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) ) {
            *info = iinfo + i*nb;
        }
        trace_cpu_end( 0 );
        
        /* start sending the panel to all the gpus */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            trace_gpu_start( 0, 1, "comm", "set" );
            magma_ssetmatrix_async( rows, nb,
                                    W(i),     ldw,
                                    &d_lAP[d][(i%h)*nb*maxm], cols,
                                    streaml[d][1] );
            trace_gpu_end( 0, 1 );
            d = (d+1)%num_gpus;
        }
        
        /* apply the pivoting */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            
            trace_gpu_start( d, 1, "pivot", "pivot" );
            if ( dd == 0 )
                magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb, i*nb );
            else
                magmablas_spermute_long3(        dAT(d,0,0), lddat, ipiv, nb, i*nb );
            trace_gpu_end( d, 1 );
            d = (d+1)%num_gpus;
        }
    
    
        /* update the trailing-matrix/look-ahead */
        d = (i+1)%num_gpus;
        for( dd=0; dd < num_gpus; dd++ ) {
            magma_setdevice(d);
            
            /* storage for panel */
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,i,i_local);
                ldpan[d] = lddat;
                /* next column */
                i_local2 = i_local+1;
            } else {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                ldpan[d] = nb;
                /* next column */
                i_local2 = i_local;
                if ( d < id ) i_local2 ++;
            }
            /* the size of the next column */
            if ( s > (i+1) ) {
                nb0 = nb;
            } else {
                nb0 = n_local[d]-nb*(s/num_gpus);
                if ( d < s%num_gpus ) nb0 -= nb;
            }
            if ( d == (i+1)%num_gpus) {
                /* owns the next column, look-ahead the column */
                nb1 = nb0;
                magmablasSetKernelStream(streaml[d][1]);
                
                /* make sure all the pivoting has been applied */
                magma_queue_sync(streaml[d][0]);
                trace_gpu_start( d, 1, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
                /* synch for remaining update */
                magma_queue_sync(streaml[d][1]);
            } else {
                /* update the entire trailing matrix */
                nb1 = n_local[d] - i_local2*nb;
                magmablasSetKernelStream(streaml[d][0]);
                
                /* synchronization to make sure panel arrived on gpu */
                magma_queue_sync(streaml[d][1]);
                trace_gpu_start( d, 0, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_stranspose( rows, nb, &d_lAP[d][(i%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
            }
            
            /* gpu updating the trailing matrix */
            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         nb1, nb, c_one,
                         panel_local[d],       ldpan[d],
                         dAT(d, i, i_local2), lddat);
            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                         nb1, m-(i+1)*nb, nb,
                         c_neg_one, dAT(d, i,   i_local2),         lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d, i+1, i_local2),         lddat );
        
            if ( d == (i+1)%num_gpus ) {
                /* Set the local index where the current panel is */
                int loff    = i+1;
                int i_local = (i+1)/num_gpus;
                int ldda    = maxm - (i+1)*nb;
                int cols    = m - (i+1)*nb;
                nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */
                trace_gpu_end( d, 1 );
                
                if ( nb0 > 0 ) {
                    /* transpose the panel for sending it to cpu */
                    trace_gpu_start( d, 1, "comm", "get" );
                    magmablas_stranspose( nb0, m-(i+1)*nb, dAT(d,loff,i_local), lddat, &d_lAP[d][((i+1)%h)*nb*maxm], ldda );
             
                    /* send the panel to cpu */
                    magma_sgetmatrix_async( cols, nb0,
                                            &d_lAP[d][((i+1)%h)*nb*maxm], ldda,
                                            W(i+1), ldw, streaml[d][1] );

                    trace_gpu_end( d, 1 );
                }
            } else {
                trace_gpu_end( d, 0 );
            }
            
            d = (d+1)%num_gpus;
        }
    
        /* update the remaining matrix by gpu owning the next panel */
        if ( (i+1) < s ) {
            int i_local = (i+1)/num_gpus;
            int rows  = m - (i+1)*nb;
            
            d = (i+1)%num_gpus;
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            trace_gpu_start( d, 0, "gemm", "gemm" );

            magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n_local[d] - (i_local+1)*nb, nb,
                         c_one, panel_local[d],       ldpan[d],
                                dAT(d,i,i_local+1),  lddat );
            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                         n_local[d]-(i_local+1)*nb, rows, nb,
                         c_neg_one, dAT(d,i,i_local+1),            lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d,i+1,  i_local+1),        lddat );
            trace_gpu_end( d, 0 );
        }
    } /* end of for i=1..s */
    /* ------------------------------------------------------------------------------ */
    
    /* Set the GPU number that holds the last panel */
    id = s%num_gpus;
    
    /* Set the local index where the last panel is */
    i_local = s/num_gpus;
    
    /* size of the last diagonal-block */
    nb0 = min(m - s*nb, n - s*nb);
    rows = m    - s*nb;
    cols = maxm - s*nb;
    
    if ( nb0 > 0 ) {
        magma_setdevice(id);
        
        /* wait for the last panel on cpu */
        magma_queue_sync( streaml[id][1] );
    
        /* factor on cpu */
        lapackf77_sgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        
        /* send the factor to gpus */
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            i_local2 = i_local;
            if ( d < id ) i_local2 ++;
            
            if ( d == id || n_local[d] > i_local2*nb ) {
                magma_ssetmatrix_async( rows, nb0,
                                        W(s),     ldw,
                                        &d_lAP[d][(s%h)*nb*maxm],
                                        cols, streaml[d][1] );
            }
        }
        
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][0]);
            if ( d == 0 )
                magmablas_spermute_long2( lddat, dAT(d,0,0), lddat, ipiv, nb0, s*nb );
            else
                magmablas_spermute_long3(        dAT(d,0,0), lddat, ipiv, nb0, s*nb );
        }
        
        for( d=0; d < num_gpus; d++ ) {
            magma_setdevice(d);
            magmablasSetKernelStream(streaml[d][1]);
            
            /* wait for the pivoting to be done */
            magma_queue_sync( streaml[d][0] );
            
            i_local2 = i_local;
            if ( d < id ) i_local2++;
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,s,i_local);
                
                /* next column */
                nb1 = n_local[d] - i_local*nb-nb0;
                
                magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat );
                
                if ( nb1 > 0 ) {
                    magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb1, nb0, c_one,
                                 panel_local[d],        lddat,
                                 dAT(d,s,i_local)+nb0, lddat);
                }
            } else if ( n_local[d] > i_local2*nb ) {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                
                /* next column */
                nb1 = n_local[d] - i_local2*nb;
                
                magmablas_stranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb );
                magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb1, nb0, c_one,
                             panel_local[d],     nb,
                             dAT(d,s,i_local2), lddat);
            }
        }
    } /* if ( nb0 > 0 ) */
    
    /* clean up */
    trace_finalize( "sgetrf_mgpu.svg","trace.css" );
    for( d=0; d < num_gpus; d++ ) {
        magma_setdevice(d);
        magma_queue_sync( streaml[d][0] );
        magma_queue_sync( streaml[d][1] );
        magmablasSetKernelStream(NULL);
    }
    magma_setdevice(0);
    
    timer_start( time );
    timer_printf("\n Performance %f GFlop/s\n", FLOPS_SGETRF(m,n) / 1e9 / time );

    return *info;
} /* magma_sgetrf2_mgpu */