void magma_task_dev_dmalloc_pinned(Schedule* sched_obj )
 {
     magma_int_t deviceID;
    magma_int_t size;
    double **A; 
    void *dep_ptr;
    
#if (dbglevel >=1)
    ca_trace_start();
#endif
//    printf("doing dmalloc\n");
    schedule_unpack_args_4(sched_obj, deviceID, size, A, dep_ptr);

    magma_setdevice(deviceID);
//    printf("doing dmalloc %p\n",dep_ptr);

    //printf("using malloc instead, *** TODO: fix\n");
    //A = (double**) malloc(size * sizeof(double));

    magma_dmalloc_pinned(A, size);
    
//    printf("end doing dmalloc\n");
#if (dbglevel >=1)
ca_trace_end_gpu('O');
ca_trace_end_cpu('C');
#endif
 }
void magma_task_dev_dfree_pinned_index(Schedule* sched_obj )
 {
     magma_int_t deviceID;
    double **A; 
    magma_int_t index;
    void *dep_ptr;
#if (dbglevel >=1)
    ca_trace_start();
#endif
//    printf("doing dmalloc\n");
    schedule_unpack_args_4(sched_obj, deviceID, A, index, dep_ptr);

    magma_setdevice(deviceID);
//    printf("doing dmalloc %p\n",dep_ptr);

    //printf("using malloc instead, *** TODO: fix\n");
    //A = (double**) malloc(size * sizeof(double));

    //printf("*** using simpl free\n");
    //free(A[index]);

    //A[index]=NULL;
    magma_free_pinned(A[index]);
#if (dbglevel >=1)
ca_trace_end_gpu('0');
ca_trace_end_cpu('C');
#endif

 }
  void magma_task_dev_dgetmatrix_async_transpose(Schedule* sched_obj )               
 {
     magma_int_t deviceID;
    magma_int_t m;
    magma_int_t nb;
    double *dA_src;
    magma_int_t dA_LD;
    double *A_dst;
    magma_int_t LDA;
    magma_queue_t stream1;
    double *dwork;
    magma_int_t dwork_LD;
    
    void *dep_ptr;

#if (dbglevel >=1)
    ca_trace_start();
#endif

//    printf("Matrix_get_transpose\n");
    //schedule_unpack_args_8(sched_obj, m,  nb, dA_src,  dA_LD, A_dst,  LDA, dwork, dwork_LD);
    schedule_unpack_args_11(sched_obj,  deviceID, m,  nb, dA_src,  dA_LD, A_dst,  LDA, stream1, dwork, dwork_LD, dep_ptr);

    magma_setdevice(deviceID);
//    printf("Matrix_get_transpose m:%d, nb:%d, dep_ptr:%p\n",m,nb,dep_ptr);

    //magma_dgetmatrix_transpose( m,  nb, dA_src,  dA_LD, A_dst,  LDA);
     // task_getpanel(gpu_nrows, ncols, dAT(K,K+A_N), dAT_LD, A(K,K+A_N), A_LD);
#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(nb, m, dA_src, dA_LD, "dA before getMatrix");
#endif
//            pthread_mutex_lock(&mutex_dAP);
    /*1.transpose dA to dwork*/
    magmablas_dtranspose2s(dwork, dwork_LD, dA_src, dA_LD,  nb, m, compute_stream[deviceID]);
//    printf("Matrix_get_transpose m:%d, nb:%d 1:done\n",m,nb);
#if (dbglevel==10)
//            ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after dA transpose");
#endif

    magma_queue_sync(compute_stream[deviceID]);

    /*2. copy dwork to A: send the panel to GPU*/
    magma_dgetmatrix_async(m, nb, dwork, dwork_LD, A_dst, LDA, stream1);
//    printf("Matrix_get_transpose m:%d, nb:%d 2:done\n",m,nb);
//    pthread_mutex_unlock(&mutex_dAP);

    magma_queue_sync(stream1); //make sure dwork is set before the transpose

#if (dbglevel==10)
            ca_dbg_printMat(m, nb, A_dst, LDA, "A after getMatrix");
#endif 
#if (dbglevel >=1)
ca_trace_end_gpu('G');
ca_trace_end_cpu('C');
#endif

//    printf("End Matrix_get_transpose m:%d, nb:%d\n",m,nb);
 }
 void magma_task_dev_dsetmatrix_transpose(Schedule* sched_obj)
 {
     magma_int_t deviceID;
    magma_int_t m;
    magma_int_t nb;
    double *A_src;
    magma_int_t LDA;
    double *dA_dst;
    magma_int_t dA_LD;
    double *dwork;
    magma_int_t dwork_LD;

//    double *tmpdA;

#if (dbglevel >=1)
    ca_trace_start();
#endif

    schedule_unpack_args_9(sched_obj, deviceID, m,  nb, A_src,  LDA, dA_dst,  dA_LD, dwork, dwork_LD);

    magma_setdevice(deviceID);
    //magma_dsetmatrix_transpose( m,  nb, A_src,  LDA, dA_dst,  dA_LD);
      //task_setpanel(A_m, nb, A(0,K), A_LD, dA(0,K), dA_LD)

#if (dbglevel==10)
            ca_dbg_printMat(m, nb, A_src, LDA, "A before setMatrix");
            
            //ca_dbg_printMat_gpu(2, 2, dwork, dwork_LD, "dwork for testing");

            //cudaMemcpy(&tmpdA, dwork, sizeof(double*), cudaMemcpyHostToDevice);
            //ca_dbg_printMat_gpu(2, 2, tmpdA, dwork_LD, "dlAP_set[dd] for testing");

#endif
//            pthread_mutex_lock(&mutex_dAP);
    /*1. copy A to dwork: send to the GPU*/
    magma_dsetmatrix(m, nb, A_src, LDA, dwork, dwork_LD);
#if (dbglevel==10)
//            ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after setMatrix");
#endif
    /*2.transpose dwork to dA*/
    magmablas_dtranspose2(dA_dst, dA_LD, dwork, dwork_LD, m, nb);
//    pthread_mutex_unlock(&mutex_dAP);
#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(nb, m, dA_dst, dA_LD, "dA after setMatrix");
#endif
#if (dbglevel >=1)
ca_trace_end_gpu('T');
ca_trace_end_cpu('C');
#endif
 }
void magma_task_dev_dgemm(Schedule* sched_obj  )
{
    magma_int_t deviceID;
    magma_trans_t transA;
    magma_trans_t transB;
    magma_int_t m;
    magma_int_t n;
    magma_int_t k;
    double alpha;
    double *dA;
    magma_int_t lda;
    double *dB;
    magma_int_t ldb;
    double beta;
    double *dC;
    magma_int_t ldc;

#if (dbglevel >=1)
    ca_trace_start();
#endif
    schedule_unpack_args_14(sched_obj, deviceID, transA,  transB,  m,  n,  k, alpha, dA,  lda, dB,  ldb, beta, dC,  ldc);

magma_setdevice(deviceID);

#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(m, k, dA, lda, "A before magma_dgemm");
            ca_dbg_printMat_transpose_gpu(k, n, dB, ldb, "B before magma_dgemm");
            ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C before magma_dgemm");
#endif

        pthread_mutex_lock(&mutex_compute_stream);
    magmablasSetKernelStream(compute_stream[deviceID]);
    
    magma_dgemm( transA,  transB,  m,  n,  k, alpha, dA,  lda, dB,  ldb, beta, dC,  ldc );
     //task_magma_dgemm(MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD);
    pthread_mutex_unlock(&mutex_compute_stream);
#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C after magma_dgemm");
#endif
#if (dbglevel >=1)
ca_trace_end_gpu('S');
ca_trace_end_cpu('C');
#endif
}
Exemple #6
0
void magma_task_dfree_pinned(Schedule* sched_obj )
 {
    double *A; 
    void *dep_ptr;
#if (dbglevel >=1)
    ca_trace_start();
#endif
//    printf("doing dmalloc\n");
    schedule_unpack_args_2(sched_obj, A, dep_ptr);
//    printf("doing dmalloc %p\n",dep_ptr);

    //printf("using malloc instead, *** TODO: fix\n");
    //A = (double**) malloc(size * sizeof(double));

    magma_free_pinned(A);
    
#if (dbglevel >=1)
ca_trace_end_1gpu('O');
ca_trace_end_cpu('C');
#endif
 }
void magma_task_dev_dtrsm(Schedule* sched_obj  )
{
    magma_int_t deviceID;
    magma_side_t side;
    magma_uplo_t uplo;
    magma_trans_t trans;
    magma_diag_t diag;
    magma_int_t m;
    magma_int_t n;
    double alpha;
    double *dA;
    magma_int_t lda;
    double *dB;
    magma_int_t ldb;
#if (dbglevel >=1)
    ca_trace_start();
#endif
    schedule_unpack_args_12(sched_obj, deviceID, side,  uplo,  trans,  diag,  m,  n, alpha, dA,  lda, dB,  ldb);

    #if (dbglevel==10)
       ca_dbg_printMat_transpose_gpu(m, n, dB, ldb, "A(K,K:N) before magma_dtrsm");
    #endif

    magma_setdevice(deviceID);
    pthread_mutex_lock(&mutex_compute_stream);
    magmablasSetKernelStream(compute_stream[deviceID]);
    
    magma_dtrsm( side,  uplo,  trans,  diag,  m,  n, alpha, dA,  lda, dB,  ldb );
     //task_magma_dtrsm('R', 'U', MagmaNoTrans, 'U', gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD);
    pthread_mutex_unlock(&mutex_compute_stream);

    #if (dbglevel==10)
       ca_dbg_printMat_transpose_gpu(m, n, dB, ldb, "A(K,K:N) after magma_dtrsm");
    #endif

#if (dbglevel >=1)
ca_trace_end_gpu('U');
ca_trace_end_cpu('C');
#endif
}
Exemple #8
0
void magma_task_dgemm(Schedule* sched_obj  )
{
    magma_trans_t transA;
    magma_trans_t transB;
    magma_int_t m;
    magma_int_t n;
    magma_int_t k;
    double alpha;
    double *dA;
    magma_int_t lda;
    double *dB;
    magma_int_t ldb;
    double beta;
    double *dC;
    magma_int_t ldc;

#if (dbglevel >=1)
    ca_trace_start();
#endif
    schedule_unpack_args_13(sched_obj,transA,  transB,  m,  n,  k, alpha, dA,  lda, dB,  ldb, beta, dC,  ldc);

#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(m, k, dA, lda, "A before magma_dgemm");
            ca_dbg_printMat_transpose_gpu(k, n, dB, ldb, "B before magma_dgemm");
            ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C before magma_dgemm");
#endif

    
    magma_dgemm( transA,  transB,  m,  n,  k, alpha, dA,  lda, dB,  ldb, beta, dC,  ldc );
     //magma_task_dgemm('N','N', gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD);

#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C after magma_dgemm");
#endif
#if (dbglevel >=1)
ca_trace_end_1gpu('S');
ca_trace_end_cpu('C');
#endif
}
Exemple #9
0
void magma_task_dlaswp(Schedule* sched_obj   )
{
    magma_int_t n;
    double *dA;
    magma_int_t lda;
    magma_int_t i1;
    magma_int_t i2;
    magma_int_t *ipiv;
    magma_int_t inci;
#if (dbglevel >=1)
    ca_trace_start();
#endif
    schedule_unpack_args_7(sched_obj,n, dA,  lda,  i1,  i2,  ipiv,  inci);


    magmablas_dlaswp(  n, dA,  lda,  i1,  i2,  ipiv,  inci );
     //magma_task_dlaswp(gpu_ncols, dAT(K,K+A_N), dAT_LD, c_one, nb, &ipiv[K], c_one);
#if (dbglevel >=1)
ca_trace_end_1gpu('W');
ca_trace_end_cpu('C');
#endif
}
Exemple #10
0
 void magma_task_dsetmatrix_transpose(Schedule* sched_obj)
 {
    magma_int_t m;
    magma_int_t nb;
    double *A_src;
    magma_int_t LDA;
    double *dA_dst;
    magma_int_t dA_LD;
    double *dwork;
    magma_int_t dwork_LD;
#if (dbglevel >=1)
    ca_trace_start();
#endif

    schedule_unpack_args_8(sched_obj, m,  nb, A_src,  LDA, dA_dst,  dA_LD, dwork, dwork_LD);

    //magma_dsetmatrix_transpose( m,  nb, A_src,  LDA, dA_dst,  dA_LD);
      //task_setpanel(A_m, nb, A(0,K), A_LD, dA(0,K), dA_LD)

#if (dbglevel==10)
            ca_dbg_printMat(m, nb, A_src, LDA, "A before setMatrix");
#endif
//            pthread_mutex_lock(&mutex_dAP);
    /*1. copy A to dwork: send to the GPU*/
    magma_dsetmatrix(m, nb, A_src, LDA, dwork, dwork_LD);
#if (dbglevel==10)
//            ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after setMatrix");
#endif
    /*2.transpose dwork to dA*/
    magmablas_dtranspose2(dA_dst, dA_LD, dwork, dwork_LD, m, nb);
//    pthread_mutex_unlock(&mutex_dAP);
#if (dbglevel==10)
            ca_dbg_printMat_transpose_gpu(nb, m, dA_dst, dA_LD, "dA after setMatrix");
#endif
#if (dbglevel >=1)
ca_trace_end_1gpu('T');
ca_trace_end_cpu('C');
#endif
 }
Exemple #11
0
  void magma_task_dev_queue_sync(Schedule* sched_obj)
 {
     magma_int_t deviceID;
    magma_queue_t stream1;

    void *dep_ptr;
//    double *tmpdA;

#if (dbglevel >=1)
    ca_trace_start();
#endif

    schedule_unpack_args_3(sched_obj, deviceID, stream1,  dep_ptr);

    magma_setdevice(deviceID);

    magma_queue_sync(stream1);

#if (dbglevel >=1)
ca_trace_end_gpu('O');
ca_trace_end_cpu('C');
#endif
 }
Exemple #12
0
void magma_task_dev_dlaswp(Schedule* sched_obj   )
{
    magma_int_t deviceID;
    magma_int_t n;
    double *dA;
    magma_int_t lda;
    magma_int_t i1;
    magma_int_t i2;
    magma_int_t *ipiv;
    magma_int_t inci;
#if (dbglevel >=1)
    ca_trace_start();
#endif
    schedule_unpack_args_8(sched_obj, deviceID, n, dA,  lda,  i1,  i2,  ipiv,  inci);

    magma_setdevice(deviceID);

    #if (dbglevel==10)
       ca_dbg_printMat_transpose_gpu(n, n, dA, lda, "A(n,n) before magma_dlaswp");
    #endif

    pthread_mutex_lock(&mutex_compute_stream);
    magmablasSetKernelStream(compute_stream[deviceID]);
    
    magmablas_dlaswp(  n, dA,  lda,  i1,  i2,  ipiv,  inci );

    pthread_mutex_unlock(&mutex_compute_stream);
     //task_magma_dlaswp(gpu_ncols, dAT(K,K+A_N), dAT_LD, c_one, nb, &ipiv[K], c_one);
    #if (dbglevel==10)
       ca_dbg_printMat_transpose_gpu(n, n, dA, lda, "A(n,n) after magma_dlaswp");
    #endif

#if (dbglevel >=1)
ca_trace_end_gpu('W');
ca_trace_end_cpu('C');
#endif
}
Exemple #13
0
void magma_task_dtrsm(Schedule* sched_obj  )
{
    magma_side_t side;
    magma_uplo_t uplo;
    magma_trans_t trans;
    magma_diag_t diag;
    magma_int_t m;
    magma_int_t n;
    double alpha;
    double *dA;
    magma_int_t lda;
    double *dB;
    magma_int_t ldb;
#if (dbglevel >=1)
    ca_trace_start();
#endif
    schedule_unpack_args_11(sched_obj,side,  uplo,  trans,  diag,  m,  n, alpha, dA,  lda, dB,  ldb);
    magma_dtrsm( side,  uplo,  trans,  diag,  m,  n, alpha, dA,  lda, dB,  ldb );
     //magma_task_dtrsm('R', 'U', 'N', 'U', gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD);
#if (dbglevel >=1)
ca_trace_end_1gpu('U');
ca_trace_end_cpu('C');
#endif
}
extern "C" magma_int_t 
magma_dgetrf_mgpu_work_amc_v3(magma_int_t num_gpus,
magma_int_t m, magma_int_t n,  
double **dlA, magma_int_t dlA_LD, 
magma_int_t *ipiv, magma_int_t *info,
/*workspace on the cpu side*/
double *AWORK, magma_int_t AWORK_LD, magma_int_t AWORK_n
) 
{ 
/*  -- MAGMA (version 1.5.0-beta3) -- 
       Univ. of Tennessee, Knoxville 
       Univ. of California, Berkeley 
       Univ. of Colorado, Denver 
       November 2011 
 
    Purpose 
    ======= 
 
    DGETRF_REC_ASYNC computes an LU factorization of a general M-by-N matrix A 
    using partial pivoting with row interchanges. The technique used for the panel factorization
    is the parallel recursif LU (see lawn 259).
 
    The factorization has the form 
       A = P * L * U 
    where P is a permutation matrix, L is lower triangular with unit 
    diagonal elements (lower trapezoidal if m > n), and U is upper 
    triangular (upper trapezoidal if m < n). 
 
    This is the right-looking Level 3 BLAS version of the algorithm. 
 
    Arguments 
    ========= 
    NUM_GPUS
            (input) INTEGER
            The number of GPUS to be used for the factorization.
 
    M       (input) INTEGER 
            The number of rows of the matrix A.  M >= 0. 
 
    N       (input) INTEGER 
            The number of columns of the matrix A.  N >= 0. 
 
    A       (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). 
            On entry, the M-by-N matrix to be factored. 
            On exit, the factors L and U from the factorization 
            A = P*L*U; the unit diagonal elements of L are not stored. 
 
    LDDA     (input) INTEGER 
            The leading dimension of the array A.  LDDA >= max(1,M). 
 
    IPIV    (output) INTEGER array, dimension (min(M,N)) 
            The pivot indices; for 1 <= i <= min(M,N), row i of the 
            matrix was interchanged with row IPIV(i). 
 
    INFO    (output) INTEGER 
            = 0:  successful exit 
            < 0:  if INFO = -i, the i-th argument had an illegal value 
                  or another error occured, such as memory allocation failed. 
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization 
                  has been completed, but the factor U is exactly 
                  singular, and division by zero will occur if it is used 
                  to solve a system of equations.

    =====================================================================    */ 
 
 
 
    double c_one     = MAGMA_D_ONE; 
    double c_neg_one = MAGMA_D_NEG_ONE; 
 
    int ONE = 1; 
 
    magma_int_t iinfo, nb; 
    magma_int_t mindim; 
    magma_int_t nrows, ncols; 
    //double *work; 
 
 
     magma_int_t dm_max, dn_max; 
     magma_int_t I, J, K, M, N, U_K, L; 
     
     //magma_int_t A_m, A_n, A_N; 
     //magma_int_t Am_max, An_max; 
     //magma_int_t A_nb; 
     
 
     //magma_int_t A_K; 
     double **dlAT; 
     magma_int_t dlAT_LD; 
      
      
     double *dlAP_get[MagmaMaxGPUs]; //*dlAP_set[MagmaMaxGPUs]
      
     
     double *dlAP_set[MagmaMaxGPUs];
     magma_int_t dlAP_LD;

     double *dlpanel[MagmaMaxGPUs];
     magma_int_t dlpanel_LD;

     int *n_local, *nr_local;

     //magma_int_t nrows, ncols; 
     magma_int_t gpu_nrows, gpu_ncols; 
  
     int nbcores; /*Number of cores available for the whole factorization*/ 
     int panel_num_threads; /*Number of threads for the panel*/ 
     double dcpu; /*percentage of the matrix to allocate on the CPUs*/ 
  
    int B_rows;

    double t1;
    
    /*Workspace*/
    // magma_int_t AWORK_NMAX;
    // magma_int_t AWORK_m, AWORK_n, AWORK_N;

     /* Recommanded dimension in the workspace*/ 
     int A_m, A_n, A_N, A_NMAX, A_LD;
     int A_NP1;
     double *A;

     amc_args_t *args;
    /*magma_event_t *A_event;*/ /*Control bucket*/
     magma_queue_t mstream[MagmaMaxGPUs][3]; /*0: H2D, 1: compute, 2:D2H*/
     int dd;

//     double *tmpdA;

    /* Check arguments */ 
    *info = 0; 
    if (m < 0) 
        *info = -1; 
    else if (n < 0) 
        *info = -2; 
    else if (dlA_LD < max(1,m)) 
        *info = -4; 
    else if (AWORK_LD < max(1,m)) 
        *info = -5;

    if (*info != 0) { 
        magma_xerbla( __func__, -(*info) ); 
        return *info; 
    } 
 
    /* Quick return if possible */ 
    if (m == 0 || n == 0) 
        return *info; 
 
 


      
     /*Get parameters*/ 
    args = magma_amc_args_get_default();
     nb= args->nb;

     nbcores = args->P;  
     panel_num_threads = args->Pr; 
     dcpu = args->dcpu;

     /* Check and fix parameters */
     if(nb==0)
        nb     = magma_get_dgetrf_nb(m) ;/*magma dgetrf block size*/ 
    else
        nb = args->nb;

     if(nb>n) nb = n; 
     if(panel_num_threads>nbcores) panel_num_threads = nbcores;

     /*check the buffer size*/

     if(AWORK_n<nb){
         printf("Not enough buffer. Should be greater than the block size: %d\n", nb);
         exit(1);
     }

     /* Compute the number of blocks columns to factorize*/
     N  = (int) ceil( (double) min(m, n) / nb);

     /* Compute the maximum number of panels we can store in the workspace*/
     A_NMAX = (int) (AWORK_n/ nb);

     /*Compute the recommanded number of panels for the cpu part*/
     A_N = NSplit(N, dcpu);

     /* Compute the recommanded number of columns for the cpu part*/
     A_n = A_N*nb;//(int) ceil(n*dcpu);

     //if(A_n<nb) 
     //     A_n = nb;//make sure workspace has at least one block column

     /*Make sure we work with multiple of 32*/
     /*
     if(A_n%32!=0) {
         A_n = ((A_n + 31)/32)*32;
     }
     */

     /* Compute the recommanded number of panels for the cpu part*/
    // A_N = (int) (A_n/ nb);
     
     /* Check if there are enough workspace. In case the user gave a workspace lower than the optimal*/
     /* NOTE: using small workspace may reduce performance*/

     if(A_N>A_NMAX){    

#if (dbglevel >=1)
        printf("[DBG_WARNING] Resizing buffer to feet user preferences. Recommanded:%d, Max given:%d\n",A_N, A_NMAX); 
#endif
        A_N = A_NMAX;

        /*Make A_n a multiple of nb*/
        A_n = A_N*nb;
    }
      
     A = AWORK;
     A_m = m;
     A_LD = AWORK_LD;


#if (dbglevel >=1)
    /* Initialize the tracing*/
    ca_dbg_trace_init(nbcores,num_gpus); //nbcores + 1 GPU
#endif

#if (dbglevel >=1)
    t1 = magma_wtime();
#endif

    /* create the streams */
    //mstream = (magma_queue_t *)    malloc(num_gpus*sizeof(magma_queue_t));

    for(dd=0;dd<num_gpus;dd++){
       magma_setdevice(dd); //required
       magma_queue_create(&mstream[dd][0]);
       magma_queue_create(&mstream[dd][1]);
       magma_queue_create(&mstream[dd][2]);

       /*Set the stream for internal computations*/
       //magmablasSetKernelStream(0); /*Use 0 instead of mstream[dd][1], MagmasetkernelStream is not thread safe*/ /*TODO: mae it safe*/

       //task_dev_set_compute_stream(dd, mstream[dd][1]);
       magma_task_dev_set_compute_stream(dd, 0); //set to mstream 1 later
    }

    


     /* Matrix dimension */
     dm_max = m;
     dn_max = n;

    /*Make sure m and n are multiple of 32*/
     
     if(dm_max%32!=0) dm_max = ((dm_max + 31)/32)*32;
     if(dn_max%32!=0) dn_max = ((dn_max + 31)/32)*32;
     
     

     /* local dimensions of the matrix for each GPU*/
     n_local = (int *)    malloc(num_gpus*sizeof(int)); /*This do no change during the execution*/
     nr_local = (int *)    malloc(num_gpus*sizeof(int)); /*Change after each update of the trailing submatrix*/

     for(dd=0;dd<num_gpus;dd++){
        n_local[dd] = numcols2p(dd, n, nb, num_gpus); //loc2p(dd, N, num_gpus)*nb;    
        nr_local[dd] = n_local[dd];
     }


     /*Allocate a workspace for the panels transposition*/ 

     dlAP_LD = dm_max; 
     //if(dAP_LD%32!=0) dAP_LD = ((dAP_LD + 31)/32)*32;/*Make dAP_LD multiple of 32*/
    /// dlAP_set = (double **)    malloc(num_gpus*sizeof(double*));
     //dlAP_get = (double **)    malloc(num_gpus*sizeof(double*));

     for(dd=0;dd<num_gpus;dd++){

         magma_setdevice(dd);
        
         if (MAGMA_SUCCESS != magma_dmalloc( &dlAP_set[dd], dlAP_LD*nb)) { 
                *info = MAGMA_ERR_DEVICE_ALLOC; 
                return *info; 
        } 
        
        /*
        if (MAGMA_SUCCESS != magma_dmalloc(&tmpdA, dlAP_LD*nb)) { 
                *info = MAGMA_ERR_DEVICE_ALLOC; 
                return *info; 
        }
        */
        if ( magma_is_devptr(dlAP_set[dd] ) == 0 ) {
            fprintf( stderr, "ERROR: dlAP_set[dd] is host pointer.\n" );
            //exit(1);
        }

        
        //cudaMemcpy(dlAP_set[dd],&tmpdA,sizeof(double*), cudaMemcpyDeviceToHost);

        #if (dbglevel==10) 
        printf("0.4\n");
            
            //ca_dbg_printMat_gpu(2, 2, dlAP_set[dd], dlAP_LD, "dlAP_set[dd] for testing");
            //cudaMemcpy(&tmpdA, &dlAP_set[dd], sizeof(double*), cudaMemcpyHostToDevice);
            //ca_dbg_printMat_gpu(2, 2, tmpdA, dlAP_LD, "dlAP_set[dd] for testing");
            //printf("0.5: int to continue"); scanf("%d", &I);
        #endif

         if (MAGMA_SUCCESS != magma_dmalloc(&dlAP_get[dd], dlAP_LD*nb)) { 
                //magma_free(dlAP_set); //TODO: free all previous buffers
                *info = MAGMA_ERR_DEVICE_ALLOC; 
                return *info; 
        }
     }

    /* Workspace for the panels */

    // dlpanel = (double **)    malloc(num_gpus*sizeof(double*));
     
     for(dd=0;dd<num_gpus;dd++){
         magma_setdevice(dd);

         if (MAGMA_SUCCESS != magma_dmalloc(&dlpanel[dd], nb*dm_max)) { 

                
                *info = MAGMA_ERR_DEVICE_ALLOC; 
                return *info; 
        }
      }

      dlpanel_LD = nb;
      

    /*local matrix storage*/
    dlAT = (double **)    malloc(num_gpus*sizeof(double*));

    
    dlAT_LD = n_local[0];

    if(dlAT_LD%32!=0) dlAT_LD = ((dlAT_LD + 31)/32)*32;

    for(dd=0;dd<num_gpus;dd++){
         magma_setdevice(dd);

        if (MAGMA_SUCCESS != magma_dmalloc(&dlAT[dd], dlAT_LD*dm_max )) { 
                for(J=0;J<dd;J++){
                    magma_setdevice(J);
                    magma_free( dlAP_set[J]); 
                    magma_free( dlAP_get[J]);
                    magma_free(dlpanel[J]);
                    magma_free(dlAT[J]);
                }
                //free(dlAP_set); 
                //free(dlAP_get);
                //free(dlpanel);
                free(dlAT);
            *info = MAGMA_ERR_DEVICE_ALLOC; 
            return *info; 
        }



    }


#if (dbglevel >=1)
    printf("[DBG] Time workspace memory alloc (dAP): %f\n",magma_wtime()-t1);
    t1 = magma_wtime();
#endif


    /*1. Transfer the first column blocks of the matrix from the GPU to the CPUs.*/ 
    
    //magma_dgetmatrix(A_m, A_n, dA, dA_LD, A, A_LD); 
    magma_dgetmatrix_1D_col_bcyclic(A_m, A_n, dlA, dlA_LD, A, A_LD, num_gpus, nb);

#if (dbglevel >=1)
    printf("[DBG] Time First getmatrix: %f\n",magma_wtime()-t1);
    t1 = magma_wtime();
#endif

#if (dbglevel==10) 
    printf("1.0\n");
    ca_dbg_printMat(A_m, A_n, A, A_LD,"A after first getMatrix"); 

    /*
    for(dd=0;dd<num_gpus;dd++){
        //Fill the matrix with zero for easy visualization of the matrix in debug mode
        for(I=0;I<dlAT_LD*dm_max;I++)  dlAT[dd][I] = 0.0;
    }
    */
//    ca_dbg_printMat_mgpu(num_gpus,  m, n_local, dlAT, dlAT_LD,"matrix dAlT^T empty");
//    ca_dbg_printMat_transpose_mgpu(num_gpus,  n_local, m, dlAT, dlAT_LD,"matrix dAT empty");
printf("2.0\n");
#endif

    /*Update the remaining number of columns on the GPUs.*/
    for(dd=0;dd<num_gpus;dd++){
        nr_local[dd] = nr_local[dd] - numcols2p(dd, A_n, nb, num_gpus); //;n_local[dd] - loc2p(dd, A_N, num_gpus)*nb;        
    }

#if (dbglevel==10) 
    ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlA, dlA_LD,"matrix dA to factorize");

    printf("3.0\n");    
#endif


for(dd=0;dd<num_gpus;dd++){
    magma_setdevice(dd);
    //magmablasSetKernelStream(mstream[dd][1]);    
    magmablas_dtranspose2(dlAT[dd], dlAT_LD, dlA[dd], dlA_LD, m, n_local[dd]);
}

///


for(dd=0;dd<num_gpus;dd++){
    magma_setdevice(dd);
    magma_task_dev_set_compute_stream(dd, mstream[dd][1]);
}



#if (dbglevel >=1)
    printf("[DBG] Time First transposition: %f\n",magma_wtime()-t1);
    t1 = magma_wtime();
#endif

#if (dbglevel==10) 
    //ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"matrix dAT to factorize");
/*
    dd = GID(A_N);
    magma_setdevice(dd);

    ca_dbg_printMat_transpose_gpu(nb, m, dlAT(0, A_N), dlAT_LD,"matrix dAT(0, A_N)");
    
    magma_setdevice(0);
    ca_dbg_printMat_transpose_gpu(m, nb, dlA(0, A_N), dlA_LD,"matrix dA(0, A_N)");
    */

    printf("4.0\n");
    printf("int to continue"); scanf("%d", &I);
#endif

/*
#if (dbglevel==10) 
    ca_dbg_printMat_transpose_mgpu(num_gpus, m, n_local, dlAT, dlAT_LD,"matrix dAT to factorize");
#endif
*/


     /* Compute the maximun number of steps*/
     mindim = min(m, n); 
     M      = (int) ceil( (double) m / nb); 
     N      = (int) ceil( (double) mindim / nb); /*N = n/nb*/


     /* 3. Let the asynchronous algorithm begin*/ 
     
#if (dbglevel >=1)
     printf("Starting recursif code ... m:%d, n:%d, nb:%d, nbcores:%d, N:%d, A_N:%d\n", m, n, nb, nbcores, N, A_N); //Summary
#endif



     /*Initialize the scheduler*/ 
     magma_schedule_init(nbcores, num_gpus); 



     K = 0; 
     /*initialize parallel recursif panel environment*/
     CORE_zgetrf_reclap_init();

     magma_schedule_set_task_priority(INT_MAX-1);
     
     /*Schedule the first panel factorization*/ 
     magma_insert_core_dgetrf_rec(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K));  
     //magma_insert_core_dgetrf(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, colptr(K)); 
 
     /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/

     for(dd=0;dd<num_gpus;dd++){
         
        
        ///magma_insert_dev_dsetmatrix_transpose(dd, A_m, nb, A(0,K), A_LD, dlpanel(dd,K), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K), dlpanel[dd]);
        magma_insert_dev_dsetmatrix_async_transpose(dd, A_m, nb, A(0,K), A_LD, dlpanel(dd,K), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K), dlpanel(dd,K)); //dlpanel[dd]

     }
#if (dbglevel==10) 
    magma_schedule_barrier();

    for(dd=0;dd<num_gpus;dd++){
        magma_setdevice(dd);
        ca_dbg_printMat_transpose_gpu(nb, m, dlpanel(dd,K), dlpanel_LD,"dlpanel[dd] after setmatrix_async"); //dlpanel[dd]
    }
    printf("4.5: int to continue"); scanf("%d", &I);
#endif
     /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ 
     /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/
     dd = GID(K);
     //magma_insert_dev_dsetmatrix_transpose(dd, A_m, nb, A(0,K), A_LD, dlAT(0,K), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K), dlAT(0,K)); 
     magma_insert_dev_dsetmatrix_async_transpose(dd, A_m, nb, A(0,K), A_LD, dlAT(0,K), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K), dlAT(0,K)); 
 
#if (dbglevel==10) 
    magma_schedule_barrier(); 
    ca_dbg_printMat(m, nb, A(0,0), A_LD,"A(0,0)");
    
    for(dd=0;dd<num_gpus;dd++){
        magma_setdevice(dd);
        ca_dbg_printMat_transpose_gpu(nb, m, dlpanel[dd], dlpanel_LD,"dlpanel[dd] after setmatrix to dlAT");
    }

    ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dlA"); 
 printf("5.0: int to continue"); scanf("%d", &I);
#endif 

     for(K=0;K<=N-1;K++){ 
     
         /*compute the new value of the cpu number of blocks*/
         A_N = NSplit(N-K, dcpu);

          /*insert the coarse update of the trailing submatrix corresponding to panel K to the GPU, that is submatrix A[K+1:M, K+1+d-1:N]*/ 
          
         //if(K==0) /*TODO: move outside loop*/
          //{

         /*NOTE: Here we work on the matrix transpose*/

         /*Set the priority max for the GPU computations*/
            magma_schedule_set_task_priority(INT_MAX);
            //// magma_schedule_set_task_priority(INT_MAX - N*K);

         gpu_nrows = m - (K+1)*nb;///

         for(J=K+A_N;J<min(K+A_N+num_gpus,N);J++){

            /*Determine the device which own the first column of the group of columns to update*/
             dd = GID(J);

            /*Determine the number of columns to apply the update. */
             nr_local[dd] = numcols2p(dd, n - (K+1+A_N-1)*nb, nb, num_gpus);

              gpu_ncols = nr_local[dd]; //n - (K+1+A_N-1)*nb; 
            
              if(gpu_ncols >0) 
              { 
 
                  /*schedule a swap of the trailing submatrix in the gpus using ipiv[K]*/ 
                  /*dependency dAT((K+1)-1, (K+A_N)-1) = dAT(K, K+A_N-1) with previous dgemm*/              
              
                  magma_insert_dev_dlaswp(dd, gpu_ncols, dlAT(K, J), dlAT_LD, ONE, nb, ipiv(K), ONE, dlAT(K, J-1)); /*non blocking*/                  
                  //printf("debug barrier\n");
                  //magma_schedule_barrier();
                  //&(dlpanel[dd][dlpanel_LD*nb*K])
                  magma_insert_dev_dtrsm(dd, MagmaRight,  MagmaUpper, MagmaNoTrans, MagmaUnit, gpu_ncols, nb, c_one, dlpanel(dd,K), dlpanel_LD, dlAT(K,J), dlAT_LD);/*non blocking*/ 
 
                  /* aij^T = aij^T - (lik.ukj)^T = aij^T - ukj^T.lik^T*/ //&(dlpanel[dd][dlpanel_LD*nb*(K+1)])
                  magma_insert_dev_dgemm(dd, MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dlAT(K,J), dlAT_LD, dlpanel(dd,K+1), dlpanel_LD, c_one, dlAT(K+1,J), dlAT_LD);/*non blocking*/    
              

                  /*Transfer asynchronously one column (column K+A_N) from the GPU to the CPU to balance work*/                
                 //// if(K+A_N<N) 
                 //// { 
                    ////ncols = min(nb, gpu_ncols); 
 
                    //////magma_schedule_set_task_priority(INT_MAX);

                    ////magma_insert_dgetmatrix_transpose(gpu_nrows, ncols, dAT(K+1,K+A_N), dAT_LD, A(K+1,K+A_N), A_LD, dAP, dAP_LD, colptr(K+A_N)); //blocking
                 //// }
              
              } 
         }

          //}
          /*iterate over the rest of the columns to update the trailing submatrix on the cpu*/ 
          for(J=K+1;J<=min(K+A_N-1, N-1);J++){ 
 
               ncols = min(nb, n - J*nb); 
 
               /*Set the priority max for column having the next panel (look ahead of deep 1),
               and process the rest of the update in a right looking way*/
               if(J==K+1)
                   magma_schedule_set_task_priority(INT_MAX -2 );
                  //// magma_schedule_set_task_priority(INT_MAX - N*K -1);
               else
                   magma_schedule_set_task_priority(INT_MAX -3 - J );//- N*K
                  //// magma_schedule_set_task_priority(INT_MAX - N*K -3 -J);
               //magma_schedule_set_task_priority(INT_MAX - J);

               /*dependency colptr(J): make sure column J is sent from GPU, and all previous update was done*/
               magma_insert_core_dlaswp(ncols, A(K,J), A_LD, ONE, nb, ipiv(K), ONE, colptr(J)); 
 
               magma_insert_core_dtrsm('L', 'L', 'N', 'U', nb, ncols, c_one, A(K,K), A_LD, A(K,J), A_LD, colptr(J)); 
 
             /*Compute the number of blocs rows to group together before the update. To avoid scheduling overhead.*/
              B_rows = (int) ceil((double) (M-K-1)/panel_num_threads);
              B_rows = max(B_rows,4); /*maximun of 4*/ 
              //B_rows = max(B_rows,1);
              //printf("B_rows:%d\n",B_rows);
               for(I=K+1; I<=M-1; I+=B_rows){ 
     
                    nrows = min(B_rows*nb, m-I*nb); 
                    
                    /*dep colptr(K):make sure the panel is not overwritten or swapped since dgemm use A[I,K]*/
                    /*dep colptr(J): Gather all dgemm on one column and create dependencies with previous dgemm and the next panel*/
                    magma_insert_core_dgemm('N','N', nrows, ncols, nb, c_neg_one, A(I,K), A_LD, A(K,J), A_LD, c_one, A(I,J), A_LD, colptr(K), colptr(J)); 
               } 
 
                

               if(J==K+1) 
               { 
                    /*Look ahead and insert the next panel*/ 
                    nrows = m - (K+1)*nb; 
                    ncols = min(nb, n - (K+1)*nb); 
 
                    /*Schedule the next panel factorization with maximum priority*/ 
                    magma_schedule_set_task_priority(INT_MAX -1);
                    ///magma_schedule_set_task_priority(0); //TEST: testing prio_0
                   //// magma_schedule_set_task_priority(INT_MAX - N*K - 2);

                   magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); 
                   // magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); 
 
                    /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/

                     for(dd=0;dd<num_gpus;dd++){
                         //&(dlpanel[dd][dlpanel_LD*nb*(K+1)])
                        ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel[dd]);
                        magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel(dd,K+1));//, dlpanel[dd]
                     }

                    /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ 
                    U_K = max(0, K+1 - A_N +1); 
                    nrows = m - U_K*nb; 
 
                    ///magma_schedule_set_task_priority(INT_MAX);
                    /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ 
                    ///magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP, dAP_LD, A(K+1,K+1), dAT(K+1,K+1)); 
                    //magma_insert_dev_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1));
 
                    
                     /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ 
                     /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/
                     dd = GID(K+1);
                     ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(K+1,K+1));
                     magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(0,K+1));///

               } 
 
          } 
 
            /*compute the next number of blocks colums */
          A_NP1 = NSplit(N-(K+1), dcpu) - NSplit(N-K, dcpu) + 1;

           /*Transfer asynchronously (A_NP1 - A_N) block column (column K+A_N) from the GPU to the CPU to balance work*/  
            /*Make sure this is inserted after all dgemm because it schedules to replace a current panel for the case A_N< N*/
          for(L=K+A_N;L<K+A_N+A_NP1;L++)
          {
               if(L<N) { 

                 /*Determine the device which own column K+A_N*/
                 dd = GID(L);

                 gpu_ncols = nr_local[dd];

                 ncols = min(nb, gpu_ncols); 
 
                 magma_schedule_set_task_priority(INT_MAX);

                 ///magma_insert_dev_dgetmatrix_transpose(dd, gpu_nrows, ncols, dlAT(K+1,K+A_N), dlAT_LD, A(K+1,K+A_N), A_LD, dlAP_get[dd], dlAP_LD, colptr(K+A_N)); //blocking
             
                 /*make sure the computations are done on stream 1 and send a block column on stream 2*/
                 magma_insert_dev_queue_sync(dd, mstream[dd][1], dlAT(K+1,L)); 
                 magma_insert_dev_dgetmatrix_async_transpose(dd, gpu_nrows, ncols, dlAT(K+1,L), dlAT_LD, A(K+1,L), A_LD, mstream[dd][2], dlAP_get[dd], dlAP_LD, colptr(L));
                 /*Update the remaining number of columns*/
                //// nr_local[dd]-=nb;

                  /*if A_N==1, there is no look-ahead, so insert the panel here*/
                   if((A_N==1) && (L==K+A_N)){
                      /*Look ahead and insert the next panel*/ 
                      nrows = m - (K+1)*nb; 
                      ncols = min(nb, n - (K+1)*nb); 
                      /*Schedule the next panel factorization with maximum priority*/ 
                      magma_schedule_set_task_priority(INT_MAX -1);
                            ///magma_schedule_set_task_priority(0); //TEST: testing prio_0
                           //// magma_schedule_set_task_priority(INT_MAX - N*K - 2);

                      magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); 
                      //magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); 
 
                      /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/

                      for(dd=0;dd<num_gpus;dd++){
                          //&(dlpanel[dd][dlpanel_LD*nb*(K+1)])
                        ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel[dd]);
                        magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel(dd,K+1));//dlpanel[dd]
                      }

                            /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ 
                      U_K = max(0, K+1 - A_N +1); 
                      nrows = m - U_K*nb; 
 
                            ///magma_schedule_set_task_priority(INT_MAX);
                            /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ 
                            ///magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP, dAP_LD, A(K+1,K+1), dAT(K+1,K+1)); 
                            //magma_insert_dev_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1));

                      /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ 
                      /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/
                      dd = GID(K+1);
                      ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(K+1,K+1));
                      magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(0,K+1));///dlAT(K+1,K+1)
                   }
               }

         }
#if (dbglevel==10) 
  
  magma_schedule_barrier(); 
  ca_dbg_printMat(m, A_n, A, A_LD,"A"); 
  ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dAT (Step K)"); 
  
  nrows = m - K*nb; 
  ncols = min(nb, n - K*nb);

  dd = GID(K);
  magma_setdevice(dd);
  ca_dbg_printMat_transpose_gpu(ncols, nrows, dlAT(K,K), dlAT_LD,"dAT(K,K)");

  if(K<=5){
  printf("Step K:%d done. Int to continue: ",K); scanf("%d", &I);
  }

#endif 
           

     } //Step K done
 /*Wait for all thread termination*/
 magma_schedule_barrier(); 


 /*make sure everything arrived*/ ///needed?
 for(dd=0;dd<num_gpus;dd++){
        magma_setdevice(dd);
       magma_queue_sync(mstream[dd][0]);
       magma_queue_sync(mstream[dd][1]);
       magma_queue_sync(mstream[dd][2]);
}

     /*TODO: don't need quark here*/
     /*Perform a sequence of left swap on the matrix corresponding to the different panel*/ 
     for(K=1;K<=N-1;K++){ 
 
#if (dbglevel >=1)
    ca_trace_start();
#endif

    nrows = min(nb,m - K*nb);

    ncols = min(K*nb,n);

    for(dd=0;dd<=min(num_gpus-1, K-1);dd++){
        
        gpu_ncols = numcols2p(dd, ncols, nb, num_gpus); 

        J = dd;
        if(gpu_ncols>0){
            magma_setdevice(dd);
            //pthread_mutex_lock(&mutex_compute_stream);
            magmablasSetKernelStream(mstream[dd][1]);
            magmablas_dlaswp(gpu_ncols, dlAT(K, J), dlAT_LD, ONE, nrows, ipiv(K), ONE);
            //pthread_mutex_lock(&mutex_compute_stream);
        }

    }       
        

        

#if (dbglevel >=1)
ca_trace_end_1gpu('W');
#endif
     } 
     
#if (dbglevel==10) 
    ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dAT after lswap"); 
#endif

/*Shutdown the scheduler*/
     magma_schedule_delete();

/*update permutation vector indexes*/ 
     for(K=1;K<=N-1;K++){ 
 
        nrows = min(nb, n-K*nb); 
        for(J=0;J<=nrows-1;J++){ 
            ipiv[K*nb+J] += K*nb; 
        } 
     } 

#if dbglevel>=1
    printf("[DBG] Time Factorization:%f\n",magma_wtime()-t1); 
    t1 = magma_wtime();
#endif

    /* 4. Transpose back the matrix in/out of place*/
    for(dd=0;dd<num_gpus;dd++){

        //n_local[dd] = numcols2p(dd, n, nb, num_gpus); //loc2p(dd, N, num_gpus)*nb;

        magma_setdevice(dd);
        magmablasSetKernelStream(mstream[dd][1]);
        magmablas_dtranspose2(dlA[dd], dlA_LD, dlAT[dd], dlAT_LD, n_local[dd], m);
    }


    for(dd=0;dd<num_gpus;dd++){ //needed
        magma_setdevice(dd);
        magmablasSetKernelStream(NULL);
    }

#if dbglevel>=1
    printf("[DBG] Time Final in/out of place transpose:%f\n",magma_wtime()-t1); 
    t1 = magma_wtime();

#endif



#if (dbglevel==10)     
    ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlA, dlA_LD,"dA = LU"); 
#endif 

    for(dd=0;dd<num_gpus;dd++){
        magma_setdevice(dd);
       magma_queue_destroy(mstream[dd][0]);
       magma_queue_destroy(mstream[dd][1]);
       magma_queue_destroy(mstream[dd][2]);
    }

    //free(mstream);

    // printf("Step 4: time:%f\n",magma_wtime()-t1); 
// t1 = magma_wtime();
    free(n_local);
    free(nr_local);
//    free(k_local);
    for(dd=0;dd<num_gpus;dd++){
        magma_setdevice(dd);
        magma_free( dlAP_set[dd]); 
        magma_free( dlAP_get[dd]);
        magma_free(dlpanel[dd]);
        magma_free(dlAT[dd]);
    }

    //free(dlAP_set); 
    //free(dlAP_get);
    //free(dlpanel);
    free(dlAT);

#if dbglevel>=1
    printf("[DBG] Time memory free (dAP):%f\n",magma_wtime()-t1); 
    t1 = magma_wtime();
#endif

#if dbglevel>=1
    /*Finalize the tracing*/
    ca_dbg_trace_finalize();
    printf("[DBG] Time llog:%f\n",magma_wtime()-t1); 
#endif

    return *info; 
}   /* End of MAGMA_DGETRF_REC_ASYNC_WORK_GPU */
Exemple #15
0
  void magma_task_dev_dsetmatrix_async_transpose(Schedule* sched_obj)
 {
     magma_int_t deviceID;
    magma_int_t m;
    magma_int_t nb;
    double *A_src;
    magma_int_t LDA;
    double *dA_dst;
    magma_int_t dA_LD;
    magma_queue_t stream1;

    double *dwork;
    magma_int_t dwork_LD;

//    double *tmpdA;

#if (dbglevel >=1)
    ca_trace_start();
#endif

    schedule_unpack_args_10(sched_obj, deviceID, m,  nb, A_src,  LDA, dA_dst,  dA_LD, stream1, dwork, dwork_LD);

    magma_setdevice(deviceID);
    //magma_dsetmatrix_transpose( m,  nb, A_src,  LDA, dA_dst,  dA_LD);
      //task_setpanel(A_m, nb, A(0,K), A_LD, dA(0,K), dA_LD)

    /*make sure any operation on the device completed*/
    magma_queue_sync(compute_stream[deviceID]);

#if (dbglevel==10)
            ca_dbg_printMat(m, nb, A_src, LDA, "A before setMatrix");
            
            //ca_dbg_printMat_gpu(2, 2, dwork, dwork_LD, "dwork for testing");

            //cudaMemcpy(&tmpdA, dwork, sizeof(double*), cudaMemcpyHostToDevice);
            //ca_dbg_printMat_gpu(2, 2, tmpdA, dwork_LD, "dlAP_set[dd] for testing");

#endif
//            pthread_mutex_lock(&mutex_dAP);
    /*1. copy A to dwork: send to the GPU*/
    magma_dsetmatrix_async(m, nb, A_src, LDA, dwork, dwork_LD, stream1);
#if (dbglevel==10)
    magma_queue_sync(stream1);
    ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after setMatrix");
#endif

    magma_queue_sync(stream1); //make sure dwork is set before the transpose

    /*2.transpose dwork to dA*/
    magmablas_dtranspose2s(dA_dst, dA_LD, dwork, dwork_LD, m, nb, compute_stream[deviceID]);
//    pthread_mutex_unlock(&mutex_dAP);
#if (dbglevel==10)
    ca_dbg_printMat_transpose_gpu(nb, m, dA_dst, dA_LD, "dA after setMatrix");
#endif

    

#if (dbglevel >=1)
ca_trace_end_gpu('T');
ca_trace_end_cpu('C');
#endif
 }
extern "C" magma_int_t 
magma_dgetrf_gpu_work_amc(
magma_int_t m, magma_int_t n,  
double *dA, magma_int_t dA_LD, 
magma_int_t *ipiv, magma_int_t *info,
/*workspace on the cpu side*/
double *AWORK, magma_int_t AWORK_LD, magma_int_t AWORK_n
) 
{ 
/*  -- MAGMA (version 1.5.0-beta3) -- 
       Univ. of Tennessee, Knoxville 
       Univ. of California, Berkeley 
       Univ. of Colorado, Denver 
       November 2011 
 
    Purpose 
    ======= 
 
    DGETRF_GPU_WORK_AMC computes an LU factorization of a general M-by-N matrix A 
    using partial pivoting with row interchanges. The technique used for the panel factorization
    is the parallel recursif LU (see lawn 259).
 
    The factorization has the form 
       A = P * L * U 
    where P is a permutation matrix, L is lower triangular with unit 
    diagonal elements (lower trapezoidal if m > n), and U is upper 
    triangular (upper trapezoidal if m < n). 
 
    This is the right-looking Level 3 BLAS version of the algorithm. 
 
    Arguments 
    ========= 
 
    M       (input) INTEGER 
            The number of rows of the matrix A.  M >= 0. 
 
    N       (input) INTEGER 
            The number of columns of the matrix A.  N >= 0. 
 
    A       (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). 
            On entry, the M-by-N matrix to be factored. 
            On exit, the factors L and U from the factorization 
            A = P*L*U; the unit diagonal elements of L are not stored. 
 
    LDDA     (input) INTEGER 
            The leading dimension of the array A.  LDDA >= max(1,M). 
 
    IPIV    (output) INTEGER array, dimension (min(M,N)) 
            The pivot indices; for 1 <= i <= min(M,N), row i of the 
            matrix was interchanged with row IPIV(i). 
 
    INFO    (output) INTEGER 
            = 0:  successful exit 
            < 0:  if INFO = -i, the i-th argument had an illegal value 
                  or another error occured, such as memory allocation failed. 
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization 
                  has been completed, but the factor U is exactly 
                  singular, and division by zero will occur if it is used 
                  to solve a system of equations.

    =====================================================================    */ 
 
 
 
    double c_one     = MAGMA_D_ONE; 
    double c_neg_one = MAGMA_D_NEG_ONE; 
 
    int ONE = 1; 
 
    magma_int_t iinfo, nb; 
    magma_int_t mindim; 
    magma_int_t nrows, ncols; 
    //double *work; 
 
 
     magma_int_t dm_max, dn_max; 
     magma_int_t I, J, K, M, N, U_K; 
  
     //magma_int_t A_K; 
     double *dAT; 
     magma_int_t dAT_LD; 
      
      
     double *dAP_set,*dAP_get; 
     magma_int_t dAP_LD; 
      
     

     //magma_int_t nrows, ncols; 
     magma_int_t gpu_nrows, gpu_ncols; 
  
     int nbcores; /*Number of cores available for the whole factorization*/ 
     int panel_num_threads; /*Number of threads for the panel*/ 
     double dcpu; /*percentage of the matrix to allocate on the CPUs*/ 
  
    int B_rows;

    double t1;
    

     /* Recommanded dimension in the workspace*/ 
     int A_m, A_n, A_N, A_NMAX, A_LD;
     double *A;
#ifdef USE_CALU     
     int i_nrows;
#endif

     amc_args_t *args;
    /*magma_event_t *A_event;*/ /*Control bucket*/



    /* Check arguments */ 
    *info = 0; 
    if (m < 0) 
        *info = -1; 
    else if (n < 0) 
        *info = -2; 
    else if (dA_LD < max(1,m)) 
        *info = -4; 
    else if (AWORK_LD < max(1,m)) 
        *info = -5;

    if (*info != 0) { 
        magma_xerbla( __func__, -(*info) ); 
        return *info; 
    } 
 
    /* Quick return if possible */ 
    if (m == 0 || n == 0) 
        return *info; 

      
     /*Get parameters*/ 
    args = magma_amc_args_get_default();
     nb= args->nb;

     nbcores = args->P;  
     panel_num_threads = args->Pr; 
     dcpu = args->dcpu;

     /* Check and fix parameters */
     if(nb==0)
        nb     = magma_get_dgetrf_nb(m) ;/*magma dgetrf block size*/ 
    else
        nb = args->nb;

     if(nb>n) nb = n; 
     if(panel_num_threads>nbcores) panel_num_threads = nbcores;

     /* Compute the maximum number of panels we can store in the workspace*/
     A_NMAX = (int) (AWORK_n/ nb);

     /* Compute the recommanded number of columns for the cpu part*/
     A_n = (int) ceil(n*dcpu);

     /*Make sure we work with multiple of 32*/
     /*
     if(A_n%32!=0) {
         A_n = ((A_n + 31)/32)*32;
     }
     */

     /* Compute the recommanded number of panels for the cpu part*/
     A_N = (int) (A_n/ nb);
     
     /* Check if there are enough workspace. In case the user gave a workspace lower than the optimal*/
     /* NOTE: using small workspace may reduce performance*/
     if(A_N>A_NMAX){    
#if (dbglevel >=1)
        printf("[DBG_WARNING] Resizing buffer to feet user preferences. Recommanded:%d, Max given:%d\n",A_N, A_NMAX); 
#endif
        A_N = A_NMAX;
    }
      
     

     A = AWORK;
     A_m = m;
     A_LD = AWORK_LD;


#if (dbglevel >=1)
    /* Initialize the tracing*/
    ca_dbg_trace_init(nbcores,1); //nbcores + 1 GPU
#endif

#if (dbglevel >=1)
    t1 = magma_wtime();
#endif

     /*Transfer the first column block of the matrix from the GPU to the CPUs*/ 
    
    magma_dgetmatrix(A_m, A_n, dA, dA_LD, A, A_LD); 

#if (dbglevel >=1)
    printf("[DBG] Time First getmatrix: %f\n",magma_wtime()-t1);
    t1 = magma_wtime();
#endif
#if (dbglevel==10)  
    ca_dbg_printMat(m, A_n, A, A_LD,"A after first getMatrix"); 
#endif


     /*Allocate a workspace for the panels transposition*/ 
     dAP_LD = m; 
     if(dAP_LD%32!=0) dAP_LD = ((dAP_LD + 31)/32)*32;/*Make dAP_LD multiple of 32*/

     if (MAGMA_SUCCESS != magma_dmalloc(&dAP_set, dAP_LD*nb)) { 
            *info = MAGMA_ERR_DEVICE_ALLOC; 
            return *info; 
    } 

     if (MAGMA_SUCCESS != magma_dmalloc(&dAP_get, dAP_LD*nb)) { 
            magma_free(dAP_set);
            *info = MAGMA_ERR_DEVICE_ALLOC; 
            return *info; 
    }


#if (dbglevel >=1)
    printf("[DBG] Time workspace memory alloc (dAP): %f\n",magma_wtime()-t1);
    t1 = magma_wtime();
#endif

    /*Transpose the gpu part of the matrix in/out of place*/

    if ((m == n) ){  //&& (m % 32 == 0) && (dA_LD%32 == 0)
         dAT = dA;
         dAT_LD= dA_LD;
       magmablas_dtranspose_inplace(m, dAT, dAT_LD); 
    } 
    else { 
        
      
     dm_max = m;
     dn_max = n;

    /*Make sure m and n are multiple of 32*/
     
     if(dm_max%32!=0) dm_max = ((dm_max + 31)/32)*32;
     if(dn_max%32!=0) dn_max = ((dn_max + 31)/32)*32;
     
     if (MAGMA_SUCCESS != magma_dmalloc(&dAT, dm_max*dn_max )) { 
        magma_free(dAP_set); 
        magma_free(dAP_get);
        *info = MAGMA_ERR_DEVICE_ALLOC; 
        return *info; 
     }

     dAT_LD = dn_max; 
     magmablas_dtranspose2( dAT, dAT_LD, dA, dA_LD, m, n );  
   }

#if (dbglevel >=1)
    printf("[DBG] Time First transposition: %f\n",magma_wtime()-t1);
    t1 = magma_wtime();
#endif

#if (dbglevel==10) 
    ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"matrix dAT to factorize"); 
#endif



     /* Compute the maximun number of steps*/
     mindim = min(m, n); 
     M      = (int) ceil( (double) m / nb); 
     N      = (int) ceil( (double) mindim / nb); /*N = n/nb*/


     /*Let the asynchronous algorithm begin*/ 
     
#if (dbglevel >=1)
     printf("Starting recursif code ... m:%d, n:%d, nb:%d, nbcores:%d, N:%d, A_N:%d\n", m, n, nb, nbcores, N, A_N); //Summary
#endif



     /*Initialize the scheduler*/ 
     magma_schedule_init(nbcores, 1); 


     K = 0; 
#ifdef USE_CALU
     /*initialize calu environment*/
     core_dtslu_alloc(panel_num_threads, A_m, nb);
     core_dtslu_init(panel_num_threads);

     /*Initialize rows indice: required*/
     for(I=0;I<A_m;I++) ipiv[I]=I;
#else
     /*initialize parallel recursif panel environment*/
     CORE_zgetrf_reclap_init();
#endif


     magma_schedule_set_task_priority(INT_MAX-1);

     /*Schedule the first panel factorization*/ 
#ifdef USE_CALU
     magma_insert_core_dtslu(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K));

     B_rows = (int) ceil((double) (M-K-1)/panel_num_threads);
     B_rows = max(B_rows,4); /*maximun of 4*/ 
     //B_rows = max(B_rows,1);

     for(I=K+1; I<=M-1; I+=B_rows){ 
     
        i_nrows = min(B_rows*nb, m-I*nb);
        magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, nb, c_one, A(0,K), A_LD, A(I,K), A_LD, colptr(K));
     }
#else
     magma_insert_core_dgetrf_rec(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K));  
#endif
 
     /*Transfer the factorized panel to the GPU (transposition included)*/ 
     magma_insert_dsetmatrix_transpose(A_m, nb, A(0,K), A_LD, dAT(0,K), dAT_LD, dAP_set, dAP_LD, colptr(K), dAT(K,K)); 
 
 
#if (dbglevel==10) 
    magma_schedule_barrier(); 
    ca_dbg_printMat(m, nb, A(0,0), A_LD,"A(0,0)"); 
    ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"dA"); 
#endif 
 
     for(K=0;K<=N-1;K++){ 
     
          /*insert the coarse update of the trailing submatrix corresponding to panel K to the GPU, that is submatrix A[K+1:M, K+1+d-1:N]*/ 

          gpu_nrows = m - (K+1)*nb; 
          gpu_ncols = n - (K+1+A_N-1)*nb; 
 
          if(gpu_ncols >0) 
          { 
 
              /*NOTE: Here we work on the matrix transpose*/

              /*Set the priority max for the GPU computations*/
              magma_schedule_set_task_priority(INT_MAX);
             //// magma_schedule_set_task_priority(INT_MAX - N*K);

              /*schedule a swap of the trailing submatrix in the gpu using ipiv[K]*/ 
              /*dependency dAT((K+1)-1, (K+A_N)-1) = dAT(K, K+A_N-1) with previous dgemm*/
              magma_insert_dlaswp(gpu_ncols, dAT(K, K+A_N), dAT_LD, ONE, nb, ipiv(K), ONE, dAT(K, K+A_N-1)); /*non blocking*/                  
              //printf("debug barrier\n");
              //magma_schedule_barrier();
              magma_insert_dtrsm(MagmaRight,  MagmaUpper, MagmaNoTrans, MagmaUnit, gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD);/*non blocking*/ 
 
              /* aij^T = aij^T - (lik.ukj)^T = aij^T - ukj^T.lik^T*/ 
              magma_insert_dgemm(MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD);/*non blocking*/    
       
          } 
          
          /*iterate over the rest of the columns to update the trailing submatrix on the cpu*/ 
          for(J=K+1;J<=min(K+A_N-1, N-1);J++){ 
 
               ncols = min(nb, n - J*nb); 
 
               /*Set the priority max for column having the next panel (look ahead of deep 1),
               and process the rest of the update in a right looking way*/
               if(J==K+1)
                   magma_schedule_set_task_priority(INT_MAX -2 );
                  //// magma_schedule_set_task_priority(INT_MAX - N*K -1);
               else
                   magma_schedule_set_task_priority(INT_MAX -3 - J );//- N*K


               /*dependency colptr(J): make sure column J is sent from GPU, and all previous update was done*/
               magma_insert_core_dlaswp(ncols, A(K,J), A_LD, ONE, nb, ipiv(K), ONE, colptr(J)); 
 
               magma_insert_core_dtrsm('L', 'L', 'N', 'U', nb, ncols, c_one, A(K,K), A_LD, A(K,J), A_LD, colptr(J)); 
 
             /*Compute the number of blocs rows to group together before the update. To avoid scheduling overhead.*/
              B_rows = (int) ceil((double) (M-K-1)/panel_num_threads);
              //B_rows = max(B_rows,4); /*maximun of 4*/ 
              //B_rows = max(B_rows,1);

               for(I=K+1; I<=M-1; I+=B_rows){ 
     
                    nrows = min(B_rows*nb, m-I*nb); 
                    
                    /*dep colptr(K):make sure the panel is not overwritten or swapped since dgemm use A[I,K]*/
                    /*dep colptr(J): Gather all dgemm on one column and create dependencies with previous dgemm and the next panel*/
                    magma_insert_core_dgemm('N','N', nrows, ncols, nb, c_neg_one, A(I,K), A_LD, A(K,J), A_LD, c_one, A(I,J), A_LD, colptr(K), colptr(J)); 
               } 

               if(J==K+1) 
               { 
                    /*Look ahead and insert the next panel*/ 
                    nrows = m - (K+1)*nb; 
                    ncols = min(nb, n - (K+1)*nb); 
 
                    /*Schedule the next panel factorization with maximum priority*/ 
                    magma_schedule_set_task_priority(INT_MAX -1);
#ifdef USE_CALU
                    magma_insert_core_dtslu(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1));

                    B_rows = (int) ceil((double) (M-(K+1)-1)/panel_num_threads);
                    B_rows = max(B_rows,4); /*maximun of 4*/ 
                     //B_rows = max(B_rows,1);

                     for(I=K+2; I<=M-1; I+=B_rows){ 
     
                        i_nrows = min(B_rows*nb, m-I*nb);
                        magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, ncols, c_one, A(K+1,K+1), A_LD, A(I,K+1), A_LD, colptr(K+1));
                        //dtrsm("R", "U", "N", "N", &nrowPblock, &panel_NB, &dONE, &(A[M*pos+pos]), &LDA, &(A[lpos]), &LDA); //
                     }

#else
                   magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); 
#endif 
 
                    /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ 
                    U_K = max(0, K+1 - A_N +1); 
                    nrows = m - U_K*nb; 
 
                    /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ 
                    magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1));
 
               } 
 
          } 
 
           /*Transfer asynchronously one column (column K+A_N) from the GPU to the CPU to balance work*/  
            /*Make sure this is inserted after all dgemm before it schedules to replace a current panel in case A_N< N*/
           if(K+A_N<N) { 
              ncols = min(nb, gpu_ncols); 
 
              magma_schedule_set_task_priority(INT_MAX);

              magma_insert_dgetmatrix_transpose(gpu_nrows, ncols, dAT(K+1,K+A_N), dAT_LD, A(K+1,K+A_N), A_LD, dAP_get, dAP_LD, colptr(K+A_N)); //blocking
           

          /*if A_N==1 there is no look-ahead, so insert the panel here*/
           if(A_N==1){
              /*Look ahead and insert the next panel*/ 
              nrows = m - (K+1)*nb; 
              ncols = min(nb, n - (K+1)*nb); 
              /*Schedule the next panel factorization with maximum priority*/ 
              magma_schedule_set_task_priority(INT_MAX -1);

#ifdef USE_CALU
                magma_insert_core_dtslu(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); 

                B_rows = (int) ceil((double) (M-(K+1)-1)/panel_num_threads);
                B_rows = max(B_rows,4); /*maximun of 4*/ 
                //B_rows = max(B_rows,1);

                for(I=K+2; I<=M-1; I+=B_rows){ 
     
                    i_nrows = min(B_rows*nb, m-I*nb);
                    magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, ncols, c_one, A(K+1,K+1), A_LD, A(I,K+1), A_LD, colptr(K+1));
                    //dtrsm("R", "U", "N", "N", &nrowPblock, &panel_NB, &dONE, &(A[M*pos+pos]), &LDA, &(A[lpos]), &LDA); //
                }

#else
                 magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); 
                 //magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1));
#endif

               
 
                /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ 
                U_K = max(0, K+1 - A_N +1); 
                nrows = m - U_K*nb; 
 
                    ///magma_schedule_set_task_priority(INT_MAX);
                    /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ 
                magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1));
           }
         }
#if (dbglevel==10)     
    magma_schedule_barrier(); 
    ca_dbg_printMat(m, A_n, A, A_LD,"A"); 
    ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"dA"); 
#endif 
         
     } //Step K done

 /*Wait for all thread termination*/
 magma_schedule_barrier(); 

     /*TODO: don't need quark here*/
     /*Perform a sequence of left swap on the matrix corresponding to the different panel*/ 
     for(K=1;K<=N-1;K++){ 
 
#if (dbglevel >=1)
    ca_trace_start();
#endif
        nrows = min(nb,m - K*nb); 
 
        ncols = min(K*nb,n); 

        /*dep dAT(K-1): Make sure the last swap is completed, and also the dgemm using the panel*/

       // magma_insert_dlaswp(ncols, dAT(K, 0), dAT_LD, ONE, nrows, ipiv(K), ONE, dAT(K-1,0)); 
        magmablas_dlaswp(ncols, dAT(K, 0), dAT_LD, ONE, nrows, ipiv(K), ONE);

#if (dbglevel >=1)
ca_trace_end_1gpu('W');
#endif
     } 
 
     
        
/*Shutdown the scheduler*/
     magma_schedule_delete();

/*update permutation vector indexes*/ 
     for(K=1;K<=N-1;K++){ 
 
        nrows = min(nb, n-K*nb); 
        for(J=0;J<=nrows-1;J++){ 
            ipiv[K*nb+J] += K*nb; 
        } 
     } 

#if dbglevel>=1
    printf("[DBG] Time Factorization:%f\n",magma_wtime()-t1); 
    t1 = magma_wtime();
#endif


 /*No need for synchro, since dtranspose is blocking*/
   if (m == n) {
      magmablas_dtranspose_inplace(m, dAT, dAT_LD); //( m, dAT, dAT_LD ); 
      dA = dAT; 
   } 
   else { 
      magmablas_dtranspose2( dA, dA_LD, dAT, dAT_LD, n, m ); 
      magma_free( dAT ); 
   } 

#if dbglevel>=1
    printf("[DBG] Time Final in/out of place transpose:%f\n",magma_wtime()-t1); 
    t1 = magma_wtime();
#endif


#ifdef USE_CALU
    core_dtslu_free();
#endif 

   magma_free( dAP_set ); 
   magma_free( dAP_get );

#if dbglevel>=1
    printf("[DBG] Time memory free (dAP):%f\n",magma_wtime()-t1); 
    t1 = magma_wtime();
#endif

#if (dbglevel==10)     
    ca_dbg_printMat_transpose_gpu(m, n, dA, dA_LD,"dA = LU"); 
#endif 


#if dbglevel>=1
    /*Finalize the tracing*/
    ca_dbg_trace_finalize();
    printf("[DBG] Time llog:%f\n",magma_wtime()-t1); 
#endif

    return *info; 
}   /* End of MAGMA_DGETRF_REC_ASYNC_WORK_GPU */