void magma_task_dlaswp(Schedule* sched_obj ) { magma_int_t n; double *dA; magma_int_t lda; magma_int_t i1; magma_int_t i2; magma_int_t *ipiv; magma_int_t inci; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_7(sched_obj,n, dA, lda, i1, i2, ipiv, inci); magmablas_dlaswp( n, dA, lda, i1, i2, ipiv, inci ); //magma_task_dlaswp(gpu_ncols, dAT(K,K+A_N), dAT_LD, c_one, nb, &ipiv[K], c_one); #if (dbglevel >=1) ca_trace_end_1gpu('W'); ca_trace_end_cpu('C'); #endif }
void magma_task_dev_dlaswp(Schedule* sched_obj ) { magma_int_t deviceID; magma_int_t n; double *dA; magma_int_t lda; magma_int_t i1; magma_int_t i2; magma_int_t *ipiv; magma_int_t inci; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_8(sched_obj, deviceID, n, dA, lda, i1, i2, ipiv, inci); magma_setdevice(deviceID); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(n, n, dA, lda, "A(n,n) before magma_dlaswp"); #endif pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(compute_stream[deviceID]); magmablas_dlaswp( n, dA, lda, i1, i2, ipiv, inci ); pthread_mutex_unlock(&mutex_compute_stream); //task_magma_dlaswp(gpu_ncols, dAT(K,K+A_N), dAT_LD, c_one, nb, &ipiv[K], c_one); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(n, n, dA, lda, "A(n,n) after magma_dlaswp"); #endif #if (dbglevel >=1) ca_trace_end_gpu('W'); ca_trace_end_cpu('C'); #endif }
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. It uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_)*nb + (j_)*nb*ldda + dA_offset) #define dAT(i_, j_) dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset) #define dwork(i_) dwork, (i_) #else #define dA(i_, j_) ( dA + (i_)*nb + (j_)*nb*ldda) #define dAT(i_, j_) ( dAT + (i_)*nb*lddat + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif // Constants const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; // Local variables double *work; magmaDouble_ptr dA, dAT, dwork; magma_int_t iinfo, nb; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_dgetrf_nb( m, n ); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_dgetrf( &m, &n, A, &lda, ipiv, info ); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, lddat, maxdim; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); maxdim = max( maxm, maxn ); lddat = maxn; ldda = maxm; /* set number of GPUs */ magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); /* check the memory requirement */ size_t mem_size = magma_queue_mem_size( queues[0] ); mem_size /= sizeof(double); magma_int_t h = 1+(2+ngpu); magma_int_t ngpu2 = ngpu; magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( ngpu > ceil((double)NB/nb) ) { ngpu2 = (magma_int_t)ceil((double)NB/nb); h = 1+(2+ngpu2); NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); } if ( ngpu2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place // dwork is nb*maxm for panel, and maxdim*maxdim for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; ldda = lddat = maxdim; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); dAT = dA; magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place // dwork is nb*maxm for panel, and maxm*maxn for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dwork ); magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } lapackf77_dgetrf( &m, &nb, work, &lda, ipiv, &iinfo ); for( j = 0; j < s; j++ ) { // get j-th panel from device cols = maxm - j*nb; if (j > 0) { magmablas_dtranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] ); magma_queue_sync( queues[0] ); magma_dgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); // do the cpu part rows = m - j*nb; magma_queue_sync( queues[1] ); lapackf77_dgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo ); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; // put j-th panel onto device magma_dsetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] ); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); magma_queue_sync( queues[1] ); magmablas_dtranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] ); // do the small non-parallel computations (next panel update) if (s > (j+1)) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] ); magma_dgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] ); magma_queue_sync( queues[0] ); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo ); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); // put j-th panel onto device magma_dsetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] ); magmablas_dtranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), lddat, dAT(s, s)+nb0, lddat, queues[0] ); } // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); magma_dgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] ); } else { magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_dgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] ); magma_free( dAT ); } magma_free( dwork ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); } return *info; } /* magma_dgetrf */
extern "C" magma_int_t magma_dgetrf_mgpu_work_amc_v3(magma_int_t num_gpus, magma_int_t m, magma_int_t n, double **dlA, magma_int_t dlA_LD, magma_int_t *ipiv, magma_int_t *info, /*workspace on the cpu side*/ double *AWORK, magma_int_t AWORK_LD, magma_int_t AWORK_n ) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DGETRF_REC_ASYNC computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The technique used for the panel factorization is the parallel recursif LU (see lawn 259). The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; int ONE = 1; magma_int_t iinfo, nb; magma_int_t mindim; magma_int_t nrows, ncols; //double *work; magma_int_t dm_max, dn_max; magma_int_t I, J, K, M, N, U_K, L; //magma_int_t A_m, A_n, A_N; //magma_int_t Am_max, An_max; //magma_int_t A_nb; //magma_int_t A_K; double **dlAT; magma_int_t dlAT_LD; double *dlAP_get[MagmaMaxGPUs]; //*dlAP_set[MagmaMaxGPUs] double *dlAP_set[MagmaMaxGPUs]; magma_int_t dlAP_LD; double *dlpanel[MagmaMaxGPUs]; magma_int_t dlpanel_LD; int *n_local, *nr_local; //magma_int_t nrows, ncols; magma_int_t gpu_nrows, gpu_ncols; int nbcores; /*Number of cores available for the whole factorization*/ int panel_num_threads; /*Number of threads for the panel*/ double dcpu; /*percentage of the matrix to allocate on the CPUs*/ int B_rows; double t1; /*Workspace*/ // magma_int_t AWORK_NMAX; // magma_int_t AWORK_m, AWORK_n, AWORK_N; /* Recommanded dimension in the workspace*/ int A_m, A_n, A_N, A_NMAX, A_LD; int A_NP1; double *A; amc_args_t *args; /*magma_event_t *A_event;*/ /*Control bucket*/ magma_queue_t mstream[MagmaMaxGPUs][3]; /*0: H2D, 1: compute, 2:D2H*/ int dd; // double *tmpdA; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (dlA_LD < max(1,m)) *info = -4; else if (AWORK_LD < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /*Get parameters*/ args = magma_amc_args_get_default(); nb= args->nb; nbcores = args->P; panel_num_threads = args->Pr; dcpu = args->dcpu; /* Check and fix parameters */ if(nb==0) nb = magma_get_dgetrf_nb(m) ;/*magma dgetrf block size*/ else nb = args->nb; if(nb>n) nb = n; if(panel_num_threads>nbcores) panel_num_threads = nbcores; /*check the buffer size*/ if(AWORK_n<nb){ printf("Not enough buffer. Should be greater than the block size: %d\n", nb); exit(1); } /* Compute the number of blocks columns to factorize*/ N = (int) ceil( (double) min(m, n) / nb); /* Compute the maximum number of panels we can store in the workspace*/ A_NMAX = (int) (AWORK_n/ nb); /*Compute the recommanded number of panels for the cpu part*/ A_N = NSplit(N, dcpu); /* Compute the recommanded number of columns for the cpu part*/ A_n = A_N*nb;//(int) ceil(n*dcpu); //if(A_n<nb) // A_n = nb;//make sure workspace has at least one block column /*Make sure we work with multiple of 32*/ /* if(A_n%32!=0) { A_n = ((A_n + 31)/32)*32; } */ /* Compute the recommanded number of panels for the cpu part*/ // A_N = (int) (A_n/ nb); /* Check if there are enough workspace. In case the user gave a workspace lower than the optimal*/ /* NOTE: using small workspace may reduce performance*/ if(A_N>A_NMAX){ #if (dbglevel >=1) printf("[DBG_WARNING] Resizing buffer to feet user preferences. Recommanded:%d, Max given:%d\n",A_N, A_NMAX); #endif A_N = A_NMAX; /*Make A_n a multiple of nb*/ A_n = A_N*nb; } A = AWORK; A_m = m; A_LD = AWORK_LD; #if (dbglevel >=1) /* Initialize the tracing*/ ca_dbg_trace_init(nbcores,num_gpus); //nbcores + 1 GPU #endif #if (dbglevel >=1) t1 = magma_wtime(); #endif /* create the streams */ //mstream = (magma_queue_t *) malloc(num_gpus*sizeof(magma_queue_t)); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); //required magma_queue_create(&mstream[dd][0]); magma_queue_create(&mstream[dd][1]); magma_queue_create(&mstream[dd][2]); /*Set the stream for internal computations*/ //magmablasSetKernelStream(0); /*Use 0 instead of mstream[dd][1], MagmasetkernelStream is not thread safe*/ /*TODO: mae it safe*/ //task_dev_set_compute_stream(dd, mstream[dd][1]); magma_task_dev_set_compute_stream(dd, 0); //set to mstream 1 later } /* Matrix dimension */ dm_max = m; dn_max = n; /*Make sure m and n are multiple of 32*/ if(dm_max%32!=0) dm_max = ((dm_max + 31)/32)*32; if(dn_max%32!=0) dn_max = ((dn_max + 31)/32)*32; /* local dimensions of the matrix for each GPU*/ n_local = (int *) malloc(num_gpus*sizeof(int)); /*This do no change during the execution*/ nr_local = (int *) malloc(num_gpus*sizeof(int)); /*Change after each update of the trailing submatrix*/ for(dd=0;dd<num_gpus;dd++){ n_local[dd] = numcols2p(dd, n, nb, num_gpus); //loc2p(dd, N, num_gpus)*nb; nr_local[dd] = n_local[dd]; } /*Allocate a workspace for the panels transposition*/ dlAP_LD = dm_max; //if(dAP_LD%32!=0) dAP_LD = ((dAP_LD + 31)/32)*32;/*Make dAP_LD multiple of 32*/ /// dlAP_set = (double **) malloc(num_gpus*sizeof(double*)); //dlAP_get = (double **) malloc(num_gpus*sizeof(double*)); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); if (MAGMA_SUCCESS != magma_dmalloc( &dlAP_set[dd], dlAP_LD*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if (MAGMA_SUCCESS != magma_dmalloc(&tmpdA, dlAP_LD*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } */ if ( magma_is_devptr(dlAP_set[dd] ) == 0 ) { fprintf( stderr, "ERROR: dlAP_set[dd] is host pointer.\n" ); //exit(1); } //cudaMemcpy(dlAP_set[dd],&tmpdA,sizeof(double*), cudaMemcpyDeviceToHost); #if (dbglevel==10) printf("0.4\n"); //ca_dbg_printMat_gpu(2, 2, dlAP_set[dd], dlAP_LD, "dlAP_set[dd] for testing"); //cudaMemcpy(&tmpdA, &dlAP_set[dd], sizeof(double*), cudaMemcpyHostToDevice); //ca_dbg_printMat_gpu(2, 2, tmpdA, dlAP_LD, "dlAP_set[dd] for testing"); //printf("0.5: int to continue"); scanf("%d", &I); #endif if (MAGMA_SUCCESS != magma_dmalloc(&dlAP_get[dd], dlAP_LD*nb)) { //magma_free(dlAP_set); //TODO: free all previous buffers *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Workspace for the panels */ // dlpanel = (double **) malloc(num_gpus*sizeof(double*)); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); if (MAGMA_SUCCESS != magma_dmalloc(&dlpanel[dd], nb*dm_max)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dlpanel_LD = nb; /*local matrix storage*/ dlAT = (double **) malloc(num_gpus*sizeof(double*)); dlAT_LD = n_local[0]; if(dlAT_LD%32!=0) dlAT_LD = ((dlAT_LD + 31)/32)*32; for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); if (MAGMA_SUCCESS != magma_dmalloc(&dlAT[dd], dlAT_LD*dm_max )) { for(J=0;J<dd;J++){ magma_setdevice(J); magma_free( dlAP_set[J]); magma_free( dlAP_get[J]); magma_free(dlpanel[J]); magma_free(dlAT[J]); } //free(dlAP_set); //free(dlAP_get); //free(dlpanel); free(dlAT); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } #if (dbglevel >=1) printf("[DBG] Time workspace memory alloc (dAP): %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /*1. Transfer the first column blocks of the matrix from the GPU to the CPUs.*/ //magma_dgetmatrix(A_m, A_n, dA, dA_LD, A, A_LD); magma_dgetmatrix_1D_col_bcyclic(A_m, A_n, dlA, dlA_LD, A, A_LD, num_gpus, nb); #if (dbglevel >=1) printf("[DBG] Time First getmatrix: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) printf("1.0\n"); ca_dbg_printMat(A_m, A_n, A, A_LD,"A after first getMatrix"); /* for(dd=0;dd<num_gpus;dd++){ //Fill the matrix with zero for easy visualization of the matrix in debug mode for(I=0;I<dlAT_LD*dm_max;I++) dlAT[dd][I] = 0.0; } */ // ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlAT, dlAT_LD,"matrix dAlT^T empty"); // ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"matrix dAT empty"); printf("2.0\n"); #endif /*Update the remaining number of columns on the GPUs.*/ for(dd=0;dd<num_gpus;dd++){ nr_local[dd] = nr_local[dd] - numcols2p(dd, A_n, nb, num_gpus); //;n_local[dd] - loc2p(dd, A_N, num_gpus)*nb; } #if (dbglevel==10) ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlA, dlA_LD,"matrix dA to factorize"); printf("3.0\n"); #endif for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); //magmablasSetKernelStream(mstream[dd][1]); magmablas_dtranspose2(dlAT[dd], dlAT_LD, dlA[dd], dlA_LD, m, n_local[dd]); } /// for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_task_dev_set_compute_stream(dd, mstream[dd][1]); } #if (dbglevel >=1) printf("[DBG] Time First transposition: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) //ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"matrix dAT to factorize"); /* dd = GID(A_N); magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(nb, m, dlAT(0, A_N), dlAT_LD,"matrix dAT(0, A_N)"); magma_setdevice(0); ca_dbg_printMat_transpose_gpu(m, nb, dlA(0, A_N), dlA_LD,"matrix dA(0, A_N)"); */ printf("4.0\n"); printf("int to continue"); scanf("%d", &I); #endif /* #if (dbglevel==10) ca_dbg_printMat_transpose_mgpu(num_gpus, m, n_local, dlAT, dlAT_LD,"matrix dAT to factorize"); #endif */ /* Compute the maximun number of steps*/ mindim = min(m, n); M = (int) ceil( (double) m / nb); N = (int) ceil( (double) mindim / nb); /*N = n/nb*/ /* 3. Let the asynchronous algorithm begin*/ #if (dbglevel >=1) printf("Starting recursif code ... m:%d, n:%d, nb:%d, nbcores:%d, N:%d, A_N:%d\n", m, n, nb, nbcores, N, A_N); //Summary #endif /*Initialize the scheduler*/ magma_schedule_init(nbcores, num_gpus); K = 0; /*initialize parallel recursif panel environment*/ CORE_zgetrf_reclap_init(); magma_schedule_set_task_priority(INT_MAX-1); /*Schedule the first panel factorization*/ magma_insert_core_dgetrf_rec(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K)); //magma_insert_core_dgetrf(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, colptr(K)); /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/ for(dd=0;dd<num_gpus;dd++){ ///magma_insert_dev_dsetmatrix_transpose(dd, A_m, nb, A(0,K), A_LD, dlpanel(dd,K), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K), dlpanel[dd]); magma_insert_dev_dsetmatrix_async_transpose(dd, A_m, nb, A(0,K), A_LD, dlpanel(dd,K), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K), dlpanel(dd,K)); //dlpanel[dd] } #if (dbglevel==10) magma_schedule_barrier(); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(nb, m, dlpanel(dd,K), dlpanel_LD,"dlpanel[dd] after setmatrix_async"); //dlpanel[dd] } printf("4.5: int to continue"); scanf("%d", &I); #endif /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/ dd = GID(K); //magma_insert_dev_dsetmatrix_transpose(dd, A_m, nb, A(0,K), A_LD, dlAT(0,K), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K), dlAT(0,K)); magma_insert_dev_dsetmatrix_async_transpose(dd, A_m, nb, A(0,K), A_LD, dlAT(0,K), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K), dlAT(0,K)); #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, nb, A(0,0), A_LD,"A(0,0)"); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(nb, m, dlpanel[dd], dlpanel_LD,"dlpanel[dd] after setmatrix to dlAT"); } ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dlA"); printf("5.0: int to continue"); scanf("%d", &I); #endif for(K=0;K<=N-1;K++){ /*compute the new value of the cpu number of blocks*/ A_N = NSplit(N-K, dcpu); /*insert the coarse update of the trailing submatrix corresponding to panel K to the GPU, that is submatrix A[K+1:M, K+1+d-1:N]*/ //if(K==0) /*TODO: move outside loop*/ //{ /*NOTE: Here we work on the matrix transpose*/ /*Set the priority max for the GPU computations*/ magma_schedule_set_task_priority(INT_MAX); //// magma_schedule_set_task_priority(INT_MAX - N*K); gpu_nrows = m - (K+1)*nb;/// for(J=K+A_N;J<min(K+A_N+num_gpus,N);J++){ /*Determine the device which own the first column of the group of columns to update*/ dd = GID(J); /*Determine the number of columns to apply the update. */ nr_local[dd] = numcols2p(dd, n - (K+1+A_N-1)*nb, nb, num_gpus); gpu_ncols = nr_local[dd]; //n - (K+1+A_N-1)*nb; if(gpu_ncols >0) { /*schedule a swap of the trailing submatrix in the gpus using ipiv[K]*/ /*dependency dAT((K+1)-1, (K+A_N)-1) = dAT(K, K+A_N-1) with previous dgemm*/ magma_insert_dev_dlaswp(dd, gpu_ncols, dlAT(K, J), dlAT_LD, ONE, nb, ipiv(K), ONE, dlAT(K, J-1)); /*non blocking*/ //printf("debug barrier\n"); //magma_schedule_barrier(); //&(dlpanel[dd][dlpanel_LD*nb*K]) magma_insert_dev_dtrsm(dd, MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, gpu_ncols, nb, c_one, dlpanel(dd,K), dlpanel_LD, dlAT(K,J), dlAT_LD);/*non blocking*/ /* aij^T = aij^T - (lik.ukj)^T = aij^T - ukj^T.lik^T*/ //&(dlpanel[dd][dlpanel_LD*nb*(K+1)]) magma_insert_dev_dgemm(dd, MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dlAT(K,J), dlAT_LD, dlpanel(dd,K+1), dlpanel_LD, c_one, dlAT(K+1,J), dlAT_LD);/*non blocking*/ /*Transfer asynchronously one column (column K+A_N) from the GPU to the CPU to balance work*/ //// if(K+A_N<N) //// { ////ncols = min(nb, gpu_ncols); //////magma_schedule_set_task_priority(INT_MAX); ////magma_insert_dgetmatrix_transpose(gpu_nrows, ncols, dAT(K+1,K+A_N), dAT_LD, A(K+1,K+A_N), A_LD, dAP, dAP_LD, colptr(K+A_N)); //blocking //// } } } //} /*iterate over the rest of the columns to update the trailing submatrix on the cpu*/ for(J=K+1;J<=min(K+A_N-1, N-1);J++){ ncols = min(nb, n - J*nb); /*Set the priority max for column having the next panel (look ahead of deep 1), and process the rest of the update in a right looking way*/ if(J==K+1) magma_schedule_set_task_priority(INT_MAX -2 ); //// magma_schedule_set_task_priority(INT_MAX - N*K -1); else magma_schedule_set_task_priority(INT_MAX -3 - J );//- N*K //// magma_schedule_set_task_priority(INT_MAX - N*K -3 -J); //magma_schedule_set_task_priority(INT_MAX - J); /*dependency colptr(J): make sure column J is sent from GPU, and all previous update was done*/ magma_insert_core_dlaswp(ncols, A(K,J), A_LD, ONE, nb, ipiv(K), ONE, colptr(J)); magma_insert_core_dtrsm('L', 'L', 'N', 'U', nb, ncols, c_one, A(K,K), A_LD, A(K,J), A_LD, colptr(J)); /*Compute the number of blocs rows to group together before the update. To avoid scheduling overhead.*/ B_rows = (int) ceil((double) (M-K-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); //printf("B_rows:%d\n",B_rows); for(I=K+1; I<=M-1; I+=B_rows){ nrows = min(B_rows*nb, m-I*nb); /*dep colptr(K):make sure the panel is not overwritten or swapped since dgemm use A[I,K]*/ /*dep colptr(J): Gather all dgemm on one column and create dependencies with previous dgemm and the next panel*/ magma_insert_core_dgemm('N','N', nrows, ncols, nb, c_neg_one, A(I,K), A_LD, A(K,J), A_LD, c_one, A(I,J), A_LD, colptr(K), colptr(J)); } if(J==K+1) { /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); ///magma_schedule_set_task_priority(0); //TEST: testing prio_0 //// magma_schedule_set_task_priority(INT_MAX - N*K - 2); magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); // magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/ for(dd=0;dd<num_gpus;dd++){ //&(dlpanel[dd][dlpanel_LD*nb*(K+1)]) ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel[dd]); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel(dd,K+1));//, dlpanel[dd] } /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; ///magma_schedule_set_task_priority(INT_MAX); /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ ///magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP, dAP_LD, A(K+1,K+1), dAT(K+1,K+1)); //magma_insert_dev_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/ dd = GID(K+1); ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(K+1,K+1)); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(0,K+1));/// } } /*compute the next number of blocks colums */ A_NP1 = NSplit(N-(K+1), dcpu) - NSplit(N-K, dcpu) + 1; /*Transfer asynchronously (A_NP1 - A_N) block column (column K+A_N) from the GPU to the CPU to balance work*/ /*Make sure this is inserted after all dgemm because it schedules to replace a current panel for the case A_N< N*/ for(L=K+A_N;L<K+A_N+A_NP1;L++) { if(L<N) { /*Determine the device which own column K+A_N*/ dd = GID(L); gpu_ncols = nr_local[dd]; ncols = min(nb, gpu_ncols); magma_schedule_set_task_priority(INT_MAX); ///magma_insert_dev_dgetmatrix_transpose(dd, gpu_nrows, ncols, dlAT(K+1,K+A_N), dlAT_LD, A(K+1,K+A_N), A_LD, dlAP_get[dd], dlAP_LD, colptr(K+A_N)); //blocking /*make sure the computations are done on stream 1 and send a block column on stream 2*/ magma_insert_dev_queue_sync(dd, mstream[dd][1], dlAT(K+1,L)); magma_insert_dev_dgetmatrix_async_transpose(dd, gpu_nrows, ncols, dlAT(K+1,L), dlAT_LD, A(K+1,L), A_LD, mstream[dd][2], dlAP_get[dd], dlAP_LD, colptr(L)); /*Update the remaining number of columns*/ //// nr_local[dd]-=nb; /*if A_N==1, there is no look-ahead, so insert the panel here*/ if((A_N==1) && (L==K+A_N)){ /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); ///magma_schedule_set_task_priority(0); //TEST: testing prio_0 //// magma_schedule_set_task_priority(INT_MAX - N*K - 2); magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); //magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/ for(dd=0;dd<num_gpus;dd++){ //&(dlpanel[dd][dlpanel_LD*nb*(K+1)]) ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel[dd]); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel(dd,K+1));//dlpanel[dd] } /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; ///magma_schedule_set_task_priority(INT_MAX); /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ ///magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP, dAP_LD, A(K+1,K+1), dAT(K+1,K+1)); //magma_insert_dev_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/ dd = GID(K+1); ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(K+1,K+1)); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(0,K+1));///dlAT(K+1,K+1) } } } #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, A_n, A, A_LD,"A"); ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dAT (Step K)"); nrows = m - K*nb; ncols = min(nb, n - K*nb); dd = GID(K); magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(ncols, nrows, dlAT(K,K), dlAT_LD,"dAT(K,K)"); if(K<=5){ printf("Step K:%d done. Int to continue: ",K); scanf("%d", &I); } #endif } //Step K done /*Wait for all thread termination*/ magma_schedule_barrier(); /*make sure everything arrived*/ ///needed? for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_queue_sync(mstream[dd][0]); magma_queue_sync(mstream[dd][1]); magma_queue_sync(mstream[dd][2]); } /*TODO: don't need quark here*/ /*Perform a sequence of left swap on the matrix corresponding to the different panel*/ for(K=1;K<=N-1;K++){ #if (dbglevel >=1) ca_trace_start(); #endif nrows = min(nb,m - K*nb); ncols = min(K*nb,n); for(dd=0;dd<=min(num_gpus-1, K-1);dd++){ gpu_ncols = numcols2p(dd, ncols, nb, num_gpus); J = dd; if(gpu_ncols>0){ magma_setdevice(dd); //pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(mstream[dd][1]); magmablas_dlaswp(gpu_ncols, dlAT(K, J), dlAT_LD, ONE, nrows, ipiv(K), ONE); //pthread_mutex_lock(&mutex_compute_stream); } } #if (dbglevel >=1) ca_trace_end_1gpu('W'); #endif } #if (dbglevel==10) ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dAT after lswap"); #endif /*Shutdown the scheduler*/ magma_schedule_delete(); /*update permutation vector indexes*/ for(K=1;K<=N-1;K++){ nrows = min(nb, n-K*nb); for(J=0;J<=nrows-1;J++){ ipiv[K*nb+J] += K*nb; } } #if dbglevel>=1 printf("[DBG] Time Factorization:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /* 4. Transpose back the matrix in/out of place*/ for(dd=0;dd<num_gpus;dd++){ //n_local[dd] = numcols2p(dd, n, nb, num_gpus); //loc2p(dd, N, num_gpus)*nb; magma_setdevice(dd); magmablasSetKernelStream(mstream[dd][1]); magmablas_dtranspose2(dlA[dd], dlA_LD, dlAT[dd], dlAT_LD, n_local[dd], m); } for(dd=0;dd<num_gpus;dd++){ //needed magma_setdevice(dd); magmablasSetKernelStream(NULL); } #if dbglevel>=1 printf("[DBG] Time Final in/out of place transpose:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlA, dlA_LD,"dA = LU"); #endif for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_queue_destroy(mstream[dd][0]); magma_queue_destroy(mstream[dd][1]); magma_queue_destroy(mstream[dd][2]); } //free(mstream); // printf("Step 4: time:%f\n",magma_wtime()-t1); // t1 = magma_wtime(); free(n_local); free(nr_local); // free(k_local); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_free( dlAP_set[dd]); magma_free( dlAP_get[dd]); magma_free(dlpanel[dd]); magma_free(dlAT[dd]); } //free(dlAP_set); //free(dlAP_get); //free(dlpanel); free(dlAT); #if dbglevel>=1 printf("[DBG] Time memory free (dAP):%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if dbglevel>=1 /*Finalize the tracing*/ ca_dbg_trace_finalize(); printf("[DBG] Time llog:%f\n",magma_wtime()-t1); #endif return *info; } /* End of MAGMA_DGETRF_REC_ASYNC_WORK_GPU */
/** Purpose ------- DGESSM applies the factors L computed by DGETRF_INCPIV to a real M-by-N tile A. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] k INTEGER The number of columns of the matrix L. K >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by DGETRF_INCPIV. @param[in] dL1 DOUBLE_PRECISION array, dimension(LDDL1, N) The IB-by-K matrix in which is stored L^(-1) as returned by GETRF_INCPIV @param[in] lddl1 INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). @param[in] dL DOUBLE_PRECISION array, dimension(LDDL, N) The M-by-K lower triangular tile on the gpu. @param[in] lddl INTEGER The leading dimension of the array L. LDDL >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension (LDDA, N) On entry, the M-by-N tile A on the gpu. On exit, updated by the application of L on the gpu. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @ingroup magma_dgesv_tile ********************************************************************/ extern "C" magma_int_t magma_dgessm_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, magmaDouble_ptr dL1, magma_int_t lddl1, magmaDouble_ptr dL, magma_int_t lddl, magmaDouble_ptr dA, magma_int_t ldda, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ldda + (j) ) #define L(i,j) (dL + (i) + (j)*lddl ) #define dL1(j) (dL1 + (j)*lddl1) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; int i, sb; magmaDouble_ptr dAT; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for (i = 0; i < k; i += ib) { sb = min(ib, k-i); magmablas_dlaswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, dL1(i), lddl1, AT(i, 0), ldda); #else magma_dtrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, L( i, i), lddl, AT(i, 0), ldda); #endif if ( (i+sb) < m) { magma_dgemm( MagmaNoTrans, MagmaTrans, n, m-(i+sb), sb, c_neg_one, AT(i, 0), ldda, L( i+sb, i), lddl, c_one, AT(i+sb, 0), ldda ); } } if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } return *info; } /* magma_dgessm_gpu */
extern "C" magma_int_t magma_dgetrf_gpu( magma_int_t m, magma_int_t n, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_queue_t queue, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dA(i_, j_) dA, dA_offset + (i_)*nb + (j_)*nb*ldda #define dAT(i_, j_) dAT, dAT_offset + (i_)*nb*lddat + (j_)*nb #define dAP(i_, j_) dAP, (i_) + (j_)*maxm #define work(i_) (work + (i_)) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, s, lddat, ldwork; magmaDouble_ptr dAT, dAP; double *work; size_t dAT_offset; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_dgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( m, n, dA(0,0), ldda, work(0), m, queue ); lapackf77_dgetrf( &m, &n, work, &m, ipiv, info ); magma_dsetmatrix( m, n, work(0), m, dA(0,0), ldda, queue ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if ( MAGMA_SUCCESS != magma_dmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; dAT_offset = dA_offset; lddat = ldda; magmablas_dtranspose_inplace( m, dAT(0,0), lddat, queue ); } else { lddat = maxn; // N-by-M dAT_offset = 0; if ( MAGMA_SUCCESS != magma_dmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queue ); } ldwork = maxm; if ( MAGMA_SUCCESS != magma_dmalloc_cpu( &work, ldwork*nb )) { magma_free( dAP ); if ( dA != dAT ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( j=0; j < s; j++ ) { // download j-th panel magmablas_dtranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queue ); magma_dgetmatrix( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queue ); if ( j > 0 ){ magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queue ); } // do the cpu part rows = m - j*nb; lapackf77_dgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queue ); // upload j-th panel magma_dsetmatrix( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queue ); magmablas_dtranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queue ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queue ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queue ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queue ); magma_dgetmatrix( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queue ); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queue ); // upload j-th panel magma_dsetmatrix( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queue ); magmablas_dtranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queue ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat, queue ); } // undo transpose if ( dA == dAT ) { magmablas_dtranspose_inplace( m, dAT(0,0), lddat, queue ); } else { magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queue ); magma_free( dAT ); } magma_free( dAP ); magma_free_cpu( work ); } return *info; } /* magma_dgetrf_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- Testing dswap, dswapblk, dpermute, dlaswp, dlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); double *h_A1, *h_A2; double *d_A1, *d_A2; double *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasDswap dswap dswapblk dlaswp dpermute dlaswp2 dlaswpx dcopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk dlaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_dgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(double) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_R1, double, lda*N ); TESTING_MALLOC_PIN( h_R2, double, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, double, ldda*N ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasDswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dpermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // dpermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_dlaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_dlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_dcopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_dcopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- DGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] hA DOUBLE_PRECISION array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL DOUBLE_PRECISION array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL DOUBLE_PRECISION array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] ipiv INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. @param[out] dWORK DOUBLE_PRECISION array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, double *hA, magma_int_t ldha, double *dA, magma_int_t ldda, double *hL, magma_int_t ldhl, double *dL, magma_int_t lddl, magma_int_t *ipiv, double *dwork, magma_int_t lddwork, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; double *dAT; #ifndef WITHOUTTRTRI double *dL2 = dL + ib; double *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_dgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_dlacpy(PlasmaUpperLower, mindim, mindim, (double*)hA, ldha, (double*)hL2, ldhl ); CORE_dtrtri( PlasmaLower, PlasmaUnit, mindim, (double*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_dsetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( order == MagmaRowMajor ) { magma_dsetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_dtranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_dsetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( order == MagmaColMajor ) { magmablas_dgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i < s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i > 0 ) { // download i-th panel magmablas_dtranspose( sb, m, AT(0,i), ldda, dwork, maxm ); magma_dgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_dgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for (j=ii; j < fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_dlaswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_dlacpy(PlasmaLower, sb, sb, (double*)hA(i, i), ldha, (double*)hL2(i), ldhl ); CORE_dtrtri( PlasmaLower, PlasmaUnit, sb, (double*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_dsetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_dsetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_dtranspose( rows, sb, dwork, cols, AT(i,i), ldda ); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_dgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_dtrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_dgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
extern "C" magma_int_t magma_dgetrf_gpu_work_amc( magma_int_t m, magma_int_t n, double *dA, magma_int_t dA_LD, magma_int_t *ipiv, magma_int_t *info, /*workspace on the cpu side*/ double *AWORK, magma_int_t AWORK_LD, magma_int_t AWORK_n ) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DGETRF_GPU_WORK_AMC computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The technique used for the panel factorization is the parallel recursif LU (see lawn 259). The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; int ONE = 1; magma_int_t iinfo, nb; magma_int_t mindim; magma_int_t nrows, ncols; //double *work; magma_int_t dm_max, dn_max; magma_int_t I, J, K, M, N, U_K; //magma_int_t A_K; double *dAT; magma_int_t dAT_LD; double *dAP_set,*dAP_get; magma_int_t dAP_LD; //magma_int_t nrows, ncols; magma_int_t gpu_nrows, gpu_ncols; int nbcores; /*Number of cores available for the whole factorization*/ int panel_num_threads; /*Number of threads for the panel*/ double dcpu; /*percentage of the matrix to allocate on the CPUs*/ int B_rows; double t1; /* Recommanded dimension in the workspace*/ int A_m, A_n, A_N, A_NMAX, A_LD; double *A; #ifdef USE_CALU int i_nrows; #endif amc_args_t *args; /*magma_event_t *A_event;*/ /*Control bucket*/ /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (dA_LD < max(1,m)) *info = -4; else if (AWORK_LD < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /*Get parameters*/ args = magma_amc_args_get_default(); nb= args->nb; nbcores = args->P; panel_num_threads = args->Pr; dcpu = args->dcpu; /* Check and fix parameters */ if(nb==0) nb = magma_get_dgetrf_nb(m) ;/*magma dgetrf block size*/ else nb = args->nb; if(nb>n) nb = n; if(panel_num_threads>nbcores) panel_num_threads = nbcores; /* Compute the maximum number of panels we can store in the workspace*/ A_NMAX = (int) (AWORK_n/ nb); /* Compute the recommanded number of columns for the cpu part*/ A_n = (int) ceil(n*dcpu); /*Make sure we work with multiple of 32*/ /* if(A_n%32!=0) { A_n = ((A_n + 31)/32)*32; } */ /* Compute the recommanded number of panels for the cpu part*/ A_N = (int) (A_n/ nb); /* Check if there are enough workspace. In case the user gave a workspace lower than the optimal*/ /* NOTE: using small workspace may reduce performance*/ if(A_N>A_NMAX){ #if (dbglevel >=1) printf("[DBG_WARNING] Resizing buffer to feet user preferences. Recommanded:%d, Max given:%d\n",A_N, A_NMAX); #endif A_N = A_NMAX; } A = AWORK; A_m = m; A_LD = AWORK_LD; #if (dbglevel >=1) /* Initialize the tracing*/ ca_dbg_trace_init(nbcores,1); //nbcores + 1 GPU #endif #if (dbglevel >=1) t1 = magma_wtime(); #endif /*Transfer the first column block of the matrix from the GPU to the CPUs*/ magma_dgetmatrix(A_m, A_n, dA, dA_LD, A, A_LD); #if (dbglevel >=1) printf("[DBG] Time First getmatrix: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat(m, A_n, A, A_LD,"A after first getMatrix"); #endif /*Allocate a workspace for the panels transposition*/ dAP_LD = m; if(dAP_LD%32!=0) dAP_LD = ((dAP_LD + 31)/32)*32;/*Make dAP_LD multiple of 32*/ if (MAGMA_SUCCESS != magma_dmalloc(&dAP_set, dAP_LD*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_dmalloc(&dAP_get, dAP_LD*nb)) { magma_free(dAP_set); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #if (dbglevel >=1) printf("[DBG] Time workspace memory alloc (dAP): %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /*Transpose the gpu part of the matrix in/out of place*/ if ((m == n) ){ //&& (m % 32 == 0) && (dA_LD%32 == 0) dAT = dA; dAT_LD= dA_LD; magmablas_dtranspose_inplace(m, dAT, dAT_LD); } else { dm_max = m; dn_max = n; /*Make sure m and n are multiple of 32*/ if(dm_max%32!=0) dm_max = ((dm_max + 31)/32)*32; if(dn_max%32!=0) dn_max = ((dn_max + 31)/32)*32; if (MAGMA_SUCCESS != magma_dmalloc(&dAT, dm_max*dn_max )) { magma_free(dAP_set); magma_free(dAP_get); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dAT_LD = dn_max; magmablas_dtranspose2( dAT, dAT_LD, dA, dA_LD, m, n ); } #if (dbglevel >=1) printf("[DBG] Time First transposition: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"matrix dAT to factorize"); #endif /* Compute the maximun number of steps*/ mindim = min(m, n); M = (int) ceil( (double) m / nb); N = (int) ceil( (double) mindim / nb); /*N = n/nb*/ /*Let the asynchronous algorithm begin*/ #if (dbglevel >=1) printf("Starting recursif code ... m:%d, n:%d, nb:%d, nbcores:%d, N:%d, A_N:%d\n", m, n, nb, nbcores, N, A_N); //Summary #endif /*Initialize the scheduler*/ magma_schedule_init(nbcores, 1); K = 0; #ifdef USE_CALU /*initialize calu environment*/ core_dtslu_alloc(panel_num_threads, A_m, nb); core_dtslu_init(panel_num_threads); /*Initialize rows indice: required*/ for(I=0;I<A_m;I++) ipiv[I]=I; #else /*initialize parallel recursif panel environment*/ CORE_zgetrf_reclap_init(); #endif magma_schedule_set_task_priority(INT_MAX-1); /*Schedule the first panel factorization*/ #ifdef USE_CALU magma_insert_core_dtslu(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K)); B_rows = (int) ceil((double) (M-K-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+1; I<=M-1; I+=B_rows){ i_nrows = min(B_rows*nb, m-I*nb); magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, nb, c_one, A(0,K), A_LD, A(I,K), A_LD, colptr(K)); } #else magma_insert_core_dgetrf_rec(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K)); #endif /*Transfer the factorized panel to the GPU (transposition included)*/ magma_insert_dsetmatrix_transpose(A_m, nb, A(0,K), A_LD, dAT(0,K), dAT_LD, dAP_set, dAP_LD, colptr(K), dAT(K,K)); #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, nb, A(0,0), A_LD,"A(0,0)"); ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"dA"); #endif for(K=0;K<=N-1;K++){ /*insert the coarse update of the trailing submatrix corresponding to panel K to the GPU, that is submatrix A[K+1:M, K+1+d-1:N]*/ gpu_nrows = m - (K+1)*nb; gpu_ncols = n - (K+1+A_N-1)*nb; if(gpu_ncols >0) { /*NOTE: Here we work on the matrix transpose*/ /*Set the priority max for the GPU computations*/ magma_schedule_set_task_priority(INT_MAX); //// magma_schedule_set_task_priority(INT_MAX - N*K); /*schedule a swap of the trailing submatrix in the gpu using ipiv[K]*/ /*dependency dAT((K+1)-1, (K+A_N)-1) = dAT(K, K+A_N-1) with previous dgemm*/ magma_insert_dlaswp(gpu_ncols, dAT(K, K+A_N), dAT_LD, ONE, nb, ipiv(K), ONE, dAT(K, K+A_N-1)); /*non blocking*/ //printf("debug barrier\n"); //magma_schedule_barrier(); magma_insert_dtrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD);/*non blocking*/ /* aij^T = aij^T - (lik.ukj)^T = aij^T - ukj^T.lik^T*/ magma_insert_dgemm(MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD);/*non blocking*/ } /*iterate over the rest of the columns to update the trailing submatrix on the cpu*/ for(J=K+1;J<=min(K+A_N-1, N-1);J++){ ncols = min(nb, n - J*nb); /*Set the priority max for column having the next panel (look ahead of deep 1), and process the rest of the update in a right looking way*/ if(J==K+1) magma_schedule_set_task_priority(INT_MAX -2 ); //// magma_schedule_set_task_priority(INT_MAX - N*K -1); else magma_schedule_set_task_priority(INT_MAX -3 - J );//- N*K /*dependency colptr(J): make sure column J is sent from GPU, and all previous update was done*/ magma_insert_core_dlaswp(ncols, A(K,J), A_LD, ONE, nb, ipiv(K), ONE, colptr(J)); magma_insert_core_dtrsm('L', 'L', 'N', 'U', nb, ncols, c_one, A(K,K), A_LD, A(K,J), A_LD, colptr(J)); /*Compute the number of blocs rows to group together before the update. To avoid scheduling overhead.*/ B_rows = (int) ceil((double) (M-K-1)/panel_num_threads); //B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+1; I<=M-1; I+=B_rows){ nrows = min(B_rows*nb, m-I*nb); /*dep colptr(K):make sure the panel is not overwritten or swapped since dgemm use A[I,K]*/ /*dep colptr(J): Gather all dgemm on one column and create dependencies with previous dgemm and the next panel*/ magma_insert_core_dgemm('N','N', nrows, ncols, nb, c_neg_one, A(I,K), A_LD, A(K,J), A_LD, c_one, A(I,J), A_LD, colptr(K), colptr(J)); } if(J==K+1) { /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); #ifdef USE_CALU magma_insert_core_dtslu(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); B_rows = (int) ceil((double) (M-(K+1)-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+2; I<=M-1; I+=B_rows){ i_nrows = min(B_rows*nb, m-I*nb); magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, ncols, c_one, A(K+1,K+1), A_LD, A(I,K+1), A_LD, colptr(K+1)); //dtrsm("R", "U", "N", "N", &nrowPblock, &panel_NB, &dONE, &(A[M*pos+pos]), &LDA, &(A[lpos]), &LDA); // } #else magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); #endif /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); } } /*Transfer asynchronously one column (column K+A_N) from the GPU to the CPU to balance work*/ /*Make sure this is inserted after all dgemm before it schedules to replace a current panel in case A_N< N*/ if(K+A_N<N) { ncols = min(nb, gpu_ncols); magma_schedule_set_task_priority(INT_MAX); magma_insert_dgetmatrix_transpose(gpu_nrows, ncols, dAT(K+1,K+A_N), dAT_LD, A(K+1,K+A_N), A_LD, dAP_get, dAP_LD, colptr(K+A_N)); //blocking /*if A_N==1 there is no look-ahead, so insert the panel here*/ if(A_N==1){ /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); #ifdef USE_CALU magma_insert_core_dtslu(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); B_rows = (int) ceil((double) (M-(K+1)-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+2; I<=M-1; I+=B_rows){ i_nrows = min(B_rows*nb, m-I*nb); magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, ncols, c_one, A(K+1,K+1), A_LD, A(I,K+1), A_LD, colptr(K+1)); //dtrsm("R", "U", "N", "N", &nrowPblock, &panel_NB, &dONE, &(A[M*pos+pos]), &LDA, &(A[lpos]), &LDA); // } #else magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); //magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); #endif /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; ///magma_schedule_set_task_priority(INT_MAX); /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); } } #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, A_n, A, A_LD,"A"); ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"dA"); #endif } //Step K done /*Wait for all thread termination*/ magma_schedule_barrier(); /*TODO: don't need quark here*/ /*Perform a sequence of left swap on the matrix corresponding to the different panel*/ for(K=1;K<=N-1;K++){ #if (dbglevel >=1) ca_trace_start(); #endif nrows = min(nb,m - K*nb); ncols = min(K*nb,n); /*dep dAT(K-1): Make sure the last swap is completed, and also the dgemm using the panel*/ // magma_insert_dlaswp(ncols, dAT(K, 0), dAT_LD, ONE, nrows, ipiv(K), ONE, dAT(K-1,0)); magmablas_dlaswp(ncols, dAT(K, 0), dAT_LD, ONE, nrows, ipiv(K), ONE); #if (dbglevel >=1) ca_trace_end_1gpu('W'); #endif } /*Shutdown the scheduler*/ magma_schedule_delete(); /*update permutation vector indexes*/ for(K=1;K<=N-1;K++){ nrows = min(nb, n-K*nb); for(J=0;J<=nrows-1;J++){ ipiv[K*nb+J] += K*nb; } } #if dbglevel>=1 printf("[DBG] Time Factorization:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /*No need for synchro, since dtranspose is blocking*/ if (m == n) { magmablas_dtranspose_inplace(m, dAT, dAT_LD); //( m, dAT, dAT_LD ); dA = dAT; } else { magmablas_dtranspose2( dA, dA_LD, dAT, dAT_LD, n, m ); magma_free( dAT ); } #if dbglevel>=1 printf("[DBG] Time Final in/out of place transpose:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #ifdef USE_CALU core_dtslu_free(); #endif magma_free( dAP_set ); magma_free( dAP_get ); #if dbglevel>=1 printf("[DBG] Time memory free (dAP):%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dA, dA_LD,"dA = LU"); #endif #if dbglevel>=1 /*Finalize the tracing*/ ca_dbg_trace_finalize(); printf("[DBG] Time llog:%f\n",magma_wtime()-t1); #endif return *info; } /* End of MAGMA_DGETRF_REC_ASYNC_WORK_GPU */