void magma_task_dev_dmalloc_pinned(Schedule* sched_obj ) { magma_int_t deviceID; magma_int_t size; double **A; void *dep_ptr; #if (dbglevel >=1) ca_trace_start(); #endif // printf("doing dmalloc\n"); schedule_unpack_args_4(sched_obj, deviceID, size, A, dep_ptr); magma_setdevice(deviceID); // printf("doing dmalloc %p\n",dep_ptr); //printf("using malloc instead, *** TODO: fix\n"); //A = (double**) malloc(size * sizeof(double)); magma_dmalloc_pinned(A, size); // printf("end doing dmalloc\n"); #if (dbglevel >=1) ca_trace_end_gpu('O'); ca_trace_end_cpu('C'); #endif }
void magma_task_dev_dfree_pinned_index(Schedule* sched_obj ) { magma_int_t deviceID; double **A; magma_int_t index; void *dep_ptr; #if (dbglevel >=1) ca_trace_start(); #endif // printf("doing dmalloc\n"); schedule_unpack_args_4(sched_obj, deviceID, A, index, dep_ptr); magma_setdevice(deviceID); // printf("doing dmalloc %p\n",dep_ptr); //printf("using malloc instead, *** TODO: fix\n"); //A = (double**) malloc(size * sizeof(double)); //printf("*** using simpl free\n"); //free(A[index]); //A[index]=NULL; magma_free_pinned(A[index]); #if (dbglevel >=1) ca_trace_end_gpu('0'); ca_trace_end_cpu('C'); #endif }
void magma_task_dev_dgetmatrix_async_transpose(Schedule* sched_obj ) { magma_int_t deviceID; magma_int_t m; magma_int_t nb; double *dA_src; magma_int_t dA_LD; double *A_dst; magma_int_t LDA; magma_queue_t stream1; double *dwork; magma_int_t dwork_LD; void *dep_ptr; #if (dbglevel >=1) ca_trace_start(); #endif // printf("Matrix_get_transpose\n"); //schedule_unpack_args_8(sched_obj, m, nb, dA_src, dA_LD, A_dst, LDA, dwork, dwork_LD); schedule_unpack_args_11(sched_obj, deviceID, m, nb, dA_src, dA_LD, A_dst, LDA, stream1, dwork, dwork_LD, dep_ptr); magma_setdevice(deviceID); // printf("Matrix_get_transpose m:%d, nb:%d, dep_ptr:%p\n",m,nb,dep_ptr); //magma_dgetmatrix_transpose( m, nb, dA_src, dA_LD, A_dst, LDA); // task_getpanel(gpu_nrows, ncols, dAT(K,K+A_N), dAT_LD, A(K,K+A_N), A_LD); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(nb, m, dA_src, dA_LD, "dA before getMatrix"); #endif // pthread_mutex_lock(&mutex_dAP); /*1.transpose dA to dwork*/ magmablas_dtranspose2s(dwork, dwork_LD, dA_src, dA_LD, nb, m, compute_stream[deviceID]); // printf("Matrix_get_transpose m:%d, nb:%d 1:done\n",m,nb); #if (dbglevel==10) // ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after dA transpose"); #endif magma_queue_sync(compute_stream[deviceID]); /*2. copy dwork to A: send the panel to GPU*/ magma_dgetmatrix_async(m, nb, dwork, dwork_LD, A_dst, LDA, stream1); // printf("Matrix_get_transpose m:%d, nb:%d 2:done\n",m,nb); // pthread_mutex_unlock(&mutex_dAP); magma_queue_sync(stream1); //make sure dwork is set before the transpose #if (dbglevel==10) ca_dbg_printMat(m, nb, A_dst, LDA, "A after getMatrix"); #endif #if (dbglevel >=1) ca_trace_end_gpu('G'); ca_trace_end_cpu('C'); #endif // printf("End Matrix_get_transpose m:%d, nb:%d\n",m,nb); }
void magma_task_dev_dsetmatrix_transpose(Schedule* sched_obj) { magma_int_t deviceID; magma_int_t m; magma_int_t nb; double *A_src; magma_int_t LDA; double *dA_dst; magma_int_t dA_LD; double *dwork; magma_int_t dwork_LD; // double *tmpdA; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_9(sched_obj, deviceID, m, nb, A_src, LDA, dA_dst, dA_LD, dwork, dwork_LD); magma_setdevice(deviceID); //magma_dsetmatrix_transpose( m, nb, A_src, LDA, dA_dst, dA_LD); //task_setpanel(A_m, nb, A(0,K), A_LD, dA(0,K), dA_LD) #if (dbglevel==10) ca_dbg_printMat(m, nb, A_src, LDA, "A before setMatrix"); //ca_dbg_printMat_gpu(2, 2, dwork, dwork_LD, "dwork for testing"); //cudaMemcpy(&tmpdA, dwork, sizeof(double*), cudaMemcpyHostToDevice); //ca_dbg_printMat_gpu(2, 2, tmpdA, dwork_LD, "dlAP_set[dd] for testing"); #endif // pthread_mutex_lock(&mutex_dAP); /*1. copy A to dwork: send to the GPU*/ magma_dsetmatrix(m, nb, A_src, LDA, dwork, dwork_LD); #if (dbglevel==10) // ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after setMatrix"); #endif /*2.transpose dwork to dA*/ magmablas_dtranspose2(dA_dst, dA_LD, dwork, dwork_LD, m, nb); // pthread_mutex_unlock(&mutex_dAP); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(nb, m, dA_dst, dA_LD, "dA after setMatrix"); #endif #if (dbglevel >=1) ca_trace_end_gpu('T'); ca_trace_end_cpu('C'); #endif }
void magma_task_dev_dgemm(Schedule* sched_obj ) { magma_int_t deviceID; magma_trans_t transA; magma_trans_t transB; magma_int_t m; magma_int_t n; magma_int_t k; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; double beta; double *dC; magma_int_t ldc; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_14(sched_obj, deviceID, transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc); magma_setdevice(deviceID); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, k, dA, lda, "A before magma_dgemm"); ca_dbg_printMat_transpose_gpu(k, n, dB, ldb, "B before magma_dgemm"); ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C before magma_dgemm"); #endif pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(compute_stream[deviceID]); magma_dgemm( transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc ); //task_magma_dgemm(MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD); pthread_mutex_unlock(&mutex_compute_stream); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C after magma_dgemm"); #endif #if (dbglevel >=1) ca_trace_end_gpu('S'); ca_trace_end_cpu('C'); #endif }
void magma_task_dfree_pinned(Schedule* sched_obj ) { double *A; void *dep_ptr; #if (dbglevel >=1) ca_trace_start(); #endif // printf("doing dmalloc\n"); schedule_unpack_args_2(sched_obj, A, dep_ptr); // printf("doing dmalloc %p\n",dep_ptr); //printf("using malloc instead, *** TODO: fix\n"); //A = (double**) malloc(size * sizeof(double)); magma_free_pinned(A); #if (dbglevel >=1) ca_trace_end_1gpu('O'); ca_trace_end_cpu('C'); #endif }
void magma_task_dev_dtrsm(Schedule* sched_obj ) { magma_int_t deviceID; magma_side_t side; magma_uplo_t uplo; magma_trans_t trans; magma_diag_t diag; magma_int_t m; magma_int_t n; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_12(sched_obj, deviceID, side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dB, ldb, "A(K,K:N) before magma_dtrsm"); #endif magma_setdevice(deviceID); pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(compute_stream[deviceID]); magma_dtrsm( side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb ); //task_magma_dtrsm('R', 'U', MagmaNoTrans, 'U', gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD); pthread_mutex_unlock(&mutex_compute_stream); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dB, ldb, "A(K,K:N) after magma_dtrsm"); #endif #if (dbglevel >=1) ca_trace_end_gpu('U'); ca_trace_end_cpu('C'); #endif }
void magma_task_dgemm(Schedule* sched_obj ) { magma_trans_t transA; magma_trans_t transB; magma_int_t m; magma_int_t n; magma_int_t k; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; double beta; double *dC; magma_int_t ldc; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_13(sched_obj,transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, k, dA, lda, "A before magma_dgemm"); ca_dbg_printMat_transpose_gpu(k, n, dB, ldb, "B before magma_dgemm"); ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C before magma_dgemm"); #endif magma_dgemm( transA, transB, m, n, k, alpha, dA, lda, dB, ldb, beta, dC, ldc ); //magma_task_dgemm('N','N', gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dC, ldc, "C after magma_dgemm"); #endif #if (dbglevel >=1) ca_trace_end_1gpu('S'); ca_trace_end_cpu('C'); #endif }
void magma_task_dlaswp(Schedule* sched_obj ) { magma_int_t n; double *dA; magma_int_t lda; magma_int_t i1; magma_int_t i2; magma_int_t *ipiv; magma_int_t inci; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_7(sched_obj,n, dA, lda, i1, i2, ipiv, inci); magmablas_dlaswp( n, dA, lda, i1, i2, ipiv, inci ); //magma_task_dlaswp(gpu_ncols, dAT(K,K+A_N), dAT_LD, c_one, nb, &ipiv[K], c_one); #if (dbglevel >=1) ca_trace_end_1gpu('W'); ca_trace_end_cpu('C'); #endif }
void magma_task_dsetmatrix_transpose(Schedule* sched_obj) { magma_int_t m; magma_int_t nb; double *A_src; magma_int_t LDA; double *dA_dst; magma_int_t dA_LD; double *dwork; magma_int_t dwork_LD; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_8(sched_obj, m, nb, A_src, LDA, dA_dst, dA_LD, dwork, dwork_LD); //magma_dsetmatrix_transpose( m, nb, A_src, LDA, dA_dst, dA_LD); //task_setpanel(A_m, nb, A(0,K), A_LD, dA(0,K), dA_LD) #if (dbglevel==10) ca_dbg_printMat(m, nb, A_src, LDA, "A before setMatrix"); #endif // pthread_mutex_lock(&mutex_dAP); /*1. copy A to dwork: send to the GPU*/ magma_dsetmatrix(m, nb, A_src, LDA, dwork, dwork_LD); #if (dbglevel==10) // ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after setMatrix"); #endif /*2.transpose dwork to dA*/ magmablas_dtranspose2(dA_dst, dA_LD, dwork, dwork_LD, m, nb); // pthread_mutex_unlock(&mutex_dAP); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(nb, m, dA_dst, dA_LD, "dA after setMatrix"); #endif #if (dbglevel >=1) ca_trace_end_1gpu('T'); ca_trace_end_cpu('C'); #endif }
void magma_task_dev_queue_sync(Schedule* sched_obj) { magma_int_t deviceID; magma_queue_t stream1; void *dep_ptr; // double *tmpdA; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_3(sched_obj, deviceID, stream1, dep_ptr); magma_setdevice(deviceID); magma_queue_sync(stream1); #if (dbglevel >=1) ca_trace_end_gpu('O'); ca_trace_end_cpu('C'); #endif }
void magma_task_dev_dlaswp(Schedule* sched_obj ) { magma_int_t deviceID; magma_int_t n; double *dA; magma_int_t lda; magma_int_t i1; magma_int_t i2; magma_int_t *ipiv; magma_int_t inci; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_8(sched_obj, deviceID, n, dA, lda, i1, i2, ipiv, inci); magma_setdevice(deviceID); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(n, n, dA, lda, "A(n,n) before magma_dlaswp"); #endif pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(compute_stream[deviceID]); magmablas_dlaswp( n, dA, lda, i1, i2, ipiv, inci ); pthread_mutex_unlock(&mutex_compute_stream); //task_magma_dlaswp(gpu_ncols, dAT(K,K+A_N), dAT_LD, c_one, nb, &ipiv[K], c_one); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(n, n, dA, lda, "A(n,n) after magma_dlaswp"); #endif #if (dbglevel >=1) ca_trace_end_gpu('W'); ca_trace_end_cpu('C'); #endif }
void magma_task_dtrsm(Schedule* sched_obj ) { magma_side_t side; magma_uplo_t uplo; magma_trans_t trans; magma_diag_t diag; magma_int_t m; magma_int_t n; double alpha; double *dA; magma_int_t lda; double *dB; magma_int_t ldb; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_11(sched_obj,side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb); magma_dtrsm( side, uplo, trans, diag, m, n, alpha, dA, lda, dB, ldb ); //magma_task_dtrsm('R', 'U', 'N', 'U', gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD); #if (dbglevel >=1) ca_trace_end_1gpu('U'); ca_trace_end_cpu('C'); #endif }
extern "C" magma_int_t magma_dgetrf_mgpu_work_amc_v3(magma_int_t num_gpus, magma_int_t m, magma_int_t n, double **dlA, magma_int_t dlA_LD, magma_int_t *ipiv, magma_int_t *info, /*workspace on the cpu side*/ double *AWORK, magma_int_t AWORK_LD, magma_int_t AWORK_n ) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DGETRF_REC_ASYNC computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The technique used for the panel factorization is the parallel recursif LU (see lawn 259). The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; int ONE = 1; magma_int_t iinfo, nb; magma_int_t mindim; magma_int_t nrows, ncols; //double *work; magma_int_t dm_max, dn_max; magma_int_t I, J, K, M, N, U_K, L; //magma_int_t A_m, A_n, A_N; //magma_int_t Am_max, An_max; //magma_int_t A_nb; //magma_int_t A_K; double **dlAT; magma_int_t dlAT_LD; double *dlAP_get[MagmaMaxGPUs]; //*dlAP_set[MagmaMaxGPUs] double *dlAP_set[MagmaMaxGPUs]; magma_int_t dlAP_LD; double *dlpanel[MagmaMaxGPUs]; magma_int_t dlpanel_LD; int *n_local, *nr_local; //magma_int_t nrows, ncols; magma_int_t gpu_nrows, gpu_ncols; int nbcores; /*Number of cores available for the whole factorization*/ int panel_num_threads; /*Number of threads for the panel*/ double dcpu; /*percentage of the matrix to allocate on the CPUs*/ int B_rows; double t1; /*Workspace*/ // magma_int_t AWORK_NMAX; // magma_int_t AWORK_m, AWORK_n, AWORK_N; /* Recommanded dimension in the workspace*/ int A_m, A_n, A_N, A_NMAX, A_LD; int A_NP1; double *A; amc_args_t *args; /*magma_event_t *A_event;*/ /*Control bucket*/ magma_queue_t mstream[MagmaMaxGPUs][3]; /*0: H2D, 1: compute, 2:D2H*/ int dd; // double *tmpdA; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (dlA_LD < max(1,m)) *info = -4; else if (AWORK_LD < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /*Get parameters*/ args = magma_amc_args_get_default(); nb= args->nb; nbcores = args->P; panel_num_threads = args->Pr; dcpu = args->dcpu; /* Check and fix parameters */ if(nb==0) nb = magma_get_dgetrf_nb(m) ;/*magma dgetrf block size*/ else nb = args->nb; if(nb>n) nb = n; if(panel_num_threads>nbcores) panel_num_threads = nbcores; /*check the buffer size*/ if(AWORK_n<nb){ printf("Not enough buffer. Should be greater than the block size: %d\n", nb); exit(1); } /* Compute the number of blocks columns to factorize*/ N = (int) ceil( (double) min(m, n) / nb); /* Compute the maximum number of panels we can store in the workspace*/ A_NMAX = (int) (AWORK_n/ nb); /*Compute the recommanded number of panels for the cpu part*/ A_N = NSplit(N, dcpu); /* Compute the recommanded number of columns for the cpu part*/ A_n = A_N*nb;//(int) ceil(n*dcpu); //if(A_n<nb) // A_n = nb;//make sure workspace has at least one block column /*Make sure we work with multiple of 32*/ /* if(A_n%32!=0) { A_n = ((A_n + 31)/32)*32; } */ /* Compute the recommanded number of panels for the cpu part*/ // A_N = (int) (A_n/ nb); /* Check if there are enough workspace. In case the user gave a workspace lower than the optimal*/ /* NOTE: using small workspace may reduce performance*/ if(A_N>A_NMAX){ #if (dbglevel >=1) printf("[DBG_WARNING] Resizing buffer to feet user preferences. Recommanded:%d, Max given:%d\n",A_N, A_NMAX); #endif A_N = A_NMAX; /*Make A_n a multiple of nb*/ A_n = A_N*nb; } A = AWORK; A_m = m; A_LD = AWORK_LD; #if (dbglevel >=1) /* Initialize the tracing*/ ca_dbg_trace_init(nbcores,num_gpus); //nbcores + 1 GPU #endif #if (dbglevel >=1) t1 = magma_wtime(); #endif /* create the streams */ //mstream = (magma_queue_t *) malloc(num_gpus*sizeof(magma_queue_t)); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); //required magma_queue_create(&mstream[dd][0]); magma_queue_create(&mstream[dd][1]); magma_queue_create(&mstream[dd][2]); /*Set the stream for internal computations*/ //magmablasSetKernelStream(0); /*Use 0 instead of mstream[dd][1], MagmasetkernelStream is not thread safe*/ /*TODO: mae it safe*/ //task_dev_set_compute_stream(dd, mstream[dd][1]); magma_task_dev_set_compute_stream(dd, 0); //set to mstream 1 later } /* Matrix dimension */ dm_max = m; dn_max = n; /*Make sure m and n are multiple of 32*/ if(dm_max%32!=0) dm_max = ((dm_max + 31)/32)*32; if(dn_max%32!=0) dn_max = ((dn_max + 31)/32)*32; /* local dimensions of the matrix for each GPU*/ n_local = (int *) malloc(num_gpus*sizeof(int)); /*This do no change during the execution*/ nr_local = (int *) malloc(num_gpus*sizeof(int)); /*Change after each update of the trailing submatrix*/ for(dd=0;dd<num_gpus;dd++){ n_local[dd] = numcols2p(dd, n, nb, num_gpus); //loc2p(dd, N, num_gpus)*nb; nr_local[dd] = n_local[dd]; } /*Allocate a workspace for the panels transposition*/ dlAP_LD = dm_max; //if(dAP_LD%32!=0) dAP_LD = ((dAP_LD + 31)/32)*32;/*Make dAP_LD multiple of 32*/ /// dlAP_set = (double **) malloc(num_gpus*sizeof(double*)); //dlAP_get = (double **) malloc(num_gpus*sizeof(double*)); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); if (MAGMA_SUCCESS != magma_dmalloc( &dlAP_set[dd], dlAP_LD*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if (MAGMA_SUCCESS != magma_dmalloc(&tmpdA, dlAP_LD*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } */ if ( magma_is_devptr(dlAP_set[dd] ) == 0 ) { fprintf( stderr, "ERROR: dlAP_set[dd] is host pointer.\n" ); //exit(1); } //cudaMemcpy(dlAP_set[dd],&tmpdA,sizeof(double*), cudaMemcpyDeviceToHost); #if (dbglevel==10) printf("0.4\n"); //ca_dbg_printMat_gpu(2, 2, dlAP_set[dd], dlAP_LD, "dlAP_set[dd] for testing"); //cudaMemcpy(&tmpdA, &dlAP_set[dd], sizeof(double*), cudaMemcpyHostToDevice); //ca_dbg_printMat_gpu(2, 2, tmpdA, dlAP_LD, "dlAP_set[dd] for testing"); //printf("0.5: int to continue"); scanf("%d", &I); #endif if (MAGMA_SUCCESS != magma_dmalloc(&dlAP_get[dd], dlAP_LD*nb)) { //magma_free(dlAP_set); //TODO: free all previous buffers *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } /* Workspace for the panels */ // dlpanel = (double **) malloc(num_gpus*sizeof(double*)); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); if (MAGMA_SUCCESS != magma_dmalloc(&dlpanel[dd], nb*dm_max)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dlpanel_LD = nb; /*local matrix storage*/ dlAT = (double **) malloc(num_gpus*sizeof(double*)); dlAT_LD = n_local[0]; if(dlAT_LD%32!=0) dlAT_LD = ((dlAT_LD + 31)/32)*32; for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); if (MAGMA_SUCCESS != magma_dmalloc(&dlAT[dd], dlAT_LD*dm_max )) { for(J=0;J<dd;J++){ magma_setdevice(J); magma_free( dlAP_set[J]); magma_free( dlAP_get[J]); magma_free(dlpanel[J]); magma_free(dlAT[J]); } //free(dlAP_set); //free(dlAP_get); //free(dlpanel); free(dlAT); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } #if (dbglevel >=1) printf("[DBG] Time workspace memory alloc (dAP): %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /*1. Transfer the first column blocks of the matrix from the GPU to the CPUs.*/ //magma_dgetmatrix(A_m, A_n, dA, dA_LD, A, A_LD); magma_dgetmatrix_1D_col_bcyclic(A_m, A_n, dlA, dlA_LD, A, A_LD, num_gpus, nb); #if (dbglevel >=1) printf("[DBG] Time First getmatrix: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) printf("1.0\n"); ca_dbg_printMat(A_m, A_n, A, A_LD,"A after first getMatrix"); /* for(dd=0;dd<num_gpus;dd++){ //Fill the matrix with zero for easy visualization of the matrix in debug mode for(I=0;I<dlAT_LD*dm_max;I++) dlAT[dd][I] = 0.0; } */ // ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlAT, dlAT_LD,"matrix dAlT^T empty"); // ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"matrix dAT empty"); printf("2.0\n"); #endif /*Update the remaining number of columns on the GPUs.*/ for(dd=0;dd<num_gpus;dd++){ nr_local[dd] = nr_local[dd] - numcols2p(dd, A_n, nb, num_gpus); //;n_local[dd] - loc2p(dd, A_N, num_gpus)*nb; } #if (dbglevel==10) ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlA, dlA_LD,"matrix dA to factorize"); printf("3.0\n"); #endif for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); //magmablasSetKernelStream(mstream[dd][1]); magmablas_dtranspose2(dlAT[dd], dlAT_LD, dlA[dd], dlA_LD, m, n_local[dd]); } /// for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_task_dev_set_compute_stream(dd, mstream[dd][1]); } #if (dbglevel >=1) printf("[DBG] Time First transposition: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) //ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"matrix dAT to factorize"); /* dd = GID(A_N); magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(nb, m, dlAT(0, A_N), dlAT_LD,"matrix dAT(0, A_N)"); magma_setdevice(0); ca_dbg_printMat_transpose_gpu(m, nb, dlA(0, A_N), dlA_LD,"matrix dA(0, A_N)"); */ printf("4.0\n"); printf("int to continue"); scanf("%d", &I); #endif /* #if (dbglevel==10) ca_dbg_printMat_transpose_mgpu(num_gpus, m, n_local, dlAT, dlAT_LD,"matrix dAT to factorize"); #endif */ /* Compute the maximun number of steps*/ mindim = min(m, n); M = (int) ceil( (double) m / nb); N = (int) ceil( (double) mindim / nb); /*N = n/nb*/ /* 3. Let the asynchronous algorithm begin*/ #if (dbglevel >=1) printf("Starting recursif code ... m:%d, n:%d, nb:%d, nbcores:%d, N:%d, A_N:%d\n", m, n, nb, nbcores, N, A_N); //Summary #endif /*Initialize the scheduler*/ magma_schedule_init(nbcores, num_gpus); K = 0; /*initialize parallel recursif panel environment*/ CORE_zgetrf_reclap_init(); magma_schedule_set_task_priority(INT_MAX-1); /*Schedule the first panel factorization*/ magma_insert_core_dgetrf_rec(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K)); //magma_insert_core_dgetrf(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, colptr(K)); /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/ for(dd=0;dd<num_gpus;dd++){ ///magma_insert_dev_dsetmatrix_transpose(dd, A_m, nb, A(0,K), A_LD, dlpanel(dd,K), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K), dlpanel[dd]); magma_insert_dev_dsetmatrix_async_transpose(dd, A_m, nb, A(0,K), A_LD, dlpanel(dd,K), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K), dlpanel(dd,K)); //dlpanel[dd] } #if (dbglevel==10) magma_schedule_barrier(); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(nb, m, dlpanel(dd,K), dlpanel_LD,"dlpanel[dd] after setmatrix_async"); //dlpanel[dd] } printf("4.5: int to continue"); scanf("%d", &I); #endif /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/ dd = GID(K); //magma_insert_dev_dsetmatrix_transpose(dd, A_m, nb, A(0,K), A_LD, dlAT(0,K), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K), dlAT(0,K)); magma_insert_dev_dsetmatrix_async_transpose(dd, A_m, nb, A(0,K), A_LD, dlAT(0,K), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K), dlAT(0,K)); #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, nb, A(0,0), A_LD,"A(0,0)"); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(nb, m, dlpanel[dd], dlpanel_LD,"dlpanel[dd] after setmatrix to dlAT"); } ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dlA"); printf("5.0: int to continue"); scanf("%d", &I); #endif for(K=0;K<=N-1;K++){ /*compute the new value of the cpu number of blocks*/ A_N = NSplit(N-K, dcpu); /*insert the coarse update of the trailing submatrix corresponding to panel K to the GPU, that is submatrix A[K+1:M, K+1+d-1:N]*/ //if(K==0) /*TODO: move outside loop*/ //{ /*NOTE: Here we work on the matrix transpose*/ /*Set the priority max for the GPU computations*/ magma_schedule_set_task_priority(INT_MAX); //// magma_schedule_set_task_priority(INT_MAX - N*K); gpu_nrows = m - (K+1)*nb;/// for(J=K+A_N;J<min(K+A_N+num_gpus,N);J++){ /*Determine the device which own the first column of the group of columns to update*/ dd = GID(J); /*Determine the number of columns to apply the update. */ nr_local[dd] = numcols2p(dd, n - (K+1+A_N-1)*nb, nb, num_gpus); gpu_ncols = nr_local[dd]; //n - (K+1+A_N-1)*nb; if(gpu_ncols >0) { /*schedule a swap of the trailing submatrix in the gpus using ipiv[K]*/ /*dependency dAT((K+1)-1, (K+A_N)-1) = dAT(K, K+A_N-1) with previous dgemm*/ magma_insert_dev_dlaswp(dd, gpu_ncols, dlAT(K, J), dlAT_LD, ONE, nb, ipiv(K), ONE, dlAT(K, J-1)); /*non blocking*/ //printf("debug barrier\n"); //magma_schedule_barrier(); //&(dlpanel[dd][dlpanel_LD*nb*K]) magma_insert_dev_dtrsm(dd, MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, gpu_ncols, nb, c_one, dlpanel(dd,K), dlpanel_LD, dlAT(K,J), dlAT_LD);/*non blocking*/ /* aij^T = aij^T - (lik.ukj)^T = aij^T - ukj^T.lik^T*/ //&(dlpanel[dd][dlpanel_LD*nb*(K+1)]) magma_insert_dev_dgemm(dd, MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dlAT(K,J), dlAT_LD, dlpanel(dd,K+1), dlpanel_LD, c_one, dlAT(K+1,J), dlAT_LD);/*non blocking*/ /*Transfer asynchronously one column (column K+A_N) from the GPU to the CPU to balance work*/ //// if(K+A_N<N) //// { ////ncols = min(nb, gpu_ncols); //////magma_schedule_set_task_priority(INT_MAX); ////magma_insert_dgetmatrix_transpose(gpu_nrows, ncols, dAT(K+1,K+A_N), dAT_LD, A(K+1,K+A_N), A_LD, dAP, dAP_LD, colptr(K+A_N)); //blocking //// } } } //} /*iterate over the rest of the columns to update the trailing submatrix on the cpu*/ for(J=K+1;J<=min(K+A_N-1, N-1);J++){ ncols = min(nb, n - J*nb); /*Set the priority max for column having the next panel (look ahead of deep 1), and process the rest of the update in a right looking way*/ if(J==K+1) magma_schedule_set_task_priority(INT_MAX -2 ); //// magma_schedule_set_task_priority(INT_MAX - N*K -1); else magma_schedule_set_task_priority(INT_MAX -3 - J );//- N*K //// magma_schedule_set_task_priority(INT_MAX - N*K -3 -J); //magma_schedule_set_task_priority(INT_MAX - J); /*dependency colptr(J): make sure column J is sent from GPU, and all previous update was done*/ magma_insert_core_dlaswp(ncols, A(K,J), A_LD, ONE, nb, ipiv(K), ONE, colptr(J)); magma_insert_core_dtrsm('L', 'L', 'N', 'U', nb, ncols, c_one, A(K,K), A_LD, A(K,J), A_LD, colptr(J)); /*Compute the number of blocs rows to group together before the update. To avoid scheduling overhead.*/ B_rows = (int) ceil((double) (M-K-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); //printf("B_rows:%d\n",B_rows); for(I=K+1; I<=M-1; I+=B_rows){ nrows = min(B_rows*nb, m-I*nb); /*dep colptr(K):make sure the panel is not overwritten or swapped since dgemm use A[I,K]*/ /*dep colptr(J): Gather all dgemm on one column and create dependencies with previous dgemm and the next panel*/ magma_insert_core_dgemm('N','N', nrows, ncols, nb, c_neg_one, A(I,K), A_LD, A(K,J), A_LD, c_one, A(I,J), A_LD, colptr(K), colptr(J)); } if(J==K+1) { /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); ///magma_schedule_set_task_priority(0); //TEST: testing prio_0 //// magma_schedule_set_task_priority(INT_MAX - N*K - 2); magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); // magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/ for(dd=0;dd<num_gpus;dd++){ //&(dlpanel[dd][dlpanel_LD*nb*(K+1)]) ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel[dd]); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel(dd,K+1));//, dlpanel[dd] } /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; ///magma_schedule_set_task_priority(INT_MAX); /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ ///magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP, dAP_LD, A(K+1,K+1), dAT(K+1,K+1)); //magma_insert_dev_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/ dd = GID(K+1); ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(K+1,K+1)); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(0,K+1));/// } } /*compute the next number of blocks colums */ A_NP1 = NSplit(N-(K+1), dcpu) - NSplit(N-K, dcpu) + 1; /*Transfer asynchronously (A_NP1 - A_N) block column (column K+A_N) from the GPU to the CPU to balance work*/ /*Make sure this is inserted after all dgemm because it schedules to replace a current panel for the case A_N< N*/ for(L=K+A_N;L<K+A_N+A_NP1;L++) { if(L<N) { /*Determine the device which own column K+A_N*/ dd = GID(L); gpu_ncols = nr_local[dd]; ncols = min(nb, gpu_ncols); magma_schedule_set_task_priority(INT_MAX); ///magma_insert_dev_dgetmatrix_transpose(dd, gpu_nrows, ncols, dlAT(K+1,K+A_N), dlAT_LD, A(K+1,K+A_N), A_LD, dlAP_get[dd], dlAP_LD, colptr(K+A_N)); //blocking /*make sure the computations are done on stream 1 and send a block column on stream 2*/ magma_insert_dev_queue_sync(dd, mstream[dd][1], dlAT(K+1,L)); magma_insert_dev_dgetmatrix_async_transpose(dd, gpu_nrows, ncols, dlAT(K+1,L), dlAT_LD, A(K+1,L), A_LD, mstream[dd][2], dlAP_get[dd], dlAP_LD, colptr(L)); /*Update the remaining number of columns*/ //// nr_local[dd]-=nb; /*if A_N==1, there is no look-ahead, so insert the panel here*/ if((A_N==1) && (L==K+A_N)){ /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); ///magma_schedule_set_task_priority(0); //TEST: testing prio_0 //// magma_schedule_set_task_priority(INT_MAX - N*K - 2); magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); //magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); /*Transfer the factorized panel in the buffer of GPU (dlpanel)*/ for(dd=0;dd<num_gpus;dd++){ //&(dlpanel[dd][dlpanel_LD*nb*(K+1)]) ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel[dd]); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(K+1, K+1), A_LD, dlpanel(dd, K+1), dlpanel_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlpanel(dd,K+1));//dlpanel[dd] } /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; ///magma_schedule_set_task_priority(INT_MAX); /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ ///magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP, dAP_LD, A(K+1,K+1), dAT(K+1,K+1)); //magma_insert_dev_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); /*Transfer also the factorized panel on its right position in the final matrix (transposition included)*/ /*TODO: this may use cudaMemcpyDeviceToDevice and initiate the transfer from dlpanel*/ dd = GID(K+1); ///magma_insert_dev_dsetmatrix_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(K+1,K+1)); magma_insert_dev_dsetmatrix_async_transpose(dd, nrows, ncols, A(U_K, K+1), A_LD, dlAT(U_K,K+1), dlAT_LD, mstream[dd][0], dlAP_set[dd], dlAP_LD, colptr(K+1), dlAT(0,K+1));///dlAT(K+1,K+1) } } } #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, A_n, A, A_LD,"A"); ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dAT (Step K)"); nrows = m - K*nb; ncols = min(nb, n - K*nb); dd = GID(K); magma_setdevice(dd); ca_dbg_printMat_transpose_gpu(ncols, nrows, dlAT(K,K), dlAT_LD,"dAT(K,K)"); if(K<=5){ printf("Step K:%d done. Int to continue: ",K); scanf("%d", &I); } #endif } //Step K done /*Wait for all thread termination*/ magma_schedule_barrier(); /*make sure everything arrived*/ ///needed? for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_queue_sync(mstream[dd][0]); magma_queue_sync(mstream[dd][1]); magma_queue_sync(mstream[dd][2]); } /*TODO: don't need quark here*/ /*Perform a sequence of left swap on the matrix corresponding to the different panel*/ for(K=1;K<=N-1;K++){ #if (dbglevel >=1) ca_trace_start(); #endif nrows = min(nb,m - K*nb); ncols = min(K*nb,n); for(dd=0;dd<=min(num_gpus-1, K-1);dd++){ gpu_ncols = numcols2p(dd, ncols, nb, num_gpus); J = dd; if(gpu_ncols>0){ magma_setdevice(dd); //pthread_mutex_lock(&mutex_compute_stream); magmablasSetKernelStream(mstream[dd][1]); magmablas_dlaswp(gpu_ncols, dlAT(K, J), dlAT_LD, ONE, nrows, ipiv(K), ONE); //pthread_mutex_lock(&mutex_compute_stream); } } #if (dbglevel >=1) ca_trace_end_1gpu('W'); #endif } #if (dbglevel==10) ca_dbg_printMat_transpose_mgpu(num_gpus, n_local, m, dlAT, dlAT_LD,"dAT after lswap"); #endif /*Shutdown the scheduler*/ magma_schedule_delete(); /*update permutation vector indexes*/ for(K=1;K<=N-1;K++){ nrows = min(nb, n-K*nb); for(J=0;J<=nrows-1;J++){ ipiv[K*nb+J] += K*nb; } } #if dbglevel>=1 printf("[DBG] Time Factorization:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /* 4. Transpose back the matrix in/out of place*/ for(dd=0;dd<num_gpus;dd++){ //n_local[dd] = numcols2p(dd, n, nb, num_gpus); //loc2p(dd, N, num_gpus)*nb; magma_setdevice(dd); magmablasSetKernelStream(mstream[dd][1]); magmablas_dtranspose2(dlA[dd], dlA_LD, dlAT[dd], dlAT_LD, n_local[dd], m); } for(dd=0;dd<num_gpus;dd++){ //needed magma_setdevice(dd); magmablasSetKernelStream(NULL); } #if dbglevel>=1 printf("[DBG] Time Final in/out of place transpose:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat_mgpu(num_gpus, m, n_local, dlA, dlA_LD,"dA = LU"); #endif for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_queue_destroy(mstream[dd][0]); magma_queue_destroy(mstream[dd][1]); magma_queue_destroy(mstream[dd][2]); } //free(mstream); // printf("Step 4: time:%f\n",magma_wtime()-t1); // t1 = magma_wtime(); free(n_local); free(nr_local); // free(k_local); for(dd=0;dd<num_gpus;dd++){ magma_setdevice(dd); magma_free( dlAP_set[dd]); magma_free( dlAP_get[dd]); magma_free(dlpanel[dd]); magma_free(dlAT[dd]); } //free(dlAP_set); //free(dlAP_get); //free(dlpanel); free(dlAT); #if dbglevel>=1 printf("[DBG] Time memory free (dAP):%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if dbglevel>=1 /*Finalize the tracing*/ ca_dbg_trace_finalize(); printf("[DBG] Time llog:%f\n",magma_wtime()-t1); #endif return *info; } /* End of MAGMA_DGETRF_REC_ASYNC_WORK_GPU */
void magma_task_dev_dsetmatrix_async_transpose(Schedule* sched_obj) { magma_int_t deviceID; magma_int_t m; magma_int_t nb; double *A_src; magma_int_t LDA; double *dA_dst; magma_int_t dA_LD; magma_queue_t stream1; double *dwork; magma_int_t dwork_LD; // double *tmpdA; #if (dbglevel >=1) ca_trace_start(); #endif schedule_unpack_args_10(sched_obj, deviceID, m, nb, A_src, LDA, dA_dst, dA_LD, stream1, dwork, dwork_LD); magma_setdevice(deviceID); //magma_dsetmatrix_transpose( m, nb, A_src, LDA, dA_dst, dA_LD); //task_setpanel(A_m, nb, A(0,K), A_LD, dA(0,K), dA_LD) /*make sure any operation on the device completed*/ magma_queue_sync(compute_stream[deviceID]); #if (dbglevel==10) ca_dbg_printMat(m, nb, A_src, LDA, "A before setMatrix"); //ca_dbg_printMat_gpu(2, 2, dwork, dwork_LD, "dwork for testing"); //cudaMemcpy(&tmpdA, dwork, sizeof(double*), cudaMemcpyHostToDevice); //ca_dbg_printMat_gpu(2, 2, tmpdA, dwork_LD, "dlAP_set[dd] for testing"); #endif // pthread_mutex_lock(&mutex_dAP); /*1. copy A to dwork: send to the GPU*/ magma_dsetmatrix_async(m, nb, A_src, LDA, dwork, dwork_LD, stream1); #if (dbglevel==10) magma_queue_sync(stream1); ca_dbg_printMat_gpu(m, nb, dwork, dwork_LD, "dwork after setMatrix"); #endif magma_queue_sync(stream1); //make sure dwork is set before the transpose /*2.transpose dwork to dA*/ magmablas_dtranspose2s(dA_dst, dA_LD, dwork, dwork_LD, m, nb, compute_stream[deviceID]); // pthread_mutex_unlock(&mutex_dAP); #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(nb, m, dA_dst, dA_LD, "dA after setMatrix"); #endif #if (dbglevel >=1) ca_trace_end_gpu('T'); ca_trace_end_cpu('C'); #endif }
extern "C" magma_int_t magma_dgetrf_gpu_work_amc( magma_int_t m, magma_int_t n, double *dA, magma_int_t dA_LD, magma_int_t *ipiv, magma_int_t *info, /*workspace on the cpu side*/ double *AWORK, magma_int_t AWORK_LD, magma_int_t AWORK_n ) { /* -- MAGMA (version 1.5.0-beta3) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2011 Purpose ======= DGETRF_GPU_WORK_AMC computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The technique used for the panel factorization is the parallel recursif LU (see lawn 259). The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; int ONE = 1; magma_int_t iinfo, nb; magma_int_t mindim; magma_int_t nrows, ncols; //double *work; magma_int_t dm_max, dn_max; magma_int_t I, J, K, M, N, U_K; //magma_int_t A_K; double *dAT; magma_int_t dAT_LD; double *dAP_set,*dAP_get; magma_int_t dAP_LD; //magma_int_t nrows, ncols; magma_int_t gpu_nrows, gpu_ncols; int nbcores; /*Number of cores available for the whole factorization*/ int panel_num_threads; /*Number of threads for the panel*/ double dcpu; /*percentage of the matrix to allocate on the CPUs*/ int B_rows; double t1; /* Recommanded dimension in the workspace*/ int A_m, A_n, A_N, A_NMAX, A_LD; double *A; #ifdef USE_CALU int i_nrows; #endif amc_args_t *args; /*magma_event_t *A_event;*/ /*Control bucket*/ /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (dA_LD < max(1,m)) *info = -4; else if (AWORK_LD < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /*Get parameters*/ args = magma_amc_args_get_default(); nb= args->nb; nbcores = args->P; panel_num_threads = args->Pr; dcpu = args->dcpu; /* Check and fix parameters */ if(nb==0) nb = magma_get_dgetrf_nb(m) ;/*magma dgetrf block size*/ else nb = args->nb; if(nb>n) nb = n; if(panel_num_threads>nbcores) panel_num_threads = nbcores; /* Compute the maximum number of panels we can store in the workspace*/ A_NMAX = (int) (AWORK_n/ nb); /* Compute the recommanded number of columns for the cpu part*/ A_n = (int) ceil(n*dcpu); /*Make sure we work with multiple of 32*/ /* if(A_n%32!=0) { A_n = ((A_n + 31)/32)*32; } */ /* Compute the recommanded number of panels for the cpu part*/ A_N = (int) (A_n/ nb); /* Check if there are enough workspace. In case the user gave a workspace lower than the optimal*/ /* NOTE: using small workspace may reduce performance*/ if(A_N>A_NMAX){ #if (dbglevel >=1) printf("[DBG_WARNING] Resizing buffer to feet user preferences. Recommanded:%d, Max given:%d\n",A_N, A_NMAX); #endif A_N = A_NMAX; } A = AWORK; A_m = m; A_LD = AWORK_LD; #if (dbglevel >=1) /* Initialize the tracing*/ ca_dbg_trace_init(nbcores,1); //nbcores + 1 GPU #endif #if (dbglevel >=1) t1 = magma_wtime(); #endif /*Transfer the first column block of the matrix from the GPU to the CPUs*/ magma_dgetmatrix(A_m, A_n, dA, dA_LD, A, A_LD); #if (dbglevel >=1) printf("[DBG] Time First getmatrix: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat(m, A_n, A, A_LD,"A after first getMatrix"); #endif /*Allocate a workspace for the panels transposition*/ dAP_LD = m; if(dAP_LD%32!=0) dAP_LD = ((dAP_LD + 31)/32)*32;/*Make dAP_LD multiple of 32*/ if (MAGMA_SUCCESS != magma_dmalloc(&dAP_set, dAP_LD*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_dmalloc(&dAP_get, dAP_LD*nb)) { magma_free(dAP_set); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #if (dbglevel >=1) printf("[DBG] Time workspace memory alloc (dAP): %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /*Transpose the gpu part of the matrix in/out of place*/ if ((m == n) ){ //&& (m % 32 == 0) && (dA_LD%32 == 0) dAT = dA; dAT_LD= dA_LD; magmablas_dtranspose_inplace(m, dAT, dAT_LD); } else { dm_max = m; dn_max = n; /*Make sure m and n are multiple of 32*/ if(dm_max%32!=0) dm_max = ((dm_max + 31)/32)*32; if(dn_max%32!=0) dn_max = ((dn_max + 31)/32)*32; if (MAGMA_SUCCESS != magma_dmalloc(&dAT, dm_max*dn_max )) { magma_free(dAP_set); magma_free(dAP_get); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dAT_LD = dn_max; magmablas_dtranspose2( dAT, dAT_LD, dA, dA_LD, m, n ); } #if (dbglevel >=1) printf("[DBG] Time First transposition: %f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"matrix dAT to factorize"); #endif /* Compute the maximun number of steps*/ mindim = min(m, n); M = (int) ceil( (double) m / nb); N = (int) ceil( (double) mindim / nb); /*N = n/nb*/ /*Let the asynchronous algorithm begin*/ #if (dbglevel >=1) printf("Starting recursif code ... m:%d, n:%d, nb:%d, nbcores:%d, N:%d, A_N:%d\n", m, n, nb, nbcores, N, A_N); //Summary #endif /*Initialize the scheduler*/ magma_schedule_init(nbcores, 1); K = 0; #ifdef USE_CALU /*initialize calu environment*/ core_dtslu_alloc(panel_num_threads, A_m, nb); core_dtslu_init(panel_num_threads); /*Initialize rows indice: required*/ for(I=0;I<A_m;I++) ipiv[I]=I; #else /*initialize parallel recursif panel environment*/ CORE_zgetrf_reclap_init(); #endif magma_schedule_set_task_priority(INT_MAX-1); /*Schedule the first panel factorization*/ #ifdef USE_CALU magma_insert_core_dtslu(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K)); B_rows = (int) ceil((double) (M-K-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+1; I<=M-1; I+=B_rows){ i_nrows = min(B_rows*nb, m-I*nb); magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, nb, c_one, A(0,K), A_LD, A(I,K), A_LD, colptr(K)); } #else magma_insert_core_dgetrf_rec(A_m, nb, A(0,K), A_LD, ipiv(0), &iinfo, panel_num_threads, colptr(K)); #endif /*Transfer the factorized panel to the GPU (transposition included)*/ magma_insert_dsetmatrix_transpose(A_m, nb, A(0,K), A_LD, dAT(0,K), dAT_LD, dAP_set, dAP_LD, colptr(K), dAT(K,K)); #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, nb, A(0,0), A_LD,"A(0,0)"); ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"dA"); #endif for(K=0;K<=N-1;K++){ /*insert the coarse update of the trailing submatrix corresponding to panel K to the GPU, that is submatrix A[K+1:M, K+1+d-1:N]*/ gpu_nrows = m - (K+1)*nb; gpu_ncols = n - (K+1+A_N-1)*nb; if(gpu_ncols >0) { /*NOTE: Here we work on the matrix transpose*/ /*Set the priority max for the GPU computations*/ magma_schedule_set_task_priority(INT_MAX); //// magma_schedule_set_task_priority(INT_MAX - N*K); /*schedule a swap of the trailing submatrix in the gpu using ipiv[K]*/ /*dependency dAT((K+1)-1, (K+A_N)-1) = dAT(K, K+A_N-1) with previous dgemm*/ magma_insert_dlaswp(gpu_ncols, dAT(K, K+A_N), dAT_LD, ONE, nb, ipiv(K), ONE, dAT(K, K+A_N-1)); /*non blocking*/ //printf("debug barrier\n"); //magma_schedule_barrier(); magma_insert_dtrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, gpu_ncols, nb, c_one, dAT(K,K), dAT_LD, dAT(K,K+A_N), dAT_LD);/*non blocking*/ /* aij^T = aij^T - (lik.ukj)^T = aij^T - ukj^T.lik^T*/ magma_insert_dgemm(MagmaNoTrans,MagmaNoTrans, gpu_ncols, gpu_nrows, nb, c_neg_one, dAT(K,K+A_N), dAT_LD, dAT(K+1,K), dAT_LD, c_one, dAT(K+1,K+A_N), dAT_LD);/*non blocking*/ } /*iterate over the rest of the columns to update the trailing submatrix on the cpu*/ for(J=K+1;J<=min(K+A_N-1, N-1);J++){ ncols = min(nb, n - J*nb); /*Set the priority max for column having the next panel (look ahead of deep 1), and process the rest of the update in a right looking way*/ if(J==K+1) magma_schedule_set_task_priority(INT_MAX -2 ); //// magma_schedule_set_task_priority(INT_MAX - N*K -1); else magma_schedule_set_task_priority(INT_MAX -3 - J );//- N*K /*dependency colptr(J): make sure column J is sent from GPU, and all previous update was done*/ magma_insert_core_dlaswp(ncols, A(K,J), A_LD, ONE, nb, ipiv(K), ONE, colptr(J)); magma_insert_core_dtrsm('L', 'L', 'N', 'U', nb, ncols, c_one, A(K,K), A_LD, A(K,J), A_LD, colptr(J)); /*Compute the number of blocs rows to group together before the update. To avoid scheduling overhead.*/ B_rows = (int) ceil((double) (M-K-1)/panel_num_threads); //B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+1; I<=M-1; I+=B_rows){ nrows = min(B_rows*nb, m-I*nb); /*dep colptr(K):make sure the panel is not overwritten or swapped since dgemm use A[I,K]*/ /*dep colptr(J): Gather all dgemm on one column and create dependencies with previous dgemm and the next panel*/ magma_insert_core_dgemm('N','N', nrows, ncols, nb, c_neg_one, A(I,K), A_LD, A(K,J), A_LD, c_one, A(I,J), A_LD, colptr(K), colptr(J)); } if(J==K+1) { /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); #ifdef USE_CALU magma_insert_core_dtslu(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); B_rows = (int) ceil((double) (M-(K+1)-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+2; I<=M-1; I+=B_rows){ i_nrows = min(B_rows*nb, m-I*nb); magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, ncols, c_one, A(K+1,K+1), A_LD, A(I,K+1), A_LD, colptr(K+1)); //dtrsm("R", "U", "N", "N", &nrowPblock, &panel_NB, &dONE, &(A[M*pos+pos]), &LDA, &(A[lpos]), &LDA); // } #else magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); #endif /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); } } /*Transfer asynchronously one column (column K+A_N) from the GPU to the CPU to balance work*/ /*Make sure this is inserted after all dgemm before it schedules to replace a current panel in case A_N< N*/ if(K+A_N<N) { ncols = min(nb, gpu_ncols); magma_schedule_set_task_priority(INT_MAX); magma_insert_dgetmatrix_transpose(gpu_nrows, ncols, dAT(K+1,K+A_N), dAT_LD, A(K+1,K+A_N), A_LD, dAP_get, dAP_LD, colptr(K+A_N)); //blocking /*if A_N==1 there is no look-ahead, so insert the panel here*/ if(A_N==1){ /*Look ahead and insert the next panel*/ nrows = m - (K+1)*nb; ncols = min(nb, n - (K+1)*nb); /*Schedule the next panel factorization with maximum priority*/ magma_schedule_set_task_priority(INT_MAX -1); #ifdef USE_CALU magma_insert_core_dtslu(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); B_rows = (int) ceil((double) (M-(K+1)-1)/panel_num_threads); B_rows = max(B_rows,4); /*maximun of 4*/ //B_rows = max(B_rows,1); for(I=K+2; I<=M-1; I+=B_rows){ i_nrows = min(B_rows*nb, m-I*nb); magma_insert_core_dtrsm_gatherv('R', 'U', 'N', 'N', i_nrows, ncols, c_one, A(K+1,K+1), A_LD, A(I,K+1), A_LD, colptr(K+1)); //dtrsm("R", "U", "N", "N", &nrowPblock, &panel_NB, &dONE, &(A[M*pos+pos]), &LDA, &(A[lpos]), &LDA); // } #else magma_insert_core_dgetrf_rec(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, panel_num_threads, colptr(K+1)); //magma_insert_core_dgetrf(nrows, ncols, A(K+1,K+1), A_LD, ipiv(K+1), &iinfo, colptr(K+1)); #endif /*Determine the upper part of the matrix done by the CPU on that column and send it to the GPU with the panel*/ U_K = max(0, K+1 - A_N +1); nrows = m - U_K*nb; ///magma_schedule_set_task_priority(INT_MAX); /*Transfer the upper part of the matrix for that column and the factorized panel to the GPU*/ magma_insert_dsetmatrix_transpose(nrows, ncols, A(U_K, K+1), A_LD, dAT(U_K, K+1), dAT_LD, dAP_set, dAP_LD, colptr(K+1), dAT(K+1,K+1)); } } #if (dbglevel==10) magma_schedule_barrier(); ca_dbg_printMat(m, A_n, A, A_LD,"A"); ca_dbg_printMat_transpose_gpu(m, n, dAT, dAT_LD,"dA"); #endif } //Step K done /*Wait for all thread termination*/ magma_schedule_barrier(); /*TODO: don't need quark here*/ /*Perform a sequence of left swap on the matrix corresponding to the different panel*/ for(K=1;K<=N-1;K++){ #if (dbglevel >=1) ca_trace_start(); #endif nrows = min(nb,m - K*nb); ncols = min(K*nb,n); /*dep dAT(K-1): Make sure the last swap is completed, and also the dgemm using the panel*/ // magma_insert_dlaswp(ncols, dAT(K, 0), dAT_LD, ONE, nrows, ipiv(K), ONE, dAT(K-1,0)); magmablas_dlaswp(ncols, dAT(K, 0), dAT_LD, ONE, nrows, ipiv(K), ONE); #if (dbglevel >=1) ca_trace_end_1gpu('W'); #endif } /*Shutdown the scheduler*/ magma_schedule_delete(); /*update permutation vector indexes*/ for(K=1;K<=N-1;K++){ nrows = min(nb, n-K*nb); for(J=0;J<=nrows-1;J++){ ipiv[K*nb+J] += K*nb; } } #if dbglevel>=1 printf("[DBG] Time Factorization:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif /*No need for synchro, since dtranspose is blocking*/ if (m == n) { magmablas_dtranspose_inplace(m, dAT, dAT_LD); //( m, dAT, dAT_LD ); dA = dAT; } else { magmablas_dtranspose2( dA, dA_LD, dAT, dAT_LD, n, m ); magma_free( dAT ); } #if dbglevel>=1 printf("[DBG] Time Final in/out of place transpose:%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #ifdef USE_CALU core_dtslu_free(); #endif magma_free( dAP_set ); magma_free( dAP_get ); #if dbglevel>=1 printf("[DBG] Time memory free (dAP):%f\n",magma_wtime()-t1); t1 = magma_wtime(); #endif #if (dbglevel==10) ca_dbg_printMat_transpose_gpu(m, n, dA, dA_LD,"dA = LU"); #endif #if dbglevel>=1 /*Finalize the tracing*/ ca_dbg_trace_finalize(); printf("[DBG] Time llog:%f\n",magma_wtime()-t1); #endif return *info; } /* End of MAGMA_DGETRF_REC_ASYNC_WORK_GPU */