extern "C" magma_int_t magma_dpotrf_recpanel_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, cublasHandle_t myhandle, magma_queue_t queue) { // Quick return if possible if (m ==0 || n == 0) { return 1; } if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); return -100; } if(m<n){ printf("error m < n %d < %d \n", (int) m, (int) n); return -101; } double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; magma_int_t panel_nb = n; if(panel_nb <= min_recpnb){ //printf("calling bottom panel recursive with m=%d nb=%d\n",m,n); // panel factorization magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); //magma_dpotrf_rectile_batched(uplo, m, panel_nb, 16, magma_dpotrf_panel_batched( uplo, m, panel_nb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, myhandle, queue); } else{ // split A over two [A A2] // panel on A1, update on A2 then panel on A1 magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t m1 = m; magma_int_t m2 = m-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A1 //printf("calling recursive panel on A1 with m=%d nb=%d min_recpnb %d\n",m1,n1,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); magma_dpotrf_recpanel_batched( uplo, m1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, myhandle, queue); // update A2 //printf("calling update A2 with m=%d n=%d k=%d\n",m2,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1+n1, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p1+n1, p2, batchCount, queue); magmablas_dgemm_batched(MagmaNoTrans, MagmaConjTrans, m2, n2, n1, alpha, dA_displ, ldda, dA_displ, ldda, beta, dW0_displ, ldda, batchCount, queue); // panel on A2 //printf("calling recursive panel on A2 with m=%d nb=%d min_recpnb %d\n",m2,n2,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount, queue); magma_dpotrf_recpanel_batched( uplo, m2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, myhandle, queue); } magma_free(dA_displ); return 0; }
extern "C" magma_int_t magma_dpotrf_rectile_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, cublasHandle_t myhandle, magma_queue_t queue) { //magma_int_t DEBUG=0; // Quick return if possible if (m ==0 || n == 0) { return 1; } if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); return -100; } if(m<n){ printf("error m < n %d < %d \n", (int) m, (int) n); return -101; } double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; magma_int_t panel_nb = n; if(panel_nb <= min_recpnb){ // if(DEBUG==1) printf("calling bottom panel recursive with n=%d\n",(int) panel_nb); // panel factorization magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); magma_dpotrf_panel_batched( uplo, m, panel_nb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, myhandle, queue); } else{ // split A over two [A11 A12; A21 A22; A31 A32] // panel on tile A11, // trsm on A21, using A11 // update on A22 then panel on A22. // finally a trsm on [A31 A32] using the whole [A11 A12; A21 A22] magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A11 //if(DEBUG==1) printf("calling recursive panel on A11=A(%d,%d) with n=%d min_recpnb %d\n",(int) p1, (int) p1, (int) n1, (int) min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); magma_dpotrf_rectile_batched( uplo, n1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, myhandle, queue); // TRSM on A21 //if(DEBUG==1) printf("calling trsm on A21=A(%d,%d) using A11==A(%d,%d) with m=%d k=%d \n",p2,p1,p1,p1,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p2, p1, batchCount, queue); magmablas_dtrsm_work_batched(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, n2, n1, MAGMA_D_ONE, dA_displ, ldda, dW0_displ, ldda, dX_array, n2, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue); // update A22 //if(DEBUG==1) printf("calling update A22=A(%d,%d) using A21==A(%d,%d) with m=%d n=%d k=%d\n",p2,p2,p2,p1,n2,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p2, p2, batchCount, queue); magmablas_dgemm_batched(MagmaNoTrans, MagmaConjTrans, n2, n2, n1, alpha, dA_displ, ldda, dA_displ, ldda, beta, dW0_displ, ldda, batchCount, queue); // panel on A22 //if(DEBUG==1) printf("calling recursive panel on A22=A(%d,%d) with n=%d min_recpnb %d\n",p2,p2,n2,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount, queue); magma_dpotrf_rectile_batched( uplo, n2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, myhandle, queue); } if(m>n){ // TRSM on A3: //if(DEBUG==1) printf("calling trsm AT THE END on A3=A(%d,%d): using A1222==A(%d,%d) with m=%d k=%d \n",n,0,0,0,m-n,n); magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, n, 0, batchCount, queue); magmablas_dtrsm_work_batched(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, m-n, n, MAGMA_D_ONE, dA_displ, ldda, dW0_displ, ldda, dX_array, m-n, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue); } magma_free(dA_displ); return 0; }
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, magma_int_t **ipiv_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define A(i_, j_) (A + (i_) + (j_)*ldda) magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } //#define ENABLE_TIMER3 #if defined(ENABLE_TIMER3) real_Double_t tall=0.0, tloop=0., talloc=0., tdalloc=0.; tall = magma_sync_wtime(0); talloc = magma_sync_wtime(0); #endif double neg_one = MAGMA_D_NEG_ONE; double one = MAGMA_D_ONE; magma_int_t ib, i, k, pm; magma_int_t nb = BATRF_NB; magma_int_t gemm_crossover = nb > 32 ? 127 : 160; // magma_int_t gemm_crossover = n;// use only stream gemm #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif magma_int_t **dipiv_displ = NULL; double **dA_displ = NULL; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dinvA_array = NULL; double **dwork_array = NULL; magma_malloc((void**)&dipiv_displ, batchCount * sizeof(*dipiv_displ)); magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nb; magma_int_t **pivinfo_array = NULL; magma_int_t *pivinfo = NULL; double* dinvA = NULL; double* dwork = NULL;// dinvA and dwork are workspace in dtrsm double **cpuAarray = NULL; magma_dmalloc( &dinvA, invA_msize * batchCount); magma_dmalloc( &dwork, dwork_msize * batchCount ); magma_malloc((void**)&pivinfo_array, batchCount * sizeof(*pivinfo_array)); magma_malloc((void**)&pivinfo, batchCount * m * sizeof(magma_int_t)); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); /* check allocation */ if ( dA_displ == NULL || dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL || cpuAarray == NULL || dipiv_displ == NULL || pivinfo_array == NULL || pivinfo == NULL) { magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_dlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dinvA, invA_msize, queue); magmablas_dlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, dwork_msize, queue); dset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); dset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); set_ipointer(pivinfo_array, pivinfo, 1, 0, 0, m, batchCount, queue); // printf(" I am in dgetrfbatched\n"); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++){ magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1); #if defined(ENABLE_TIMER3) printf(" I am after malloc\n"); talloc = magma_sync_wtime(0) - talloc; tloop = magma_sync_wtime(0); #endif for(i = 0; i < min_mn; i+=nb) { magmablasSetKernelStream(NULL); ib = min(nb, min_mn-i); pm = m-i; magma_idisplace_pointers(dipiv_displ, ipiv_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); //=============================================== // panel factorization //=============================================== #if 0 arginfo = magma_dgetf2_batched( pm, ib, dA_displ, ldda, dW1_displ, dW2_displ, dW3_displ, dipiv_displ, info_array, i, batchCount, myhandle); #else arginfo = magma_dgetrf_recpanel_batched( pm, ib, 16, dA_displ, ldda, dipiv_displ, pivinfo_array, dwork_array, nb, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, i, batchCount, myhandle, queue); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #define RUN_ALL #ifdef RUN_ALL // setup pivinfo before adjusting ipiv setup_pivinfo_batched(pivinfo_array, dipiv_displ, pm, ib, batchCount, queue); adjust_ipiv_batched(dipiv_displ, ib, i, batchCount, queue); // stepinit_ipiv(pivinfo_array, pm, batchCount);// for debug and check swap, it create an ipiv #if 0 dlaswp_batched( i, dA_displ, ldda, i, i+ib, dipiv_displ, pivinfo_array, batchCount); #else magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, 0, batchCount, queue); magma_dlaswp_rowparallel_batched( i, dA_displ, ldda, dW0_displ, ldda, i, i+ib, pivinfo_array, batchCount, queue); #endif if( (i + ib) < n) { // swap right side and trsm magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i+ib, batchCount, queue); dset_pointer(dwork_array, dwork, nb, 0, 0, dwork_msize, batchCount, queue); // I don't think it is needed Azzam magma_dlaswp_rowparallel_batched( n-(i+ib), dA_displ, ldda, dwork_array, nb, i, i+ib, pivinfo_array, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i+ib, batchCount, queue); magmablas_dtrsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, ib, n-i-ib, MAGMA_D_ONE, dA_displ, ldda, // dA dwork_array, nb, // dB dW0_displ, ldda, // dX dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue); if( (i + ib) < m) { // if gemm size is >160 use a streamed classical cublas gemm since it is faster // the batched is faster only when M=N<=160 for K40c //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i-ib) > gemm_crossover && (n-i-ib) > gemm_crossover) { //printf("caling streamed dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); // since it use different stream I need to wait the TRSM and swap. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); // for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, cpuAarray[k] + (i+ib)+i*ldda, ldda, cpuAarray[k] + i+(i+ib)*ldda, ldda, one, cpuAarray[k] + (i+ib)+(i+ib)*ldda, ldda); } // need to synchronise to be sure that dgetf2 do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { magma_ddisplace_pointers(dA_displ, dA_array, ldda, i+ib, i, batchCount, queue); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dA_array, ldda, i+ib, i+ib, batchCount, queue); //printf("caling batched dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, dA_displ, ldda, dW1_displ, ldda, one, dW2_displ, ldda, batchCount, queue); } // end of batched/stream gemm } // end of if( (i + ib) < m) } // end of if( (i + ib) < n) #endif }// end of for fin: magma_queue_sync(NULL); #if defined(ENABLE_TIMER3) tloop = magma_sync_wtime(0) - tloop; tdalloc = magma_sync_wtime(0); #endif for(i=0; i<nbstreams; i++){ magma_queue_destroy( stream[i] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); #if defined(ENABLE_TIMER3) tdalloc = magma_sync_wtime(0) - tdalloc; tall = magma_sync_wtime(0) - tall; printf("here is the timing from inside dgetrf_batched talloc: %10.5f tloop: %10.5f tdalloc: %10.5f tall: %10.5f sum: %10.5f\n", talloc, tloop, tdalloc, tall, talloc+tloop+tdalloc ); #endif return arginfo; }
extern "C" magma_int_t magma_dgetrf_recpanel_nopiv_batched_q( magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, double** dW5_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t stream, cublasHandle_t myhandle) { // Quick return if possible if (m == 0 || n == 0) { return 0; } magma_int_t arginfo = 0; double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_int_t panel_nb = n; if(panel_nb <= min_recpnb){ // if(DEBUG>0)printf("calling bottom panel recursive with m=%d nb=%d\n",m,n); // panel factorization //magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount); arginfo = magma_dgetrf_panel_nopiv_batched_q( m, panel_nb, dA_array, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep, batchCount, stream, myhandle); if (arginfo != 0) return arginfo; } else{ // split A over two [A A2] // panel on A1, update on A2 then panel on A1 magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t m1 = m; magma_int_t m2 = m-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A1 //printf("calling recursive panel on A1 with m=%d nb=%d min_recpnb %d\n",m1,n1,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount); arginfo = magma_dgetrf_recpanel_nopiv_batched_q( m1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep, batchCount, stream, myhandle); if (arginfo != 0) return arginfo; // update A2 //printf("calling update A2 with m=%d n=%d k=%d\n",m2,n2,n1); magma_ddisplace_pointers(dW5_displ, dA_array, ldda, p1, p2, batchCount); magmablas_dtrsm_work_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, n1, n2, MAGMA_D_ONE, dA_displ, ldda, // dA dW5_displ, ldda, // dB dX_array, n1, // dX dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, p2, 0, batchCount); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount); magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m2, n2, n1, MAGMA_D_NEG_ONE, dW1_displ, ldda, dW5_displ, ldda, MAGMA_D_ONE, dA_displ, ldda, batchCount); // panel on A2 //printf("calling recursive panel on A2 with m=%d nb=%d min_recpnb %d\n",m2,n2,min_recpnb); arginfo = magma_dgetrf_recpanel_nopiv_batched_q( m2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep+p2, batchCount, stream, myhandle); if (arginfo != 0) return arginfo; } magma_free(dA_displ); return 0; }
extern "C" magma_int_t magma_dlarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, double **v_array, magma_int_t ldv, double **tau_array, double **T_array, magma_int_t ldt, double **work_array, magma_int_t lwork, magma_int_t batchCount, cublasHandle_t myhandle, magma_queue_t queue) { if( k <= 0) return 0; if( stair_T > 0 && k <= stair_T) return 0; magma_int_t maxnb = max_shared_bsiz; if( lwork < k*ldt) { magma_xerbla( __func__, -(10) ); return -10; } if( stair_T > 0 && stair_T > maxnb) { magma_xerbla( __func__, -(3) ); return -3; } magma_int_t DEBUG=0; magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T; magma_int_t i, j, prev_n, mycol, rows; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dTstep_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dTstep_array, batchCount * sizeof(*dTstep_array)); //double *Tstep = k > nb ? work : T; if(k > nb) { magma_ddisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue); } else { magma_ddisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue); } //magma_int_t ldtstep = k > nb ? k : ldt; magma_int_t ldtstep = ldt; //a enlever // stair_T = 0 meaning all T // stair_T > 0 meaning the triangular portion of T has been computed. // the value of stair_T is the nb of these triangulars //GEMV compute the whole triangular upper portion of T (phase 1) // TODO addcublas to check perf #ifdef RFT_MAG_GEM magmablas_dgemm_batched( MagmaConjTrans, MagmaNoTrans, k, k, n, one, v_array, ldv, v_array, ldv, zero, dTstep_array, ldtstep, batchCount, queue); #else cublasDgemmBatched(myhandle, CUBLAS_OP_C, CUBLAS_OP_N, k, k, n, &one, (const double**) v_array, ldv, (const double**) v_array, ldv, &zero, dTstep_array, ldtstep, batchCount); #endif magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount, queue); // no need for it as T is expected to be lower zero //if(k > nb) magmablas_dlaset_batched(MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount); //TRMV //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] // TRMV is split over block of column of size nb // the update should be done from top to bottom so: // 1- a gemm using the previous computed columns // of T to update rectangular upper protion above // the triangle of my columns // 2- the columns need to be updated by a serial // loop over of gemv over itself. since we limit the // shared memory to nb, this nb column // are split vertically by chunk of nb rows dim3 grid(1, 1, batchCount); for(j=0; j<k; j+=nb) { prev_n = j; mycol = min(nb, k-j); // note that myrow = prev_n + mycol; if(prev_n>0 && mycol>0){ if(DEBUG==3) printf("doing gemm on the rectangular portion of size %d %d of T(%d,%d)\n",prev_n,mycol,0,j); magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue); magma_ddisplace_pointers(dW2_displ, T_array, ldt, 0, j, batchCount, queue); #ifdef RFT_MAG_GEM magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, prev_n, mycol, prev_n, one, T_array, ldt, dW1_displ, ldtstep, zero, dW2_displ, ldt, batchCount, queue ); #else cublasDgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N, prev_n, mycol, prev_n, &one, (const double**) T_array, ldt, (const double**) dW1_displ, ldtstep, &zero, dW2_displ, ldt, batchCount); #endif // update my rectangular portion (prev_n,mycol) using sequence of gemv magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_ddisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); for(i=0; i<prev_n; i+=nb) { rows = min(nb,prev_n-i); if(DEBUG==3) printf(" doing recdtrmv on the rectangular portion of size %d %d of T(%d,%d)\n",rows,mycol,i,j); if(rows>0 && mycol>0) { magma_ddisplace_pointers(dW2_displ, T_array, ldt, i, j, batchCount, queue); magmablas_dlarft_recdtrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue); } } } // the upper rectangular protion is updated, now if needed update the triangular portion if(stair_T == 0){ if(DEBUG==3) printf("doing dtrmv on the triangular portion of size %d %d of T(%d,%d)\n",mycol,mycol,j,j); if(mycol>0) { magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_ddisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); magma_ddisplace_pointers(dW2_displ, T_array, ldt, j, j, batchCount, queue); magmablas_dlarft_dtrmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue); } } }// end of j magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dTstep_array); return 0; }