extern "C" magma_int_t magma_dpotf2_dtrsm_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, double **dA_array, magma_int_t lda, double **dA_displ, double **dB_displ, double **dC_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { magma_int_t j; magma_int_t arginfo = 0; if ( m > MAX_NTHREADS ) { printf("magma_dpotf2_dtrsm_batched m=%lld > %lld not supported today\n", (long long) m, (long long) MAX_NTHREADS ); arginfo = -13; return arginfo; } // Quick return if possible if (n == 0) { return arginfo; } double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; if (uplo == MagmaUpper) { printf("Upper side is unavailable\n"); } else { for (j = 0; j < n; j++) { magma_dpotf2_ddot_batched(j, dA_array, lda, j, info_array, gbstep, batchCount, queue); // including ddot product and update a(j,j) if (j < n) { #ifdef COMPLEX magma_dlacgv_batched(j, dA_array, lda, j, batchCount, queue); #endif magma_ddisplace_pointers(dA_displ, dA_array, lda, j+1, 0, batchCount, queue); magma_ddisplace_pointers(dB_displ, dA_array, lda, j, 0, batchCount, queue); magma_ddisplace_pointers(dC_displ, dA_array, lda, j+1, j, batchCount, queue); // Compute elements J+1:N of column J = A(j+1:n,1:j-1) * A(j,1:j-1) (row). magmablas_dgemv_batched( MagmaNoTrans, m-j-1, j, alpha, dA_displ, lda, dB_displ, lda, beta, dC_displ, 1, batchCount, queue ); #ifdef COMPLEX magma_dlacgv_batched(j, dA_array, lda, j, batchCount, queue); #endif magma_dpotf2_dscal_batched(m-j, dA_array, 1, j+j*lda, info_array, batchCount, queue); } } } return arginfo; }
/** \n This is an internal routine. ********************************************************************/ extern "C" magma_int_t magma_dpotrf_panel_batched( magma_uplo_t uplo, magma_int_t n, magma_int_t nb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { magma_int_t arginfo = 0; //=============================================== // panel factorization //=============================================== if (n < nb) { printf("magma_dpotrf_panel error n < nb %d < %d \n",(int) n, (int) nb); return -101; } #if 0 arginfo = magma_dpotf2_batched( uplo, n, nb, dA_array, ldda, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); #else arginfo = magma_dpotf2_batched( uplo, nb, nb, dA_array, ldda, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); if ((n-nb) > 0) { magma_ddisplace_pointers(dW0_displ, dA_array, ldda, nb, 0, batchCount, queue); magmablas_dtrsm_work_batched( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, n-nb, nb, MAGMA_D_ONE, dA_array, ldda, dW0_displ, ldda, dX_array, n-nb, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue ); } #endif return arginfo; }
extern "C" magma_int_t magma_dgetrf_panel_nopiv_batched_q( magma_int_t m, magma_int_t nb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t stream, cublasHandle_t myhandle) { magma_int_t arginfo = 0; //=============================================== // panel factorization //=============================================== if(m < nb){ printf("magma_dgetrf_panel_nopiv_batched_q m < nb %d < %d \n",(int) m, (int) nb); return -101; } #if 0 arginfo = magma_dgetf2_nopiv_batched( m, nb, dA_array, ldda, dW1_displ, dW2_displ, dW3_displ, info_array, gbstep, batchCount, myhandle); if (arginfo != 0) return arginfo; #else arginfo = magma_dgetf2_nopiv_batched( nb, nb, dA_array, ldda, dW1_displ, dW2_displ, dW3_displ, info_array, gbstep, batchCount, myhandle); if (arginfo != 0) return arginfo; if((m-nb) > 0){ magma_ddisplace_pointers(dW0_displ, dA_array, ldda, nb, 0, batchCount); magmablas_dtrsm_work_batched(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 1, m-nb, nb, MAGMA_D_ONE, dA_array, ldda, dW0_displ, ldda, dX_array, m-nb, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount); } #endif return 0; }
/***************************************************************************//** Purpose ------- DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[in,out] dR_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDR, N/NB) dR should be of size (LDDR, N) when provide_RT > 0 and of size (LDDT, NB) otherwise. NB is the local blocking size. On exit, the elements of R are stored in dR only when provide_RT > 0. @param[in] lddr INTEGER The leading dimension of the array dR. LDDR >= min(M,N) when provide_RT == 1 otherwise LDDR >= min(NB, min(M,N)). NB is the local blocking size. To benefit from coalescent memory accesses LDDR must be divisible by 16. @param[in,out] dT_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDT, N/NB) dT should be of size (LDDT, N) when provide_RT > 0 and of size (LDDT, NB) otherwise. NB is the local blocking size. On exit, the elements of T are stored in dT only when provide_RT > 0. @param[in] lddt INTEGER The leading dimension of the array dT. LDDT >= min(NB,min(M,N)). NB is the local blocking size. To benefit from coalescent memory accesses LDDR must be divisible by 16. @param[out] dtau_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[in] provide_RT INTEGER provide_RT = 0 no R and no T in output. dR and dT are used as local workspace to store the R and T of each step. provide_RT = 1 the whole R of size (min(M,N), N) and the nbxnb block of T are provided in output. provide_RT = 2 the nbxnb diag block of R and of T are provided in output. @param[out] info_array Array of INTEGERs, dimension (batchCount), for corresponding matrices. - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf_batched *******************************************************************************/ extern "C" magma_int_t magma_dgeqrf_expert_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, double **dR_array, magma_int_t lddr, double **dT_array, magma_int_t lddt, double **dtau_array, magma_int_t provide_RT, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) /* Local Parameter */ magma_int_t nb = magma_get_dgeqrf_batched_nb(m); magma_int_t nnb = 8; magma_int_t min_mn = min(m, n); /* Check arguments */ cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; else if (lddr < min_mn && provide_RT == 1) arginfo = -6; else if (lddr < min(min_mn, nb)) arginfo = -6; else if (lddt < min(min_mn, nb)) arginfo = -8; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if (min_mn == 0 ) return arginfo; if ( m > 2048 || n > 2048 ) { printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t i, k, ib=nb, jb=nnb, offset_RT=0, use_stream; magma_int_t ldw, offset; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dW5_displ = NULL; double **dR_displ = NULL; double **dT_displ = NULL; double *dwork = NULL; double **cpuAarray = NULL; double **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_displ, batchCount * sizeof(*dR_displ)); magma_malloc((void**)&dT_displ, batchCount * sizeof(*dT_displ)); magma_dmalloc(&dwork, (2 * nb * n) * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(double*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_displ == NULL || dT_displ == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magma_ddisplace_pointers(dR_displ, dR_array, lddr, 0, 0, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, 0, batchCount, queue); // set dwork to zero because our GEMM routine does propagate NAN when C=betaC+alphaA*B and beta=0 magmablas_dlaset_q( MagmaFull, 2*nb, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, 2*nb, queue ); // set dR and dT to zero. if provide_RT == 0 only a tile of size nbxnb is used and overwritten at each step magmablas_dlaset_batched( MagmaFull, lddr, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dR_displ, lddr, batchCount, queue ); magmablas_dlaset_batched( MagmaFull, lddt, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dT_displ, lddt, batchCount, queue ); /* if ( provide_RT > 0 ) { magmablas_dlaset_q( MagmaFull, lddr, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } else { magmablas_dlaset_q( MagmaFull, lddr, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } */ magma_int_t streamid; const magma_int_t nbstreams=10; magma_queue_t queues[nbstreams]; for (i=0; i < nbstreams; i++) { magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1, queue); magma_getvector( batchCount, sizeof(double*), dT_array, 1, cpuTarray, 1, queue); for (i=0; i < min_mn; i += nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); if ( provide_RT > 0 ) { offset_RT = i; magma_ddisplace_pointers(dR_displ, dR_array, lddr, (provide_RT == 1 ? offset_RT:0), offset_RT, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, offset_RT, batchCount, queue); } //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_dgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dR_displ, lddr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, queue); //=============================================== // end of panel //=============================================== //=============================================== // update trailing matrix //=============================================== if ( (n-ib-i) > 0) { //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; magma_dset_pointer( dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; magma_dset_pointer( dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); // set the diagonal of v as one and the upper triangular part as zero already set inside geqrf_panel //magmablas_dlaset_batched( MagmaUpper, ib, ib, MAGMA_D_ZERO, MAGMA_D_ONE, dW0_displ, ldda, batchCount, queue ); //magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_dlarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dW4_displ, nb*lddt, batchCount, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- use_stream = magma_drecommend_cublas_gemm_stream(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib); if ( use_stream ) { magma_queue_sync(queue); for (k=0; k < batchCount; k++) { streamid = k%nbstreams; // the queue gemm must take cpu pointer magma_dlarfb_gpu_gemm( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k] + offset_RT*lddt, lddt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1, queues[streamid] ); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // if queue is NULL, no need to sync if ( queue != NULL ) { for (magma_int_t s=0; s < nbstreams; s++) magma_queue_sync(queues[s]); } } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_dlarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const double**)dW0_displ, ldda, (const double**)dT_displ, lddt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, queue ); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update, // only when provide_RT=0 otherwise the nbxnb block of V is set to diag=1/0 // The upper portion of V could be set totaly to 0 here if ( provide_RT == 0 ) { magmablas_dlacpy_batched( MagmaUpper, ib, ib, dR_displ, lddr, dW0_displ, ldda, batchCount, queue ); } } magma_queue_sync(queue); for (k=0; k < nbstreams; k++) { magma_queue_destroy( queues[k] ); } magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); return arginfo; }
/** \n This is an internal routine. ********************************************************************/ extern "C" magma_int_t magma_dpotrf_recpanel_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { magma_int_t arginfo = 0; // Quick return if possible if (m == 0 || n == 0) { return arginfo; } if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); arginfo = -1; magma_xerbla( __func__, -(arginfo) ); return arginfo; } if (m < n) { printf("error m < n %d < %d \n", (int) m, (int) n); arginfo = -101; magma_xerbla( __func__, -(arginfo) ); return arginfo; } double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; magma_int_t panel_nb = n; if (panel_nb <= min_recpnb) { //printf("calling bottom panel recursive with m=%d nb=%d\n",m,n); // panel factorization magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); //magma_dpotrf_rectile_batched(uplo, m, panel_nb, 16, arginfo = magma_dpotrf_panel_batched(uplo, m, panel_nb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } else { // split A over two [A A2] // panel on A1, update on A2 then panel on A1 magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t m1 = m; magma_int_t m2 = m-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A1 //printf("calling recursive panel on A1 with m=%d nb=%d min_recpnb %d\n",m1,n1,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); arginfo = magma_dpotrf_recpanel_batched( uplo, m1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); if (arginfo != 0) { magma_free(dA_displ); return arginfo; } // update A2 //printf("calling update A2 with m=%d n=%d k=%d\n",m2,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1+n1, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p1+n1, p2, batchCount, queue); magma_dgemm_batched( MagmaNoTrans, MagmaConjTrans, m2, n2, n1, alpha, dA_displ, ldda, dA_displ, ldda, beta, dW0_displ, ldda, batchCount, queue ); // panel on A2 //printf("calling recursive panel on A2 with m=%d nb=%d min_recpnb %d\n",m2,n2,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount, queue); arginfo = magma_dpotrf_recpanel_batched( uplo, m2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } magma_free(dA_displ); return arginfo; }
/** \n This is an internal routine. ********************************************************************/ extern "C" magma_int_t magma_dpotrf_rectile_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { //magma_int_t DEBUG=0; // Quick return if possible if (m == 0 || n == 0) { return 1; } if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); return -100; } if (m < n) { printf("error m < n %d < %d \n", (int) m, (int) n); return -101; } double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; magma_int_t panel_nb = n; if (panel_nb <= min_recpnb) { // if (DEBUG == 1) printf("calling bottom panel recursive with n=%d\n",(int) panel_nb); // panel factorization magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); magma_dpotrf_panel_batched( uplo, m, panel_nb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } else { // split A over two [A11 A12; A21 A22; A31 A32] // panel on tile A11, // trsm on A21, using A11 // update on A22 then panel on A22. // finally a trsm on [A31 A32] using the whole [A11 A12; A21 A22] magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A11 //if (DEBUG == 1) printf("calling recursive panel on A11=A(%d,%d) with n=%d min_recpnb %d\n",(int) p1, (int) p1, (int) n1, (int) min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); magma_dpotrf_rectile_batched( uplo, n1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); // TRSM on A21 //if (DEBUG == 1) printf("calling trsm on A21=A(%d,%d) using A11 == A(%d,%d) with m=%d k=%d \n",p2,p1,p1,p1,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p2, p1, batchCount, queue); magmablas_dtrsm_work_batched( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, n2, n1, MAGMA_D_ONE, dA_displ, ldda, dW0_displ, ldda, dX_array, n2, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue ); // update A22 //if (DEBUG == 1) printf("calling update A22=A(%d,%d) using A21 == A(%d,%d) with m=%d n=%d k=%d\n",p2,p2,p2,p1,n2,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p2, p2, batchCount, queue); // NEED TO BE REPLACED BY HERK magma_dgemm_batched( MagmaNoTrans, MagmaConjTrans, n2, n2, n1, alpha, dA_displ, ldda, dA_displ, ldda, beta, dW0_displ, ldda, batchCount, queue ); // panel on A22 //if (DEBUG == 1) printf("calling recursive panel on A22=A(%d,%d) with n=%d min_recpnb %d\n",p2,p2,n2,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount, queue); magma_dpotrf_rectile_batched( uplo, n2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } if (m > n) { // TRSM on A3: //if (DEBUG == 1) printf("calling trsm AT THE END on A3=A(%d,%d): using A1222 == A(%d,%d) with m=%d k=%d \n",n,0,0,0,m-n,n); magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, n, 0, batchCount, queue); magmablas_dtrsm_work_batched( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, m-n, n, MAGMA_D_ONE, dA_displ, ldda, dW0_displ, ldda, dX_array, m-n, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue ); } magma_free(dA_displ); return 0; }
extern "C" magma_int_t magma_dgetf2_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, double **dW0_displ, double **dW1_displ, double **dW2_displ, magma_int_t **ipiv_array, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { magma_int_t arginfo = 0; if (m < 0) { arginfo = -1; } else if (n < 0 ) { arginfo = -2; } else if (ldda < max(1,m)) { arginfo = -4; } if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } // Quick return if possible if (m == 0 || n == 0) { return arginfo; } double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t nb = BATF2_NB; magma_int_t min_mn = min(m, n); magma_int_t gbj, panelj, step, ib; for( panelj=0; panelj < min_mn; panelj += nb) { ib = min(nb, min_mn-panelj); for (step=0; step < ib; step++) { gbj = panelj+step; //size_t required_shmem_size = zamax*(sizeof(double)+sizeof(int)) + (m-panelj+2)*sizeof(double); //if ( (m-panelj) > 0) if ( (m-panelj) > MAX_NTHREADS) //if ( required_shmem_size > (MAX_SHARED_ALLOWED*1024)) { //printf("running non shared version\n"); // find the max of the column gbj arginfo = magma_idamax_batched(m-gbj, dA_array, 1, gbj, ldda, ipiv_array, info_array, gbstep, batchCount, queue); if (arginfo != 0 ) return arginfo; // Apply the interchange to columns 1:N. swap the whole row arginfo = magma_dswap_batched(n, dA_array, ldda, gbj, ipiv_array, batchCount, queue); if (arginfo != 0 ) return arginfo; // Compute elements J+1:M of J-th column. if (gbj < m) { arginfo = magma_dscal_dger_batched( m-gbj, ib-step, gbj, dA_array, ldda, info_array, gbstep, batchCount, queue ); if (arginfo != 0 ) return arginfo; } } else { //printf("running --- shared version\n"); arginfo = magma_dcomputecolumn_batched(m-panelj, panelj, step, dA_array, ldda, ipiv_array, info_array, gbstep, batchCount, queue); if (arginfo != 0 ) return arginfo; // Apply the interchange to columns 1:N. swap the whole row arginfo = magma_dswap_batched(n, dA_array, ldda, gbj, ipiv_array, batchCount, queue); if (arginfo != 0 ) return arginfo; } } if ( (n-panelj-ib) > 0) { // continue the update of the selected ib row column panelj+ib:n(TRSM) magma_dgetf2trsm_batched(ib, n-panelj-ib, dA_array, panelj, ldda, batchCount, queue); // do the blocked DGER = DGEMM for the remaining panelj+ib:n columns magma_ddisplace_pointers(dW0_displ, dA_array, ldda, ib+panelj, panelj, batchCount, queue); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, panelj, ib+panelj, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dA_array, ldda, ib+panelj, ib+panelj, batchCount, queue); magma_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m-(panelj+ib), n-(panelj+ib), ib, c_neg_one, dW0_displ, ldda, dW1_displ, ldda, c_one, dW2_displ, ldda, batchCount, queue ); } } //magma_free_cpu(cpuAarray); return 0; }
//=================================================================================================================== //=================================================================================================================== //=================================================================================================================== extern "C" magma_int_t magma_dlarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, double **v_array, magma_int_t ldv, double **tau_array, double **T_array, magma_int_t ldt, double **work_array, magma_int_t lwork, magma_int_t batchCount, magma_queue_t queue) { double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; if ( k <= 0) return 0; if ( stair_T > 0 && k <= stair_T) return 0; magma_int_t maxnb = max_shared_bsiz; if ( lwork < k*ldt) { magma_xerbla( __func__, -(10) ); return -10; } if ( stair_T > 0 && stair_T > maxnb) { magma_xerbla( __func__, -(3) ); return -3; } magma_int_t DEBUG=0; magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T; magma_int_t i, j, prev_n, mycol, rows; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dTstep_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dTstep_array, batchCount * sizeof(*dTstep_array)); //double *Tstep = k > nb ? work : T; if (k > nb) { magma_ddisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue); } else { magma_ddisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue); } //magma_int_t ldtstep = k > nb ? k : ldt; magma_int_t ldtstep = ldt; //a enlever // stair_T = 0 meaning all T // stair_T > 0 meaning the triangular portion of T has been computed. // the value of stair_T is the nb of these triangulars //GEMV compute the whole triangular upper portion of T (phase 1) // TODO addcublas to check perf magma_dgemm_batched( MagmaConjTrans, MagmaNoTrans, k, k, n, c_one, v_array, ldv, v_array, ldv, c_zero, dTstep_array, ldtstep, batchCount, queue ); magmablas_dlaset_batched( MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount, queue ); // no need for it as T is expected to be lower zero //if (k > nb) magmablas_dlaset_batched( MagmaLower, k, k, MAGMA_D_ZERO, MAGMA_D_ZERO, dTstep_array, ldtstep, batchCount, queue ); //TRMV //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] // TRMV is split over block of column of size nb // the update should be done from top to bottom so: // 1- a gemm using the previous computed columns // of T to update rectangular upper protion above // the triangle of my columns // 2- the columns need to be updated by a serial // loop over of gemv over itself. since we limit the // shared memory to nb, this nb column // are split vertically by chunk of nb rows dim3 grid(1, 1, batchCount); for (j=0; j < k; j += nb) { prev_n = j; mycol = min(nb, k-j); // note that myrow = prev_n + mycol; if (prev_n > 0 && mycol > 0) { if (DEBUG == 3) { printf("doing gemm on the rectangular portion of size %d %d of T(%d,%d)\n", (int) prev_n, (int) mycol, 0, (int) j ); } magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue); magma_ddisplace_pointers(dW2_displ, T_array, ldt, 0, j, batchCount, queue); magma_dgemm_batched( MagmaNoTrans, MagmaNoTrans, prev_n, mycol, prev_n, c_one, T_array, ldt, dW1_displ, ldtstep, c_zero, dW2_displ, ldt, batchCount, queue ); // update my rectangular portion (prev_n,mycol) using sequence of gemv magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_ddisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); for (i=0; i < prev_n; i += nb) { rows = min(nb,prev_n-i); if (DEBUG == 3) { printf(" doing recdtrmv on the rectangular portion of size %d %d of T(%d,%d)\n", (int) rows, (int) mycol, (int) i, (int) j ); } if (rows > 0 && mycol > 0) { magma_ddisplace_pointers(dW2_displ, T_array, ldt, i, j, batchCount, queue); magmablas_dlarft_recdtrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue); } } } // the upper rectangular protion is updated, now if needed update the triangular portion if (stair_T == 0) { if (DEBUG == 3) { printf("doing dtrmv on the triangular portion of size %d %d of T(%d,%d)\n", (int) mycol, (int) mycol, (int) j, (int) j ); } if (mycol > 0) { magma_ddisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_ddisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); magma_ddisplace_pointers(dW2_displ, T_array, ldt, j, j, batchCount, queue); magmablas_dlarft_dtrmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue); } } }// end of j magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dTstep_array); return 0; }
extern "C" magma_int_t magma_dpotf2_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, double **dA_array, magma_int_t lda, double **dA_displ, double **dW_displ, double **dB_displ, double **dC_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { magma_int_t arginfo=0; // Quick return if possible if (n == 0) { return 1; } double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; magma_int_t nb = POTF2_NB; magma_int_t j, ib, rows; magma_int_t crossover = magma_get_dpotrf_batched_crossover(); if (uplo == MagmaUpper) { printf("Upper side is unavailable\n"); } else { if ( n <= crossover ) { arginfo = magma_dpotrf_lpout_batched(uplo, n, dA_array, lda, gbstep, info_array, batchCount, queue); } else { for (j = 0; j < n; j += nb) { ib = min(nb, n-j); rows = m-j; if ( (rows <= POTF2_TILE_SIZE) && (ib <= POTF2_TILE_SIZE) ) { magma_ddisplace_pointers(dA_displ, dA_array, lda, j, j, batchCount, queue); arginfo = magma_dpotf2_tile_batched( uplo, rows, ib, dA_displ, lda, info_array, gbstep, batchCount, queue); } else { magma_ddisplace_pointers(dA_displ, dA_array, lda, j, j, batchCount, queue); magma_dpotf2_dtrsm_batched( uplo, rows, ib, dA_displ, lda, dW_displ, dB_displ, dC_displ, info_array, gbstep, batchCount, queue); } #if 1 //#define RIGHT_LOOKING if ( (n-j-ib) > 0) { #ifdef RIGHT_LOOKING magma_ddisplace_pointers(dA_displ, dA_array, lda, j+ib, j, batchCount, queue); magma_ddisplace_pointers(dC_displ, dA_array, lda, j+ib, j+ib, batchCount, queue); magma_dgemm_batched( MagmaNoTrans, MagmaConjTrans, m-j-ib, n-j-ib, ib, alpha, dA_displ, lda, dA_displ, lda, beta, dC_displ, lda, batchCount, queue ); #else // update next subpanel magma_ddisplace_pointers(dA_displ, dA_array, lda, j+ib, 0, batchCount, queue); magma_ddisplace_pointers(dC_displ, dA_array, lda, j+ib, j+ib, batchCount, queue); magma_dgemm_batched( MagmaNoTrans, MagmaConjTrans, m-j-ib, min((n-j-ib),ib), j+ib, alpha, dA_displ, lda, dA_displ, lda, beta, dC_displ, lda, batchCount, queue ); #endif } // end of if ( (n-j-ib) > 0) #endif } } } return arginfo; }
extern "C" magma_int_t magma_dgeqrf_panel_batched( magma_int_t m, magma_int_t n, magma_int_t nb, double** dA_array, magma_int_t ldda, double** tau_array, double** dT_array, magma_int_t ldt, double** dR_array, magma_int_t ldr, double** dW0_displ, double** dW1_displ, double *dwork, double** dW2_displ, double** dW3_displ, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { magma_int_t j, jb; magma_int_t ldw = nb; magma_int_t minmn = min(m,n); for( j=0; j < minmn; j += nb) { jb = min(nb, minmn-j); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, j, j, batchCount, queue); magma_ddisplace_pointers(dW2_displ, tau_array, 1, j, 0, batchCount, queue); magma_ddisplace_pointers(dW3_displ, dR_array, ldr, j, j, batchCount, queue); // //sub-panel factorization magma_dgeqr2_batched( m-j, jb, dW0_displ, ldda, dW2_displ, info_array, batchCount, queue); //copy th whole rectangular n,jb from of dA to dR (it's lower portion (which is V's) will be set to zero if needed at the end) magma_ddisplace_pointers(dW0_displ, dA_array, ldda, 0, j, batchCount, queue); magma_ddisplace_pointers(dW3_displ, dR_array, ldr, 0, j, batchCount, queue); magmablas_dlacpy_batched( MagmaFull, minmn, jb, dW0_displ, ldda, dW3_displ, ldr, batchCount, queue ); //set the upper jbxjb portion of V dA(j,j) to 1/0s (note that the rectangular on the top of this triangular of V still non zero but has been copied to dR). magma_ddisplace_pointers(dW0_displ, dA_array, ldda, j, j, batchCount, queue); magmablas_dlaset_batched( MagmaUpper, jb, jb, MAGMA_D_ZERO, MAGMA_D_ONE, dW0_displ, ldda, batchCount, queue ); if ( (n-j-jb) > 0) //update the trailing matrix inside the panel { magma_dlarft_sm32x32_batched(m-j, jb, dW0_displ, ldda, dW2_displ, dT_array, ldt, batchCount, queue); magma_ddisplace_pointers( dW1_displ, dA_array, ldda, j, j + jb, batchCount, queue ); magma_dset_pointer( dW2_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); magma_dset_pointer( dW3_displ, dwork + ldw*n*batchCount, 1, 0, 0, ldw*n, batchCount, queue ); magma_dlarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-j, n-j-jb, jb, (const double**)dW0_displ, ldda, (const double**)dT_array, ldt, dW1_displ, ldda, dW2_displ, ldw, dW3_displ, ldw, batchCount, queue ); } } // copy the remaining portion of dR from dA in case m < n if ( m < n ) { magma_ddisplace_pointers(dW0_displ, dA_array, ldda, 0, minmn, batchCount, queue); magma_ddisplace_pointers(dW3_displ, dR_array, ldr, 0, minmn, batchCount, queue); magmablas_dlacpy_batched( MagmaFull, minmn, n-minmn, dW0_displ, ldda, dW3_displ, ldr, batchCount, queue ); } // to be consistent set the whole upper nbxnb of V to 0/1s, in this case no need to set it inside dgeqrf_batched magma_ddisplace_pointers(dW0_displ, dA_array, ldda, 0, 0, batchCount, queue); magmablas_dlaset_batched( MagmaUpper, minmn, n, MAGMA_D_ZERO, MAGMA_D_ONE, dW0_displ, ldda, batchCount, queue ); return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_dgetf2_nopiv_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, double **dW0_displ, double **dW1_displ, double **dW2_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { magma_int_t arginfo = 0; if (m < 0) { arginfo = -1; } else if (n < 0 ) { arginfo = -2; } else if (ldda < max(1,m)) { arginfo = -4; } if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } // Quick return if possible if (m == 0 || n == 0) { return arginfo; } double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; magma_int_t nb = BATF2_NB; magma_int_t min_mn = min(m, n); magma_int_t gbj, panelj, step, ib; for( panelj=0; panelj < min_mn; panelj += nb) { ib = min(nb, min_mn-panelj); for (step=0; step < ib; step++) { gbj = panelj+step; #if 0 size_t required_shmem_size = ((m-panelj)*ib)*sizeof(double); if ( required_shmem_size > (MAX_SHARED_ALLOWED*1024)) #else if ( (m-panelj) > 0) #endif { // Compute elements J+1:M of J-th column. if (gbj < m) { arginfo = magma_dscal_dger_batched( m-gbj, ib-step, gbj, dA_array, ldda, info_array, gbstep, batchCount, queue ); if (arginfo != 0 ) return arginfo; } } else { // TODO } } if ( (n-panelj-ib) > 0) { // continue the update of the selected ib row column panelj+ib:n(TRSM) magma_dgetf2trsm_batched(ib, n-panelj-ib, dA_array, panelj, ldda, batchCount, queue); // do the blocked DGER = DGEMM for the remaining panelj+ib:n columns magma_ddisplace_pointers(dW0_displ, dA_array, ldda, ib+panelj, panelj, batchCount, queue); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, panelj, ib+panelj, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dA_array, ldda, ib+panelj, ib+panelj, batchCount, queue); magma_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m-(panelj+ib), n-(panelj+ib), ib, c_neg_one, dW0_displ, ldda, dW1_displ, ldda, c_one, dW2_displ, ldda, batchCount, queue ); } } //magma_free_cpu(cpuAarray); return 0; }
extern "C" magma_int_t magma_dgetrf_recpanel_nopiv_batched_q( magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, double** dW5_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t stream, cublasHandle_t myhandle) { // Quick return if possible if (m == 0 || n == 0) { return 0; } magma_int_t arginfo = 0; double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_int_t panel_nb = n; if(panel_nb <= min_recpnb){ // if(DEBUG>0)printf("calling bottom panel recursive with m=%d nb=%d\n",m,n); // panel factorization //magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount); arginfo = magma_dgetrf_panel_nopiv_batched_q( m, panel_nb, dA_array, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep, batchCount, stream, myhandle); if (arginfo != 0) return arginfo; } else{ // split A over two [A A2] // panel on A1, update on A2 then panel on A1 magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t m1 = m; magma_int_t m2 = m-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A1 //printf("calling recursive panel on A1 with m=%d nb=%d min_recpnb %d\n",m1,n1,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount); arginfo = magma_dgetrf_recpanel_nopiv_batched_q( m1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep, batchCount, stream, myhandle); if (arginfo != 0) return arginfo; // update A2 //printf("calling update A2 with m=%d n=%d k=%d\n",m2,n2,n1); magma_ddisplace_pointers(dW5_displ, dA_array, ldda, p1, p2, batchCount); magmablas_dtrsm_work_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, n1, n2, MAGMA_D_ONE, dA_displ, ldda, // dA dW5_displ, ldda, // dB dX_array, n1, // dX dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, p2, 0, batchCount); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount); magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m2, n2, n1, MAGMA_D_NEG_ONE, dW1_displ, ldda, dW5_displ, ldda, MAGMA_D_ONE, dA_displ, ldda, batchCount); // panel on A2 //printf("calling recursive panel on A2 with m=%d nb=%d min_recpnb %d\n",m2,n2,min_recpnb); arginfo = magma_dgetrf_recpanel_nopiv_batched_q( m2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep+p2, batchCount, stream, myhandle); if (arginfo != 0) return arginfo; } magma_free(dA_displ); return 0; }
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, magma_int_t **ipiv_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define A(i_, j_) (A + (i_) + (j_)*ldda) magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } //#define ENABLE_TIMER3 #if defined(ENABLE_TIMER3) real_Double_t tall=0.0, tloop=0., talloc=0., tdalloc=0.; tall = magma_sync_wtime(0); talloc = magma_sync_wtime(0); #endif double neg_one = MAGMA_D_NEG_ONE; double one = MAGMA_D_ONE; magma_int_t ib, i, k, pm; magma_int_t nb = BATRF_NB; magma_int_t gemm_crossover = nb > 32 ? 127 : 160; // magma_int_t gemm_crossover = n;// use only stream gemm #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif magma_int_t **dipiv_displ = NULL; double **dA_displ = NULL; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dinvA_array = NULL; double **dwork_array = NULL; magma_malloc((void**)&dipiv_displ, batchCount * sizeof(*dipiv_displ)); magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nb; magma_int_t **pivinfo_array = NULL; magma_int_t *pivinfo = NULL; double* dinvA = NULL; double* dwork = NULL;// dinvA and dwork are workspace in dtrsm double **cpuAarray = NULL; magma_dmalloc( &dinvA, invA_msize * batchCount); magma_dmalloc( &dwork, dwork_msize * batchCount ); magma_malloc((void**)&pivinfo_array, batchCount * sizeof(*pivinfo_array)); magma_malloc((void**)&pivinfo, batchCount * m * sizeof(magma_int_t)); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); /* check allocation */ if ( dA_displ == NULL || dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL || cpuAarray == NULL || dipiv_displ == NULL || pivinfo_array == NULL || pivinfo == NULL) { magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_dlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dinvA, invA_msize, queue); magmablas_dlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, dwork_msize, queue); dset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); dset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); set_ipointer(pivinfo_array, pivinfo, 1, 0, 0, m, batchCount, queue); // printf(" I am in dgetrfbatched\n"); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++){ magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1); #if defined(ENABLE_TIMER3) printf(" I am after malloc\n"); talloc = magma_sync_wtime(0) - talloc; tloop = magma_sync_wtime(0); #endif for(i = 0; i < min_mn; i+=nb) { magmablasSetKernelStream(NULL); ib = min(nb, min_mn-i); pm = m-i; magma_idisplace_pointers(dipiv_displ, ipiv_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); //=============================================== // panel factorization //=============================================== #if 0 arginfo = magma_dgetf2_batched( pm, ib, dA_displ, ldda, dW1_displ, dW2_displ, dW3_displ, dipiv_displ, info_array, i, batchCount, myhandle); #else arginfo = magma_dgetrf_recpanel_batched( pm, ib, 16, dA_displ, ldda, dipiv_displ, pivinfo_array, dwork_array, nb, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, i, batchCount, myhandle, queue); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #define RUN_ALL #ifdef RUN_ALL // setup pivinfo before adjusting ipiv setup_pivinfo_batched(pivinfo_array, dipiv_displ, pm, ib, batchCount, queue); adjust_ipiv_batched(dipiv_displ, ib, i, batchCount, queue); // stepinit_ipiv(pivinfo_array, pm, batchCount);// for debug and check swap, it create an ipiv #if 0 dlaswp_batched( i, dA_displ, ldda, i, i+ib, dipiv_displ, pivinfo_array, batchCount); #else magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, 0, batchCount, queue); magma_dlaswp_rowparallel_batched( i, dA_displ, ldda, dW0_displ, ldda, i, i+ib, pivinfo_array, batchCount, queue); #endif if( (i + ib) < n) { // swap right side and trsm magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i+ib, batchCount, queue); dset_pointer(dwork_array, dwork, nb, 0, 0, dwork_msize, batchCount, queue); // I don't think it is needed Azzam magma_dlaswp_rowparallel_batched( n-(i+ib), dA_displ, ldda, dwork_array, nb, i, i+ib, pivinfo_array, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i+ib, batchCount, queue); magmablas_dtrsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, ib, n-i-ib, MAGMA_D_ONE, dA_displ, ldda, // dA dwork_array, nb, // dB dW0_displ, ldda, // dX dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue); if( (i + ib) < m) { // if gemm size is >160 use a streamed classical cublas gemm since it is faster // the batched is faster only when M=N<=160 for K40c //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i-ib) > gemm_crossover && (n-i-ib) > gemm_crossover) { //printf("caling streamed dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); // since it use different stream I need to wait the TRSM and swap. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); // for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, cpuAarray[k] + (i+ib)+i*ldda, ldda, cpuAarray[k] + i+(i+ib)*ldda, ldda, one, cpuAarray[k] + (i+ib)+(i+ib)*ldda, ldda); } // need to synchronise to be sure that dgetf2 do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { magma_ddisplace_pointers(dA_displ, dA_array, ldda, i+ib, i, batchCount, queue); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dA_array, ldda, i+ib, i+ib, batchCount, queue); //printf("caling batched dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, dA_displ, ldda, dW1_displ, ldda, one, dW2_displ, ldda, batchCount, queue); } // end of batched/stream gemm } // end of if( (i + ib) < m) } // end of if( (i + ib) < n) #endif }// end of for fin: magma_queue_sync(NULL); #if defined(ENABLE_TIMER3) tloop = magma_sync_wtime(0) - tloop; tdalloc = magma_sync_wtime(0); #endif for(i=0; i<nbstreams; i++){ magma_queue_destroy( stream[i] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); #if defined(ENABLE_TIMER3) tdalloc = magma_sync_wtime(0) - tdalloc; tall = magma_sync_wtime(0) - tall; printf("here is the timing from inside dgetrf_batched talloc: %10.5f tloop: %10.5f tdalloc: %10.5f tall: %10.5f sum: %10.5f\n", talloc, tloop, tdalloc, tall, talloc+tloop+tdalloc ); #endif return arginfo; }