extern "C" magma_int_t magma_cgeqrf_batched( magma_int_t m, magma_int_t n, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **tau_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) // A(i, j) means at i row, j column magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t nb = 32; magma_int_t nnb = 8; magma_int_t i, k, ib=nb, jb=nnb; magma_int_t ldw, ldt, ldr, offset; cublasHandle_t myhandle; cublasCreate_v2(&myhandle); magmaFloatComplex **dW0_displ = NULL; magmaFloatComplex **dW1_displ = NULL; magmaFloatComplex **dW2_displ = NULL; magmaFloatComplex **dW3_displ = NULL; magmaFloatComplex **dW4_displ = NULL; magmaFloatComplex **dW5_displ = NULL; magmaFloatComplex *dwork = NULL; magmaFloatComplex *dT = NULL; magmaFloatComplex *dR = NULL; magmaFloatComplex **dR_array = NULL; magmaFloatComplex **dT_array = NULL; magmaFloatComplex **cpuAarray = NULL; magmaFloatComplex **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); // used in clarfb magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array)); magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array)); ldt = ldr = min(nb, min_mn); magma_cmalloc(&dwork, (2 * nb * n) * batchCount); magma_cmalloc(&dR, ldr * n * batchCount); magma_cmalloc(&dT, ldt * ldt * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaFloatComplex*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(magmaFloatComplex*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_array == NULL || dT_array == NULL || dR == NULL || dT == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_claset_q(MagmaFull, ldr, n*batchCount , MAGMA_C_ZERO, MAGMA_C_ZERO, dR, ldr, queue); magmablas_claset_q(MagmaFull, ldt, ldt*batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dT, ldt, queue); cset_pointer(dR_array, dR, 1, 0, 0, ldr*min(nb, min_mn), batchCount, queue); cset_pointer(dT_array, dT, 1, 0, 0, ldt*min(nb, min_mn), batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++){ magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(magmaFloatComplex*), dA_array, 1, cpuAarray, 1); magma_getvector( batchCount, sizeof(magmaFloatComplex*), dT_array, 1, cpuTarray, 1); magmablasSetKernelStream(NULL); for(i=0; i<min_mn;i+=nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_cdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_cdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_cgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_array, ldt, dR_array, ldr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, myhandle, queue); //=============================================== // end of panel //=============================================== //direct panel matrix V in dW0_displ, magma_cdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); // copy the upper part of V into dR cgeqrf_copy_upper_batched(ib, jb, dW0_displ, ldda, dR_array, ldr, batchCount, queue); //=============================================== // update trailing matrix //=============================================== //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; cset_pointer(dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; cset_pointer(dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); if( (n-ib-i) > 0) { // set the diagonal of v as one and the upper triangular part as zero magmablas_claset_batched(MagmaUpper, ib, ib, MAGMA_C_ZERO, MAGMA_C_ONE, dW0_displ, ldda, batchCount, queue); magma_cdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_clarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_array, ldt, dW4_displ, nb*ldt, batchCount, myhandle, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i) > 100 && (n-i-ib) > 100) { // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_device_sync(); for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // the stream gemm must take cpu pointer magma_clarfb_gpu_gemm(MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k], ldt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_cdisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_clarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const magmaFloatComplex**)dW0_displ, ldda, (const magmaFloatComplex**)dT_array, ldt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, myhandle, queue); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update magmablas_clacpy_batched(MagmaUpper, ib, ib, dR_array, ldr, dW0_displ, ldda, batchCount, queue); } for(k=0; k<nbstreams; k++){ magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); cublasDestroy_v2(myhandle); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); return arginfo; }
extern "C" magma_int_t magma_chetrd(char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, float *d, float *e, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CHETRD reduces a complex Hermitian matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**H * A * Q = T. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). D (output) COMPLEX array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). E (output) COMPLEX array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. TAU (output) COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB is the optimal blocksize given by magma_get_chetrd_nb(). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t ldda = lda; magma_int_t nb = magma_get_chetrd_nb(n); magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; float d_one = MAGMA_D_ONE; magma_int_t kk, nx; magma_int_t i, j, i_n; magma_int_t iinfo; magma_int_t ldwork, lddwork, lwkopt; magma_int_t lquery; *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } /* Determine the block size. */ ldwork = lddwork = n; lwkopt = n * nb; if (*info == 0) { work[0] = MAGMA_C_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magmaFloatComplex *da; if (MAGMA_SUCCESS != magma_cmalloc( &da, n*ldda + 2*n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaFloatComplex *dwork = da + (n)*ldda; if (n < 2048) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ magma_csetmatrix( n, n, A(0, 0), lda, dA(0, 0), ldda ); /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ kk = n - (n - nx + nb - 1) / nb * nb; for (i = n - nb; i >= kk; i -= nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i!=n-nb) magma_cgetmatrix( i+nb, nb, dA(0, i), ldda, A(0, i), lda ); magma_clatrd(uplo, i+nb, nb, A(0, 0), lda, e, tau, work, ldwork, dA(0, 0), ldda, dwork, lddwork); /* Update the unreduced submatrix A(0:i-2,0:i-2), using an update of the form: A := A - V*W' - W*V' */ magma_csetmatrix( i + nb, nb, work, ldwork, dwork, lddwork ); magma_cher2k(uplo, MagmaNoTrans, i, nb, c_neg_one, dA(0, i), ldda, dwork, lddwork, d_one, dA(0, 0), ldda); /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { *A(j-1,j) = MAGMA_C_MAKE( e[j - 1], 0 ); d[j] = MAGMA_C_REAL( *A(j, j) ); } } magma_cgetmatrix( kk, kk, dA(0, 0), ldda, A(0, 0), lda ); /* Use unblocked code to reduce the last or only block */ lapackf77_chetd2(uplo_, &kk, A(0, 0), &lda, d, e, tau, &iinfo); } else { /* Copy the matrix to the GPU */ if (1<=n-nx) magma_csetmatrix( n, n, A(0,0), lda, dA(0,0), ldda ); #ifdef FAST_HEMV // TODO this leaks memory from da, above magmaFloatComplex *dwork2; if (MAGMA_SUCCESS != magma_cmalloc( &dwork2, n*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i!=0) magma_cgetmatrix( n-i, nb, dA(i, i), ldda, A(i, i), lda ); #ifdef FAST_HEMV magma_clatrd2(uplo, n-i, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA(i, i), ldda, dwork, lddwork, dwork2, n*n); #else magma_clatrd(uplo, n-i, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA(i, i), ldda, dwork, lddwork); #endif /* Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' */ magma_csetmatrix( n-i, nb, work, ldwork, dwork, lddwork ); magma_cher2k(MagmaLower, MagmaNoTrans, n-i-nb, nb, c_neg_one, dA(i+nb, i), ldda, &dwork[nb], lddwork, d_one, dA(i+nb, i+nb), ldda); /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { *A(j+1,j) = MAGMA_C_MAKE( e[j], 0 ); d[j] = MAGMA_C_REAL( *A(j, j) ); } } #ifdef FAST_HEMV magma_free( dwork2 ); #endif /* Use unblocked code to reduce the last or only block */ if (1<=n-nx) magma_cgetmatrix( n-i, n-i, dA(i, i), ldda, A(i, i), lda ); i_n = n-i; lapackf77_chetrd(uplo_, &i_n, A(i, i), &lda, &d[i], &e[i], &tau[i], work, &lwork, &iinfo); } magma_free( da ); work[0] = MAGMA_C_MAKE( lwkopt, 0 ); return *info; } /* magma_chetrd */
int main( int argc, char** argv ) { magma_init(); cublasHandle_t handle; cudaSetDevice( 0 ); cublasCreate( &handle ); magmaFloatComplex *A, *B, *C; magmaFloatComplex *dA, *dB, *dC; float error, work[1]; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_int_t ISEED[4] = { 1, 2, 3, 4 }; magma_int_t n, lda, ldda, size, info; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf(" N |dC - C|/|C|\n"); printf("====================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // for this simple case, all matrices are N-by-N n = opts.nsize[itest]; lda = n; ldda = ((n+31)/32)*32; magma_cmalloc_cpu( &A, lda*n ); magma_cmalloc_cpu( &B, lda*n ); magma_cmalloc_cpu( &C, lda*n ); magma_cmalloc( &dA, ldda*n ); magma_cmalloc( &dB, ldda*n ); magma_cmalloc( &dC, ldda*n ); // initialize matrices size = lda*n; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); // increase diagonal to be SPD for( int i=0; i < n; ++i ) { C[i+i*lda] = MAGMA_C_ADD( C[i+i*lda], MAGMA_C_MAKE( n*n, 0 )); } magma_csetmatrix( n, n, A, lda, dA, ldda ); magma_csetmatrix( n, n, B, lda, dB, ldda ); magma_csetmatrix( n, n, C, lda, dC, ldda ); // compute with cublas cublasCgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda ); magma_cpotrf_gpu( MagmaLower, n, dC, ldda, &info ); if (info != 0) printf("magma_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute with LAPACK blasf77_cgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n, &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda ); lapackf77_cpotrf( MagmaLowerStr, &n, C, &lda, &info ); if (info != 0) printf("lapackf77_cpotrf returned error %d: %s.\n", (int) info, magma_strerror( info )); // compute difference, |dC - C| / |C| magma_cgetmatrix( n, n, dC, ldda, A, lda ); blasf77_caxpy( &size, &c_neg_one, C, &ione, A, &ione ); error = lapackf77_clange( "F", &n, &n, A, &lda, work ) / lapackf77_clange( "F", &n, &n, C, &lda, work ); printf( "%5d %8.2e %s\n", (int) n, error, (error < tol ? "ok" : "failed")); status += ! (error < tol); magma_free( dA ); magma_free( dB ); magma_free( dC ); magma_free_cpu( A ); magma_free_cpu( B ); magma_free_cpu( C ); fflush( stdout ); } } cublasDestroy( handle ); magma_finalize(); return status; }
extern "C" magma_int_t magma_cungqr2(magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. This version recomputes the T matrices on the CPU and sends them to the GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define A(i,j) ( A + (i) + (j)*lda ) #define dA(i,j) (dA + (i) + (j)*ldda) magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t nb = magma_get_cgeqrf_nb(min(m, n)); magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, ldda; magma_int_t i, ib, ki, kk; //, iinfo; magma_int_t lddwork; magmaFloatComplex *dA, *dV, *dW, *dT, *T; magmaFloatComplex *work; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); } else { ki = 0; kk = 0; } // Allocate GPU work space // ldda*n for matrix dA // ldda*nb for dV // lddwork*nb for dW larfb workspace ldda = ((m + 31) / 32) * 32; lddwork = ((n + 31) / 32) * 32; if (MAGMA_SUCCESS != magma_cmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dA + ldda*n; dW = dA + ldda*n + ldda*nb; dT = dA + ldda*n + ldda*nb + lddwork*nb; // Allocate CPU work space lwork = (n+m+nb) * nb; magma_cmalloc_cpu( &work, lwork ); T = work; if (work == NULL) { magma_free( dA ); magma_free_cpu( work ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magmaFloatComplex *V = work + (n+nb)*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; /* lapackf77_cungqr( &m_kk, &n_kk, &k_kk, A(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo ); */ lapackf77_clacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk); lapackf77_claset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda ); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &k_kk, V, &m_kk, &tau[kk], work, &k_kk); lapackf77_clarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr, &m_kk, &n_kk, &k_kk, V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk ); if (kk > 0) { magma_csetmatrix( m_kk, n_kk, A(kk, kk), lda, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_claset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } } if (kk > 0) { // Use blocked code // stream: set Aii (V) --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min(nb, k - i); // Send current panel to the GPU mi = m - i; lapackf77_claset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda ); magma_csetmatrix_async( mi, ib, A(i, i), lda, dV, ldda, stream ); lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &mi, &ib, A(i,i), &lda, &tau[i], T, &nb); magma_csetmatrix_async( ib, ib, T, nb, dT , nb, stream ); // set panel to identity magmablas_claset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_claset_identity( mi, ib, dA(i, i), ldda ); magma_queue_sync( stream ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT, nb, dA(i, i), ldda, dW, lddwork ); } } // copy result back to CPU magma_cgetmatrix( m, n, dA(0, 0), ldda, A(0, 0), lda); } magmablasSetKernelStream( NULL ); magma_queue_destroy( stream ); magma_free( dA ); magma_free_cpu( work ); return *info; } /* magma_cungqr */
extern "C" magma_int_t magma_cbulge_applyQ_v2_m(magma_int_t ngpu, magma_side_t side, magma_int_t NE, magma_int_t N, magma_int_t NB, magma_int_t Vblksiz, magmaFloatComplex *E, magma_int_t lde, magmaFloatComplex *V, magma_int_t ldv, magmaFloatComplex *T, magma_int_t ldt, magma_int_t *info) { //%=========================== //% local variables //%=========================== magma_int_t Vm, Vn, mt, nt; magma_int_t myrow, mycol, blkj, blki; magma_int_t blkid,vpos,tpos; magma_int_t firstrow, nbcolinvolvd; magma_int_t versionL = 113; magma_int_t versionR = 92; magma_int_t Vchunksiz = 10; *info=0; /* Quick return */ if ( NE == 0 ) { return *info; } if ( N == 0 ) { return *info; } if ( NB == 0 ) { return *info; } /* ========================================== * some infos for developer * Initialisation and checking nb of cores * ==========================================*/ /* we have 2 algo for left (113 114) and 2 algo for right (91 92) * which correspond to versionL versionR. * They are very similar (detail explained in tech report and matlab code) * however version 114 and 92 improve locality. * while version 113 is used in case WNATZ=1 (construct Q2) which allow * the construction to be done in an optimized way taking into * consideration that the matrix is Identity so making less flops. * */ // Initialize streaming and events magma_device_sync(); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_int_t nbevents =2, nstream=2; magma_queue_t streams[MagmaMaxGPUs][20]; magma_event_t myevent[MagmaMaxGPUs][20]; for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } for( magma_int_t i = 0; i < nbevents; ++i ) { cudaEventCreateWithFlags(&myevent[dev][i],cudaEventDisableTiming); } } // Azzam 21/11/2012 // NOTE THAT dwork was of size 2*NE*Vblksiz+... // but I am thinking why not modifing it to NE*Vblksiz+... // BUT NO because the 2* is used because of making 2 streams working and so // they might be using dwork in parallel magmaFloatComplex *dE[MagmaMaxGPUs]; magmaFloatComplex *dwork[MagmaMaxGPUs], *dwork0[MagmaMaxGPUs], *dwork1[MagmaMaxGPUs]; //magmaFloatComplex *dwvt[MagmaMaxGPUs]; magmaFloatComplex *dwvt0[MagmaMaxGPUs], *dwvt1[MagmaMaxGPUs]; magmaFloatComplex *dT0[MagmaMaxGPUs], *dV0[MagmaMaxGPUs], *dT1[MagmaMaxGPUs], *dV1[MagmaMaxGPUs]; magma_int_t dev; magma_int_t ldde = N; magma_int_t lddv = ldv; magma_int_t lddt = ldt; magma_int_t ne_loc = magma_ceildiv(NE, ngpu); if (ne_loc < 256) ne_loc=256; magma_int_t dwVTsiz = lddv*Vblksiz; // lddv*lddv + lddv*NE; // lddv*Vblksiz; magma_int_t dworksiz = ne_loc*Vblksiz; // lddv*Vblksiz; // NE*Vblksiz; ngpu = min(ngpu, magma_ceildiv(NE,ne_loc)); // Don't use GPU that will not have data. // copy dE to GPUs for (dev=0; dev < ngpu; ++dev) { magma_setdevice( dev ); if (MAGMA_SUCCESS != magma_cmalloc( &dE[dev], ldde * ne_loc)) { printf ("!!!! magma_cbulge_applyQ magma_alloc failed for: dE\n" ); exit(-1); } if (MAGMA_SUCCESS != magma_cmalloc( &dwork[dev], 2*dworksiz + 2*dwVTsiz + 2*Vchunksiz* (Vblksiz* (lddv+lddt)) )) { printf ("!!!! magma_cbulge_applyQ magma_alloc failed for: dwork\n" ); exit(-1); } dwork0[dev] = dwork[dev]; // size = dworksiz; dwork1[dev] = dwork0[dev] + dworksiz; // size = dworksiz; dwvt0[dev] = dwork[dev] + 2*dworksiz; // size = dwVTsiz; dwvt1[dev] = dwvt0[dev] + dwVTsiz; // size = dwVTsiz; dV0[dev] = dwork[dev] + 2*dworksiz + 2*dwVTsiz; dT0[dev] = dV0[dev] + Vchunksiz*Vblksiz*lddv; dV1[dev] = dT0[dev] + Vchunksiz*Vblksiz*lddt; dT1[dev] = dV1[dev] + Vchunksiz*Vblksiz*lddv; magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_csetmatrix_async( N, ie_loc, E+lde*ne_loc*dev, lde, dE(dev, 0, 0), ldde, streams[dev][1] ); } // make overlapped copy magma_int_t ncpy = 0; magma_int_t copyed=0, copyst=0; magma_int_t blkcnt,nothing, mysiz, flip, vld,tld, locpos; findVTsiz(N, NB, Vblksiz, &blkcnt, ¬hing); flip = 0; /* SIDE LEFT meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1 * Also E is splitten by row meaning each apply consist in a block of row (horizontal block) */ /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal order (forward) from q_1 to q_n * Also E is splitten by col meaning each apply consist in a block of col (vertical block) */ #ifdef ENABLE_DEBUG printf(" APPLY Q_v22_m GPU with NGPU %d N %d, NE %d, NB %d, Vblksiz %d, versionL %d versionR %d SIDE %c \n", ngpu, N, NE, NB, Vblksiz, versionL, versionR, side); #endif /* * MagmamaLeft */ if (side == MagmaLeft) { /* * Version 113: * loop over the block_col (nt) and for each find the * number of tiles (mt) in this block_col. then loop over mt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ if ( versionL == 113 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=nt-1; blkj >= 0; blkj--) { /* the index of the first row on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=mt; blki > 0; blki--) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d vpos %d \n",blkj,blki,Vm, Vn,mycol,vpos); magma_bulge_findpos113(N, NB, Vblksiz, mycol, myrow, &blkid); // COPY Vchunksiz Vs and Vchunksiz Ts to GPU and store it in dV0/dV1 and dT0/dT1 if (ncpy == 0) { // flip = 1 for this. copyst = 0; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied if (mysiz > 0) { ncpy = 1; flip = 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_csetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]); magma_csetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]); } //printf("doing the first copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); } } if (blkid == copyst) { flip = ncpy % 2; copyst = copyed; // meaning that copy will start copying from blkid =copyst copyed = min(copyst+Vchunksiz, blkcnt); // meaning that copy will end copying at blkid =copyed-1==> next copy had to start at copyed mysiz = copyed-copyst; // the size of the chunk to be copied //printf(" get to copy blkid %d blkid+(2*Vchunksiz) %d copyst %d copyed %d\n",blkid,blkid+(Vchunksiz),copyst,copyed); if (mysiz > 0) { ncpy = ncpy + 1; vpos = copyst*Vblksiz*ldv; tpos = copyst*Vblksiz*ldt; vld = mysiz * ldv; tld = mysiz * ldt; if (flip == 0) { // now I am working on dV0 so copy the next and put it on dV1 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV1 dT1\n",mysiz,copyst,copyed,vpos,tpos); for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 1 ] ); magma_csetmatrix_async(vld, Vblksiz, V(vpos), vld, dV1[dev], vld, streams[dev][1]); magma_csetmatrix_async(tld, Vblksiz, T(tpos), tld, dT1[dev], tld, streams[dev][1]); } } else { // now I am working on dV1 so copy the next and put it on dV0 //printf("doing overlapping copy of mysiz %2d copyst %2d copyed %2d vpos %8d tpos %8d into dV0 dT0\n",mysiz,copyst,copyed,vpos,tpos); for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream( streams[ dev ][ 0 ] ); magma_csetmatrix_async(vld, Vblksiz, V(vpos), vld, dV0[dev], vld, streams[dev][0]); magma_csetmatrix_async(tld, Vblksiz, T(tpos), tld, dT0[dev], tld, streams[dev][0]); } } } } if ((Vm > 0) && (Vn > 0)) { locpos = blkid%Vchunksiz; magma_int_t lcvpos = locpos*Vblksiz*lddv; magma_int_t lctpos = locpos*Vblksiz*lddt; //printf("voici blkj %d blki %d Vm %d Vn %d mycol %d locvpos %5d loctpos %5d blkid %2d using data in dV%1d dT%1d \n",blkj,blki,Vm, Vn,mycol,lcvpos,lctpos, blkid,flip,flip); if (flip == 0) { for( dev = 0; dev < ngpu; ++dev ) { magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_int_t nr_bl = magma_ceildiv(ie_loc,10000); //nr of blocks magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][1] ); for (magma_int_t i=0; i < ie_loc; i += sz_bl) { ib = min(sz_bl, ie_loc-i); //magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib); magma_clarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0[dev]+lcvpos, lddv, dT0[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork0[dev], ib, dwvt0[dev], Vm); } magma_event_record( myevent[dev][0], streams[dev][0] ); } } else { for( dev = 0; dev < ngpu; ++dev ) { magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_int_t nr_bl = magma_ceildiv(ie_loc,10000); //nr of blocks magma_int_t sz_bl = magma_ceildiv(ie_loc,nr_bl*64)*64; //maximum size of blocks (to have blocks of around the same size and multiple of 64) magma_int_t ib; //size of current block magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][1]); magma_queue_wait_event( streams[dev][1], myevent[dev][0] ); for (magma_int_t i=0; i < ie_loc; i += sz_bl) { ib = min(sz_bl, ie_loc-i); //magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib); magma_clarfb_gpu_gemm( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV1[dev]+lcvpos, lddv, dT1[dev]+lctpos, lddt, dE(dev,myrow,i), ldde, dwork1[dev], ib, dwvt1[dev], Vm); } magma_event_record( myevent[dev][1], streams[dev][1] ); } } } // end for (Vm &Vn) > 0 } // end for blki } // end for blkj } // end if version=113 /* * Version 114: * loop over the block_row (mt) and for each find diagonally the * number of tiles (nt) in this block_row. then loop over nt, find * the size of the V's(Vm,Vn) and apply it to the corresponding * portion of E. */ else { printf("versionL 114 not implemented in cbulge_applyQ_v2_m\n"); exit(-1); mt = magma_ceildiv((N-1),NB); for (blki = mt; blki > 0; blki--) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = nt-1; blkj >= 0; blkj--) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_csetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_csetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); //printf("voici blki %d rownbm %d mycol %d coled %d blkid %d vpos %d tpos %d\n", blki, rownbm, mycol, coled, blkid, vpos, tpos); for (magma_int_t i=0; i < NE; i += sz_bl) { ib = min(sz_bl, NE-i); magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, Vm, ib, Vn, dV0, lddv, dT0, lddt, dE(myrow,i), ldde, dwork, NE); } */ } // end for (Vm &Vn) > 0 } // end for blkj } // end for blki } // end version 114 } // end LEFT /* * MagmaRight */ else { printf("Side 'R' not implemented in cbulge_applyQ_v2_m\n"); exit(-1); /* * Version 91: */ if ( versionR == 91 ) { nt = magma_ceildiv((N-1),Vblksiz); for (blkj=0; blkj < nt; blkj++) { /* the index of the first myrow on the top of block (blkj) */ firstrow = blkj * Vblksiz + 1; /*find the number of tile for this block */ if ( blkj == nt-1 ) mt = magma_ceildiv( N - firstrow, NB); else mt = magma_ceildiv( N - (firstrow+1), NB); /*loop over the tiles find the size of the Vs and apply it */ for (blki=1; blki <= mt; blki++) { /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + (mt-blki)*NB; Vm = min( NB+Vblksiz-1, N-myrow); if ( (blkj == nt-1) && (blki == mt) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } mycol = blkj*Vblksiz; if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_csetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_csetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); */ } // end for (Vm &Vn) > 0 } // end for blki } // end fo blkj } // end of version 91 /* * Version 92: */ else { mt = magma_ceildiv((N-1),NB); for (blki = 1; blki <= mt; blki++) { /* nbcolinvolvd = number of column corresponding to this block_row (blki) */ nbcolinvolvd = min(N-1, blki*NB); /*find the number of tile for this block (diagonal row of tiles) */ nt = magma_ceildiv(nbcolinvolvd,Vblksiz); /*loop over the tiles find the size of the Vs and apply it */ for (blkj = 0; blkj < nt; blkj++) { /* the index of the first row of the first col meaning * the block on the top left (blki) */ firstrow = (mt-blki)*NB+1; /*calculate the size of each losange of Vs= (Vm,Vn)*/ myrow = firstrow + blkj*Vblksiz; mycol = blkj*Vblksiz; Vm = min( NB+Vblksiz-1, N-myrow); if ( ( blkj == nt-1 ) && ( blki == mt ) ) { Vn = min (Vblksiz, Vm); } else { Vn = min (Vblksiz, Vm-1); } if ((Vm > 0) && (Vn > 0)) { /*calculate the pointer to the Vs and the Ts. * Note that Vs and Ts have special storage done * by the bulgechasing function*/ /* magma_bulge_findVTpos(N, NB, Vblksiz, mycol, myrow, ldv, ldt, &vpos, &tpos); magma_csetmatrix_async(Vm, Vn, V(vpos), ldv, dV0, lddv, NULL); magma_csetmatrix_async(Vn, Vn, T(tpos), ldt, dT0, lddt, NULL); magma_clarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaColumnwise, NE, Vm, Vn, dV0, lddv, dT0, lddt, dE(0, myrow), ldde, dwork, NE); */ } // end for (Vm &Vn) > 0 } //end for blkj } // end for blki } //end of version 92 } // end RIGHT // copy back the dE form each GPU for( dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][1] ); magma_queue_wait_event( streams[dev][0], myevent[dev][0] ); magma_int_t ie_loc = min(ne_loc, NE - ne_loc*dev); magma_cgetmatrix_async( N, ie_loc, dE(dev, 0, 0), ldde, E+lde*ne_loc*dev, lde, streams[dev][0] ); magma_event_record( myevent[dev][0], streams[dev][0] ); } for( magma_int_t dev = 0; dev < ngpu; ++dev ) { magma_setdevice( dev ); magmablasSetKernelStream(streams[dev][0]); magma_queue_wait_event( streams[dev][0], myevent[dev][0] ); magma_device_sync(); // no need for synchronize magma_free(dwork[dev]); magma_free(dE[dev]); for( magma_int_t i = 0; i < nbevents; ++i ) { magma_event_destroy( myevent[dev][i] ); } for( magma_int_t i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; }
/** Purpose ------- CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((float)NB/nb) ) { ngpu = (int)ceil((float)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((float)N/nb) ) { ngpu = (int)ceil((float)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_csetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_claswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_cgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_cgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_CGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_cgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
extern "C" magma_int_t magma_ctrtri(char uplo, char diag, magma_int_t n, cuFloatComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= CTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. ===================================================================== */ /* Local variables */ char uplo_[2] = {uplo, 0}; char diag_[2] = {diag, 0}; magma_int_t ldda, nb, nn, j, jb; cuFloatComplex c_zero = MAGMA_C_ZERO; cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; cuFloatComplex *work; int upper = lapackf77_lsame(uplo_, "U"); int nounit = lapackf77_lsame(diag_, "N"); *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) *info = -1; else if ((! nounit) && (! lapackf77_lsame(diag_, "U"))) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; /* Check for singularity if non-unit */ if (nounit) { for ( j=0; j<n; ++j ) { if ( MAGMA_C_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } /* Determine the block size for this environment */ nb = magma_get_cpotrf_nb(n); ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_cmalloc( &work, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) lapackf77_ctrtri(uplo_, diag_, &n, a, &lda, info); else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j<n; j=j+nb) { jb = min(nb, (n-j)); magma_csetmatrix( jb, (n-j), A(j, j), lda, dA(j, j), ldda ); /* Compute rows 1:j-1 of current block column */ magma_ctrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j),ldda); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j),ldda); //cublasGetMatrix(j ,jb, sizeof( cuFloatComplex), //dA(0, j), ldda, A(0, j), lda); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[1] ); magma_cgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, stream[0] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ctrtri(MagmaUpperStr, diag_, &jb, A(j,j), &lda, info); magma_csetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } else { /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for(j=nn-1; j>=0; j=j-nb) { jb=min(nb,(n-j)); if((j+jb) < n) { magma_csetmatrix( (n-j), jb, A(j, j), lda, dA(j, j), ldda ); /* Compute rows j+jb:n of current block column */ magma_ctrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda ); magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda ); //cublasGetMatrix((n-j), jb, sizeof( cuFloatComplex),dA(j, j), ldda, A(j, j), lda); magma_cgetmatrix_async( n-j-jb, jb, dA(j+jb, j), ldda, A(j+jb, j), lda, stream[1] ); magma_cgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); magma_queue_sync( stream[0] ); } /* Compute inverse of current diagonal block */ lapackf77_ctrtri(MagmaLowerStr, diag_, &jb, A(j,j), &lda, info); magma_csetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( work ); return *info; }
/***************************************************************************//** Purpose ------- CPOTRS solves a system of linear equations A*X = B with a Hermitian positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by CPOTRF. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA_array Array of pointers, dimension (batchCount). Each is a COMPLEX array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by CPOTRF. @param[in] ldda INTEGER The leading dimension of each array A. LDDA >= max(1,N). @param[in,out] dB_array Array of pointers, dimension (batchCount). Each is a COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, each pointer is a right hand side matrix B. On exit, the corresponding solution matrix X. @param[in] lddb INTEGER The leading dimension of each array B. LDDB >= max(1,N). @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. @ingroup magma_potrs_batched *******************************************************************************/ extern "C" magma_int_t magma_cpotrs_batched( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **dB_array, magma_int_t lddb, magma_int_t batchCount, magma_queue_t queue) { magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) info = -1; if ( n < 0 ) info = -2; if ( nrhs < 0) info = -3; if ( ldda < max(1, n) ) info = -5; if ( lddb < max(1, n) ) info = -7; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return info; } magmaFloatComplex **dW1_displ = NULL; magmaFloatComplex **dW2_displ = NULL; magmaFloatComplex **dW3_displ = NULL; magmaFloatComplex **dW4_displ = NULL; magmaFloatComplex **dinvA_array = NULL; magmaFloatComplex **dwork_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = magma_roundup( n, CTRTRI_BATCHED_NB )*CTRTRI_BATCHED_NB; magma_int_t dwork_msize = n*nrhs; magmaFloatComplex* dinvA = NULL; magmaFloatComplex* dwork = NULL; // dinvA and dwork are workspace in ctrsm magma_cmalloc( &dinvA, invA_msize * batchCount); magma_cmalloc( &dwork, dwork_msize * batchCount ); /* check allocation */ if ( dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL ) { magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_claset_q( MagmaFull, invA_msize, batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dinvA, invA_msize, queue ); magmablas_claset_q( MagmaFull, dwork_msize, batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dwork, dwork_msize, queue ); magma_cset_pointer( dwork_array, dwork, n, 0, 0, dwork_msize, batchCount, queue ); magma_cset_pointer( dinvA_array, dinvA, CTRTRI_BATCHED_NB, 0, 0, invA_msize, batchCount, queue ); if ( uplo == MagmaUpper) { if (nrhs > 1) { // A = U^T U // solve U^{T}X = B ==> dworkX = U^-T * B magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); // solve U X = dwork ==> X = U^-1 * dwork magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); } else { // A = U^T U // solve U^{T}X = B ==> dworkX = U^-T * B magmablas_ctrsv_outofplace_batched( MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, dA_array, ldda, // dA dB_array, 1, // dB dwork_array, // dX //output batchCount, queue, 0 ); // solve U X = dwork ==> X = U^-1 * dwork magmablas_ctrsv_outofplace_batched( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA_array, ldda, // dA dwork_array, 1, // dB dB_array, // dX //output batchCount, queue, 0 ); } } else { if (nrhs > 1) { // A = L L^T // solve LX= B ==> dwork = L^{-1} B magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); // solve L^{T}X= dwork ==> X = L^{-T} dwork magmablas_ctrsm_outofplace_batched( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); } else { // A = L L^T // solve LX= B ==> dwork = L^{-1} B magmablas_ctrsv_outofplace_batched( MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA_array, ldda, // dA dB_array, 1, // dB dwork_array, // dX //output batchCount, queue, 0 ); // solve L^{T}X= dwork ==> X = L^{-T} dwork magmablas_ctrsv_outofplace_batched( MagmaLower, MagmaConjTrans, MagmaNonUnit, n, dA_array, ldda, // dA dwork_array, 1, // dB dB_array, // dX //output batchCount, queue, 0 ); } } magma_queue_sync(queue); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); return info; }
/** Purpose ------- CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu) On entry, the Hermitian matrix dA distributed over GPUs (d_lA[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_cposv_comp ********************************************************************/ extern "C" magma_int_t magma_cpotrf_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *info) { magma_int_t j, nb, d, lddp, h; const char* uplo_ = lapack_uplo_const( uplo ); magmaFloatComplex *work; bool upper = (uplo == MagmaUpper); magmaFloatComplex *dwork[MagmaMaxGPUs]; magma_queue_t queues[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; *info = 0; nb = magma_get_cpotrf_nb(n); if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper) { lddp = nb*(n/(nb*ngpu)); if ( n%(nb*ngpu) != 0 ) lddp += min(nb, n-ngpu*lddp); if ( ldda < lddp ) *info = -4; } else if ( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); if (ngpu == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ magma_setdevice(0); magma_queue_create( 0, &queues[0][0] ); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( n, n, d_lA[0], ldda, work, n, queues[0][0] ); lapackf77_cpotrf(uplo_, &n, work, &n, info); magma_csetmatrix( n, n, work, n, d_lA[0], ldda, queues[0][0] ); magma_free_pinned( work ); magma_queue_destroy( queues[0][0] ); } else { lddp = magma_roundup( n, nb ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dwork[d], ngpu*nb*lddp )) { for( j=0; j < d; j++ ) { magma_setdevice(j); magma_free( dwork[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < 3; j++ ) { magma_queue_create( d, &queues[d][j] ); } for( j=0; j < 5; j++ ) { magma_event_create( &event[d][j] ); } } magma_setdevice(0); h = 1; //ngpu; //magma_ceildiv( n, nb ); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (upper) { /* with three queues */ magma_cpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, h, queues, event, info); } else { /* with three queues */ magma_cpotrf3_mgpu(ngpu, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, h, queues, event, info); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < 3; j++ ) { magma_queue_sync( queues[d][j] ); magma_queue_destroy( queues[d][j] ); } for( j=0; j < 5; j++ ) magma_event_destroy( event[d][j] ); magma_free( dwork[d] ); } magma_free_pinned( work ); } /* end of not lapack */ magma_setdevice( orig_dev ); return *info; } /* magma_cpotrf_mgpu */
extern "C" magma_int_t magma_cbicgstab_merge3( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new, *skp_h={0}; float nom, nom0, betanom, nomb; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // workspace magma_c_matrix q={Magma_CSR}, r={Magma_CSR}, rr={Magma_CSR}, p={Magma_CSR}, v={Magma_CSR}, s={Magma_CSR}, t={Magma_CSR}; magmaFloatComplex *d1=NULL, *d2=NULL, *skp=NULL; d1 = NULL; d2 = NULL; skp = NULL; CHECK( magma_cmalloc( &d1, dofs*(2) )); CHECK( magma_cmalloc( &d2, dofs*(2) )); // array for the parameters CHECK( magma_cmalloc( &skp, 8 )); // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] CHECK( magma_cvinit( &q, Magma_DEV, dofs*6, 1, c_zero, queue )); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; rr.num_cols = 1; rr.storage_type = Magma_DENSE; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; r.num_cols = 1; r.storage_type = Magma_DENSE; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; p.num_cols = 1; p.storage_type = Magma_DENSE; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; v.num_cols = 1; v.storage_type = Magma_DENSE; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; s.num_cols = 1; s.storage_type = Magma_DENSE; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; t.num_cols = 1; t.storage_type = Magma_DENSE; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver setup CHECK( magma_cresidualvec( A, b, *x, &r, &nom0, queue)); magma_ccopy( dofs, r.dval, 1, q(0), 1, queue ); // rr = r magma_ccopy( dofs, r.dval, 1, q(1), 1, queue ); // q = r betanom = nom0; nom = nom0*nom0; rho_new = magma_cdotc( dofs, r.dval, 1, r.dval, 1, queue ); // rho=<rr,r> rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters CHECK( magma_cmalloc_cpu( &skp_h, 8 )); nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if ( nomb == 0.0 ){ nomb=1.0; } solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_C_MAKE(nom, 0.0); magma_csetvector( 8, skp_h, 1, skp, 1, queue ); CHECK( magma_c_spmv( c_one, A, r, c_zero, v, queue )); // z = A r nomb = magma_scnrm2( dofs, b.dval, 1, queue ); if( nom0 < solver_par->atol || nom0/nomb < solver_par->rtol ){ info = MAGMA_SUCCESS; goto cleanup; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); solver_par->numiter = 0; solver_par->spmv_count = 0; // start iteration do { solver_par->numiter++; // computes p=r+beta*(p-omega*v) CHECK( magma_cbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue )); CHECK( magma_c_spmv( c_one, A, p, c_zero, v, queue )); // v = Ap solver_par->spmv_count++; CHECK( magma_cmdotc( dofs, 1, q.dval, v.dval, d1, d2, skp, queue )); CHECK( magma_cbicgmerge4( 1, skp, queue )); CHECK( magma_cbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue )); // s=r-alpha*v CHECK( magma_c_spmv( c_one, A, s, c_zero, t, queue )); // t=As solver_par->spmv_count++; CHECK( magma_cmdotc( dofs, 2, q.dval+4*dofs, t.dval, d1, d2, skp+6, queue )); CHECK( magma_cbicgmerge4( 2, skp, queue )); CHECK( magma_cbicgmerge_xrbeta( dofs, d1, d2, q.dval, r.dval, p.dval, s.dval, t.dval, x->dval, skp, queue )); // check stopping criterion magma_cgetvector_async( 1 , skp+5, 1, skp_h+5, 1, queue ); betanom = sqrt(MAGMA_C_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < solver_par->atol || betanom/nomb < solver_par->rtol ) { break; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; CHECK( magma_cresidualvec( A, b, *x, &r, &residual, queue)); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->atol || solver_par->iter_res/solver_par->init_res < solver_par->rtol ){ info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); solver_par->info = info; return info; } /* cbicgstab_merge */
/***************************************************************************//** Purpose ------- CGERBT solves a system of linear equations A * X = B where A is a general n-by-n matrix and X and B are n-by-nrhs matrices. Random Butterfly Tranformation is applied on A and B, then the LU decomposition with no pivoting is used to factor A as A = L * U, where L is unit lower triangular, and U is upper triangular. The factored form of A is then used to solve the system of equations A * X = B. Arguments --------- @param[in] gen magma_bool_t - = MagmaTrue: new matrices are generated for U and V - = MagmaFalse: matrices U and V given as parameter are used @param[in] n INTEGER The order of the matrix A. n >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. nrhs >= 0. @param[in,out] dA COMPLEX array, dimension (LDDA,n). On entry, the M-by-n matrix to be factored. On exit, the factors L and U from the factorization A = L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,n). @param[in,out] dB COMPLEX array, dimension (LDDB,nrhs) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,n). @param[in,out] U COMPLEX array, dimension (2,n) Random butterfly matrix, if gen = MagmaTrue U is generated and returned as output; else we use U given as input. CPU memory @param[in,out] V COMPLEX array, dimension (2,n) Random butterfly matrix, if gen = MagmaTrue V is generated and returned as output; else we use U given as input. CPU memory @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @ingroup magma_gerbt *******************************************************************************/ extern "C" magma_int_t magma_cgerbt_gpu( magma_bool_t gen, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magmaFloatComplex *U, magmaFloatComplex *V, magma_int_t *info) { #define dB(i_, j_) (dB + (i_) + (j_)*lddb) /* Function Body */ *info = 0; if ( ! (gen == MagmaTrue) && ! (gen == MagmaFalse) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (nrhs == 0 || n == 0) return *info; magmaFloatComplex_ptr dU=NULL, dV=NULL; magma_int_t j; /* Allocate memory for the buterfly matrices */ if (MAGMA_SUCCESS != magma_cmalloc( &dU, 2*n ) || MAGMA_SUCCESS != magma_cmalloc( &dV, 2*n )) { magma_free( dU ); magma_free( dV ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Initialize Butterfly matrix on the CPU */ if (gen == MagmaTrue) init_butterfly( 2*n, U, V ); /* Copy the butterfly to the GPU */ magma_csetvector( 2*n, U, 1, dU, 1, queue ); magma_csetvector( 2*n, V, 1, dV, 1, queue ); /* Perform Partial Random Butterfly Transformation on the GPU */ magmablas_cprbt( n, dA, ldda, dU, dV, queue ); /* Compute U^T * b on the GPU*/ for (j= 0; j < nrhs; j++) { magmablas_cprbt_mtv( n, dU, dB(0,j), queue ); } magma_queue_destroy( queue ); magma_free( dU ); magma_free( dV ); return *info; }
extern "C" magma_int_t magma_cgehrd(magma_int_t n, magma_int_t ilo, magma_int_t ihi, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magmaFloatComplex *dT, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CGEHRD reduces a COMPLEX general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments ========= N (input) INTEGER The order of the matrix A. N >= 0. ILO (input) INTEGER IHI (input) INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to CGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). TAU (output) COMPLEX array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. dT (output) COMPLEX array on the GPU, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. Further Details =============== The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices in dT, for later use in magma_cunghr. ===================================================================== */ #define A( i, j ) ( A + (i) + (j)*lda) #define dA( i, j ) (dA + (i) + (j-ilo)*ldda) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_zero = MAGMA_C_ZERO; magma_int_t nb = magma_get_cgehrd_nb(n); magma_int_t ldda = n; // assumed in clahru magma_int_t nh, iws; magma_int_t iinfo; magma_int_t ldwork; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = MAGMA_C_MAKE( iws, 0 ); lquery = lwork == -1; if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // GPU workspace is: // nb*ldda for dwork for clahru // nb*ldda for dV // n*ldda for dA magmaFloatComplex *dwork; if (MAGMA_SUCCESS != magma_cmalloc( &dwork, 2*nb*ldda + n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaFloatComplex *dV = dwork + nb*ldda; magmaFloatComplex *dA = dwork + nb*ldda*2; ldwork = n; magma_int_t i; magmaFloatComplex *T, *dTi; magma_cmalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular czero_nbxnb_block(nb, dV, ldda); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for(i = 0; i < ilo; ++i) tau[i] = c_zero; for(i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; for(i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; magmablas_claset( 'F', nb, n, dT, nb ); // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // Copy the matrix to the GPU magma_csetmatrix( n, n-ilo, A(0,ilo), lda, dA, ldda ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_cgetmatrix( ihi-i, nb, dA(i,i), ldda, A (i,i), lda ); // add 1 to i for 1-based index magma_clahr2( ihi, i+1, nb, dA(0,i), dV, A (0,i), lda, &tau[i], T, nb, work, ldwork); // Copy T from the CPU to dT on the GPU dTi = dT + (i - ilo)*nb; magma_csetmatrix( nb, nb, T, nb, dTi, nb ); magma_clahru( n, ihi, i, nb, A (0,i), lda, dA(0,i), // dA dA(i,i), // dY, stored over current panel dV, dTi, dwork ); } // Copy remainder to host magma_cgetmatrix( n, n-i, dA(0,i), ldda, A (0,i), lda ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_cgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = MAGMA_C_MAKE( iws, 0 ); magma_free( dwork ); magma_free_cpu( T ); return *info; } /* magma_cgehrd */
/** Purpose ------- CTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: A is upper triangular; - = MagmaLower: A is lower triangular. @param[in] diag magma_diag_t - = MagmaNonUnit: A is non-unit triangular; - = MagmaUnit: A is unit triangular. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = MagmaUnit, the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. @ingroup magma_cgesv_aux ********************************************************************/ extern "C" magma_int_t magma_ctrtri(magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A + (i) + (j)*lda ) #define dA(i, j) (dA + (i) + (j)*ldda) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); const char* diag_ = lapack_diag_const( diag ); magma_int_t ldda, nb, nn, j, jb; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dA; int upper = (uplo == MagmaUpper); int nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; /* Check for singularity if non-unit */ if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_C_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } /* Determine the block size for this environment */ nb = magma_get_cpotrf_nb(n); ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_cmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) lapackf77_ctrtri(uplo_, diag_, &n, A, &lda, info); else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); magma_csetmatrix( jb, (n-j), A(j, j), lda, dA(j, j), ldda ); /* Compute rows 1:j-1 of current block column */ magma_ctrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j),ldda); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j),ldda); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[1] ); magma_cgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, stream[0] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ctrtri(MagmaUpperStr, diag_, &jb, A(j,j), &lda, info); magma_csetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } else { /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for (j=nn-1; j >= 0; j -= nb) { jb=min(nb,(n-j)); if ((j+jb) < n) { magma_csetmatrix( (n-j), jb, A(j, j), lda, dA(j, j), ldda ); /* Compute rows j+jb:n of current block column */ magma_ctrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda ); magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda ); magma_cgetmatrix_async( n-j-jb, jb, dA(j+jb, j), ldda, A(j+jb, j), lda, stream[1] ); magma_cgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); magma_queue_sync( stream[0] ); } /* Compute inverse of current diagonal block */ lapackf77_ctrtri(MagmaLowerStr, diag_, &jb, A(j,j), &lda, info); magma_csetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
/** Purpose ------- CUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by CGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQLF in the last k columns of its array argument A. A is modified by the routine but restored on exit. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQLF. @param[in,out] C COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance if SIDE = MagmaLeft, LWORK >= N*NB; if SIDE = MagmaRight, LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_cunmql( magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *C, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) magmaFloatComplex *T, *T2; magma_int_t i, i1, i2, ib, nb, mi, ni, nq, nq_i, nw, step; magma_int_t iinfo, ldwork, lwkopt; *info = 0; bool left = (side == MagmaLeft); bool notran = (trans == MagmaNoTrans); bool lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } /* Test the input arguments */ if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { nb = magma_get_cgelqf_nb( m, n ); lwkopt = max(1,nw)*nb; work[0] = magma_cmake_lwork( lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = MAGMA_C_ONE; return *info; } ldwork = nw; if ( nb >= k ) { /* Use CPU code */ lapackf77_cunmql( lapack_side_const(side), lapack_trans_const(trans), &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, &iinfo ); } else { /* Use hybrid CPU-GPU code */ /* Allocate work space on the GPU. * nw*nb for dwork (m or n) by nb * nq*nb for dV (n or m) by nb * nb*nb for dT * lddc*n for dC. */ magma_int_t lddc = magma_roundup( m, 32 ); magmaFloatComplex *dwork, *dV, *dT, *dC; magma_cmalloc( &dwork, (nw + nq + nb)*nb + lddc*n ); if ( dwork == NULL ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dV = dwork + nw*nb; dT = dV + nq*nb; dC = dT + nb*nb; /* work space on CPU. * nb*nb for T * nb*nb for T2, used to save and restore diagonal block of panel */ magma_cmalloc_pinned( &T, 2*nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } T2 = T + nb*nb; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, C, ldc, dC, lddc, queue ); if ( (left && notran) || (! left && ! notran) ) { i1 = 0; i2 = k; step = nb; } else { i1 = ((k - 1) / nb) * nb; i2 = 0; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } for (i = i1; (step < 0 ? i >= i2 : i < i2); i += step) { ib = min(nb, k - i); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ nq_i = nq - k + i + ib; lapackf77_clarft("Backward", "Columnwise", &nq_i, &ib, A(0,i), &lda, &tau[i], T, &ib); /* 1) set lower triangle of panel in A to identity, 2) copy the panel from A to the GPU, and 3) restore A */ magma_cpanel_to_q( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); magma_csetmatrix( nq_i, ib, A(0,i), lda, dV, nq_i, queue ); magma_cq_to_panel( MagmaLower, ib, A(nq_i-ib,i), lda, T2 ); if (left) { /* H or H**H is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib; } else { /* H or H**H is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib; } /* Apply H or H**H; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dT, ib, queue ); magma_clarfb_gpu( side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dV, nq_i, dT, ib, dC, lddc, dwork, ldwork, queue ); } magma_cgetmatrix( m, n, dC, lddc, C, ldc, queue ); magma_queue_destroy( queue ); magma_free( dwork ); magma_free_pinned( T ); } work[0] = magma_cmake_lwork( lwkopt ); return *info; } /* magma_cunmql */
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] d_lA COMPLEX array of pointers on the GPU, dimension (ngpu). On entry, the M-by-N matrix A distributed over GPUs (d_lA[d] points to the local matrix on d-th GPU). It uses 1D block column cyclic format with the block size of nb, and each local matrix is stored by column. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array d_lA. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_mgpu( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex_ptr d_lA[], magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm; magma_int_t i, j, d, lddat, lddwork; magmaFloatComplex *d_lAT[MagmaMaxGPUs]; magmaFloatComplex *d_panel[MagmaMaxGPUs], *work; magma_queue_t queues[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* create the queues */ for( d=0; d < ngpu; d++ ) { magma_queue_create( d, &queues[d][0] ); magma_queue_create( d, &queues[d][1] ); } /* Function Body */ nb = magma_get_cgetrf_nb( m, n ); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, d_lA[0], ldda, work, m, queues[0][0] ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, d_lA[0], ldda, queues[0][0] ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ magma_device_t orig_dev; magma_getdevice( &orig_dev ); maxm = magma_roundup( m, 32 ); if ( ngpu > ceil((float)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = magma_roundup( ((magma_ceildiv( n, nb )/ngpu)*nb), 32 ); lddat = magma_ceildiv( n, nb ); /* number of block columns */ lddat = magma_ceildiv( lddat, ngpu ); /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = magma_roundup( lddat, 32 ); /* make it a multiple of 32 */ for (i=0; i < ngpu; i++) { magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/ngpu)*nb; if (i < (n/nb)%ngpu) n_local[i] += nb; else if (i == (n/nb)%ngpu) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+ngpu)*nb*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j <= i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j < i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ctranspose( m, n_local[i], d_lA[i], ldda, d_lAT[i], lddat, queues[i][1] ); } for (i=0; i < ngpu; i++) { magma_setdevice(i); magma_queue_sync(queues[i][0]); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*ngpu )) { for (i=0; i < ngpu; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and queues */ magma_cgetrf2_mgpu(ngpu, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, queues, info); /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ctranspose( n_local[d], m, d_lAT[d], lddat, d_lA[d], ldda, queues[d][0] ); magma_queue_sync(queues[d][0]); magma_queue_sync(queues[d][1]); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); } /* end of for d=1,..,ngpu */ magma_setdevice( orig_dev ); magma_free_pinned( work ); } /* clean up */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_destroy( queues[d][0] ); magma_queue_destroy( queues[d][1] ); } return *info; }
/** Purpose ------- CHEEVD_2STAGE computes all eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. It uses a two-stage algorithm for the tridiagonalization. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= LQ2 + N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= LQ2 + 2*N + N**2. where LQ2 is the size needed to store the Q2 matrix and is returned by magma_bulge_get_lq2. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) REAL array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_cheev_driver ********************************************************************/ extern "C" magma_int_t magma_cheevdx_2stage( magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, magmaFloatComplex *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { #define A( i_,j_) (A + (i_) + (j_)*lda) #define A2(i_,j_) (A2 + (i_) + (j_)*lda2) const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t ione = 1; magma_int_t izero = 0; float d_one = 1.; float d__1; float eps; float anrm; magma_int_t imax; float rmin, rmax; float sigma; //magma_int_t iinfo; magma_int_t lwmin, lrwmin, liwmin; magma_int_t lower; magma_int_t wantz; magma_int_t iscale; float safmin; float bignum; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t len; float* dwork; /* determine the number of threads */ magma_int_t parallel_threads = magma_get_parallel_numthreads(); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_cbulge_nb(n,parallel_threads); magma_int_t Vblksiz = magma_cbulge_get_Vblksiz(n, nb, parallel_threads); magma_int_t ldt = Vblksiz; magma_int_t ldv = nb + Vblksiz; magma_int_t blkcnt = magma_bulge_get_blkcnt(n, nb, Vblksiz); magma_int_t lq2 = magma_cbulge_get_lq2(n, parallel_threads); if (wantz) { lwmin = lq2 + 2*n + n*n; lrwmin = 1 + 5*n + 2*n*n; liwmin = 5*n + 3; } else { lwmin = lq2 + n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_slamch("Epsilon"); work[0] = MAGMA_C_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -14; } else if ((lrwork < lrwmin) && ! lquery) { *info = -16; } else if ((liwork < liwmin) && ! lquery) { *info = -18; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (n == 1) { w[0] = MAGMA_C_REAL(A[0]); if (wantz) { A[0] = MAGMA_C_ONE; } return *info; } timer_printf("using %d parallel_threads\n", (int) parallel_threads); /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ magma_int_t ntiles = n/nb; if ( ( ntiles < 2 ) || ( n <= 128 ) ) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_cheevd(jobz_, uplo_, &n, A, &lda, w, work, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); *m = n; return *info; } /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = lapackf77_clanhe("M", uplo_, &n, A, &lda, rwork); iscale = 0; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { lapackf77_clascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info); } magma_int_t indT2 = 0; magma_int_t indTAU2 = indT2 + blkcnt*ldt*Vblksiz; magma_int_t indV2 = indTAU2+ blkcnt*Vblksiz; magma_int_t indtau1 = indV2 + blkcnt*ldv*Vblksiz; magma_int_t indwrk = indtau1+ n; //magma_int_t indwk2 = indwrk + n*n; magma_int_t llwork = lwork - indwrk; //magma_int_t llwrk2 = lwork - indwk2; magma_int_t inde = 0; magma_int_t indrwk = inde + n; magma_int_t llrwk = lrwork - indrwk; magma_timer_t time=0, time_total=0; timer_start( time_total ); timer_start( time ); magmaFloatComplex *dT1; if (MAGMA_SUCCESS != magma_cmalloc( &dT1, n*nb)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_chetrd_he2hb(uplo, n, nb, A, lda, &work[indtau1], &work[indwrk], llwork, dT1, info); timer_stop( time ); timer_printf( " time chetrd_he2hb = %6.2f\n", time ); timer_start( time ); /* copy the input matrix into WORK(INDWRK) with band storage */ /* PAY ATTENTION THAT work[indwrk] should be able to be of size lda2*n which it should be checked in any future modification of lwork.*/ magma_int_t lda2 = 2*nb; //nb+1+(nb-1); magmaFloatComplex* A2 = &work[indwrk]; memset(A2, 0, n*lda2*sizeof(magmaFloatComplex)); for (magma_int_t j = 0; j < n-nb; j++) { len = nb+1; blasf77_ccopy( &len, A(j,j), &ione, A2(0,j), &ione ); memset(A(j,j), 0, (nb+1)*sizeof(magmaFloatComplex)); *A(nb+j,j) = c_one; } for (magma_int_t j = 0; j < nb; j++) { len = nb-j; blasf77_ccopy( &len, A(j+n-nb,j+n-nb), &ione, A2(0,j+n-nb), &ione ); memset(A(j+n-nb,j+n-nb), 0, (nb-j)*sizeof(magmaFloatComplex)); } timer_stop( time ); timer_printf( " time chetrd_convert = %6.2f\n", time ); timer_start( time ); magma_chetrd_hb2st(uplo, n, nb, Vblksiz, A2, lda2, w, &rwork[inde], &work[indV2], ldv, &work[indTAU2], wantz, &work[indT2], ldt); timer_stop( time ); timer_stop( time_total ); timer_printf( " time chetrd_hb2st = %6.2f\n", time ); timer_printf( " time chetrd = %6.2f\n", time_total ); /* For eigenvalues only, call SSTERF. For eigenvectors, first call CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call CUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { timer_start( time ); lapackf77_ssterf(&n, w, &rwork[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); timer_stop( time ); timer_printf( " time dstedc = %6.2f\n", time ); } else { timer_start( time_total ); timer_start( time ); if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_cstedx(range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info); magma_free( dwork ); timer_stop( time ); timer_printf( " time cstedx = %6.2f\n", time ); timer_start( time ); magmaFloatComplex *dZ; magma_int_t lddz = n; magmaFloatComplex *da; magma_int_t ldda = n; magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); if (MAGMA_SUCCESS != magma_cmalloc( &dZ, *m*lddz)) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_cmalloc( &da, n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_cbulge_back(uplo, n, nb, *m, Vblksiz, &work[indwrk + n * (il-1)], n, dZ, lddz, &work[indV2], ldv, &work[indTAU2], &work[indT2], ldt, info); timer_stop( time ); timer_printf( " time cbulge_back = %6.2f\n", time ); timer_start( time ); magma_csetmatrix( n, n, A, lda, da, ldda ); magma_cunmqr_gpu_2stages(MagmaLeft, MagmaNoTrans, n-nb, *m, n-nb, da+nb, ldda, dZ+nb, n, dT1, nb, info); magma_cgetmatrix( n, *m, dZ, lddz, A, lda ); magma_free(dT1); magma_free(dZ); magma_free(da); timer_stop( time ); timer_stop( time_total ); timer_printf( " time cunmqr + copy = %6.2f\n", time ); timer_printf( " time eigenvectors backtransf. = %6.2f\n", time_total ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_C_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; return *info; } /* magma_cheevdx_2stage */
/** Purpose ------- CGEEV computes for an N-by-N complex nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**H * A = lambda(j) * u(j)**H where u(j)**H denotes the conjugate transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments --------- @param[in] jobvl magma_vec_t - = MagmaNoVec: left eigenvectors of A are not computed; - = MagmaVec: left eigenvectors of are computed. @param[in] jobvr magma_vec_t - = MagmaNoVec: right eigenvectors of A are not computed; - = MagmaVec: right eigenvectors of A are computed. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] w COMPLEX array, dimension (N) W contains the computed eigenvalues. @param[out] VL COMPLEX array, dimension (LDVL,N) If JOBVL = MagmaVec, the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = MagmaNoVec, VL is not referenced. u(j) = VL(:,j), the j-th column of VL. @param[in] ldvl INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = MagmaVec, LDVL >= N. @param[out] VR COMPLEX array, dimension (LDVR,N) If JOBVR = MagmaVec, the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = MagmaNoVec, VR is not referenced. v(j) = VR(:,j), the j-th column of VR. @param[in] ldvr INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = MagmaVec, LDVR >= N. @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace) REAL array, dimension (2*N) @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. @ingroup magma_cgeev_driver ********************************************************************/ extern "C" magma_int_t magma_cgeev_m( magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, #ifdef COMPLEX magmaFloatComplex *w, #else float *wr, float *wi, #endif magmaFloatComplex *VL, magma_int_t ldvl, magmaFloatComplex *VR, magma_int_t ldvr, magmaFloatComplex *work, magma_int_t lwork, #ifdef COMPLEX float *rwork, #endif magma_int_t *info ) { #define VL(i,j) (VL + (i) + (j)*ldvl) #define VR(i,j) (VR + (i) + (j)*ldvr) const magma_int_t ione = 1; const magma_int_t izero = 0; float d__1, d__2; magmaFloatComplex tmp; float scl; float dum[1], eps; float anrm, cscale, bignum, smlnum; magma_int_t i, k, ilo, ihi; magma_int_t ibal, ierr, itau, iwrk, nout, liwrk, nb; magma_int_t scalea, minwrk, irwork, lquery, wantvl, wantvr, select[1]; magma_side_t side = MagmaRight; irwork = 0; *info = 0; lquery = (lwork == -1); wantvl = (jobvl == MagmaVec); wantvr = (jobvr == MagmaVec); if (! wantvl && jobvl != MagmaNoVec) { *info = -1; } else if (! wantvr && jobvr != MagmaNoVec) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -8; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -10; } /* Compute workspace */ nb = magma_get_cgehrd_nb( n ); if (*info == 0) { minwrk = (1+nb)*n; work[0] = MAGMA_C_MAKE( minwrk, 0 ); if (lwork < minwrk && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } #if defined(Version3) || defined(Version4) || defined(Version5) magmaFloatComplex *dT; if (MAGMA_SUCCESS != magma_cmalloc( &dT, nb*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif #if defined(Version4) || defined(Version5) magmaFloatComplex *T; if (MAGMA_SUCCESS != magma_cmalloc_cpu( &T, nb*n )) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif /* Get machine constants */ eps = lapackf77_slamch( "P" ); smlnum = lapackf77_slamch( "S" ); bignum = 1. / smlnum; lapackf77_slabad( &smlnum, &bignum ); smlnum = magma_ssqrt( smlnum ) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_clange( "M", &n, &n, A, &lda, dum ); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_clascl( "G", &izero, &izero, &anrm, &cscale, &n, &n, A, &lda, &ierr ); } /* Balance the matrix * (CWorkspace: none) * (RWorkspace: need N) * - this space is reserved until after gebak */ ibal = 0; lapackf77_cgebal( "B", &n, A, &lda, &ilo, &ihi, &rwork[ibal], &ierr ); /* Reduce to upper Hessenberg form * (CWorkspace: need 2*N, prefer N + N*NB) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by cgehrd */ itau = 0; iwrk = itau + n; liwrk = lwork - iwrk; #if defined(Version1) // Version 1 - LAPACK lapackf77_cgehrd( &n, &ilo, &ihi, A, &lda, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version2) // Version 2 - LAPACK consistent HRD magma_cgehrd2( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, &ierr ); #elif defined(Version3) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored, magma_cgehrd( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, dT, &ierr ); #elif defined(Version4) || defined(Version5) // Version 4 - Multi-GPU, T on host magma_cgehrd_m( n, ilo, ihi, A, lda, &work[itau], &work[iwrk], liwrk, T, &ierr ); magma_csetmatrix( nb, n, T, nb, dT, nb ); #endif if (wantvl) { /* Want left eigenvectors * Copy Householder vectors to VL */ side = MagmaLeft; lapackf77_clacpy( MagmaLowerStr, &n, &n, A, &lda, VL, &ldvl ); /* Generate unitary matrix in VL * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by cunghr */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_cunghr( &n, &ilo, &ihi, VL, &ldvl, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) || defined(Version4) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_cunghr( n, ilo, ihi, VL, ldvl, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_cunghr_m( n, ilo, ihi, VL, ldvl, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VL * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w, VL, &ldvl, &work[iwrk], &liwrk, info ); if (wantvr) { /* Want left and right eigenvectors * Copy Schur vectors to VR */ side = MagmaBothSides; lapackf77_clacpy( "F", &n, &n, VL, &ldvl, VR, &ldvr ); } } else if (wantvr) { /* Want right eigenvectors * Copy Householder vectors to VR */ side = MagmaRight; lapackf77_clacpy( "L", &n, &n, A, &lda, VR, &ldvr ); /* Generate unitary matrix in VR * (CWorkspace: need 2*N-1, prefer N + (N-1)*NB) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by cunghr */ #if defined(Version1) || defined(Version2) // Version 1 & 2 - LAPACK lapackf77_cunghr( &n, &ilo, &ihi, VR, &ldvr, &work[itau], &work[iwrk], &liwrk, &ierr ); #elif defined(Version3) || defined(Version4) // Version 3 - LAPACK consistent MAGMA HRD + T matrices stored magma_cunghr( n, ilo, ihi, VR, ldvr, &work[itau], dT, nb, &ierr ); #elif defined(Version5) // Version 5 - Multi-GPU, T on host magma_cunghr_m( n, ilo, ihi, VR, ldvr, &work[itau], T, nb, &ierr ); #endif /* Perform QR iteration, accumulating Schur vectors in VR * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "S", "V", &n, &ilo, &ihi, A, &lda, w, VR, &ldvr, &work[iwrk], &liwrk, info ); } else { /* Compute eigenvalues only * (CWorkspace: need 1, prefer HSWORK (see comments) ) * (RWorkspace: N) * - including N reserved for gebal/gebak, unused by chseqr */ iwrk = itau; liwrk = lwork - iwrk; lapackf77_chseqr( "E", "N", &n, &ilo, &ihi, A, &lda, w, VR, &ldvr, &work[iwrk], &liwrk, info ); } /* If INFO > 0 from CHSEQR, then quit */ if (*info > 0) { goto CLEANUP; } if (wantvl || wantvr) { /* Compute left and/or right eigenvectors * (CWorkspace: need 2*N) * (RWorkspace: need 2*N) * - including N reserved for gebal/gebak, unused by ctrevc */ irwork = ibal + n; #if TREVC_VERSION == 1 lapackf77_ctrevc( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr ); #elif TREVC_VERSION == 2 liwrk = lwork - iwrk; lapackf77_ctrevc3( lapack_side_const(side), "B", select, &n, A, &lda, VL, &ldvl, VR, &ldvr, &n, &nout, &work[iwrk], &liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 3 magma_ctrevc3( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 4 magma_ctrevc3_mt( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #elif TREVC_VERSION == 5 magma_ctrevc3_mt_gpu( side, MagmaBacktransVec, select, n, A, lda, VL, ldvl, VR, ldvr, n, &nout, &work[iwrk], liwrk, &rwork[irwork], &ierr ); #else #error Unknown TREVC_VERSION #endif } if (wantvl) { /* Undo balancing of left eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_cgebak( "B", "L", &n, &ilo, &ihi, &rwork[ibal], &n, VL, &ldvl, &ierr ); /* Normalize left eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / magma_cblas_scnrm2( n, VL(0,i), 1 ); blasf77_csscal( &n, &scl, VL(0,i), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_C_REAL( *VL(k,i) ); d__2 = MAGMA_C_IMAG( *VL(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &rwork[irwork], &ione ) - 1; // subtract 1; k is 0-based tmp = MAGMA_C_CNJG( *VL(k,i) ) / magma_ssqrt( rwork[irwork + k] ); blasf77_cscal( &n, &tmp, VL(0,i), &ione ); *VL(k,i) = MAGMA_C_MAKE( MAGMA_C_REAL( *VL(k,i) ), 0 ); } } if (wantvr) { /* Undo balancing of right eigenvectors * (CWorkspace: none) * (RWorkspace: need N) */ lapackf77_cgebak( "B", "R", &n, &ilo, &ihi, &rwork[ibal], &n, VR, &ldvr, &ierr ); /* Normalize right eigenvectors and make largest component real */ for (i = 0; i < n; ++i) { scl = 1. / magma_cblas_scnrm2( n, VR(0,i), 1 ); blasf77_csscal( &n, &scl, VR(0,i), &ione ); for (k = 0; k < n; ++k) { /* Computing 2nd power */ d__1 = MAGMA_C_REAL( *VR(k,i) ); d__2 = MAGMA_C_IMAG( *VR(k,i) ); rwork[irwork + k] = d__1*d__1 + d__2*d__2; } k = blasf77_isamax( &n, &rwork[irwork], &ione ) - 1; // subtract 1; k is 0-based tmp = MAGMA_C_CNJG( *VR(k,i) ) / magma_ssqrt( rwork[irwork + k] ); blasf77_cscal( &n, &tmp, VR(0,i), &ione ); *VR(k,i) = MAGMA_C_MAKE( MAGMA_C_REAL( *VR(k,i) ), 0 ); } } CLEANUP: /* Undo scaling if necessary */ if (scalea) { // converged eigenvalues, stored in WR[i+1:n] and WI[i+1:n] for i = INFO magma_int_t nval = n - (*info); magma_int_t ld = max( nval, 1 ); lapackf77_clascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w + (*info), &ld, &ierr ); if (*info > 0) { // first ilo columns were already upper triangular, // so the corresponding eigenvalues are also valid. nval = ilo - 1; lapackf77_clascl( "G", &izero, &izero, &cscale, &anrm, &nval, &ione, w, &n, &ierr ); } } #if defined(Version3) || defined(Version4) || defined(Version5) magma_free( dT ); #endif #if defined(Version4) || defined(Version5) magma_free_cpu( T ); #endif work[0] = MAGMA_C_MAKE( (float) minwrk, 0. ); // TODO use optwrk as in dgeev return *info; } /* magma_cgeev */
/** Purpose ------- CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. If ITYPE = 1, the problem is A*x = lambda*B*x, and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H) If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L. B must have been previously factorized as U**H*U or L*L**H by CPOTRF. Arguments --------- @param[in] itype INTEGER = 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H); = 2 or 3: compute U*A*U**H or L**H*A*L. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored and B is factored as U**H*U; - = MagmaLower: Lower triangle of A is stored and B is factored as L*L**H. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the transformed matrix, stored in the same format as A. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] B COMPLEX array, dimension (LDB,N) The triangular factor from the Cholesky factorization of B, as returned by CPOTRF. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cheev_comp ********************************************************************/ extern "C" magma_int_t magma_chegst(magma_int_t itype, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define B(i, j) (B + (j)*ldb + (i)) #define dA(i, j) (dw + (j)*ldda + (i)) #define dB(i, j) (dw + n*ldda + (j)*lddb + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nb; magma_int_t k, kb, kb2; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; magmaFloatComplex c_neg_half = MAGMA_C_NEG_HALF; magmaFloatComplex *dw; magma_int_t ldda = n; magma_int_t lddb = n; float d_one = 1.0; int upper = (uplo == MagmaUpper); /* Test the input parameters. */ *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (ldb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; if (MAGMA_SUCCESS != magma_cmalloc( &dw, 2*n*n )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } nb = magma_get_chegst_nb(n); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_csetmatrix( n, n, A(0, 0), lda, dA(0, 0), ldda ); magma_csetmatrix( n, n, B(0, 0), ldb, dB(0, 0), lddb ); /* Use hybrid blocked code */ if (itype == 1) { if (upper) { /* Compute inv(U')*A*inv(U) */ for (k = 0; k < n; k += nb) { kb = min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the upper triangle of A(k:n,k:n) */ lapackf77_chegst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { magma_ctrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k,k), lddb, dA(k,k+kb), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_cher2k(MagmaUpper, MagmaConjTrans, n-k-kb, kb, c_neg_one, dA(k,k+kb), ldda, dB(k,k+kb), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(k+kb, k+kb), lda, stream[1] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_ctrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k+kb,k+kb), lddb, dA(k,k+kb), ldda); magma_queue_sync( stream[1] ); } } magma_queue_sync( stream[0] ); } else { /* Compute inv(L)*A*inv(L') */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the lower triangle of A(k:n,k:n) */ lapackf77_chegst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { magma_ctrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k,k), lddb, dA(k+kb,k), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_cher2k(MagmaLower, MagmaNoTrans, n-k-kb, kb, c_neg_one, dA(k+kb,k), ldda, dB(k+kb,k), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(k+kb, k+kb), lda, stream[1] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k+kb,k+kb), lddb, dA(k+kb,k), ldda); } magma_queue_sync( stream[1] ); } } magma_queue_sync( stream[0] ); } else { if (upper) { /* Compute U*A*U' */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(k, k), lda, stream[0] ); /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, k, kb, c_one, dB(0,0), lddb, dA(0,k), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_queue_sync( stream[1] ); magma_cher2k(MagmaUpper, MagmaNoTrans, k, kb, c_one, dA(0,k), ldda, dB(0,k), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_ctrmm(MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, k, kb, c_one, dB(k,k), lddb, dA(0,k), ldda); } magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(k, k), &lda, B(k, k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } else { /* Compute L'*A*L */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(k, k), lda, stream[0] ); /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, kb, k, c_one, dB(0,0), lddb, dA(k,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_queue_sync( stream[1] ); magma_cher2k(MagmaLower, MagmaConjTrans, k, kb, c_one, dA(k,0), ldda, dB(k,0), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_ctrmm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, kb, k, c_one, dB(k,k), lddb, dA(k,0), ldda); } magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(k,k), &lda, B(k,k), &ldb, info); magma_csetmatrix_async( kb, kb, A(k, k), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } } magma_cgetmatrix( n, n, dA(0, 0), ldda, A(0, 0), lda ); magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dw ); return *info; } /* magma_chegst_gpu */
extern "C" magma_int_t magma_cgelqf_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGELQF computes an LQ factorization of a COMPLEX M-by-N matrix dA: dA = L * Q. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). LDA (input) INTEGER The leading dimension of the array dA. LDA >= max(1,M). TAU (output) COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -10 internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( dA+(a_2)*(lda) + (a_1)) magmaFloatComplex *dAT; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t maxm, maxn, maxdim, nb; magma_int_t iinfo; int lquery; *info = 0; nb = magma_get_cgelqf_nb(m); work[0] = MAGMA_C_MAKE( (float)(m*nb), 0 ); lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min(m, n) == 0) { work[0] = c_one; return *info; } maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); int ldat = maxn; dAT = dA; if ( m == n ) { ldat = lda; magmablas_ctranspose_inplace( m, dAT, lda ); } else { if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn ) ){ *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ctranspose2( dAT, ldat, dA, lda, m, n ); } magma_cgeqrf2_gpu(n, m, dAT, ldat, tau, &iinfo); if ( m == n ) { magmablas_ctranspose_inplace( m, dAT, ldat ); } else { magmablas_ctranspose2( dA, lda, dAT, ldat, n, m ); magma_free( dAT ); } return *info; } /* magma_cgelqf_gpu */
extern "C" magma_int_t magma_cgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define A(i,j) (a + (j)*lda + (i)) #define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) //#define PROFILE #ifdef PROFILE float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0; magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time(); #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); char * ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if( num_gpus*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ #ifdef PROFILE start = get_current_time(); #endif n_local[0] = (NB/nb)/num_gpus; if( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); #ifdef PROFILE end = get_current_time(); printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 ); start = get_current_time(); #endif for( I=0; I<n; I+=NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d<num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; #ifdef PROFILE start2 = get_current_time(); #endif /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ magmablas_csetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE start1 = get_current_time(); #endif /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset<min(m,I); offset+=NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi); } /* applying the pivot from the previous big-panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_cpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local ); if( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local, inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if( M > I ) { //magma_cgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, // (magma_queue_t **)stream, &iinfo); magma_cgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if( iinfo < 0 ) { *info = iinfo; break; } else if( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I; } #ifdef PROFILE end1 = get_current_time(); time_rmajor += GetTimerValue(start1, end1); time_rmajor3 += GetTimerValue(start2, end1); time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0; #endif /* download the current big panel to CPU */ magmablas_cgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE end1 = get_current_time(); time_rmajor2 += GetTimerValue(start1, end1); #endif } /* end of for */ #ifdef PROFILE end = get_current_time(); flops = FLOPS_CGETRF( m, n ) / 1000000; printf(" NB=%d nb=%d\n",NB,nb); printf(" memcopy and transpose %e seconds\n",time_mem ); printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0); printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0); printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0); printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0); printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0); #endif for( d=0; d<num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if( *info >= 0 ) magma_cgetrf_piv(m, n, NB, a, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
/** Purpose ------- CTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: A is upper triangular; - = MagmaLower: A is lower triangular. @param[in] diag magma_diag_t - = MagmaNonUnit: A is non-unit triangular; - = MagmaUnit: A is unit triangular. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = MagmaUnit, the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_ctrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i_, j_) ( A + (i_) + (j_)*lda ) #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif // Constants const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const char* uplo_ = lapack_uplo_const( uplo ); const char* diag_ = lapack_diag_const( diag ); // Local variables magma_int_t ldda, nb, nn, j, jb; magmaFloatComplex_ptr dA; bool upper = (uplo == MagmaUpper); bool nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if ( n == 0 ) return *info; // Check for singularity if non-unit if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_C_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } // Determine the block size for this environment nb = magma_get_cpotrf_nb( n ); ldda = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_cmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); //magma_queue_create( cdev, &queues[1] ); // unused if (nb <= 1 || nb >= n) { lapackf77_ctrtri( uplo_, diag_, &n, A, &lda, info ); } else if (upper) { // Compute inverse of upper triangular matrix for (j=0; j < n; j += nb) { jb = min( nb, n-j ); if (j > 0) { // Send current block column (with diagonal) to device // This must finish before trtri below magma_csetmatrix( j+jb, jb, A(0,j), lda, dA(0,j), ldda, queues[0] ); // Compute rows 0:j of current block column magma_ctrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, diag, j, jb, c_one, dA(0,0), ldda, dA(0,j), ldda, queues[0] ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, diag, j, jb, c_neg_one, dA(j,j), ldda, dA(0,j), ldda, queues[0] ); // Get above diagonal from device // TODO: could be on another queue, after trmm/trsm finish magma_cgetmatrix_async( j, jb, dA(0,j), ldda, A(0,j), lda, queues[0] ); } // Compute inverse of current diagonal block // TODO: problem if diagonal has not finished sending yet? lapackf77_ctrtri( MagmaUpperStr, diag_, &jb, A(j,j), &lda, info ); if (j+jb < n) { // Send inverted diagonal block to device magma_csetmatrix( jb, jb, A(j,j), lda, dA(j,j), ldda, queues[0] ); } } } else { // Compute inverse of lower triangular matrix nn = ((n-1)/nb)*nb; for (j=nn; j >= 0; j -= nb) { jb = min( nb, n-j ); if (j+jb < n) { // Send current block row (with diagonal) to device // This must finish before trtri below magma_csetmatrix( n-j, jb, A(j,j), lda, dA(j,j), ldda, queues[0] ); // Compute rows j+jb:n of current block column magma_ctrmm( MagmaLeft, MagmaLower, MagmaNoTrans, diag, n-j-jb, jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb,j), ldda, queues[0] ); magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, diag, n-j-jb, jb, c_neg_one, dA(j,j), ldda, dA(j+jb,j), ldda, queues[0] ); // Get below diagonal block from device magma_cgetmatrix_async( n-j-jb, jb, dA(j+jb,j), ldda, A(j+jb,j), lda, queues[0] ); } // Compute inverse of current diagonal block lapackf77_ctrtri( MagmaLowerStr, diag_, &jb, A(j,j), &lda, info ); if (j > 0) { // Send inverted diagonal block to device magma_csetmatrix( jb, jb, A(j,j), lda, dA(j,j), ldda, queues[0] ); } } } magma_queue_destroy( queues[0] ); //magma_queue_destroy( queues[1] ); // unused magma_free( dA ); return *info; }
extern "C" magma_int_t magma_cmtransposeconjugate( magma_c_matrix A, magma_c_matrix *B, magma_queue_t queue ) { // for symmetric matrices: convert to csc using cusparse magma_int_t info = 0; cusparseHandle_t handle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrB=NULL; magma_c_matrix ACSR={Magma_CSR}, BCSR={Magma_CSR}; magma_c_matrix A_d={Magma_CSR}, B_d={Magma_CSR}; if( A.storage_type == Magma_CSR && A.memory_location == Magma_DEV ) { // fill in information for B B->storage_type = A.storage_type; B->diagorder_type = A.diagorder_type; B->memory_location = Magma_DEV; B->num_rows = A.num_cols; // transposed B->num_cols = A.num_rows; // transposed B->nnz = A.nnz; B->true_nnz = A.true_nnz; if ( A.fill_mode == MagmaFull ) { B->fill_mode = MagmaFull; } else if ( A.fill_mode == MagmaLower ) { B->fill_mode = MagmaUpper; } else if ( A.fill_mode == MagmaUpper ) { B->fill_mode = MagmaLower; } B->dval = NULL; B->drow = NULL; B->dcol = NULL; // memory allocation CHECK( magma_cmalloc( &B->dval, B->nnz )); CHECK( magma_index_malloc( &B->drow, B->num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, B->nnz )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &handle )); CHECK_CUSPARSE( cusparseSetStream( handle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrB )); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrB, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrB, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseCcsr2csc( handle, A.num_rows, A.num_cols, A.nnz, A.dval, A.drow, A.dcol, B->dval, B->dcol, B->drow, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO) ); CHECK( magma_cmconjugate( B, queue )); } else if ( A.memory_location == Magma_CPU ){ CHECK( magma_cmtransfer( A, &A_d, A.memory_location, Magma_DEV, queue )); CHECK( magma_cmtransposeconjugate( A_d, &B_d, queue )); CHECK( magma_cmtransfer( B_d, B, Magma_DEV, A.memory_location, queue )); } else { CHECK( magma_cmconvert( A, &ACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_cmtransposeconjugate( ACSR, &BCSR, queue )); CHECK( magma_cmconvert( BCSR, B, Magma_CSR, A.storage_type, queue )); } cleanup: cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroy( handle ); magma_cmfree( &A_d, queue ); magma_cmfree( &B_d, queue ); magma_cmfree( &ACSR, queue ); magma_cmfree( &BCSR, queue ); if( info != 0 ){ magma_cmfree( B, queue ); } return info; }
/** Purpose ------- CUNMQL overwrites the general complex M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = Magma_ConjTrans: Q**H * C C * Q**H @endverbatim where Q is a complex unitary matrix defined as the product of k elementary reflectors Q = H(k) . . . H(2) H(1) as returned by CGEQLF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = Magma_ConjTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] dA COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQLF in the last k columns of its array argument A. The diagonal and the lower part are destroyed, the reflectors are not modified. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = MagmaLeft; LDDA >= max(1,N) if SIDE = MagmaRight. @param[in] tau COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQLF. @param[in,out] dC COMPLEX array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] lddc INTEGER The leading dimension of the array C. LDDC >= max(1,M). @param[in] wA (workspace) COMPLEX array, dimension (LDWA,M) if SIDE = MagmaLeft (LDWA,N) if SIDE = MagmaRight The vectors which define the elementary reflectors, as returned by CHETRD_GPU. @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,M) if SIDE = MagmaLeft; LDWA >= max(1,N) if SIDE = MagmaRight. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgeqlf_comp ********************************************************************/ extern "C" magma_int_t magma_cunmql2_gpu(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex *dC, magma_int_t lddc, magmaFloatComplex *wA, magma_int_t ldwa, magma_int_t *info) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dC(i_,j_) (dC + (i_) + (j_)*lddc) #define wA(i_,j_) (wA + (i_) + (j_)*ldwa) /* Allocate work space on the GPU */ magmaFloatComplex *dwork; magma_cmalloc( &dwork, 2*(m + 64)*64 ); magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t i, i__4; magmaFloatComplex T[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, nb, mi, ni, nq, nw; magma_int_t ldwork; int left, notran; wA -= 1 + ldwa; dC -= 1 + lddc; --tau; *info = 0; left = (side == MagmaLeft); notran = (trans == MagmaNoTrans); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = max(1,n); } else { nq = n; nw = max(1,m); } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != Magma_ConjTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) { return *info; } ldwork = nw; /* Use hybrid CPU-GPU code */ if ((left && notran) || (! left && ! notran)) { i1 = 1; i2 = k; step = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; } else { mi = m; } // set nb-1 sub-diagonals to 0, and diagonal to 1. // This way we can copy V directly to the GPU, // already with the lower triangle parts already set to identity. magmablas_claset_band( MagmaLower, k, k, nb, c_zero, c_one, dA, ldda ); for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i+ib-1) . . . H(i+1) H(i) */ i__4 = nq - k + i + ib - 1; lapackf77_clarft("Backward", "Columnwise", &i__4, &ib, wA(1,i), &ldwa, &tau[i], T, &ib); if (left) { /* H or H' is applied to C(1:m-k+i+ib-1,1:n) */ mi = m - k + i + ib - 1; } else { /* H or H' is applied to C(1:m,1:n-k+i+ib-1) */ ni = n - k + i + ib - 1; } /* Apply H or H'; First copy T to the GPU */ magma_csetmatrix( ib, ib, T, ib, dwork+i__4*ib, ib ); magma_clarfb_gpu(side, trans, MagmaBackward, MagmaColumnwise, mi, ni, ib, dA(0,i-1), ldda, dwork+i__4*ib, ib, // dA using 0-based indices here dC(1,1), lddc, dwork+i__4*ib + ib*ib, ldwork); } magma_free( dwork ); return *info; } /* magma_cunmql */
/** Purpose ------- Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments --------- @param[in] trans magma_trans_t - = MagmaNoTrans: the linear system involves A. Only TRANS=MagmaNoTrans is currently handled. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in,out] dA COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by CGEQRF3. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array HWORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_cgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgels_driver ********************************************************************/ extern "C" magma_int_t magma_cgels3_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dB, magma_int_t lddb, magmaFloatComplex *hwork, magma_int_t lwork, magma_int_t *info) { magmaFloatComplex *dT, *tau; magma_int_t k; magma_int_t nb = magma_get_cgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_C_MAKE( (float)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_C_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_cmalloc( &dT, ldtwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_cmalloc_cpu( &tau, k ); if ( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgeqrf3_gpu( m, n, dA, ldda, tau, dT, info ); if ( *info == 0 ) { magma_cgeqrs3_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hwork, lwork, info ); } magma_free( dT ); magma_free_cpu(tau); return *info; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaFloatComplex *A, *B, *C, *C2, *LU; magmaFloatComplex *dA, *dB, *dC1, *dC2; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.5, 0.1 ); magmaFloatComplex beta = MAGMA_C_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_cmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_cmalloc( &dA, size ); assert( err == 0 ); err = magma_cmalloc( &dB, size ); assert( err == 0 ); err = magma_cmalloc( &dC1, size ); assert( err == 0 ); err = magma_cmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test CSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetmatrix( m, n, A, ld, dB, ld ); magma_cswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_cswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_cgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_clange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "cswap diff %.2g\n", error ); } else { printf( "cswap skipped for n < 3\n" ); } // ----- test ICAMAX // get argmax of column of A magma_csetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_icamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIcamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "icamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test CGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetvector( maxn, B, 1, dB, 1 ); magma_csetvector( maxn, C, 1, dC1, 1 ); magma_csetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_cgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasCaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMV( m, n ) / 1e9; printf( "cgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetvector( m, B, 1, dB, 1 ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_chemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMV( m ) / 1e9; printf( "chemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_clacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_cgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_csetmatrix( m, m, LU, ld, dA, ld ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ctrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ctrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test CGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_csetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_csetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMM( m, n, k ) / 1e9; printf( "cgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetmatrix( m, n, B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_chemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMM( side[is], m, n ) / 1e9; printf( "chemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_csetmatrix( n, k, A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHERK( k, n ) / 1e9; printf( "cherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_csetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHER2K( k, n ) / 1e9; printf( "cher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasCtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRMM( side[is], m, n ) / 1e9; printf( "ctrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test CTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( side[is], m, n ) / 1e9; printf( "ctrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/** Purpose ------- CGELQF computes an LQ factorization of a COMPLEX M-by-N matrix A: A = L * Q. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -10 internal GPU memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). @ingroup magma_cgelqf_comp ********************************************************************/ extern "C" magma_int_t magma_cgelqf( magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info) { magmaFloatComplex *dA, *dAT; magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t maxm, maxn, maxdim, nb; magma_int_t iinfo, ldda; int lquery; /* Function Body */ *info = 0; nb = magma_get_cgelqf_nb(m); work[0] = MAGMA_C_MAKE( (float)(m*nb), 0 ); lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min(m, n) == 0) { work[0] = c_one; return *info; } maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); if (maxdim*maxdim < 2*maxm*maxn) { ldda = maxdim; if (MAGMA_SUCCESS != magma_cmalloc( &dA, maxdim*maxdim )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_csetmatrix( m, n, A, lda, dA, ldda ); dAT = dA; magmablas_ctranspose_inplace( ldda, dAT, ldda ); } else { ldda = maxn; if (MAGMA_SUCCESS != magma_cmalloc( &dA, 2*maxn*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_csetmatrix( m, n, A, lda, dA, maxm ); dAT = dA + maxn * maxm; magmablas_ctranspose( m, n, dA, maxm, dAT, ldda ); } magma_cgeqrf2_gpu(n, m, dAT, ldda, tau, &iinfo); if (maxdim*maxdim < 2*maxm*maxn) { magmablas_ctranspose_inplace( ldda, dAT, ldda ); magma_cgetmatrix( m, n, dA, ldda, A, lda ); } else { magmablas_ctranspose( n, m, dAT, ldda, dA, maxm ); magma_cgetmatrix( m, n, dA, maxm, A, lda ); } magma_free( dA ); return *info; } /* magma_cgelqf */
extern "C" magma_int_t magma_cunmqr2_gpu(const char side, const char trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *da, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex *dc, magma_int_t lddc, magmaFloatComplex *wa, magma_int_t ldwa, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. DA (input) COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF in the first k columns of its array argument A. The diagonal and the upper part are destroyed, the reflectors are not modified. LDDA (input) INTEGER The leading dimension of the array DA. LDDA >= max(1,M) if SIDE = 'L'; LDDA >= max(1,N) if SIDE = 'R'. TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF. DC (device input/output) COMPLEX array, dimension (LDDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by (Q*C) or (Q**H * C) or (C * Q**H) or (C*Q). LDDC (input) INTEGER The leading dimension of the array C. LDDC >= max(1,M). WA (input/workspace) COMPLEX array, dimension (LDWA,M) if SIDE = 'L' (LDWA,N) if SIDE = 'R' The vectors which define the elementary reflectors, as returned by CHETRD_GPU. LDWA (input) INTEGER The leading dimension of the array A. LDWA >= max(1,M) if SIDE = 'L'; LDWA >= max(1,N) if SIDE = 'R'. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ char side_[2] = {side, 0}; char trans_[2] = {trans, 0}; /* Allocate work space on the GPU */ magmaFloatComplex *dwork; magma_int_t wa_offset, dc_offset, i__4, lddwork; magma_int_t i; magmaFloatComplex t[2*4160] /* was [65][64] */; magma_int_t i1, i2, step, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran; wa_offset = 1 + ldwa; wa -= wa_offset; --tau; dc_offset = 1 + lddc; dc -= dc_offset; *info = 0; left = lapackf77_lsame(side_, "L"); notran = lapackf77_lsame(trans_, "N"); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; magma_cmalloc( &dwork, (n + 64)*64 ); } else { nq = n; nw = m; magma_cmalloc( &dwork, (m + 64)*64 ); } if (! left && ! lapackf77_lsame(side_, "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(trans_, "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (ldda < max(1,nq)) { *info = -7; } else if (lddc < max(1,m)) { *info = -10; } else if (ldwa < max(1,nq)) { *info = -12; } // size of the block nb = 64; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { return *info; } /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; step = nb; } else { i1 = ((k - 1)/nb)*nb + 1; i2 = 1; step = -nb; } // silence "uninitialized" warnings mi = 0; ni = 0; if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } magmablas_csetdiag1subdiag0('L', k, nb, da, ldda); // for i=i1 to i2 by step for (i = i1; (step < 0 ? i >= i2 : i <= i2); i += step) { ib = min(nb, k - i + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i + 1; lapackf77_clarft("F", "C", &i__4, &ib, &wa[i + i*ldwa], &ldwa, &tau[i], t, &ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i + 1; ic = i; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i + 1; jc = i; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_csetmatrix( ib, ib, t, ib, dwork, ib ); magma_clarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, da + (i - 1) + (i - 1)*ldda , ldda, dwork, ib, &dc[ic + jc*lddc], lddc, dwork + ib*ib, lddwork); } magma_free( dwork ); return *info; } /* magma_cunmqr */
extern "C" magma_int_t magma_cpotrf_m(magma_int_t num_gpus0, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRF_OOC computes the Cholesky factorization of a complex Hermitian positive definite matrix A. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix A may not fit entirely in the GPU memory. The factorization has the form A = U**H * U, if UPLO = 'U', or A = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U**H * U or A = L * L**H. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ /* Local variables */ float d_one = 1.0; float d_neg_one = -1.0; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; char uplo_[2] = {uplo, 0}; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dwork[MagmaMaxGPUs], *dt[MagmaMaxGPUs]; magma_int_t ldda, lddla, nb, iinfo, n_local[MagmaMaxGPUs], J2, d, num_gpus; magma_int_t j, jj, jb, J, JB, NB, MB, h; magma_queue_t stream[MagmaMaxGPUs][3]; magma_event_t event[MagmaMaxGPUs][5]; #ifdef ROW_MAJOR_PROFILE magma_timestr_t start, end, start0, end0; float chol_time = 1.0; #endif *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_dpotrf_nb(n); if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } //ldda = ((n+31)/32)*32; ldda = ((n+nb-1)/nb)*nb; lddla = ((nb*((n+nb*num_gpus-1)/(nb*num_gpus))+31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); MB = n; /* number of rows in the big panel */ NB = (magma_int_t)((0.8*freeMem-max(2,num_gpus)*nb*ldda-(n+nb)*nb)/lddla); /* number of columns in the big panel */ //NB = min(5*nb,n); if( NB >= n ) { #ifdef CHECK_CPOTRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CPOTRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = (NB/nb) * nb; /* making sure it's devisable by nb */ } #ifdef CHECK_CPOTRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, lddla=%d, freeMem=%.2e).\n",n,NB,nb,lddla,(float)freeMem ); fflush(stdout); #endif for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dt[d], NB*lddla + max(2,num_gpus)*nb*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork[d] = &dt[d][max(2,num_gpus)*nb*ldda]; for( j=0; j<3; j++ ) magma_queue_create( &stream[d][j] ); for( j=0; j<5; j++ ) magma_event_create( &event[d][j] ); magma_device_sync(); // synch the device } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE start0 = get_current_time(); #endif if (nb <= 1 || nb >= n) { lapackf77_cpotrf(uplo_, &n, a, &lda, info); } else { /* Use hybrid blocked code. */ if (upper) { /* =========================================================== * * Compute the Cholesky factorization A = U'*U. * * big panel is divided by block-row and distributed in block * * column cyclic format */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-rows */ magma_chtodpo( num_gpus, &uplo, JB, n, J, J, nb, a, lda, dwork, NB, stream, &iinfo); #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif /* update with the previous big-panels */ for( j=0; j<J; j+=nb ) { /* upload the diagonal of the block column (broadcast to all GPUs) */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( nb, JB, A(j, J), lda, dTup(d, 0, J), nb, stream[d][0] ); n_local[d] = 0; } /* distribute off-diagonal blocks to GPUs */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( nb, jb, A(j, jj), lda, dTup(d, 0, J+JB+n_local[d]), nb, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ /* -- process the big diagonal block of the big panel */ for( jj=0; jj<JB; jj+=nb ) { // jj is 'local' column index within the big panel d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); // the last stream (2) used to process off-diagonal J2 = nb*J2; jb = min(nb,JB-jj); // number of columns in this current block-row magma_cgemm( MagmaConjTrans, MagmaNoTrans, jj, jb, nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+jj), nb, c_one, dAup(d, 0, J2), NB); magma_cherk(MagmaUpper, MagmaConjTrans, jb, nb, d_neg_one, dTup(d, 0, J+jj), nb, d_one, dAup(d, jj, J2), NB); } /* -- process the remaining big off-diagonal block of the big panel */ if( n > J+JB ) { for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = ((n-J)/(nb*num_gpus))*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting the local number of columns in the diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaConjTrans, MagmaNoTrans, JB, n_local[d], nb, c_neg_one, dTup(d, 0, J ), nb, dTup(d, 0, J+JB), nb, c_one, dAup(d, 0, J2), NB); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* end of updates with previous rows */ /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, // dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, JB, n-J, J, J, nb, dwork, NB, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal (and diagonal!!!) big panel */ magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, NB, a, lda, dwork, NB, stream, &iinfo); //magma_cdtohpo(num_gpus, &uplo, JB, n, J, J, nb, 0, a, lda, dwork, NB, stream, &iinfo); } } else { /* ========================================================= * * Compute the Cholesky factorization A = L*L'. */ /* for each big-panel */ for( J=0; J<n; J+=NB ) { JB = min(NB,n-J); if( num_gpus0 > (n-J)/nb ) { num_gpus = (n-J)/nb; if( (n-J)%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } /* load the new big-panel by block-columns */ magma_chtodpo( num_gpus, &uplo, n, JB, J, J, nb, a, lda, dwork, lddla, stream, &iinfo); /* update with the previous big-panels */ #ifdef ROW_MAJOR_PROFILE start = get_current_time(); #endif for( j=0; j<J; j+=nb ) { /* upload the diagonal of big panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_csetmatrix_async( JB, nb, A(J, j), lda, dT(d, J, 0), ldda, stream[d][0] ); n_local[d] = 0; } /* upload off-diagonals */ for( jj=J+JB; jj<n; jj+=nb ) { d = ((jj-J)/nb)%num_gpus; magma_setdevice(d); jb = min(nb, n-jj); magma_csetmatrix_async( jb, nb, A(jj, j), lda, dT(d, J+JB+n_local[d], 0), ldda, stream[d][0] ); n_local[d] += jb; } /* wait for the communication */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); } /* update the current big-panel using the previous block-row */ for( jj=0; jj<JB; jj+=nb ) { /* diagonal */ d = (jj/nb)%num_gpus; J2 = jj/(nb*num_gpus); magma_setdevice(d); magmablasSetKernelStream(stream[d][J2%2]); J2 = nb*J2; jb = min(nb,JB-jj); magma_cgemm( MagmaNoTrans, MagmaConjTrans, jb, jj, nb, c_neg_one, dT(d, J+jj, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); magma_cherk(MagmaLower, MagmaNoTrans, jb, nb, d_neg_one, dT(d, J+jj, 0), ldda, d_one, dA(d, J2, jj), lddla); } if( n > J+JB ) { /* off-diagonal */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][2]); /* local number of columns in the big panel */ n_local[d] = (((n-J)/nb)/num_gpus)*nb; if (d < ((n-J)/nb)%num_gpus) n_local[d] += nb; else if (d == ((n-J)/nb)%num_gpus) n_local[d] += (n-J)%nb; /* subtracting local number of columns in diagonal */ J2 = nb*(JB/(nb*num_gpus)); if( d < (JB/nb)%num_gpus ) J2+=nb; n_local[d] -= J2; magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d], JB, nb, c_neg_one, dT(d, J+JB, 0), ldda, dT(d, J, 0), ldda, c_one, dA(d, J2, 0), lddla); } } /* wait for the previous updates */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( jj=0; jj<3; jj++ ) magma_queue_sync( stream[d][jj] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } /* factor the big panel */ h = (JB+nb-1)/nb; // big diagonal of big panel will be on CPU // using two streams //magma_cpotrf2_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, // dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); // using three streams magma_cpotrf3_mgpu(num_gpus, uplo, n-J, JB, J, J, nb, dwork, lddla, dt, ldda, a, lda, h, stream, event, &iinfo); if( iinfo != 0 ) { *info = J+iinfo; break; } #ifdef ROW_MAJOR_PROFILE end = get_current_time(); chol_time += GetTimerValue(start, end); #endif /* upload the off-diagonal big panel */ magma_cdtohpo( num_gpus, &uplo, n, JB, J, J, nb, JB, a, lda, dwork, lddla, stream, &iinfo); } /* end of for J */ } /* if upper */ } /* if nb */ #ifdef ROW_MAJOR_PROFILE end0 = get_current_time(); #endif if( num_gpus0 > n/nb ) { num_gpus = n/nb; if( n%nb != 0 ) num_gpus ++; } else { num_gpus = num_gpus0; } for (d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { if( stream[d][j] != NULL ) magma_queue_destroy( stream[d][j] ); } magma_free( dt[d] ); for( j=0; j<5; j++ ) { magma_event_destroy( event[d][j] ); } } magma_setdevice(0); #ifdef ROW_MAJOR_PROFILE printf("\n n=%d NB=%d nb=%d\n",n,NB,nb); printf(" Without memory allocation: %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, GetTimerValue(start0, end0), FLOPS_CPOTRF(n)/(1000000*GetTimerValue(start0, end0))); printf(" Performance %f / %f = %f GFlop/s\n", FLOPS_CPOTRF(n)/1000000, chol_time, FLOPS_CPOTRF(n)/(1000000*chol_time)); #endif return *info; } /* magma_cpotrf_ooc */
magma_int_t magma_cgmres( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par ){ // prepare solver feedback solver_par->solver = Magma_GMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE, c_mone = MAGMA_C_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; float nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); magmaFloatComplex *H, *HH, *y, *h1; magma_cmalloc_pinned( &H, (ldh+1)*ldh ); magma_cmalloc_pinned( &y, ldh ); magma_cmalloc_pinned( &HH, ldh*ldh ); magma_cmalloc_pinned( &h1, ldh ); // GPU workspace magma_c_vector r, q, q_t; magma_c_vinit( &r, Magma_DEV, dofs, c_zero ); magma_c_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; magmaFloatComplex *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_cmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_cmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_cscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_ccopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_scnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_C_MAKE( nom0, 0. ); magma_csetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_ccopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/||r|| magma_cscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); magma_c_spmv( c_one, A, q_t, c_zero, r ); // r = A q[k] if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_cdotc(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_caxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing cgemv with scnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_cgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_ccopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_cgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef SCNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_cgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_cgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef SCNRM2SCALE magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_scnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_cgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_ccopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_cscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_cdotc_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_cdotc(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_C_REAL(H(k+1,k))); } magma_csetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_cgemv(MagmaNoTrans, dofs, m, c_one, q(0), dofs, dy, 1, c_one, x->val, 1); magma_c_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_caxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_C_MAKE( magma_scnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_C_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_c_vfree(&r); magma_c_vfree(&q); // free GPU streams and events //magma_queue_destroy( stream[0] ); //magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_cgmres */
extern "C" magma_int_t magma_chegvr(magma_int_t itype, char jobz, char range, char uplo, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *b, magma_int_t ldb, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, magmaFloatComplex *z, magma_int_t ldz, magma_int_t *isuppz, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= CHEGVR computes all the eigenvalues, and optionally, the eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. Whenever possible, CHEEVR calls CSTEGR to compute the eigenspectrum using Relatively Robust Representations. CSTEGR computes eigenvalues by the dqds algorithm, while orthogonal eigenvectors are computed from various "good" L D L^T representations (also known as Relatively Robust Representations). Gram-Schmidt orthogonalization is avoided as far as possible. More specifically, the various steps of the algorithm are as follows. For the i-th unreduced block of T, (a) Compute T - sigma_i = L_i D_i L_i^T, such that L_i D_i L_i^T is a relatively robust representation, (b) Compute the eigenvalues, lambda_j, of L_i D_i L_i^T to high relative accuracy by the dqds algorithm, (c) If there is a cluster of close eigenvalues, "choose" sigma_i close to the cluster, and go to step (a), (d) Given the approximate eigenvalue lambda_j of L_i D_i L_i^T, compute the corresponding eigenvector by forming a rank-revealing twisted factorization. The desired accuracy of the output can be specified by the input parameter ABSTOL. For more details, see "A new O(n^2) algorithm for the symmetric tridiagonal eigenvalue/eigenvector problem", by Inderjit Dhillon, Computer Science Division Technical Report No. UCB//CSD-97-971, UC Berkeley, May 1997. Note 1 : CHEEVR calls CSTEGR when the full spectrum is requested on machines which conform to the ieee-754 floating point standard. CHEEVR calls SSTEBZ and CSTEIN on non-ieee machines and when partial spectrum requests are made. Normal execution of CSTEGR may create NaNs and infinities and hence may abort due to a floating point exception in environments which do not handle NaNs and infinities in the ieee standard default manner. Arguments ========= ITYPE (input) INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. JOBZ (input) CHARACTER*1 = 'N': Compute eigenvalues only; = 'V': Compute eigenvalues and eigenvectors. UPLO (input) CHARACTER*1 = 'U': Upper triangles of A and B are stored; = 'L': Lower triangles of A and B are stored. N (input) INTEGER The order of the matrices A and B. N >= 0. A (input/output) COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = 'V', then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**H*B*Z = I; if ITYPE = 3, Z**H*inv(B)*Z = I. If JOBZ = 'N', then on exit the upper triangle (if UPLO='U') or the lower triangle (if UPLO='L') of A, including the diagonal, is destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). B (input/output) COMPLEX array, dimension (LDB, N) On entry, the Hermitian matrix B. If UPLO = 'U', the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = 'L', the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. ABSTOL (input) REAL The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to ABSTOL + EPS * max( |a|,|b| ) , where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. See "Computing Small Singular Values of Bidiagonal Matrices with Guaranteed High Relative Accuracy," by Demmel and Kahan, LAPACK Working Note #3. If high relative accuracy is important, set ABSTOL to SLAMCH( 'Safe minimum' ). Doing so will guarantee that eigenvalues are computed to high relative accuracy when possible in future releases. The current code does not make any guarantees about high relative accuracy, but furutre releases will. See J. Barlow and J. Demmel, "Computing Accurate Eigensystems of Scaled Diagonally Dominant Matrices", LAPACK Working Note #7, for a discussion of which matrices define their eigenvalues to high relative accuracy. M (output) INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = 'A', M = N, and if RANGE = 'I', M = IU-IL+1. W (output) REAL array, dimension (N) If INFO = 0, the eigenvalues in ascending order. Z (output) COMPLEX array, dimension (LDZ, max(1,M)) If JOBZ = 'V', then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). If JOBZ = 'N', then Z is not referenced. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = 'V', the exact value of M is not known in advance and an upper bound must be used. LDZ (input) INTEGER The leading dimension of the array Z. LDZ >= 1, and if JOBZ = 'V', LDZ >= max(1,N). ISUPPZ (output) INTEGER ARRAY, dimension ( 2*max(1,M) ) The support of the eigenvectors in Z, i.e., the indices indicating the nonzero elements in Z. The i-th eigenvector is nonzero only in elements ISUPPZ( 2*i-1 ) through ISUPPZ( 2*i ). ********* Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1 WORK (workspace/output) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The length of the array WORK. LWORK >= max(1,2*N). For optimal efficiency, LWORK >= (NB+1)*N, where NB is the max of the blocksize for CHETRD and for CUNMTR as returned by ILAENV. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace/output) REAL array, dimension (LRWORK) On exit, if INFO = 0, RWORK(1) returns the optimal (and minimal) LRWORK. LRWORK (input) INTEGER The length of the array RWORK. LRWORK >= max(1,24*N). If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the RWORK array, returns this value as the first entry of the RWORK array, and no error message related to LRWORK is issued by XERBLA. IWORK (workspace/output) INTEGER array, dimension (LIWORK) On exit, if INFO = 0, IWORK(1) returns the optimal (and minimal) LIWORK. LIWORK (input) INTEGER The dimension of the array IWORK. LIWORK >= max(1,10*N). If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the IWORK array, returns this value as the first entry of the IWORK array, and no error message related to LIWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: Internal error Further Details =============== Based on contributions by Inderjit Dhillon, IBM Almaden, USA Osni Marques, LBNL/NERSC, USA Ken Stanley, Computer Science Division, University of California at Berkeley, USA ===================================================================== */ char uplo_[2] = {uplo, 0}; char jobz_[2] = {jobz, 0}; char range_[2] = {range, 0}; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex *da; magmaFloatComplex *db; magmaFloatComplex *dz; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lddz = n; magma_int_t lower; char trans[1]; magma_int_t wantz; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin, lrwmin, liwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = lapackf77_lsame(jobz_, MagmaVecStr); lower = lapackf77_lsame(uplo_, MagmaLowerStr); alleig = lapackf77_lsame(range_, "A"); valeig = lapackf77_lsame(range_, "V"); indeig = lapackf77_lsame(range_, "I"); lquery = lwork == -1; *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || lapackf77_lsame(jobz_, MagmaNoVecStr))) { *info = -3; } else if (! (lower || lapackf77_lsame(uplo_, MagmaUpperStr))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else if (ldz < 1 || (wantz && ldz < n)) { *info = -18; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_chetrd_nb(n); lwmin = n * (nb + 1); lrwmin = 24 * n; liwmin = 10 * n; work[0] = MAGMA_C_MAKE( lwmin, 0 ); rwork[0] = lrwmin; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -21; } else if ((lrwork < lrwmin) && ! lquery) { *info = -23; } else if ((liwork < liwmin) && ! lquery) { *info = -25; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (MAGMA_SUCCESS != magma_cmalloc( &da, n*ldda ) || MAGMA_SUCCESS != magma_cmalloc( &db, n*lddb ) || MAGMA_SUCCESS != magma_cmalloc( &dz, n*lddz )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_csetmatrix( n, n, b, ldb, db, lddb ); magma_csetmatrix_async( n, n, a, lda, da, ldda, stream ); magma_cpotrf_gpu(uplo_[0], n, db, lddb, info); if (*info != 0) { *info = n + *info; return *info; } magma_queue_sync( stream ); magma_cgetmatrix_async( n, n, db, lddb, b, ldb, stream ); /* Transform problem to standard eigenvalue problem and solve. */ magma_chegst_gpu(itype, uplo, n, da, ldda, db, lddb, info); magma_cheevr_gpu(jobz, range, uplo, n, da, ldda, vl, vu, il, iu, abstol, m, w, dz, lddz, isuppz, a, lda, z, ldz, work, lwork, rwork, lrwork, iwork, liwork, info); if (wantz && *info == 0) { /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { *(unsigned char *)trans = MagmaConjTrans; } else { *(unsigned char *)trans = MagmaNoTrans; } magma_ctrsm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, c_one, db, lddb, dz, lddz); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { *(unsigned char *)trans = MagmaNoTrans; } else { *(unsigned char *)trans = MagmaConjTrans; } magma_ctrmm(MagmaLeft, uplo, *trans, MagmaNonUnit, n, *m, c_one, db, lddb, dz, lddz); } magma_cgetmatrix( n, *m, dz, lddz, z, ldz ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); magma_free( da ); magma_free( db ); magma_free( dz ); return *info; } /* chegvr */