/** Returns index of element of vector x having max. absolute value; i.e., max (infinity) norm. @param[in] n Number of elements in vector x. n >= 0. @param[in] dx COMPLEX array on GPU device. The n element vector x of dimension (1 + (n-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx > 0. @ingroup magma_cblas1 */ extern "C" magma_int_t magma_icamax( magma_int_t n, magmaFloatComplex_const_ptr dx, size_t dx_offset, magma_int_t incx, magma_queue_t queue ) { magma_ptr dimax, scratchBuff; magma_malloc( &dimax, sizeof(unsigned int) ); magma_malloc( &scratchBuff, (2*n+1)*sizeof(magmaFloatComplex) ); cl_int err = clblasiCamax( n, dimax, 0, dx, dx_offset, incx, scratchBuff, 1, &queue, 0, NULL, g_event); unsigned int imax_cpu; magma_getvector( 1, sizeof(unsigned int), dimax, 0, 1, &imax_cpu, 1, queue ); clFlush(queue); magma_free( dimax ); magma_free( scratchBuff ); return imax_cpu; }
SEXP magQR(SEXP a) { SEXP gpu = GET_SLOT(a, install("gpu")), b = PROTECT(NEW_OBJECT(MAKE_CLASS("magmaQR"))); int *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1], MIN_MN = (M < N ? M : N), NB = magma_get_dgeqrf_nb(M), *pivot, info; double *A, *tau; A = REAL(SET_VECTOR_ELT(b, 0, AS_NUMERIC(duplicate(a)))); SET_VECTOR_ELT(b, 1, ScalarInteger(MIN_MN)); tau = REAL(SET_VECTOR_ELT(b, 2, NEW_NUMERIC(MIN_MN))); pivot = INTEGER(SET_VECTOR_ELT(b, 3, NEW_INTEGER(N))); int i; for(i = 1; i <= N; i++) *pivot++ = i; if(LOGICAL_VALUE(gpu)) { int LENT = (2*MIN_MN + (N+31)/32*32)*NB; double *dA, *dT, *work; SET_SLOT(b, install("work"), NEW_NUMERIC(LENT)); work = REAL(GET_SLOT(b, install("work"))); magma_malloc((void**)&dA, (M*N)*sizeof(double)); magma_malloc((void**)&dT, LENT*sizeof(double)); magma_dsetmatrix(M, N, A, M, dA, M); magma_dgeqrf3_gpu(M, N, dA, M, tau, dT, &info); magma_dgetmatrix(M, N, dA, M, A, M); magma_dgetvector(LENT, dT, 1, work, 1); magma_free(dA); magma_free(dT); } else { int LWORK = N * NB; double *hA, *hwork; magma_malloc_pinned((void**)&hA, (M*N)*sizeof(double)); magma_malloc_pinned((void**)&hwork, LWORK*sizeof(double)); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, A, &M, hA, &M); magma_dgeqrf_ooc(M, N, hA, M, tau, hwork, LWORK, &info); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, hA, &M, A, &M); magma_free_pinned(hA); magma_free_pinned(hwork); } if(info < 0) error("illegal argument %d in 'magQR'", -1 * info); UNPROTECT(1); return b; }
SEXP magMultmv(SEXP a, SEXP transa, SEXP x, SEXP right) { SEXP gpu = magGetGPU(a, x), y = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int RHS = LOGICAL_VALUE(right), TA = (LOGICAL_VALUE(transa) ^ !RHS), *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1], LENX = LENGTH(x), LENY = DIMA[TA], LDA=M; char TRANSA = (TA ? 'T' : 'N'); double *A = REAL(PROTECT(AS_NUMERIC(a))), *X = REAL(PROTECT(AS_NUMERIC(x))), *dA, *dX, *dY; if(DIMA[!TA] != LENX) error("non-conformable matrices"); y = SET_SLOT(y, install(".Data"), allocMatrix(REALSXP, (RHS ? LENY : 1), (RHS ? 1 : LENY))); SET_SLOT(y, install("gpu"), duplicate(gpu)); magma_malloc((void**)&dA, (M*N)*sizeof(double)); magma_malloc((void**)&dX, LENX*sizeof(double)); magma_malloc((void**)&dY, LENY*sizeof(double)); magma_dsetmatrix(M, N, A, LDA, dA, LDA); magma_dsetvector(LENX, X, 1, dX, 1); if(LOGICAL_VALUE(gpu)) { magmablas_dgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1); } else { cublasDgemv(TRANSA, M, N, 1.0, dA, LDA, dX, 1, 0.0, dY, 1); } magma_dgetvector(LENY, dY, 1, REAL(y), 1); magma_free(dA); magma_free(dX); magma_free(dY); UNPROTECT(3); return y; }
SEXP magMultmm(SEXP a, SEXP transa, SEXP b, SEXP transb) { SEXP gpu = magGetGPU(a, b), c = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int TA = LOGICAL_VALUE(transa), TB = LOGICAL_VALUE(transb), *DIMA = INTEGER(GET_DIM(a)), *DIMB = INTEGER(GET_DIM(b)), M = DIMA[TA], N = DIMB[!TB], K = DIMA[!TA], LDA = DIMA[0], LDB = DIMB[0], LDC = M; char TRANSA = (TA ? 'T' : 'N'), TRANSB = (TB ? 'T' : 'N'); double *A = REAL(PROTECT(AS_NUMERIC(a))), *B = REAL(PROTECT(AS_NUMERIC(b))), *dA, *dB, *dC; if(DIMB[TB] != K) error("non-conformable matrices"); c = SET_SLOT(c, install(".Data"), allocMatrix(REALSXP, M, N)); SET_SLOT(c, install("gpu"), duplicate(gpu)); magma_malloc((void**)&dA, (M*K)*sizeof(double)); magma_malloc((void**)&dB, (K*N)*sizeof(double)); magma_malloc((void**)&dC, (M*N)*sizeof(double)); magma_dsetmatrix(DIMA[0], DIMA[1], A, LDA, dA, LDA); magma_dsetmatrix(DIMB[0], DIMB[1], B, LDB, dB, LDB); if(LOGICAL_VALUE(gpu)) magmablas_dgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC); else cublasDgemm(TRANSA, TRANSB, M, N, K, 1.0, dA, LDA, dB, LDB, 0.0, dC, LDC); magma_dgetmatrix(M, N, dC, LDC, REAL(c), LDC); magma_free(dA); magma_free(dB); magma_free(dC); UNPROTECT(3); return c; }
SEXP magChol(SEXP a) { SEXP gpu = GET_SLOT(a, install("gpu")), b = PROTECT(NEW_OBJECT(MAKE_CLASS("magma"))); int *DIMA = INTEGER(GET_DIM(a)), N = DIMA[0], N2 = N * N, LDB = N, info; double *B; if(DIMA[1] != N) error("non-square matrix"); b = SET_SLOT(b, install(".Data"), AS_NUMERIC(a)); SET_SLOT(b, install("gpu"), duplicate(gpu)); B = REAL(b); if(LOGICAL_VALUE(gpu)) { double *dB; magma_malloc((void**)&dB, N2*sizeof(double)); magma_dsetmatrix(N, N, B, LDB, dB, LDB); magma_dpotrf_gpu(magma_uplo_const('U'), N, dB, LDB, &info); magma_dgetmatrix(N, N, dB, LDB, B, LDB); magma_free(dB); } else { double *hB; magma_malloc_pinned((void**)&hB, N2*sizeof(double)); lapackf77_dlacpy(MagmaUpperStr, &N, &N, B, &LDB, hB, &LDB); magma_dpotrf(magma_uplo_const('U'), N, hB, N, &info); lapackf77_dlacpy(MagmaUpperStr, &N, &N, hB, &LDB, B, &LDB); magma_free_pinned(hB); } if(info < 0) error("illegal argument %d in 'magChol", -1 * info); else if(info > 0) error("leading minor of order %d is not positive definite", info); int i, j; for(j = 0; j < N; j++) { for(i = j + 1; i < N; i++) { B[i + j * N] = 0.0; } } UNPROTECT(1); return b; }
SEXP magLU(SEXP a) { SEXP gpu = GET_SLOT(a, install("gpu")), b = PROTECT(NEW_OBJECT(MAKE_CLASS("magmaLU"))); int *DIMA = INTEGER(GET_DIM(a)), M = DIMA[0], N = DIMA[1], LDA = M, MIN_MN = M < N ? M : N, *ipiv, info; double *A = REAL(PROTECT(AS_NUMERIC(a))); b = SET_SLOT(b, install(".Data"), AS_NUMERIC(a)); SET_SLOT(b, install("pivot"), NEW_INTEGER(MIN_MN)); ipiv = INTEGER(GET_SLOT(b, install("pivot"))); SET_SLOT(b, install("gpu"), duplicate(gpu)); if(LOGICAL_VALUE(gpu)) { double *dA; magma_malloc((void**)&dA, (M*N)*sizeof(double)); magma_dsetmatrix(M, N, A, LDA, dA, LDA); magma_dgetrf_gpu(M, N, dA, LDA, ipiv, &info); magma_dgetmatrix(M, N, dA, LDA, REAL(b), LDA); magma_free(dA); } else { double *hA; magma_malloc_pinned((void**)&hA, (M*N)*sizeof(double)); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, A, &LDA, hA, &LDA); magma_dgetrf(M, N, hA, LDA, ipiv, &info); lapackf77_dlacpy(MagmaUpperLowerStr, &M, &N, hA, &LDA, REAL(b), &LDA); magma_free_pinned(hA); } if(info < 0) error("illegal argument %d in 'magLU'", -1 * info); else if(info > 0) error("factor U is singular"); UNPROTECT(2); return b; }
extern "C" magma_err_t magma_ssytrd(char uplo, magma_int_t n, float *a, magma_int_t lda, float *d, float *e, float *tau, float *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= SSYTRD reduces a real symmetric matrix A to real symmetric tridiagonal form T by an orthogonal similarity transformation: Q**T * A * Q = T. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) On entry, the symmetric matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. On exit, if UPLO = 'U', the diagonal and first superdiagonal of A are overwritten by the corresponding elements of the tridiagonal matrix T, and the elements above the first superdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors; if UPLO = 'L', the diagonal and first subdiagonal of A are over- written by the corresponding elements of the tridiagonal matrix T, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). D (output) REAL array, dimension (N) The diagonal elements of the tridiagonal matrix T: D(i) = A(i,i). E (output) REAL array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix T: E(i) = A(i,i+1) if UPLO = 'U', E(i) = A(i+1,i) if UPLO = 'L'. TAU (output) REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= 1. For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== If UPLO = 'U', the matrix Q is represented as a product of elementary reflectors Q = H(n-1) . . . H(2) H(1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(i+1:n) = 0 and v(i) = 1; v(1:i-1) is stored on exit in A(1:i-1,i+1), and tau in TAU(i). If UPLO = 'L', the matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(n-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0 and v(i+1) = 1; v(i+2:n) is stored on exit in A(i+2:n,i), and tau in TAU(i). The contents of A on exit are illustrated by the following examples with n = 5: if UPLO = 'U': if UPLO = 'L': ( d e v2 v3 v4 ) ( d ) ( d e v3 v4 ) ( e d ) ( d e v4 ) ( v1 e d ) ( d e ) ( v1 v2 e d ) ( d ) ( v1 v2 v3 e d ) where d and e denote diagonal and off-diagonal elements of T, and vi denotes an element of the vector defining H(i). ===================================================================== */ char uplo_[2] = {uplo, 0}; magma_int_t ldda = lda; magma_int_t nb = magma_get_ssytrd_nb(n); float c_neg_one = MAGMA_S_NEG_ONE; float c_one = MAGMA_S_ONE; float d_one = MAGMA_D_ONE; magma_int_t kk, nx; magma_int_t i, j, i_n; magma_int_t iinfo; magma_int_t ldwork, lddwork, lwkopt; magma_int_t lquery; *info = 0; int upper = lapackf77_lsame(uplo_, "U"); lquery = lwork == -1; if (! upper && ! lapackf77_lsame(uplo_, "L")) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,n)) { *info = -4; } else if (lwork < nb*n && ! lquery) { *info = -9; } if (*info == 0) { /* Determine the block size. */ ldwork = lddwork = n; lwkopt = n * nb; // ACD // MAGMA_S_SET2REAL( work[0], lwkopt ); MAGMA_S_SET2REAL( work[0], (float) lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ if (n == 0) { work[0] = c_one; return *info; } magmaFloat_ptr da; size_t da_offset = 0; if (MAGMA_SUCCESS != magma_malloc( &da, (n*ldda + 2*n*nb )*sizeof(float))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmaFloat_ptr dwork = da; size_t dwork_offset = da_offset + (n)*ldda; if (n < 2048) nx = n; else nx = 512; if (upper) { /* Copy the matrix to the GPU */ magma_ssetmatrix( n, n, A(0, 0), 0, lda, dA(0, 0), ldda, queue ); /* Reduce the upper triangle of A. Columns 1:kk are handled by the unblocked method. */ kk = n - (n - nx + nb - 1) / nb * nb; for (i = n - nb; i >= kk; i -= nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i!=n-nb) magma_sgetmatrix( i+nb, nb, dA(0, i), ldda, A(0, i), 0, lda, queue ); magma_slatrd(uplo, i+nb, nb, A(0, 0), lda, e, tau, work, ldwork, dA(0, 0), ldda, dwork, dwork_offset, lddwork, queue); /* Update the unreduced submatrix A(0:i-2,0:i-2), using an update of the form: A := A - V*W' - W*V' */ magma_ssetmatrix( i + nb, nb, work, 0, ldwork, dwork, dwork_offset, lddwork, queue ); magma_ssyr2k(magma_uplo_const(uplo), MagmaNoTrans, i, nb, c_neg_one, dA(0, i), ldda, dwork, dwork_offset, lddwork, d_one, dA(0, 0), ldda, queue); /* Copy superdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { MAGMA_S_SET2REAL( *A(j-1, j), e[j - 1] ); d[j] = MAGMA_S_REAL( *A(j, j) ); } } magma_sgetmatrix( kk, kk, dA(0, 0), ldda, A(0, 0), 0, lda, queue ); /* Use unblocked code to reduce the last or only block */ lapackf77_ssytd2(uplo_, &kk, A(0, 0), &lda, d, e, tau, &iinfo); } else { /* Copy the matrix to the GPU */ if (1<=n-nx) magma_ssetmatrix( n, n, A(0,0), 0, lda, dA(0,0), ldda, queue ); #ifdef FAST_SYMV // TODO this leaks memory from da, above magmaFloat_ptr dwork2; if (MAGMA_SUCCESS != magma_malloc( &dwork2, (n*n)*sizeof(float) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } size_t dwork2_offset = 0; #endif /* Reduce the lower triangle of A */ for (i = 0; i < n-nx; i += nb) { /* Reduce columns i:i+nb-1 to tridiagonal form and form the matrix W which is needed to update the unreduced part of the matrix */ /* Get the current panel (no need for the 1st iteration) */ if (i!=0) magma_sgetmatrix( n-i, nb, dA(i, i), ldda, A(i, i), 0, lda, queue ); #ifdef FAST_SYMV // unported magma_slatrd2(uplo, n-i, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA(i, i), ldda, dwork, lddwork, dwork2, n*n); #else magma_slatrd(uplo, n-i, nb, A(i, i), lda, &e[i], &tau[i], work, ldwork, dA(i, i), ldda, dwork, dwork_offset, lddwork, queue); #endif /* Update the unreduced submatrix A(i+ib:n,i+ib:n), using an update of the form: A := A - V*W' - W*V' */ magma_ssetmatrix( n-i, nb, work, 0, ldwork, dwork, dwork_offset, lddwork, queue ); magma_ssyr2k(MagmaLower, MagmaNoTrans, n-i-nb, nb, c_neg_one, dA(i+nb, i), ldda, dwork, (dwork_offset+nb), lddwork, d_one, dA(i+nb, i+nb), ldda, queue); /* Copy subdiagonal elements back into A, and diagonal elements into D */ for (j = i; j < i+nb; ++j) { MAGMA_S_SET2REAL( *A(j+1, j), e[j] ); d[j] = MAGMA_S_REAL( *A(j, j) ); } } #ifdef FAST_SYMV magma_free( dwork2 ); #endif /* Use unblocked code to reduce the last or only block */ if (1<=n-nx) magma_sgetmatrix( n-i, n-i, dA(i, i), ldda, A(i, i), 0, lda, queue ); i_n = n-i; lapackf77_ssytrd(uplo_, &i_n, A(i, i), &lda, &d[i], &e[i], &tau[i], work, &lwork, &iinfo); } magma_free( da ); // ACD // MAGMA_S_SET2REAL( work[0], lwkopt ); MAGMA_S_SET2REAL( work[0], (float) lwkopt ); return *info; } /* magma_ssytrd */
extern "C" magma_int_t magma_dcumiccsetup( magma_d_matrix A, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; #if CUDA_VERSION >= 7000 csric02Info_t info_M=NULL; void *pBuffer = NULL; #endif magma_d_matrix hA={Magma_CSR}, hACSR={Magma_CSR}, U={Magma_CSR}; CHECK( magma_dmtransfer( A, &hA, A.memory_location, Magma_CPU, queue )); U.diagorder_type = Magma_VALUE; CHECK( magma_dmconvert( hA, &hACSR, hA.storage_type, Magma_CSR, queue )); // in case using fill-in if( precond->levels > 0 ){ magma_d_matrix hAL={Magma_CSR}, hAUt={Magma_CSR}; CHECK( magma_dsymbilu( &hACSR, precond->levels, &hAL, &hAUt, queue )); magma_dmfree(&hAL, queue); magma_dmfree(&hAUt, queue); } CHECK( magma_dmconvert( hACSR, &U, Magma_CSR, Magma_CSRL, queue )); magma_dmfree( &hACSR, queue ); CHECK( magma_dmtransfer(U, &(precond->M), Magma_CPU, Magma_DEV, queue )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &(precond->cuinfo) )); // use kernel to manually check for zeros n the diagonal CHECK( magma_ddiagcheck( precond->M, queue ) ); #if CUDA_VERSION >= 7000 // this version has the bug fixed where a zero on the diagonal causes a crash CHECK_CUSPARSE( cusparseCreateCsric02Info(&info_M) ); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); int buffersize; int structural_zero; int numerical_zero; CHECK_CUSPARSE( cusparseDcsric02_bufferSize( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, &buffersize ) ); CHECK( magma_malloc((void**)&pBuffer, buffersize) ); CHECK_CUSPARSE( cusparseDcsric02_analysis( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer )); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &numerical_zero ) ); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &structural_zero ) ); CHECK_CUSPARSE( cusparseDcsric02( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer) ); #else // this version contains the bug but is needed for backward compability CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_SYMMETRIC )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrA, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrA, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); CHECK_CUSPARSE( cusparseDcsric0( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); #endif CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU )); if( precond->maxiter < 50 ){ //prepare for iterative solves // copy the matrix to precond->L and (transposed) to precond->U CHECK( magma_dmtransfer(precond->M, &(precond->L), Magma_DEV, Magma_DEV, queue )); CHECK( magma_dmtranspose( precond->L, &(precond->U), queue )); // extract the diagonal of L into precond->d CHECK( magma_djacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_dvinit( &precond->work1, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_djacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_dvinit( &precond->work2, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); } /* // to enable also the block-asynchronous iteration for the triangular solves CHECK( magma_dmtransfer( precond->M, &hA, Magma_DEV, Magma_CPU, queue )); hA.storage_type = Magma_CSR; magma_d_matrix hD, hR, hAt CHECK( magma_dcsrsplit( 256, hA, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->LD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->L, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); CHECK( magma_d_cucsrtranspose( hA, &hAt, queue )); CHECK( magma_dcsrsplit( 256, hAt, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->UD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->U, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); magma_dmfree(&hA, queue ); magma_dmfree(&hAt, queue ); */ cleanup: #if CUDA_VERSION >= 7000 magma_free( pBuffer ); cusparseDestroyCsric02Info( info_M ); #endif cusparseDestroySolveAnalysisInfo( precond->cuinfo ); cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroyMatDescr( descrA ); cusparseDestroy( cusparseHandle ); magma_dmfree(&U, queue ); magma_dmfree(&hA, queue ); return info; }
/** \n This is an internal routine. ********************************************************************/ extern "C" magma_int_t magma_dpotrf_rectile_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { //magma_int_t DEBUG=0; // Quick return if possible if (m == 0 || n == 0) { return 1; } if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); return -100; } if (m < n) { printf("error m < n %d < %d \n", (int) m, (int) n); return -101; } double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; magma_int_t panel_nb = n; if (panel_nb <= min_recpnb) { // if (DEBUG == 1) printf("calling bottom panel recursive with n=%d\n",(int) panel_nb); // panel factorization magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); magma_dpotrf_panel_batched( uplo, m, panel_nb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } else { // split A over two [A11 A12; A21 A22; A31 A32] // panel on tile A11, // trsm on A21, using A11 // update on A22 then panel on A22. // finally a trsm on [A31 A32] using the whole [A11 A12; A21 A22] magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A11 //if (DEBUG == 1) printf("calling recursive panel on A11=A(%d,%d) with n=%d min_recpnb %d\n",(int) p1, (int) p1, (int) n1, (int) min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); magma_dpotrf_rectile_batched( uplo, n1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); // TRSM on A21 //if (DEBUG == 1) printf("calling trsm on A21=A(%d,%d) using A11 == A(%d,%d) with m=%d k=%d \n",p2,p1,p1,p1,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p2, p1, batchCount, queue); magmablas_dtrsm_work_batched( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, n2, n1, MAGMA_D_ONE, dA_displ, ldda, dW0_displ, ldda, dX_array, n2, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue ); // update A22 //if (DEBUG == 1) printf("calling update A22=A(%d,%d) using A21 == A(%d,%d) with m=%d n=%d k=%d\n",p2,p2,p2,p1,n2,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p2, p2, batchCount, queue); // NEED TO BE REPLACED BY HERK magma_dgemm_batched( MagmaNoTrans, MagmaConjTrans, n2, n2, n1, alpha, dA_displ, ldda, dA_displ, ldda, beta, dW0_displ, ldda, batchCount, queue ); // panel on A22 //if (DEBUG == 1) printf("calling recursive panel on A22=A(%d,%d) with n=%d min_recpnb %d\n",p2,p2,n2,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount, queue); magma_dpotrf_rectile_batched( uplo, n2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } if (m > n) { // TRSM on A3: //if (DEBUG == 1) printf("calling trsm AT THE END on A3=A(%d,%d): using A1222 == A(%d,%d) with m=%d k=%d \n",n,0,0,0,m-n,n); magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, n, 0, batchCount, queue); magmablas_dtrsm_work_batched( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, m-n, n, MAGMA_D_ONE, dA_displ, ldda, dW0_displ, ldda, dX_array, m-n, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue ); } magma_free(dA_displ); return 0; }
extern "C" magma_int_t magma_cunmqr(magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex *c, magma_int_t ldc, magmaFloatComplex *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- MAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= CUNMQR overwrites the general complex M-by-N matrix C with SIDE = 'L' SIDE = 'R' TRANS = 'N': Q * C C * Q TRANS = 'T': Q**H * C C * Q**H where Q is a complex orthogonal matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Q is of order M if SIDE = 'L' and of order N if SIDE = 'R'. Arguments ========= SIDE (input) CHARACTER*1 = 'L': apply Q or Q**H from the Left; = 'R': apply Q or Q**H from the Right. TRANS (input) CHARACTER*1 = 'N': No transpose, apply Q; = 'T': Transpose, apply Q**H. M (input) INTEGER The number of rows of the matrix C. M >= 0. N (input) INTEGER The number of columns of the matrix C. N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = 'L', M >= K >= 0; if SIDE = 'R', N >= K >= 0. A (input) COMPLEX array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF in the first k columns of its array argument A. A is modified by the routine but restored on exit. LDA (input) INTEGER The leading dimension of the array A. If SIDE = 'L', LDA >= max(1,M); if SIDE = 'R', LDA >= max(1,N). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF. C (input/output) COMPLEX array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H * C or C * Q**H or C*Q. LDC (input) INTEGER The leading dimension of the array C. LDC >= max(1,M). WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(0) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. If SIDE = 'L', LWORK >= max(1,N); if SIDE = 'R', LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = 'L', and LWORK >= M*NB if SIDE = 'R', where NB is the optimal blocksize. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ magmaFloatComplex c_one = MAGMA_C_ONE; magma_side_t side_ = side; magma_trans_t trans_ = trans; /* Allocate work space on the GPU */ magmaFloatComplex_ptr dwork, dc; magma_malloc( &dc, (m)*(n)*sizeof(magmaFloatComplex) ); magma_malloc( &dwork, (m + n + 64)*64*sizeof(magmaFloatComplex) ); /* Copy matrix C from the CPU to the GPU */ magma_csetmatrix( m, n, c, 0, ldc, dc, 0, m, queue ); //dc -= (1 + m); size_t dc_offset = -(1+m); magma_int_t a_offset, c_offset, i__4, lddwork; magma_int_t i__; magmaFloatComplex t[2*4160] /* was [65][64] */; magma_int_t i1, i2, i3, ib, ic, jc, nb, mi, ni, nq, nw; int left, notran, lquery; magma_int_t iinfo, lwkopt; a_offset = 1 + lda; a -= a_offset; --tau; c_offset = 1 + ldc; c -= c_offset; *info = 0; left = lapackf77_lsame(lapack_const(side_), "L"); notran = lapackf77_lsame(lapack_const(trans_), "N"); lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && ! lapackf77_lsame(lapack_const(side_), "R")) { *info = -1; } else if (! notran && ! lapackf77_lsame(lapack_const(trans_), "T")) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } if (*info == 0) { /* Determine the block size. NB may be at most NBMAX, where NBMAX is used to define the local array T. */ nb = 64; lwkopt = max(1,nw) * nb; MAGMA_C_SET2REAL( work[0], lwkopt ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_cunmqr(lapack_const(side_), lapack_const(trans_), &m, &n, &k, &a[a_offset], &lda, &tau[1], &c[c_offset], &ldc, work, &lwork, &iinfo); } else { /* Use hybrid CPU-GPU code */ if ( ( left && (! notran) ) || ( (! left) && notran ) ) { i1 = 1; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb + 1; i2 = 1; i3 = -nb; } if (left) { ni = n; jc = 1; } else { mi = m; ic = 1; } for (i__ = i1; i3 < 0 ? i__ >= i2 : i__ <= i2; i__ += i3) { ib = min(nb, k - i__ + 1); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ i__4 = nq - i__ + 1; lapackf77_clarft("F", "C", &i__4, &ib, &a[i__ + i__ * lda], &lda, &tau[i__], t, &ib); /* 1) Put 0s in the upper triangular part of A; 2) copy the panel from A to the GPU, and 3) restore A */ cpanel_to_q(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); magma_csetmatrix( i__4, ib, &a[i__ + i__ * lda], 0, lda, dwork, 0, i__4, queue ); cq_to_panel(MagmaUpper, ib, &a[i__ + i__ * lda], lda, t+ib*ib); if (left) { /* H or H' is applied to C(i:m,1:n) */ mi = m - i__ + 1; ic = i__; } else { /* H or H' is applied to C(1:m,i:n) */ ni = n - i__ + 1; jc = i__; } if (left) lddwork = ni; else lddwork = mi; /* Apply H or H'; First copy T to the GPU */ magma_csetmatrix( ib, ib, t, 0, ib, dwork, i__4*ib, ib, queue ); magma_clarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dwork, 0, i__4, dwork, i__4*ib, ib, dc, dc_offset+(ic + jc * m), m, dwork, (i__4*ib + ib*ib), lddwork, queue); } magma_cgetmatrix( m, n, dc, dc_offset+(1+m), m, &c[c_offset], 0, ldc, queue ); } MAGMA_C_SET2REAL( work[0], lwkopt ); //dc += (1 + m); magma_free( dc ); magma_free( dwork ); return *info; } /* magma_cunmqr */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zposv_batched */ int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, cpu_perf, cpu_time, gpu_perf, gpu_time; double err = 0.0, Rnorm, Anorm, Xnorm, *work; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *h_A, *h_B, *h_X; magmaDoubleComplex_ptr d_A, d_B; magma_int_t *dinfo_array; magma_int_t N, nrhs, lda, ldb, ldda, lddb, info, sizeA, sizeB; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount = 1; magmaDoubleComplex **dA_array = NULL; magmaDoubleComplex **dB_array = NULL; magma_queue_t queue = magma_stream; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); nrhs = opts.nrhs; batchCount = opts.batchcount ; printf("uplo = %s\n", lapack_uplo_const(opts.uplo) ); printf("BatchCount N NRHS CPU GFlop/s (sec) GPU GFlop/s (sec) ||B - AX|| / N*||A||*||X||\n"); printf("================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldb = lda; ldda = ((N+31)/32)*32; lddb = ldda; gflops = ( FLOPS_ZPOTRF( N) + FLOPS_ZPOTRS( N, nrhs ) ) / 1e9 * batchCount; sizeA = lda*N*batchCount; sizeB = ldb*nrhs*batchCount; TESTING_MALLOC_CPU( h_A, magmaDoubleComplex, sizeA ); TESTING_MALLOC_CPU( h_B, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( h_X, magmaDoubleComplex, sizeB ); TESTING_MALLOC_CPU( work, double, N); TESTING_MALLOC_DEV( d_A, magmaDoubleComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_B, magmaDoubleComplex, lddb*nrhs*batchCount ); TESTING_MALLOC_DEV( dinfo_array, magma_int_t, batchCount ); magma_malloc((void**)&dA_array, batchCount * sizeof(*dA_array)); magma_malloc((void**)&dB_array, batchCount * sizeof(*dB_array)); /* Initialize the matrices */ lapackf77_zlarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_zlarnv( &ione, ISEED, &sizeB, h_B ); for(int i=0; i<batchCount; i++) { magma_zmake_hpd( N, h_A + i * lda * N, lda );// need modification } magma_zsetmatrix( N, N*batchCount, h_A, lda, d_A, ldda ); magma_zsetmatrix( N, nrhs*batchCount, h_B, ldb, d_B, lddb ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ zset_pointer(dA_array, d_A, ldda, 0, 0, ldda*N, batchCount, queue); zset_pointer(dB_array, d_B, lddb, 0, 0, lddb*nrhs, batchCount, queue); gpu_time = magma_wtime(); info = magma_zposv_batched(opts.uplo, N, nrhs, dA_array, ldda, dB_array, lddb, dinfo_array, batchCount, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; // check correctness of results throught "dinfo_magma" and correctness of argument throught "info" magma_int_t *cpu_info = (magma_int_t*) malloc(batchCount*sizeof(magma_int_t)); magma_getvector( batchCount, sizeof(magma_int_t), dinfo_array, 1, cpu_info, 1); for(int i=0; i<batchCount; i++) { if(cpu_info[i] != 0 ){ printf("magma_zposv_batched matrix %d returned internal error %d\n",i, (int)cpu_info[i] ); } } if (info != 0) printf("magma_zposv_batched returned argument error %d: %s.\n", (int) info, magma_strerror( info )); //===================================================================== // Residual //===================================================================== magma_zgetmatrix( N, nrhs*batchCount, d_B, lddb, h_X, ldb ); for(magma_int_t s=0; s<batchCount; s++) { Anorm = lapackf77_zlange("I", &N, &N, h_A + s * lda * N, &lda, work); Xnorm = lapackf77_zlange("I", &N, &nrhs, h_X + s * ldb * nrhs, &ldb, work); blasf77_zgemm( MagmaNoTransStr, MagmaNoTransStr, &N, &nrhs, &N, &c_one, h_A + s * lda * N, &lda, h_X + s * ldb * nrhs, &ldb, &c_neg_one, h_B + s * ldb * nrhs, &ldb); Rnorm = lapackf77_zlange("I", &N, &nrhs, h_B + s * ldb * nrhs, &ldb, work); double error = Rnorm/(N*Anorm*Xnorm); if ( isnan(error) || isinf(error) ) { err = error; break; } err = max(err, error); } status += ! (err < tol); /* ==================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(magma_int_t s=0; s<batchCount; s++) { lapackf77_zposv( lapack_uplo_const(opts.uplo), &N, &nrhs, h_A + s * lda * N, &lda, h_B + s * ldb * nrhs, &ldb, &info ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_zposv returned err %d: %s.\n", (int) info, magma_strerror( info )); printf( "%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, cpu_perf, cpu_time, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } else { printf( "%10d %5d %5d --- ( --- ) %7.2f (%7.2f) %8.2e %s\n", (int)batchCount, (int) N, (int) nrhs, gpu_perf, gpu_time, err, (err < tol ? "ok" : "failed")); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( work ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_DEV( dinfo_array ); magma_free(dA_array); magma_free(dB_array); free(cpu_info); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_sgeev(magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, float *a, magma_int_t lda, float *WR, float *WI, float *vl, magma_int_t ldvl, float *vr, magma_int_t ldvr, float *work, magma_int_t lwork, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= SGEEV computes for an N-by-N real nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**T * A = lambda(j) * u(j)**T where u(j)**T denotes the transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments ========= JOBVL (input) CHARACTER*1 = 'N': left eigenvectors of A are not computed; = 'V': left eigenvectors of are computed. JOBVR (input) CHARACTER*1 = 'N': right eigenvectors of A are not computed; = 'V': right eigenvectors of A are computed. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) DOUBLE PRECISION array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). WR (output) DOUBLE PRECISION array, dimension (N) WI (output) DOUBLE PRECISION array, dimension (N) WR and WI contain the real and imaginary parts, respectively, of the computed eigenvalues. Complex conjugate pairs of eigenvalues appear consecutively with the eigenvalue having the positive imaginary part first. VL (output) DOUBLE PRECISION array, dimension (LDVL,N) If JOBVL = 'V', the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = 'N', VL is not referenced. u(j) = VL(:,j), the j-th column of VL. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = 'V', LDVL >= N. VR (output) DOUBLE PRECISION array, dimension (LDVR,N) If JOBVR = 'V', the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = 'N', VR is not referenced. v(j) = VR(:,j), the j-th column of VR. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = 'V', LDVR >= N. WORK (workspace/output) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. ===================================================================== */ magma_int_t c__1 = 1; magma_int_t c__0 = 0; magma_int_t c_n1 = -1; magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; float d__1, d__2; magma_int_t i__, k, ihi, ilo; float r__, cs, sn, scl; float dum[1], eps; magma_int_t ibal; float anrm; magma_int_t ierr, itau, iwrk, nout; magma_int_t scalea; float cscale; float bignum; magma_int_t minwrk; magma_int_t wantvl; float smlnum; magma_int_t lquery, wantvr, select[1]; magma_int_t nb = 0; magmaFloat_ptr dT; //magma_timestr_t start, end; char side[2] = {0, 0}; magma_vec_t jobvl_ = jobvl; magma_vec_t jobvr_ = jobvr; *info = 0; lquery = lwork == -1; wantvl = lapackf77_lsame(lapack_const(jobvl_), "V"); wantvr = lapackf77_lsame(lapack_const(jobvr_), "V"); if (! wantvl && ! lapackf77_lsame(lapack_const(jobvl_), "N")) { *info = -1; } else if (! wantvr && ! lapackf77_lsame(lapack_const(jobvr_), "N")) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -9; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -11; } /* Compute workspace */ if (*info == 0) { nb = magma_get_sgehrd_nb(n); minwrk = (2+nb)*n; work[0] = (float) minwrk; if (lwork < minwrk && ! lquery) { *info = -13; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } // if eigenvectors are needed #if defined(VERSION3) if (MAGMA_SUCCESS != magma_malloc( &dT, nb*n*sizeof(float) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif // subtract row and col for 1-based indexing a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; vl_dim1 = ldvl; vl_offset = 1 + vl_dim1; vl -= vl_offset; vr_dim1 = ldvr; vr_offset = 1 + vr_dim1; vr -= vr_offset; --work; /* Get machine constants */ eps = lapackf77_slamch("P"); smlnum = lapackf77_slamch("S"); bignum = 1. / smlnum; lapackf77_slabad(&smlnum, &bignum); smlnum = magma_ssqrt(smlnum) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_slange("M", &n, &n, &a[a_offset], &lda, dum); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_slascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, &ierr); } /* Balance the matrix (Workspace: need N) */ ibal = 1; lapackf77_sgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &work[ibal], &ierr); /* Reduce to upper Hessenberg form (Workspace: need 3*N, prefer 2*N+N*NB) */ itau = ibal + n; iwrk = itau + n; i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) /* * Version 1 - LAPACK */ lapackf77_sgehrd(&n, &ilo, &ihi, &a[a_offset], &lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION2) /* * Version 2 - LAPACK consistent HRD */ magma_sgehrd2(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, */ magma_sgehrd(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], i__1, dT, 0, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.); if (wantvl) { /* Want left eigenvectors Copy Householder vectors to VL */ side[0] = 'L'; lapackf77_slacpy(MagmaLowerStr, &n, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl); /* * Generate orthogonal matrix in VL * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_sorghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_sorghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VL * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info); if (wantvr) { /* Want left and right eigenvectors Copy Schur vectors to VR */ side[0] = 'B'; lapackf77_slacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr); } } else if (wantvr) { /* Want right eigenvectors Copy Householder vectors to VR */ side[0] = 'R'; lapackf77_slacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr); /* * Generate orthogonal matrix in VR * (Workspace: need 3*N-1, prefer 2*N+(N-1)*NB) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_sorghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_sorghr(n, ilo, ihi, &vr[vr_offset], ldvr, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for sorghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* * Perform QR iteration, accumulating Schur vectors in VR * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } else { /* * Compute eigenvalues only * (Workspace: need N+1, prefer N+HSWORK (see comments) ) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_shseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, WR, WI, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } /* If INFO > 0 from SHSEQR, then quit */ if (*info > 0) { fprintf(stderr, "SHSEQR returned with info = %d\n", (int) *info); goto L50; } if (wantvl || wantvr) { /* * Compute left and/or right eigenvectors * (Workspace: need 4*N) */ lapackf77_strevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &ierr); } if (wantvl) { /* * Undo balancing of left eigenvectors * (Workspace: need N) */ lapackf77_sgebak("B", "L", &n, &ilo, &ihi, &work[ibal], &n, &vl[vl_offset], &ldvl, &ierr); /* Normalize left eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if ( WI[i__-1] == 0.) { scl = cblas_snrm2(n, &vl[i__ * vl_dim1 + 1], 1); scl = 1. / scl; cblas_sscal(n, (scl), &vl[i__ * vl_dim1 + 1], 1); } else if (WI[i__-1] > 0.) { d__1 = cblas_snrm2(n, &vl[ i__ * vl_dim1 + 1], 1); d__2 = cblas_snrm2(n, &vl[(i__ + 1) * vl_dim1 + 1], 1); scl = lapackf77_slapy2(&d__1, &d__2); scl = 1. / scl; cblas_sscal(n, (scl), &vl[ i__ * vl_dim1 + 1], 1); cblas_sscal(n, (scl), &vl[(i__ + 1) * vl_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vl[k + i__ * vl_dim1]; /* Computing 2nd power */ d__2 = vl[k + (i__ + 1) * vl_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_isamax */ k = cblas_isamax(n, &work[iwrk], 1)+1; lapackf77_slartg(&vl[k + i__ * vl_dim1], &vl[k + (i__ + 1) * vl_dim1], &cs, &sn, &r__); cblas_srot(n, &vl[ i__ * vl_dim1 + 1], 1, &vl[(i__ + 1) * vl_dim1 + 1], 1, cs, (sn)); vl[k + (i__ + 1) * vl_dim1] = 0.; } } } if (wantvr) { /* * Undo balancing of right eigenvectors * (Workspace: need N) */ lapackf77_sgebak("B", "R", &n, &ilo, &ihi, &work[ibal], &n, &vr[vr_offset], &ldvr, &ierr); /* Normalize right eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { if (WI[i__-1] == 0.) { scl = 1. / cblas_snrm2(n, &vr[i__ * vr_dim1 + 1], 1); cblas_sscal(n, (scl), &vr[i__ * vr_dim1 + 1], 1); } else if (WI[i__-1] > 0.) { d__1 = cblas_snrm2(n, &vr[ i__ * vr_dim1 + 1], 1); d__2 = cblas_snrm2(n, &vr[(i__ + 1) * vr_dim1 + 1], 1); scl = lapackf77_slapy2(&d__1, &d__2); scl = 1. / scl; cblas_sscal(n, (scl), &vr[ i__ * vr_dim1 + 1], 1); cblas_sscal(n, (scl), &vr[(i__ + 1) * vr_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { /* Computing 2nd power */ d__1 = vr[k + i__ * vr_dim1]; /* Computing 2nd power */ d__2 = vr[k + (i__ + 1) * vr_dim1]; work[iwrk + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_isamax */ k = cblas_isamax(n, &work[iwrk], 1)+1; lapackf77_slartg(&vr[k + i__ * vr_dim1], &vr[k + (i__ + 1) * vr_dim1], &cs, &sn, &r__); cblas_srot(n, &vr[ i__ * vr_dim1 + 1], 1, &vr[(i__ + 1) * vr_dim1 + 1], 1, cs, (sn)); vr[k + (i__ + 1) * vr_dim1] = 0.; } } } /* Undo scaling if necessary */ L50: if (scalea) { i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR + (*info), &i__2, &ierr); i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI + (*info), &i__2, &ierr); if (*info > 0) { i__1 = ilo - 1; lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WR, &n, &ierr); i__1 = ilo - 1; lapackf77_slascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, WI, &n, &ierr); } } #if defined(VERSION3) magma_free( dT ); #endif return *info; } /* magma_sgeev */
extern "C" magma_int_t magma_zlobpcg( magma_z_matrix A, magma_z_solver_par *solver_par, magma_z_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = 0; #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define SWAP(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_z_bspmv_tuned(m, n, alpha, A, X, beta, AX, queue) { \ magma_z_matrix x={Magma_CSR}, ax={Magma_CSR}; \ x.memory_location = Magma_DEV; x.num_rows = m; x.num_cols = n; x.major = MagmaColMajor; x.nnz = m*n; x.dval = X; x.storage_type = Magma_DENSE; \ ax.memory_location= Magma_DEV; ax.num_rows = m; ax.num_cols = n; ax.major = MagmaColMajor; ax.nnz = m*n; ax.dval = AX; ax.storage_type = Magma_DENSE; \ CHECK( magma_z_spmv(alpha, A, x, beta, ax, queue )); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n = (solver_par->num_eigenvalues); magmaDoubleComplex *blockX = solver_par->eigenvectors; double *evalues = solver_par->eigenvalues; solver_par->numiter = 0; solver_par->spmv_count = 0; magmaDoubleComplex *dwork=NULL, *hwork=NULL; magmaDoubleComplex *blockP=NULL, *blockAP=NULL, *blockR=NULL, *blockAR=NULL, *blockAX=NULL, *blockW=NULL; magmaDoubleComplex *gramA=NULL, *gramB=NULL, *gramM=NULL; magmaDoubleComplex *gevectors=NULL, *h_gramB=NULL; dwork = NULL; hwork = NULL; blockP = NULL; blockR = NULL; blockAP = NULL; blockAR = NULL; blockAX = NULL; blockW = NULL; gramA = NULL; gramB = NULL; gramM = NULL; gevectors = NULL; h_gramB = NULL; magmaDoubleComplex *pointer, *origX = blockX; double *eval_gpu=NULL; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2, tempop1, tempop2; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_int_t *iwork={0}, liwork = 15*n+9; magma_int_t gramDim, ldgram = 3*n, ikind = 3; magmaDoubleComplex *hW={0}; // === Set solver parameters === double residualTolerance = solver_par->rtol; magma_int_t maxIterations = solver_par->maxiter; double tmp; double r0=0; // set in 1st iteration // === Set some constants & defaults === magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double *residualNorms={0}, *condestGhistory={0}, condestG={0}; double *gevalues={0}; magma_int_t *activeMask={0}; double *hresidualNorms={0}; #ifdef COMPLEX double *rwork={0}; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); CHECK( magma_dmalloc_cpu(&rwork, lrwork)); #endif CHECK( magma_zmalloc_pinned( &hwork , lwork )); CHECK( magma_zmalloc( &blockAX , m*n )); CHECK( magma_zmalloc( &blockAR , m*n )); CHECK( magma_zmalloc( &blockAP , m*n )); CHECK( magma_zmalloc( &blockR , m*n )); CHECK( magma_zmalloc( &blockP , m*n )); CHECK( magma_zmalloc( &blockW , m*n )); CHECK( magma_zmalloc( &dwork , m*n )); CHECK( magma_dmalloc( &eval_gpu , 3*n )); //**********************************************************+ // === Check some parameters for possible quick exit === solver_par->info = MAGMA_SUCCESS; if (m < 2) info = MAGMA_DIVERGENCE; else if (n > m) info = MAGMA_SLOW_CONVERGENCE; if (solver_par->info != 0) { magma_xerbla( __func__, -(info) ); goto cleanup; } solver_par->info = info; // local info variable; // === Allocate GPU memory for the residual norms' history === CHECK( magma_dmalloc(&residualNorms, (maxIterations+1) * n)); CHECK( magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) )); // === Allocate CPU work space === CHECK( magma_dmalloc_cpu(&condestGhistory, maxIterations+1)); CHECK( magma_dmalloc_cpu(&gevalues, 3 * n)); CHECK( magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t))); CHECK( magma_zmalloc_pinned(&hW, n*n)); CHECK( magma_zmalloc_pinned(&gevectors, 9*n*n)); CHECK( magma_zmalloc_pinned(&h_gramB , 9*n*n)); // === Allocate GPU workspace === CHECK( magma_zmalloc(&gramM, n * n)); CHECK( magma_zmalloc(&gramA, 9 * n * n)); CHECK( magma_zmalloc(&gramB, 9 * n * n)); // === Set activemask to one === for(magma_int_t k =0; k<n; k++){ iwork[k]=1; } magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n , activeMask, n, queue); #if defined(PRECISION_s) ikind = 3; #endif // === Make the initial vectors orthonormal === magma_zgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, &info ); //magma_zorthomgs( m, n, blockX, queue ); magma_z_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX, queue ); solver_par->spmv_count++; // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n, queue ); magma_zheevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); // === Update X = X * evectors === magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m, queue ); SWAP(blockW, blockX); // === Update AX = AX * evectors === magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m, queue ); SWAP(blockW, blockAX); condestGhistory[1] = 7.82; tempo1 = magma_sync_wtime( queue ); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_zlacpy( MagmaFull, m, n, blockAX, m, blockR, m, queue ); /* for(magma_int_t i=0; i<n; i++) { magma_zaxpy( m, MAGMA_Z_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1, queue ); } */ magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n, queue ); CHECK( magma_zlobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu, queue )); magmablas_dznrm2_cols( m, n, blockR, m, residualNorms(0, iterationNumber), queue ); // === remove the residuals corresponding to already converged evectors CHECK( magma_zcompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize, queue )); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) //magmablas_zlacpy( MagmaFull, m, cBlockSize, blockR, m, blockW, m, queue ); //SWAP(blockW, blockR); // preconditioner magma_z_matrix bWv={Magma_CSR}, bRv={Magma_CSR}; bWv.memory_location = Magma_DEV; bWv.num_rows = m; bWv.num_cols = cBlockSize; bWv.major = MagmaColMajor; bWv.nnz = m*cBlockSize; bWv.dval = blockW; bRv.memory_location = Magma_DEV; bRv.num_rows = m; bRv.num_cols = cBlockSize; bRv.major = MagmaColMajor; bRv.nnz = m*cBlockSize; bRv.dval = blockR; tempop1 = magma_sync_wtime( queue ); CHECK( magma_z_applyprecond_left( MagmaNoTrans, A, bRv, &bWv, precond_par, queue )); CHECK( magma_z_applyprecond_right( MagmaNoTrans, A, bWv, &bRv, precond_par, queue )); tempop2 = magma_sync_wtime( queue ); precond_par->runtime += tempop2-tempop1; // === make the preconditioned residuals orthogonal to X if( precond_par->solver != Magma_NONE){ magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram, queue ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_neg_one, blockX, m, gramB(0,0), ldgram, c_one, blockR, m, queue ); } // === make the active preconditioned residuals orthonormal magma_zgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, &info ); #if defined(PRECISION_s) // re-orthogonalization SWAP(blockX, dwork); magma_zgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, &info ); #endif //magma_zorthomgs( m, cBlockSize, blockR, queue ); // === compute AR magma_z_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR, queue ); solver_par->spmv_count++; if (!restart) { // === compact P & AP as well CHECK( magma_zcompactActive(m, n, blockP, m, activeMask, queue )); CHECK( magma_zcompactActive(m, n, blockAP, m, activeMask, queue )); /* // === make P orthogonal to X ? magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram, queue ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_neg_one, blockX, m, gramB(0,0), ldgram, c_one, blockP, m, queue ); // === make P orthogonal to R ? magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram, queue ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_neg_one, blockR, m, gramB(0,0), ldgram, c_one, blockP, m, queue ); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_zgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, &info ); #if defined(PRECISION_s) // re-orthogonalization SWAP(blockX, dwork); magma_zgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, &info ); #endif //magma_zorthomgs( m, cBlockSize, blockP, queue ); //magma_z_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP, queue ); magma_zsetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize, queue ); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m, queue ); #else magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m, queue ); #endif } iter = max( 1, iterationNumber - 10 - int(log(1.*cBlockSize)) ); double condestGmean = 0.; for(magma_int_t i = 0; i<iterationNumber-iter+1; i++){ condestGmean += condestGhistory[i]; } condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_zlaset( MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram, queue ); // identity if (!restart) { magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram, queue ); magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram, queue ); } magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram, queue ); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_zgetmatrix( gramDim, gramDim, gramB, ldgram, h_gramB, ldgram, queue ); lapackf77_zheev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #ifdef COMPLEX rwork, #endif &info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", int(iterationNumber)); } // === assemble GramA; first, set it to I magmablas_zlaset( MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram, queue ); // identity magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram, queue ); magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram, queue ); if (!restart) { magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram, queue ); magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram, queue ); magma_zgemm( MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram, queue ); } /* // === Compute X' AX or just use the eigenvalues below ? magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram, queue ); */ if (restart==0) { magma_zgetmatrix( gramDim, gramDim, gramA, ldgram, gevectors, ldgram, queue ); } else { gramDim = n+cBlockSize; magma_zgetmatrix( gramDim, gramDim, gramA, ldgram, gevectors, ldgram, queue ); } for(magma_int_t k=0; k<n; k++) *gevectors(k,k) = MAGMA_Z_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_zgetmatrix( gramDim, gramDim, gramB, ldgram, h_gramB, ldgram, queue ); magma_int_t itype = 1; lapackf77_zhegvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #ifdef COMPLEX rwork, &lrwork, #endif iwork, &liwork, &info); for(magma_int_t k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_zsetmatrix( gramDim, gramDim, gevectors, ldgram, gramA, ldgram, queue ); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m, queue ); SWAP(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m, queue ); // === corresponding contribution from AP to the new AX (in AP) magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m, queue ); SWAP(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m, queue ); } else { // === contribution from R (only) to the new X magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m, queue ); // === corresponding contribution from AR (only) to the new AX magma_zgemm( MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m, queue ); } // === contribution from old X to the new X + the new search direction P magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m, queue ); SWAP(dwork, blockX); //magma_zaxpy( m*n, c_one, blockP, 1, blockX, 1, queue ); CHECK( magma_zlobpcg_maxpy( m, n, blockP, blockX, queue )); // === corresponding contribution from old AX to new AX + AP magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m, queue ); SWAP(dwork, blockAX); //magma_zaxpy( m*n, c_one, blockAP, 1, blockAX, 1, queue ); CHECK( magma_zlobpcg_maxpy( m, n, blockAP, blockAX, queue )); condestGhistory[iterationNumber+1]=condestG; magma_dgetmatrix( 1, 1, residualNorms(0, iterationNumber), 1, &tmp, 1, queue ); if ( iterationNumber == 1 ) { solver_par->init_res = tmp; r0 = tmp * solver_par->rtol; if ( r0 < ATOLERANCE ) r0 = ATOLERANCE; } solver_par->final_res = tmp; if ( tmp < r0 ) { break; } if (cBlockSize == 0) { break; } if ( solver_par->verbose!=0 ) { if ( iterationNumber%solver_par->verbose == 0 ) { // double res; // magma_zgetmatrix( 1, 1, // (magmaDoubleComplex*)residualNorms(0, iterationNumber), 1, // (magmaDoubleComplex*)&res, 1, queue ); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", int(iterationNumber), int(cBlockSize)); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if ( solver_par->numiter < solver_par->maxiter) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) info = MAGMA_SLOW_CONVERGENCE; else info = MAGMA_DIVERGENCE; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_z_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX, queue ); magma_zgemm( MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n, queue ); magma_zheevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #ifdef COMPLEX rwork, lrwork, #endif iwork, liwork, &info ); for(magma_int_t k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors SWAP(blockX, dwork); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m, queue ); // === update AX = AX * evectors to compute the final residual SWAP(blockAX, dwork); magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m, queue ); // === compute R = AX - evalues X magmablas_zlacpy( MagmaFull, m, n, blockAX, m, blockR, m, queue ); for(magma_int_t i=0; i<n; i++) magma_zaxpy( m, MAGMA_Z_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1, queue ); // === residualNorms[iterationNumber] = || R || magmablas_dznrm2_cols( m, n, blockR, m, residualNorms(0, iterationNumber), queue ); // === restore blockX if needed if (blockX != origX) magmablas_zlacpy( MagmaFull, m, n, blockX, m, origX, m, queue ); printf("Eigenvalues:\n"); for(magma_int_t i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Prmagma_int_t residual history in a file for plotting ==== CHECK( magma_dmalloc_cpu(&hresidualNorms, (iterationNumber+1) * n)); magma_dgetmatrix( n, iterationNumber, residualNorms, n, hresidualNorms, n, queue ); solver_par->iter_res = *hresidualNorms(0, iterationNumber-1); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(magma_int_t i =1; i<iterationNumber; i++) { for(magma_int_t j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); cleanup: magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); if (blockX != (solver_par->eigenvectors)) magma_free( blockX ); if (blockAX != (solver_par->eigenvectors)) magma_free( blockAX ); if (blockAR != (solver_par->eigenvectors)) magma_free( blockAR ); if (blockAP != (solver_par->eigenvectors)) magma_free( blockAP ); if (blockR != (solver_par->eigenvectors)) magma_free( blockR ); if (blockP != (solver_par->eigenvectors)) magma_free( blockP ); if (blockW != (solver_par->eigenvectors)) magma_free( blockW ); if (dwork != (solver_par->eigenvectors)) magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #ifdef COMPLEX magma_free_cpu( rwork ); rwork = NULL; #endif return info; }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_batched( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex **dA_array, magma_int_t ldda, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define A(i_, j_) (A + (i_) + (j_)*ldda) double d_alpha = -1.0; double d_beta = 1.0; cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if ( uplo != MagmaUpper && uplo != MagmaLower) { arginfo = -1; } else if (n < 0) { arginfo = -2; } else if (ldda < max(1,n)) { arginfo = -4; } if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } // Quick return if possible if (n == 0) { return arginfo; } if( n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t j, k, ib; magma_int_t nb = POTRF_NB; magma_int_t gemm_crossover = 127;//nb > 32 ? 127 : 160; #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif magmaDoubleComplex **dA_displ = NULL; magmaDoubleComplex **dW0_displ = NULL; magmaDoubleComplex **dW1_displ = NULL; magmaDoubleComplex **dW2_displ = NULL; magmaDoubleComplex **dW3_displ = NULL; magmaDoubleComplex **dW4_displ = NULL; magmaDoubleComplex **dinvA_array = NULL; magmaDoubleComplex **dwork_array = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nb; magmaDoubleComplex* dinvA = NULL; magmaDoubleComplex* dwork = NULL;// dinvA and dwork are workspace in ztrsm magmaDoubleComplex **cpuAarray = NULL; magma_zmalloc( &dinvA, invA_msize * batchCount); magma_zmalloc( &dwork, dwork_msize * batchCount ); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaDoubleComplex*)); /* check allocation */ if ( dA_displ == NULL || dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL || cpuAarray == NULL ) { magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_zlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dinvA, invA_msize, queue); magmablas_zlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dwork, dwork_msize, queue); zset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); zset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(k=0; k<nbstreams; k++){ magma_queue_create( &stream[k] ); } magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dA_array, 1, cpuAarray, 1); magmablasSetKernelStream(NULL); if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); goto fin; } else { for(j = 0; j < n; j+=nb) { ib = min(nb, n-j); #if 1 //=============================================== // panel factorization //=============================================== magma_zdisplace_pointers(dA_displ, dA_array, ldda, j, j, batchCount, queue); zset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); zset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); #if 0 arginfo = magma_zpotrf_panel_batched( uplo, n-j, ib, dA_displ, ldda, dwork_array, dwork_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle); #else //arginfo = magma_zpotrf_rectile_batched( arginfo = magma_zpotrf_recpanel_batched( uplo, n-j, ib, 32, dA_displ, ldda, dwork_array, dwork_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle, queue); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #endif #if 1 //real_Double_t gpu_time; //gpu_time = magma_sync_wtime(NULL); if( (n-j-ib) > 0){ if( (n-j-ib) > gemm_crossover) { //------------------------------------------- // USE STREAM HERK //------------------------------------------- // since it use different stream I need to wait the panel. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); /* you must know the matrix layout inorder to do it */ for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // call herk, class zherk must call cpu pointer magma_zherk(MagmaLower, MagmaNoTrans, n-j-ib, ib, d_alpha, (const magmaDoubleComplex*) cpuAarray[k] + j+ib+j*ldda, ldda, d_beta, cpuAarray[k] + j+ib+(j+ib)*ldda, ldda); } // need to synchronise to be sure that panel do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } else { //------------------------------------------- // USE BATCHED GEMM(which is a HERK in fact, since it only access the lower part) //------------------------------------------- magma_zdisplace_pointers(dA_displ, dA_array, ldda, j+ib, j, batchCount, queue); magma_zdisplace_pointers(dW1_displ, dA_array, ldda, j+ib, j+ib, batchCount, queue); magmablas_zherk_batched(uplo, MagmaNoTrans, n-j-ib, ib, d_alpha, dA_displ, ldda, d_beta, dW1_displ, ldda, batchCount, queue); } } //gpu_time = magma_sync_wtime(NULL) - gpu_time; //real_Double_t flops = (n-j-ib) * (n-j-ib) * ib / 1e9 * batchCount; //real_Double_t gpu_perf = flops / gpu_time; //printf("Rows= %d, Colum=%d, herk time = %7.2fms, Gflops= %7.2f\n", n-j-ib, ib, gpu_time*1000, gpu_perf); #endif } } fin: magma_queue_sync(NULL); for(k=0; k<nbstreams; k++){ magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); return arginfo; }
extern "C" magma_int_t magma_dgels_gpu( magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDouble_ptr dA, size_t dA_offset, magma_int_t ldda, magmaDouble_ptr dB, size_t dB_offset, magma_int_t lddb, double *hwork, magma_int_t lwork, magma_int_t *info, magma_queue_t queue ) { /* -- clMagma (version 0.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= Solves the overdetermined, least squares problem min || A*X - C || using the QR factorization A. The underdetermined problem (m < n) is not currently handled. Arguments ========= TRANS (input) CHARACTER*1 = 'N': the linear system involves A. Only trans='N' is currently handled. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, A is overwritten by details of its QR factorization as returned by DGEQRF. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. DB (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) DOUBLE_PRECISION array, dimension MAX(1,LWORK). On exit, if INFO = 0, HWORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array HWORK, LWORK >= max(1,NRHS). For optimum performance LWORK >= (M-N+NB)*(NRHS + 2*NB), where NB is the blocksize given by magma_get_dgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the HWORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) dA, (dA_offset+(a_1)+(a_2)*(ldda)) magmaDouble_ptr dT; double *tau; magma_int_t k; magma_int_t nb = magma_get_dgeqrf_nb(m); magma_int_t lwkopt = (m-n+nb)*(nrhs+2*nb); long int lquery = (lwork == -1); hwork[0] = MAGMA_D_MAKE( (double)lwkopt, 0. ); *info = 0; /* For now, N is the only case working */ if ( trans != MagmaNoTrans ) *info = -1; else if (m < 0) *info = -2; else if (n < 0 || m < n) /* LQ is not handle for now*/ *info = -3; else if (nrhs < 0) *info = -4; else if (ldda < max(1,m)) *info = -6; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = MAGMA_D_ONE; return *info; } /* * Allocate temporary buffers */ int ldtwork = ( 2*k + ((n+31)/32)*32 )*nb; if (nb < nrhs) ldtwork = ( 2*k + ((n+31)/32)*32 )*nrhs; if (MAGMA_SUCCESS != magma_malloc( &dT, ldtwork*sizeof(double) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } tau = (double*) malloc( k * sizeof(double) ); if( tau == NULL ) { magma_free( dT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } size_t dT_offset = 0; magma_dgeqrf_gpu( m, n, dA, dA_offset, ldda, tau, dT, dT_offset, info, queue ); if ( *info == 0 ) { magma_dgeqrs_gpu( m, n, nrhs, dA, dA_offset, ldda, tau, dT, dT_offset, dB, dB_offset, lddb, hwork, lwork, info, queue ); } magma_free( dT ); free(tau); return *info; }
int main( int argc, char **argv ) { printf("Starting\n"); int size; cudaError_t cudaStat; magma_err_t magmaStat; cublasStatus_t stat; cublasHandle_t handle; int it,i; cublasOperation_t N = 'N'; cublasOperation_t T = 'T'; char N2 = 'N'; char T2 = 'T'; double one = 1., zero=0.; char uplo = 'L'; int info; int err; double* A; double* B; magmaStat = magma_init(); int use_pinned; if(argc > 1) { use_pinned = atoi(argv[1]); } else use_pinned = 0; printf("Setting use_pinned to %d\n", use_pinned); for( size = 256; size <= 8192; size*=2 ) { if(use_pinned) { // allocate pinned memory on CPU err = magma_dmalloc_pinned( &A, size*size ); assert( err == 0 ); err = magma_dmalloc_pinned( &B, size*size ); assert( err == 0 ); } else { // allocate standard memory on CPU A = (double*) malloc( sizeof(double)*size*size ); B = (double*) malloc( sizeof(double)*size*size ); } cudaDeviceSynchronize(); double tInit = read_timer(); double *dA,*dB; // allocate memory on GPU magma_malloc( (void**) &dA, sizeof(double)*size*size ); magma_malloc( (void**) &dB, sizeof(double)*size*size ); cudaDeviceSynchronize(); double tAlloc = read_timer(); fillMatrix(B, size*size); cudaDeviceSynchronize(); double tInit2 = read_timer(); // transfer data to GPU magma_dsetmatrix( size, size, B, size, dB, size ); cudaDeviceSynchronize(); double tTransferToGPU = read_timer(); // matrix multiply magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size ); // magma_dgemm is apparently synonymous with magmablas_dgemm cudaDeviceSynchronize(); double tMatMult = read_timer(); // Cholesky decomposition on GPU with GPU interface (called with object on GPU) magma_dpotrf_gpu( 'L', size, dA, size, &info ); cudaDeviceSynchronize(); double tChol = read_timer(); // transfer data back to CPU magma_dgetmatrix( size, size, dA, size, A, size ); cudaDeviceSynchronize(); double tTransferFromGPU = read_timer(); // standard BLAS matrix multiply on CPU dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tMatMultBlas = read_timer(); // Cholesky decomposition on GPU with CPU interface (called with object on CPU) magma_dpotrf( 'L', size, A, size, &info ); cudaDeviceSynchronize(); double tCholCpuInterface = read_timer(); // recreate A = B * B (could just do a save and copy instead....) dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); cudaDeviceSynchronize(); double tInit3 = read_timer(); // standard Lapack Cholesky decomposition on CPU dpotrf_(&uplo, &size, A, &size, &info); cudaDeviceSynchronize(); double tCholCpu= read_timer(); printf("====================================================\n"); printf("Timing results for n = %d\n", size); printf("GPU memory allocation time: %f\n", tAlloc - tInit); printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2); printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU); printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU); printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult); printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas); printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3); printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol); if(use_pinned) { magma_free_pinned(A); magma_free_pinned(B); } else { free(A); free(B); } magma_free(dA); magma_free(dB); } return EXIT_SUCCESS; }
extern "C" magma_err_t magma_cungqr(magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex *a, magma_int_t lda, magmaFloatComplex *tau, magmaFloatComplex_ptr dT, size_t dT_offset, magma_int_t nb, magma_int_t *info, magma_queue_t queue ) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= CUNGQR generates an M-by-N COMPLEX matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by CGEQRF. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. A (input/output) COMPLEX array A, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by CGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDA (input) INTEGER The first dimension of the array A. LDA >= max(1,M). TAU (input) COMPLEX array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by CGEQRF_GPU. DT (input) COMPLEX array on the GPU device. DT contains the T matrices used in blocking the elementary reflectors H(i), e.g., this can be the 6th argument of magma_cgeqrf_gpu. NB (input) INTEGER This is the block size used in CGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define a_ref(i,j) ( a + (j)*lda + (i)) #define da_ref(i,j) da, (da_offset + (j)*ldda + (i)) #define t_ref(a_1) dT, (dT_offset + (a_1)*nb) magma_int_t i__1, i__2, i__3; magma_int_t lwork, ldda; static magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork = min(m, n); magmaFloatComplex *work; magmaFloatComplex_ptr da, dwork; size_t da_offset, dwork_offset; magma_event_t event = NULL; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (lda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) return *info; /* Allocate GPU work space */ ldda = ((m+31)/32)*32; lddwork = ((lddwork+31)/32)*32; if (MAGMA_SUCCESS != magma_malloc( &da, ((n)*ldda + nb*lddwork )*sizeof(magmaFloatComplex))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } da_offset = 0; dwork = da; dwork_offset = da_offset + (n)*ldda; /* Allocate CPU work space */ lwork = n * nb; work = (magmaFloatComplex *)malloc(lwork*sizeof(magmaFloatComplex)); if( work == NULL ) { magma_free( da ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } if ( (nb > 1) && (nb < k) ) { /* Use blocked code after the last block. The first kk columns are handled by the block method. */ ki = (k - nb - 1) / nb * nb; kk = min(k, ki + nb); /* Set A(1:kk,kk+1:n) to zero. */ magmablas_claset(MagmaFull, kk, n-kk, da_ref(0,kk), ldda, queue); } else kk = 0; /* Use unblocked code for the last or only block. */ if (kk < n) { i__1 = m - kk; i__2 = n - kk; i__3 = k - kk; lapackf77_cungqr(&i__1, &i__2, &i__3, a_ref(kk, kk), &lda, &tau[kk], work, &lwork, &iinfo); magma_csetmatrix(i__1, i__2, a_ref(kk, kk), 0, lda, da_ref(kk, kk), ldda, queue); } if (kk > 0) { /* Use blocked code */ for (i = ki; i >= 0; i-=nb) { ib = min(nb, k - i); /* Send the current panel to the GPU */ i__2 = m - i; cpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work); magma_csetmatrix(i__2, ib, a_ref(i, i), 0, lda, da_ref(i, i), ldda, queue); if (i + ib < n) { /* Apply H to A(i:m,i+ib:n) from the left */ i__3 = n - i - ib; magma_clarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, i__2, i__3, ib, da_ref(i, i ), ldda, t_ref(i), nb, da_ref(i, i+ib), ldda, dwork, dwork_offset, lddwork, queue); } /* Apply H to rows i:m of current block on the CPU */ lapackf77_cungqr(&i__2, &ib, &ib, a_ref(i, i), &lda, &tau[i], work, &lwork, &iinfo); magma_csetmatrix_async( i__2, ib, a_ref(i,i), 0, lda, da_ref(i,i), ldda, queue, &event ); /* Set rows 1:i-1 of current block to zero */ i__2 = i + ib; magmablas_claset(MagmaFull, i, i__2 - i, da_ref(0,i), ldda, queue); } } magma_cgetmatrix(m, n, da_ref(0, 0), ldda, a_ref(0, 0), 0, lda, queue); //cudaStreamDestroy(stream); magma_free( da ); free(work); return *info; } /* magma_cungqr */
/** Purpose ------- CPOTRS solves a system of linear equations A*X = B with a Hermitian positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by CPOTRF. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by CPOTRF. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cposv_comp ********************************************************************/ extern "C" magma_int_t magma_cpotrs_batched( magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **dB_array, magma_int_t lddb, magma_int_t batchCount, magma_queue_t queue) { magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) info = -1; if ( n < 0 ) info = -2; if ( nrhs < 0) info = -3; if ( ldda < max(1, n) ) info = -5; if ( lddb < max(1, n) ) info = -7; if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return info; } magmaFloatComplex **dW1_displ = NULL; magmaFloatComplex **dW2_displ = NULL; magmaFloatComplex **dW3_displ = NULL; magmaFloatComplex **dW4_displ = NULL; magmaFloatComplex **dinvA_array = NULL; magmaFloatComplex **dwork_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nrhs; magmaFloatComplex* dinvA = NULL; magmaFloatComplex* dwork = NULL;// dinvA and dwork are workspace in ctrsm magma_cmalloc( &dinvA, invA_msize * batchCount); magma_cmalloc( &dwork, dwork_msize * batchCount ); /* check allocation */ if ( dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL ) { magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_claset_q(MagmaFull, invA_msize, batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dinvA, invA_msize, queue); magmablas_claset_q(MagmaFull, dwork_msize, batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dwork, dwork_msize, queue); cset_pointer(dwork_array, dwork, n, 0, 0, dwork_msize, batchCount, queue); cset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); if ( uplo == MagmaUpper) { // A = U^T U // solve U^{T}X = B ==> dworkX = U^-T * B magmablas_ctrsm_outofplace_batched(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); // solve U X = dwork ==> X = U^-1 * dwork magmablas_ctrsm_outofplace_batched(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); } else{ // A = L L^T // solve LX= B ==> dwork = L^{-1} B magmablas_ctrsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); // solve L^{T}X= dwork ==> X = L^{-T} dwork magmablas_ctrsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, 1, n, nrhs, c_one, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); } magma_queue_sync(cstream); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); return info; }
extern "C" magma_int_t magma_zgeqrf_batched( magma_int_t m, magma_int_t n, magmaDoubleComplex **dA_array, magma_int_t ldda, magmaDoubleComplex **tau_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) // A(i, j) means at i row, j column magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ) { printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t nb = 32; magma_int_t nnb = 8; magma_int_t i, k, ib=nb, jb=nnb; magma_int_t ldw, ldt, ldr, offset; cublasHandle_t myhandle; cublasCreate_v2(&myhandle); magmaDoubleComplex **dW0_displ = NULL; magmaDoubleComplex **dW1_displ = NULL; magmaDoubleComplex **dW2_displ = NULL; magmaDoubleComplex **dW3_displ = NULL; magmaDoubleComplex **dW4_displ = NULL; magmaDoubleComplex **dW5_displ = NULL; magmaDoubleComplex *dwork = NULL; magmaDoubleComplex *dT = NULL; magmaDoubleComplex *dR = NULL; magmaDoubleComplex **dR_array = NULL; magmaDoubleComplex **dT_array = NULL; magmaDoubleComplex **cpuAarray = NULL; magmaDoubleComplex **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); // used in zlarfb magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array)); magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array)); ldt = ldr = min(nb, min_mn); magma_zmalloc(&dwork, (2 * nb * n) * batchCount); magma_zmalloc(&dR, ldr * n * batchCount); magma_zmalloc(&dT, ldt * ldt * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaDoubleComplex*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(magmaDoubleComplex*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_array == NULL || dT_array == NULL || dR == NULL || dT == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_zlaset_q(MagmaFull, ldr, n*batchCount , MAGMA_Z_ZERO, MAGMA_Z_ZERO, dR, ldr, queue); magmablas_zlaset_q(MagmaFull, ldt, ldt*batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dT, ldt, queue); zset_pointer(dR_array, dR, 1, 0, 0, ldr*min(nb, min_mn), batchCount, queue); zset_pointer(dT_array, dT, 1, 0, 0, ldt*min(nb, min_mn), batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++) { magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dA_array, 1, cpuAarray, 1); magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dT_array, 1, cpuTarray, 1); magmablasSetKernelStream(NULL); for(i=0; i<min_mn; i+=nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_zdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_zdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_zgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_array, ldt, dR_array, ldr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, myhandle, queue); //=============================================== // end of panel //=============================================== //direct panel matrix V in dW0_displ, magma_zdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); // copy the upper part of V into dR zgeqrf_copy_upper_batched(ib, jb, dW0_displ, ldda, dR_array, ldr, batchCount, queue); //=============================================== // update trailing matrix //=============================================== //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; zset_pointer(dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; zset_pointer(dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); if( (n-ib-i) > 0) { // set the diagonal of v as one and the upper triangular part as zero magmablas_zlaset_batched(MagmaUpper, ib, ib, MAGMA_Z_ZERO, MAGMA_Z_ONE, dW0_displ, ldda, batchCount, queue); magma_zdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_zlarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_array, ldt, dW4_displ, nb*ldt, batchCount, myhandle, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i) > 100 && (n-i-ib) > 100) { // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_device_sync(); for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // the stream gemm must take cpu pointer magma_zlarfb_gpu_gemm(MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k], ldt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_zdisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_zlarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const magmaDoubleComplex**)dW0_displ, ldda, (const magmaDoubleComplex**)dT_array, ldt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, myhandle, queue); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update magmablas_zlacpy_batched(MagmaUpper, ib, ib, dR_array, ldr, dW0_displ, ldda, batchCount, queue); } for(k=0; k<nbstreams; k++) { magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); cublasDestroy_v2(myhandle); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); return arginfo; }
extern "C" magma_int_t magma_cgeqrf_batched( magma_int_t m, magma_int_t n, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **dtau_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) // A(i, j) means at i row, j column /* Local Parameter */ magma_int_t nb = magma_get_cgeqrf_batched_nb(m); magma_int_t min_mn = min(m, n); /* Check arguments */ cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) return arginfo; magmaFloatComplex *dT = NULL; magmaFloatComplex *dR = NULL; magmaFloatComplex **dR_array = NULL; magmaFloatComplex **dT_array = NULL; magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array)); magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array)); magma_int_t lddt = min(nb, min_mn); magma_int_t lddr = min(nb, min_mn); magma_cmalloc(&dR, lddr * lddr * batchCount); magma_cmalloc(&dT, lddt * lddt * batchCount); /* check allocation */ if ( dR_array == NULL || dT_array == NULL || dR == NULL || dT == NULL ) { magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magma_cset_pointer( dR_array, dR, lddr, 0, 0, lddr*min(nb, min_mn), batchCount, queue ); magma_cset_pointer( dT_array, dT, lddt, 0, 0, lddt*min(nb, min_mn), batchCount, queue ); arginfo = magma_cgeqrf_expert_batched(m, n, dA_array, ldda, dR_array, lddr, dT_array, lddt, dtau_array, 0, info_array, batchCount, queue); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); return arginfo; }
extern "C" magma_int_t magma_slobpcg( magma_s_sparse_matrix A, magma_s_solver_par *solver_par ) { #define residualNorms(i,iter) ( residualNorms + (i) + (iter)*n ) #define magmablas_swap(x, y) { pointer = x; x = y; y = pointer; } #define hresidualNorms(i,iter) (hresidualNorms + (i) + (iter)*n ) #define gramA( m, n) (gramA + (m) + (n)*ldgram) #define gramB( m, n) (gramB + (m) + (n)*ldgram) #define gevectors(m, n) (gevectors + (m) + (n)*ldgram) #define h_gramB( m, n) (h_gramB + (m) + (n)*ldgram) #define magma_s_bspmv_tuned(m, n, alpha, A, X, beta, AX) { \ magmablas_stranspose( m, n, X, m, blockW, n ); \ magma_s_vector x, ax; \ x.memory_location = Magma_DEV; x.num_rows = m*n; x.nnz = m*n; x.val = blockW; \ ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX; \ magma_s_spmv(alpha, A, x, beta, ax ); \ magmablas_stranspose( n, m, blockW, n, X, m ); \ } //************************************************************** // Memory allocation for the eigenvectors, eigenvalues, and workspace solver_par->solver = Magma_LOBPCG; magma_int_t m = A.num_rows; magma_int_t n =(solver_par->num_eigenvalues); float *blockX = solver_par->eigenvectors; float *evalues = solver_par->eigenvalues; float *dwork, *hwork; float *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW; float *gramA, *gramB, *gramM; float *gevectors, *h_gramB; float *pointer, *origX = blockX; float *eval_gpu; magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n), 1 + 6*3*n + 2* 3*n* 3*n); magma_smalloc_pinned( &hwork , lwork ); magma_smalloc( &blockAX , m*n ); magma_smalloc( &blockAR , m*n ); magma_smalloc( &blockAP , m*n ); magma_smalloc( &blockR , m*n ); magma_smalloc( &blockP , m*n ); magma_smalloc( &blockW , m*n ); magma_smalloc( &dwork , m*n ); magma_smalloc( &eval_gpu , 3*n ); //**********************************************************+ magma_int_t verbosity = 1; magma_int_t *iwork, liwork = 15*n+9; // === Set solver parameters === float residualTolerance = solver_par->epsilon; magma_int_t maxIterations = solver_par->maxiter; // === Set some constants & defaults === float c_one = MAGMA_S_ONE, c_zero = MAGMA_S_ZERO; float *residualNorms, *condestGhistory, condestG; float *gevalues; magma_int_t *activeMask; // === Check some parameters for possible quick exit === solver_par->info = 0; if (m < 2) solver_par->info = -1; else if (n > m) solver_par->info = -2; if (solver_par->info != 0) { magma_xerbla( __func__, -(solver_par->info) ); return solver_par->info; } magma_int_t *info = &(solver_par->info); // local info variable; // === Allocate GPU memory for the residual norms' history === magma_smalloc(&residualNorms, (maxIterations+1) * n); magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) ); // === Allocate CPU work space === magma_smalloc_cpu(&condestGhistory, maxIterations+1); magma_smalloc_cpu(&gevalues, 3 * n); magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t)); float *hW; magma_smalloc_pinned(&hW, n*n); magma_smalloc_pinned(&gevectors, 9*n*n); magma_smalloc_pinned(&h_gramB , 9*n*n); // === Allocate GPU workspace === magma_smalloc(&gramM, n * n); magma_smalloc(&gramA, 9 * n * n); magma_smalloc(&gramB, 9 * n * n); #if defined(PRECISION_z) || defined(PRECISION_c) float *rwork; magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n); magma_smalloc_cpu(&rwork, lrwork); #endif // === Set activemask to one === for(int k =0; k<n; k++) iwork[k]=1; magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n); magma_int_t gramDim, ldgram = 3*n, ikind = 4; // === Make the initial vectors orthonormal === magma_sgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info ); //magma_sorthomgs( m, n, blockX ); magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); // === Compute the Gram matrix = (X, AX) & its eigenstates === magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_ssyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, evalues, hW, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); // === Update X = X * evectors === magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockX); // === Update AX = AX * evectors === magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramM, n, c_zero, blockW, m); magmablas_swap(blockW, blockAX); condestGhistory[1] = 7.82; magma_int_t iterationNumber, cBlockSize, restart = 1, iter; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); // === Main LOBPCG loop ============================================================ for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++) { // === compute the residuals (R = Ax - x evalues ) magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); /* for(int i=0; i<n; i++){ magma_saxpy(m, MAGMA_S_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1); } */ #if defined(PRECISION_z) || defined(PRECISION_d) magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #else magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n ); #endif magma_slobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu); magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === remove the residuals corresponding to already converged evectors magma_scompact(m, n, blockR, m, residualNorms(0, iterationNumber), residualTolerance, activeMask, &cBlockSize); if (cBlockSize == 0) break; // === apply a preconditioner P to the active residulas: R_new = P R_old // === for now set P to be identity (no preconditioner => nothing to be done ) // magmablas_slacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m); /* // === make the preconditioned residuals orthogonal to X magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m); */ // === make the active preconditioned residuals orthonormal magma_sgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info ); //magma_sorthomgs( m, cBlockSize, blockR ); // === compute AR magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR ); if (!restart) { // === compact P & AP as well magma_scompactActive(m, n, blockP, m, activeMask); magma_scompactActive(m, n, blockAP, m, activeMask); /* // === make P orthogonal to X ? magma_sgemm(MagmaTrans, MagmaNoTrans, n, cBlockSize, m, c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n, c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m); // === make P orthogonal to R ? magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize, c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m); */ // === Make P orthonormal & properly change AP (without multiplication by A) magma_sgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info ); //magma_sorthomgs( m, cBlockSize, blockP ); //magma_s_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP ); magma_ssetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize); // magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, // m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); // replacement according to Stan #if defined(PRECISION_s) || defined(PRECISION_d) magmablas_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #else magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m); #endif } iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize))); float condestGmean = 0.; for(int i = 0; i<iterationNumber-iter+1; i++) condestGmean += condestGhistory[i]; condestGmean = condestGmean / (iterationNumber-iter+1); if (restart) gramDim = n+cBlockSize; else gramDim = n+2*cBlockSize; /* --- The Raileight-Ritz method for [X R P] ----------------------- [ X R P ]' [AX AR AP] y = evalues [ X R P ]' [ X R P ], i.e., GramA GramB / X'AX X'AR X'AP \ / X'X X'R X'P \ | R'AX R'AR R'AP | y = evalues | R'X R'R R'P | \ P'AX P'AR P'AP / \ P'X P'R P'P / ----------------------------------------------------------------- */ // === assemble GramB; first, set it to I magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram); // identity if (!restart) { magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram); } magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram); // === get GramB from the GPU to the CPU and compute its eigenvalues only magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); lapackf77_ssyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, #endif info); // === check stability criteria if we need to restart condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.; if ((condestG/condestGmean>2 && condestG>2) || condestG>8) { // Steepest descent restart for stability restart=1; printf("restart at step #%d\n", (int) iterationNumber); } // === assemble GramA; first, set it to I magmablas_slaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram); // identity magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram); if (!restart) { magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, n, m, c_one, blockP, m, blockAX, m, c_zero, gramA(n+cBlockSize,0), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAR, m, c_zero, gramA(n+cBlockSize,n), ldgram); magma_sgemm(MagmaTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, c_one, blockP, m, blockAP, m, c_zero, gramA(n+cBlockSize,n+cBlockSize), ldgram); } /* // === Compute X' AX or just use the eigenvalues below ? magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramA(0,0), ldgram); */ if (restart==0) { magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } else { gramDim = n+cBlockSize; magma_sgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram); } for(int k=0; k<n; k++) *gevectors(k,k) = MAGMA_S_MAKE(evalues[k], 0); // === the previous eigensolver destroyed what is in h_gramB => must copy it again magma_sgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram); magma_int_t itype = 1; lapackf77_ssygvd(&itype, "V", "L", &gramDim, gevectors, &ldgram, h_gramB, &ldgram, gevalues, hwork, &lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, &lrwork, #endif iwork, &liwork, info); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === copy back the result to gramA on the GPU and use it for the updates magma_ssetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram); if (restart == 0) { // === contribution from P to the new X (in new search direction P) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockP); // === contribution from R to the new X (in new search direction P) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m); // === corresponding contribution from AP to the new AX (in AP) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAP); // === corresponding contribution from AR to the new AX (in AP) magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m); } else { // === contribution from R (only) to the new X magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m); // === corresponding contribution from AR (only) to the new AX magma_sgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize, c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m); } // === contribution from old X to the new X + the new search direction P magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockX); //magma_saxpy(m*n, c_one, blockP, 1, blockX, 1); magma_slobpcg_maxpy( m, n, blockP, blockX ); // === corresponding contribution from old AX to new AX + AP magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m); magmablas_swap(dwork, blockAX); //magma_saxpy(m*n, c_one, blockAP, 1, blockAX, 1); magma_slobpcg_maxpy( m, n, blockAP, blockAX ); condestGhistory[iterationNumber+1]=condestG; if (verbosity==1) { // float res; // magma_sgetmatrix(1, 1, // (float*)residualNorms(0, iterationNumber), 1, // (float*)&res, 1); // // printf("Iteration %4d, CBS %4d, Residual: %10.7f\n", // iterationNumber, cBlockSize, res); printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1); } restart = 0; } // === end for iterationNumber = 1,maxIterations ======================= // fill solver info magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; solver_par->numiter = iterationNumber; if( solver_par->numiter < solver_par->maxiter) { solver_par->info = 0; } else if( solver_par->init_res > solver_par->final_res ) solver_par->info = -2; else solver_par->info = -1; // ============================================================================= // === postprocessing; // ============================================================================= // === compute the real AX and corresponding eigenvalues magma_s_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX ); magma_sgemm(MagmaTrans, MagmaNoTrans, n, n, m, c_one, blockX, m, blockAX, m, c_zero, gramM, n); magma_ssyevd_gpu( MagmaVec, MagmaUpper, n, gramM, n, gevalues, dwork, n, hwork, lwork, #if defined(PRECISION_z) || defined(PRECISION_c) rwork, lrwork, #endif iwork, liwork, info ); for(int k =0; k<n; k++) evalues[k] = gevalues[k]; // === update X = X * evectors magmablas_swap(blockX, dwork); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockX, m); // === update AX = AX * evectors to compute the final residual magmablas_swap(blockAX, dwork); magma_sgemm(MagmaNoTrans, MagmaNoTrans, m, n, n, c_one, dwork, m, gramM, n, c_zero, blockAX, m); // === compute R = AX - evalues X magmablas_slacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m); for(int i=0; i<n; i++) magma_saxpy(m, MAGMA_S_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1); // === residualNorms[iterationNumber] = || R || magmablas_snrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber)); // === restore blockX if needed if (blockX != origX) magmablas_slacpy( MagmaUpperLower, m, n, blockX, m, origX, m); printf("Eigenvalues:\n"); for(int i =0; i<n; i++) printf("%e ", evalues[i]); printf("\n\n"); printf("Final residuals:\n"); magma_sprint_gpu(1, n, residualNorms(0, iterationNumber), 1); printf("\n\n"); //=== Print residual history in a file for plotting ==== float *hresidualNorms; magma_smalloc_cpu(&hresidualNorms, (iterationNumber+1) * n); magma_sgetmatrix(n, iterationNumber, (float*)residualNorms, n, (float*)hresidualNorms, n); printf("Residuals are stored in file residualNorms\n"); printf("Plot the residuals using: myplot \n"); FILE *residuals_file; residuals_file = fopen("residualNorms", "w"); for(int i =1; i<iterationNumber; i++) { for(int j = 0; j<n; j++) fprintf(residuals_file, "%f ", *hresidualNorms(j,i)); fprintf(residuals_file, "\n"); } fclose(residuals_file); magma_free_cpu(hresidualNorms); // === free work space magma_free( residualNorms ); magma_free_cpu( condestGhistory ); magma_free_cpu( gevalues ); magma_free_cpu( iwork ); magma_free_pinned( hW ); magma_free_pinned( gevectors ); magma_free_pinned( h_gramB ); magma_free( gramM ); magma_free( gramA ); magma_free( gramB ); magma_free( activeMask ); magma_free( blockAX ); magma_free( blockAR ); magma_free( blockAP ); magma_free( blockR ); magma_free( blockP ); magma_free( blockW ); magma_free( dwork ); magma_free( eval_gpu ); magma_free_pinned( hwork ); #if defined(PRECISION_z) || defined(PRECISION_c) magma_free_cpu( rwork ); #endif return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_sgetrf_recpanel_batched_q( magma_int_t m, magma_int_t n, magma_int_t min_recpnb, float** dA_array, magma_int_t ldda, magma_int_t** dipiv_array, magma_int_t** dpivinfo_array, float** dX_array, magma_int_t dX_length, float** dinvA_array, magma_int_t dinvA_length, float** dW1_displ, float** dW2_displ, float** dW3_displ, float** dW4_displ, float** dW5_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t stream, cublasHandle_t myhandle) { //magma_int_t DEBUG = 3; // Quick return if possible if (m ==0 || n == 0) { return 0; } float **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_int_t **dipiv_displ = NULL; magma_malloc((void**)&dipiv_displ, batchCount * sizeof(*dipiv_displ)); magma_int_t panel_nb = n; if(panel_nb <= min_recpnb){ //if(DEBUG>0)printf("calling bottom panel recursive with m=%d nb=%d\n",m,n); // panel factorization //magma_sdisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount); magma_sgetf2_batched( m, panel_nb, dA_array, ldda, dW1_displ, dW2_displ, dW3_displ, dipiv_array, info_array, gbstep, batchCount, myhandle); } else{ // split A over two [A A2] // panel on A1, update on A2 then panel on A1 magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t m1 = m; magma_int_t m2 = m-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A1 //if(DEBUG>0)printf("calling recursive panel on A1 with m=%d nb=%d min_recpnb %d\n",m1,n1,min_recpnb); magma_sdisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount); magma_idisplace_pointers(dipiv_displ, dipiv_array, 1, p1, 0, batchCount); magma_sgetrf_recpanel_batched_q( m1, n1, min_recpnb, dA_displ, ldda, dipiv_displ, dpivinfo_array, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep, batchCount, stream, myhandle); // update A2 //if(DEBUG>0)printf("calling TRSM with m=%d n=%d \n",m1,n2); // setup pivinfo setup_pivinfo_batched_q(dpivinfo_array, dipiv_displ, m1, n1, stream, batchCount); magma_sdisplace_pointers(dW5_displ, dA_array, ldda, p1, p2, batchCount); magma_slaswp_rowparallel_batched( n2, dW5_displ, ldda, dX_array, n1, 0, n1, dpivinfo_array, batchCount); magmablas_strsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, n1, n2, MAGMA_S_ONE, dA_displ, ldda, // dA dX_array, n1, // dB dW5_displ, ldda, // dX dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount); magma_sdisplace_pointers(dW1_displ, dA_array, ldda, p2, 0, batchCount); magma_sdisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount); //if(DEBUG>0)printf("calling update A2(%d,%d) -= A(%d,%d)*A(%d,%d) with m=%d n=%d k=%d ldda %d\n",p2,p2,p2,0,p1,p2,m2,n2,n1,ldda); #if 0 float neg_one = MAGMA_S_NEG_ONE; float one = MAGMA_S_ONE; cublasSgemmBatched(myhandle, CUBLAS_OP_N, CUBLAS_OP_N, m2, n2, n1, &neg_one, (const float**) dW1_displ, ldda, (const float**) dW5_displ, ldda, &one, dA_displ, ldda, batchCount ); #else magmablas_sgemm_batched( MagmaNoTrans, MagmaNoTrans, m2, n2, n1, MAGMA_S_NEG_ONE, dW1_displ, ldda, dW5_displ, ldda, MAGMA_S_ONE, dA_displ, ldda, batchCount); #endif // panel on A2 //if(DEBUG>0)printf("calling recursive panel on A2 with m=%d nb=%d min_recpnb %d\n",m2,n2,min_recpnb); magma_idisplace_pointers(dipiv_displ, dipiv_array, 1, p2, 0, batchCount); magma_sgetrf_recpanel_batched_q( m2, n2, min_recpnb, dA_displ, ldda, dipiv_displ, dpivinfo_array, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep+p2, batchCount, stream, myhandle); // setup pivinfo setup_pivinfo_batched_q(dpivinfo_array, dipiv_displ, m2, n2, stream, batchCount); adjust_ipiv_batched_q(dipiv_displ, n2, n1, magma_stream, batchCount); magma_sdisplace_pointers(dW1_displ, dA_array, ldda, p2, 0, batchCount); // no need since it is above magma_slaswp_rowparallel_batched( n1, dW1_displ, ldda, dW1_displ, ldda, n1, n, dpivinfo_array, batchCount); } magma_free(dA_displ); magma_free(dipiv_displ); return 0; }
extern "C" magma_int_t magma_zgeev(magma_vec_t jobvl, magma_vec_t jobvr, magma_int_t n, magmaDoubleComplex *a, magma_int_t lda, magmaDoubleComplex *geev_w_array, magmaDoubleComplex *vl, magma_int_t ldvl, magmaDoubleComplex *vr, magma_int_t ldvr, magmaDoubleComplex *work, magma_int_t lwork, double *rwork, magma_int_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver September 2012 Purpose ======= ZGEEV computes for an N-by-N complex nonsymmetric matrix A, the eigenvalues and, optionally, the left and/or right eigenvectors. The right eigenvector v(j) of A satisfies A * v(j) = lambda(j) * v(j) where lambda(j) is its eigenvalue. The left eigenvector u(j) of A satisfies u(j)**H * A = lambda(j) * u(j)**H where u(j)**H denotes the conjugate transpose of u(j). The computed eigenvectors are normalized to have Euclidean norm equal to 1 and largest component real. Arguments ========= JOBVL (input) CHARACTER*1 = 'N': left eigenvectors of A are not computed; = 'V': left eigenvectors of are computed. JOBVR (input) CHARACTER*1 = 'N': right eigenvectors of A are not computed; = 'V': right eigenvectors of A are computed. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the N-by-N matrix A. On exit, A has been overwritten. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). W (output) COMPLEX*16 array, dimension (N) W contains the computed eigenvalues. VL (output) COMPLEX*16 array, dimension (LDVL,N) If JOBVL = 'V', the left eigenvectors u(j) are stored one after another in the columns of VL, in the same order as their eigenvalues. If JOBVL = 'N', VL is not referenced. u(j) = VL(:,j), the j-th column of VL. LDVL (input) INTEGER The leading dimension of the array VL. LDVL >= 1; if JOBVL = 'V', LDVL >= N. VR (output) COMPLEX*16 array, dimension (LDVR,N) If JOBVR = 'V', the right eigenvectors v(j) are stored one after another in the columns of VR, in the same order as their eigenvalues. If JOBVR = 'N', VR is not referenced. v(j) = VR(:,j), the j-th column of VR. LDVR (input) INTEGER The leading dimension of the array VR. LDVR >= 1; if JOBVR = 'V', LDVR >= N. WORK (workspace/output) COMPLEX*16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= (1+nb)*N. If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. RWORK (workspace) DOUBLE PRECISION array, dimension (2*N) INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value. > 0: if INFO = i, the QR algorithm failed to compute all the eigenvalues, and no eigenvectors have been computed; elements and i+1:N of W contain eigenvalues which have converged. ===================================================================== */ magma_int_t c__1 = 1; magma_int_t c__0 = 0; magma_int_t a_dim1, a_offset, vl_dim1, vl_offset, vr_dim1, vr_offset, i__1, i__2, i__3; double d__1, d__2; magmaDoubleComplex z__1, z__2; magma_int_t i__, k, ihi; double scl; magma_int_t ilo; double dum[1], eps; magmaDoubleComplex tmp; magma_int_t ibal; double anrm; magma_int_t ierr, itau, iwrk, nout; magma_int_t scalea; double cscale; magma_int_t select[1]; double bignum; magma_int_t minwrk; magma_int_t wantvl; double smlnum; magma_int_t irwork; magma_int_t lquery, wantvr; magma_int_t nb = 0; magmaDoubleComplex_ptr dT; //magma_timestr_t start, end; char side[2] = {0, 0}; magma_vec_t jobvl_ = jobvl; magma_vec_t jobvr_ = jobvr; *info = 0; lquery = lwork == -1; wantvl = lapackf77_lsame(lapack_const(jobvl_), "V"); wantvr = lapackf77_lsame(lapack_const(jobvr_), "V"); if (! wantvl && ! lapackf77_lsame(lapack_const(jobvl_), "N")) { *info = -1; } else if (! wantvr && ! lapackf77_lsame(lapack_const(jobvr_), "N")) { *info = -2; } else if (n < 0) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if ( (ldvl < 1) || (wantvl && (ldvl < n))) { *info = -8; } else if ( (ldvr < 1) || (wantvr && (ldvr < n))) { *info = -10; } /* Compute workspace */ if (*info == 0) { nb = magma_get_zgehrd_nb(n); minwrk = (1+nb)*n; work[0] = MAGMA_Z_MAKE((double) minwrk, 0.); if (lwork < minwrk && ! lquery) { *info = -12; } } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } // if eigenvectors are needed #if defined(VERSION3) if (MAGMA_SUCCESS != magma_malloc(&dT, nb*n*sizeof(magmaDoubleComplex) )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } #endif a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; vl_dim1 = ldvl; vl_offset = 1 + vl_dim1; vl -= vl_offset; vr_dim1 = ldvr; vr_offset = 1 + vr_dim1; vr -= vr_offset; --work; --rwork; /* Get machine constants */ eps = lapackf77_dlamch("P"); smlnum = lapackf77_dlamch("S"); bignum = 1. / smlnum; lapackf77_dlabad(&smlnum, &bignum); smlnum = magma_dsqrt(smlnum) / eps; bignum = 1. / smlnum; /* Scale A if max element outside range [SMLNUM,BIGNUM] */ anrm = lapackf77_zlange("M", &n, &n, &a[a_offset], &lda, dum); scalea = 0; if (anrm > 0. && anrm < smlnum) { scalea = 1; cscale = smlnum; } else if (anrm > bignum) { scalea = 1; cscale = bignum; } if (scalea) { lapackf77_zlascl("G", &c__0, &c__0, &anrm, &cscale, &n, &n, &a[a_offset], &lda, & ierr); } /* Balance the matrix (CWorkspace: none) (RWorkspace: need N) */ ibal = 1; lapackf77_zgebal("B", &n, &a[a_offset], &lda, &ilo, &ihi, &rwork[ibal], &ierr); /* Reduce to upper Hessenberg form (CWorkspace: need 2*N, prefer N+N*NB) (RWorkspace: none) */ itau = 1; iwrk = itau + n; i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) /* * Version 1 - LAPACK */ lapackf77_zgehrd(&n, &ilo, &ihi, &a[a_offset], &lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION2) /* * Version 2 - LAPACK consistent HRD */ magma_zgehrd2(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored, */ magma_zgehrd(n, ilo, ihi, &a[a_offset], lda, &work[itau], &work[iwrk], i__1, dT, 0, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for zgehrd = %5.2f sec\n", GetTimerValue(start,end)/1000.); if (wantvl) { /* Want left eigenvectors Copy Householder vectors to VL */ side[0] = 'L'; lapackf77_zlacpy(MagmaLowerStr, &n, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl); /* Generate unitary matrix in VL (CWorkspace: need 2*N-1, prefer N+(N-1)*NB) (RWorkspace: none) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_zunghr(&n, &ilo, &ihi, &vl[vl_offset], &ldvl, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_zunghr(n, ilo, ihi, &vl[vl_offset], ldvl, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* Perform QR iteration, accumulating Schur vectors in VL (CWorkspace: need 1, prefer HSWORK (see comments) ) (RWorkspace: none) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array, &vl[vl_offset], &ldvl, &work[iwrk], &i__1, info); if (wantvr) { /* Want left and right eigenvectors Copy Schur vectors to VR */ side[0] = 'B'; lapackf77_zlacpy("F", &n, &n, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr); } } else if (wantvr) { /* Want right eigenvectors Copy Householder vectors to VR */ side[0] = 'R'; lapackf77_zlacpy("L", &n, &n, &a[a_offset], &lda, &vr[vr_offset], &ldvr); /* Generate unitary matrix in VR (CWorkspace: need 2*N-1, prefer N+(N-1)*NB) (RWorkspace: none) */ i__1 = lwork - iwrk + 1; //start = get_current_time(); #if defined(VERSION1) || defined(VERSION2) /* * Version 1 & 2 - LAPACK */ lapackf77_zunghr(&n, &ilo, &ihi, &vr[vr_offset], &ldvr, &work[itau], &work[iwrk], &i__1, &ierr); #elif defined(VERSION3) /* * Version 3 - LAPACK consistent MAGMA HRD + matrices T stored */ magma_zunghr(n, ilo, ihi, &vr[vr_offset], ldvr, &work[itau], dT, 0, nb, &ierr, queue); #endif //end = get_current_time(); //printf(" Time for zunghr = %5.2f sec\n", GetTimerValue(start,end)/1000.); /* Perform QR iteration, accumulating Schur vectors in VR (CWorkspace: need 1, prefer HSWORK (see comments) ) (RWorkspace: none) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_zhseqr("S", "V", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } else { /* Compute eigenvalues only (CWorkspace: need 1, prefer HSWORK (see comments) ) (RWorkspace: none) */ iwrk = itau; i__1 = lwork - iwrk + 1; lapackf77_zhseqr("E", "N", &n, &ilo, &ihi, &a[a_offset], &lda, geev_w_array, &vr[vr_offset], &ldvr, &work[iwrk], &i__1, info); } /* If INFO > 0 from ZHSEQR, then quit */ if (*info > 0) { goto L50; } if (wantvl || wantvr) { /* Compute left and/or right eigenvectors (CWorkspace: need 2*N) (RWorkspace: need 2*N) */ irwork = ibal + n; lapackf77_ztrevc(side, "B", select, &n, &a[a_offset], &lda, &vl[vl_offset], &ldvl, &vr[vr_offset], &ldvr, &n, &nout, &work[iwrk], &rwork[irwork], &ierr); } if (wantvl) { /* Undo balancing of left eigenvectors (CWorkspace: none) (RWorkspace: need N) */ lapackf77_zgebak("B", "L", &n, &ilo, &ihi, &rwork[ibal], &n, &vl[vl_offset], &ldvl, &ierr); /* Normalize left eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { scl = 1. / cblas_dznrm2(n, &vl[i__ * vl_dim1 + 1], 1); cblas_zdscal(n, scl, &vl[i__ * vl_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { i__3 = k + i__ * vl_dim1; /* Computing 2nd power */ d__1 = MAGMA_Z_REAL(vl[i__3]); /* Computing 2nd power */ d__2 = MAGMA_Z_IMAG(vl[k + i__ * vl_dim1]); rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_idamax */ k = cblas_idamax(n, &rwork[irwork], 1)+1; z__2 = MAGMA_Z_CNJG(vl[k + i__ * vl_dim1]); d__1 = magma_dsqrt(rwork[irwork + k - 1]); MAGMA_Z_DSCALE(z__1, z__2, d__1); tmp = z__1; cblas_zscal(n, CBLAS_SADDR(tmp), &vl[i__ * vl_dim1 + 1], 1); i__2 = k + i__ * vl_dim1; i__3 = k + i__ * vl_dim1; d__1 = MAGMA_Z_REAL(vl[i__3]); MAGMA_Z_SET2REAL(z__1, d__1); vl[i__2] = z__1; } } if (wantvr) { /* Undo balancing of right eigenvectors (CWorkspace: none) (RWorkspace: need N) */ lapackf77_zgebak("B", "R", &n, &ilo, &ihi, &rwork[ibal], &n, &vr[vr_offset], &ldvr, &ierr); /* Normalize right eigenvectors and make largest component real */ for (i__ = 1; i__ <= n; ++i__) { scl = 1. / cblas_dznrm2(n, &vr[i__ * vr_dim1 + 1], 1); cblas_zdscal(n, scl, &vr[i__ * vr_dim1 + 1], 1); i__2 = n; for (k = 1; k <= i__2; ++k) { i__3 = k + i__ * vr_dim1; /* Computing 2nd power */ d__1 = MAGMA_Z_REAL(vr[i__3]); /* Computing 2nd power */ d__2 = MAGMA_Z_IMAG(vr[k + i__ * vr_dim1]); rwork[irwork + k - 1] = d__1 * d__1 + d__2 * d__2; } /* Comment: Fortran BLAS does not have to add 1 C BLAS must add one to cblas_idamax */ k = cblas_idamax(n, &rwork[irwork], 1)+1; z__2 = MAGMA_Z_CNJG(vr[k + i__ * vr_dim1]); d__1 = magma_dsqrt(rwork[irwork + k - 1]); MAGMA_Z_DSCALE(z__1, z__2, d__1); tmp = z__1; cblas_zscal(n, CBLAS_SADDR(tmp), &vr[i__ * vr_dim1 + 1], 1); i__2 = k + i__ * vr_dim1; i__3 = k + i__ * vr_dim1; d__1 = MAGMA_Z_REAL(vr[i__3]); MAGMA_Z_SET2REAL(z__1, d__1); vr[i__2] = z__1; } } /* Undo scaling if necessary */ L50: if (scalea) { i__1 = n - *info; /* Computing MAX */ i__3 = n - *info; i__2 = max(i__3,1); lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, geev_w_array + *info, &i__2, &ierr); if (*info > 0) { i__1 = ilo - 1; lapackf77_zlascl("G", &c__0, &c__0, &cscale, &anrm, &i__1, &c__1, geev_w_array, &n, &ierr); } } #if defined(VERSION3) magma_free( dT ); #endif return *info; } /* magma_zgeev */
//=================================================================================================================== //=================================================================================================================== //=================================================================================================================== extern "C" magma_int_t magma_zlarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, magmaDoubleComplex **v_array, magma_int_t ldv, magmaDoubleComplex **tau_array, magmaDoubleComplex **T_array, magma_int_t ldt, magmaDoubleComplex **work_array, magma_int_t lwork, magma_int_t batchCount, magma_queue_t queue) { magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; if ( k <= 0) return 0; if ( stair_T > 0 && k <= stair_T) return 0; magma_int_t maxnb = max_shared_bsiz; if ( lwork < k*ldt) { magma_xerbla( __func__, -(10) ); return -10; } if ( stair_T > 0 && stair_T > maxnb) { magma_xerbla( __func__, -(3) ); return -3; } magma_int_t DEBUG=0; magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T; magma_int_t i, j, prev_n, mycol, rows; magmaDoubleComplex **dW1_displ = NULL; magmaDoubleComplex **dW2_displ = NULL; magmaDoubleComplex **dW3_displ = NULL; magmaDoubleComplex **dTstep_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dTstep_array, batchCount * sizeof(*dTstep_array)); //magmaDoubleComplex *Tstep = k > nb ? work : T; if (k > nb) { magma_zdisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue); } else { magma_zdisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue); } //magma_int_t ldtstep = k > nb ? k : ldt; magma_int_t ldtstep = ldt; //a enlever // stair_T = 0 meaning all T // stair_T > 0 meaning the triangular portion of T has been computed. // the value of stair_T is the nb of these triangulars //GEMV compute the whole triangular upper portion of T (phase 1) // TODO addcublas to check perf magma_zgemm_batched( MagmaConjTrans, MagmaNoTrans, k, k, n, c_one, v_array, ldv, v_array, ldv, c_zero, dTstep_array, ldtstep, batchCount, queue ); magmablas_zlaset_batched( MagmaLower, k, k, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dTstep_array, ldtstep, batchCount, queue ); // no need for it as T is expected to be lower zero //if (k > nb) magmablas_zlaset_batched( MagmaLower, k, k, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dTstep_array, ldtstep, batchCount, queue ); //TRMV //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] // TRMV is split over block of column of size nb // the update should be done from top to bottom so: // 1- a gemm using the previous computed columns // of T to update rectangular upper protion above // the triangle of my columns // 2- the columns need to be updated by a serial // loop over of gemv over itself. since we limit the // shared memory to nb, this nb column // are split vertically by chunk of nb rows dim3 grid(1, 1, batchCount); for (j=0; j < k; j += nb) { prev_n = j; mycol = min(nb, k-j); // note that myrow = prev_n + mycol; if (prev_n > 0 && mycol > 0) { if (DEBUG == 3) { printf("doing gemm on the rectangular portion of size %d %d of T(%d,%d)\n", (int) prev_n, (int) mycol, 0, (int) j ); } magma_zdisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue); magma_zdisplace_pointers(dW2_displ, T_array, ldt, 0, j, batchCount, queue); magma_zgemm_batched( MagmaNoTrans, MagmaNoTrans, prev_n, mycol, prev_n, c_one, T_array, ldt, dW1_displ, ldtstep, c_zero, dW2_displ, ldt, batchCount, queue ); // update my rectangular portion (prev_n,mycol) using sequence of gemv magma_zdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_zdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); for (i=0; i < prev_n; i += nb) { rows = min(nb,prev_n-i); if (DEBUG == 3) { printf(" doing recztrmv on the rectangular portion of size %d %d of T(%d,%d)\n", (int) rows, (int) mycol, (int) i, (int) j ); } if (rows > 0 && mycol > 0) { magma_zdisplace_pointers(dW2_displ, T_array, ldt, i, j, batchCount, queue); magmablas_zlarft_recztrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue); } } } // the upper rectangular protion is updated, now if needed update the triangular portion if (stair_T == 0) { if (DEBUG == 3) { printf("doing ztrmv on the triangular portion of size %d %d of T(%d,%d)\n", (int) mycol, (int) mycol, (int) j, (int) j ); } if (mycol > 0) { magma_zdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_zdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); magma_zdisplace_pointers(dW2_displ, T_array, ldt, j, j, batchCount, queue); magmablas_zlarft_ztrmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue); } } }// end of j magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dTstep_array); return 0; }
/** \n This is an internal routine. ********************************************************************/ extern "C" magma_int_t magma_dpotrf_recpanel_batched( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t min_recpnb, double** dA_array, magma_int_t ldda, double** dX_array, magma_int_t dX_length, double** dinvA_array, magma_int_t dinvA_length, double** dW0_displ, double** dW1_displ, double** dW2_displ, double** dW3_displ, double** dW4_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue) { magma_int_t arginfo = 0; // Quick return if possible if (m == 0 || n == 0) { return arginfo; } if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); arginfo = -1; magma_xerbla( __func__, -(arginfo) ); return arginfo; } if (m < n) { printf("error m < n %d < %d \n", (int) m, (int) n); arginfo = -101; magma_xerbla( __func__, -(arginfo) ); return arginfo; } double **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); double alpha = MAGMA_D_NEG_ONE; double beta = MAGMA_D_ONE; magma_int_t panel_nb = n; if (panel_nb <= min_recpnb) { //printf("calling bottom panel recursive with m=%d nb=%d\n",m,n); // panel factorization magma_ddisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount, queue); //magma_dpotrf_rectile_batched(uplo, m, panel_nb, 16, arginfo = magma_dpotrf_panel_batched(uplo, m, panel_nb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } else { // split A over two [A A2] // panel on A1, update on A2 then panel on A1 magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t m1 = m; magma_int_t m2 = m-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A1 //printf("calling recursive panel on A1 with m=%d nb=%d min_recpnb %d\n",m1,n1,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); arginfo = magma_dpotrf_recpanel_batched( uplo, m1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); if (arginfo != 0) { magma_free(dA_displ); return arginfo; } // update A2 //printf("calling update A2 with m=%d n=%d k=%d\n",m2,n2,n1); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p1+n1, p1, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, p1+n1, p2, batchCount, queue); magma_dgemm_batched( MagmaNoTrans, MagmaConjTrans, m2, n2, n1, alpha, dA_displ, ldda, dA_displ, ldda, beta, dW0_displ, ldda, batchCount, queue ); // panel on A2 //printf("calling recursive panel on A2 with m=%d nb=%d min_recpnb %d\n",m2,n2,min_recpnb); magma_ddisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount, queue); arginfo = magma_dpotrf_recpanel_batched( uplo, m2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, gbstep, batchCount, queue); } magma_free(dA_displ); return arginfo; }
extern "C" magma_int_t magma_zgetrf_recpanel_nopiv_batched( magma_int_t m, magma_int_t n, magma_int_t min_recpnb, magmaDoubleComplex** dA_array, magma_int_t ldda, magmaDoubleComplex** dX_array, magma_int_t dX_length, magmaDoubleComplex** dinvA_array, magma_int_t dinvA_length, magmaDoubleComplex** dW1_displ, magmaDoubleComplex** dW2_displ, magmaDoubleComplex** dW3_displ, magmaDoubleComplex** dW4_displ, magmaDoubleComplex** dW5_displ, magma_int_t *info_array, magma_int_t gbstep, magma_int_t batchCount, magma_queue_t queue ) { // Quick return if possible if (m == 0 || n == 0) { return 0; } magma_int_t arginfo = 0; magmaDoubleComplex **dA_displ = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_int_t panel_nb = n; if (panel_nb <= min_recpnb) { // if (DEBUG > 0)printf("calling bottom panel recursive with m=%d nb=%d\n",m,n); // panel factorization //magma_zdisplace_pointers(dA_displ, dA_array, ldda, 0, 0, batchCount); arginfo = magma_zgetrf_panel_nopiv_batched( m, panel_nb, dA_array, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep, batchCount, queue); if (arginfo != 0) return arginfo; } else { // split A over two [A A2] // panel on A1, update on A2 then panel on A1 magma_int_t n1 = n/2; magma_int_t n2 = n-n1; magma_int_t m1 = m; magma_int_t m2 = m-n1; magma_int_t p1 = 0; magma_int_t p2 = n1; // panel on A1 //printf("calling recursive panel on A1 with m=%d nb=%d min_recpnb %d\n",m1,n1,min_recpnb); magma_zdisplace_pointers(dA_displ, dA_array, ldda, p1, p1, batchCount, queue); arginfo = magma_zgetrf_recpanel_nopiv_batched( m1, n1, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep, batchCount, queue); if (arginfo != 0) return arginfo; // update A2 //printf("calling update A2 with m=%d n=%d k=%d\n",m2,n2,n1); magma_zdisplace_pointers(dW5_displ, dA_array, ldda, p1, p2, batchCount, queue); magmablas_ztrsm_work_batched( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, n1, n2, MAGMA_Z_ONE, dA_displ, ldda, // dA dW5_displ, ldda, // dB dX_array, n1, // dX dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue ); magma_zdisplace_pointers(dW1_displ, dA_array, ldda, p2, 0, batchCount, queue); magma_zdisplace_pointers(dA_displ, dA_array, ldda, p2, p2, batchCount, queue); magma_zgemm_batched( MagmaNoTrans, MagmaNoTrans, m2, n2, n1, MAGMA_Z_NEG_ONE, dW1_displ, ldda, dW5_displ, ldda, MAGMA_Z_ONE, dA_displ, ldda, batchCount, queue ); // panel on A2 //printf("calling recursive panel on A2 with m=%d nb=%d min_recpnb %d\n",m2,n2,min_recpnb); arginfo = magma_zgetrf_recpanel_nopiv_batched( m2, n2, min_recpnb, dA_displ, ldda, dX_array, dX_length, dinvA_array, dinvA_length, dW1_displ, dW2_displ, dW3_displ, dW4_displ, dW5_displ, info_array, gbstep+p2, batchCount, queue); if (arginfo != 0) return arginfo; } magma_free(dA_displ); return 0; }
/***************************************************************************//** Purpose ------- DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[in,out] dR_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDR, N/NB) dR should be of size (LDDR, N) when provide_RT > 0 and of size (LDDT, NB) otherwise. NB is the local blocking size. On exit, the elements of R are stored in dR only when provide_RT > 0. @param[in] lddr INTEGER The leading dimension of the array dR. LDDR >= min(M,N) when provide_RT == 1 otherwise LDDR >= min(NB, min(M,N)). NB is the local blocking size. To benefit from coalescent memory accesses LDDR must be divisible by 16. @param[in,out] dT_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array on the GPU, dimension (LDDT, N/NB) dT should be of size (LDDT, N) when provide_RT > 0 and of size (LDDT, NB) otherwise. NB is the local blocking size. On exit, the elements of T are stored in dT only when provide_RT > 0. @param[in] lddt INTEGER The leading dimension of the array dT. LDDT >= min(NB,min(M,N)). NB is the local blocking size. To benefit from coalescent memory accesses LDDR must be divisible by 16. @param[out] dtau_array Array of pointers, dimension (batchCount). Each is a DOUBLE PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[in] provide_RT INTEGER provide_RT = 0 no R and no T in output. dR and dT are used as local workspace to store the R and T of each step. provide_RT = 1 the whole R of size (min(M,N), N) and the nbxnb block of T are provided in output. provide_RT = 2 the nbxnb diag block of R and of T are provided in output. @param[out] info_array Array of INTEGERs, dimension (batchCount), for corresponding matrices. - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @param[in] batchCount INTEGER The number of matrices to operate on. @param[in] queue magma_queue_t Queue to execute in. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_geqrf_batched *******************************************************************************/ extern "C" magma_int_t magma_dgeqrf_expert_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, double **dR_array, magma_int_t lddr, double **dT_array, magma_int_t lddt, double **dtau_array, magma_int_t provide_RT, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) /* Local Parameter */ magma_int_t nb = magma_get_dgeqrf_batched_nb(m); magma_int_t nnb = 8; magma_int_t min_mn = min(m, n); /* Check arguments */ cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; else if (lddr < min_mn && provide_RT == 1) arginfo = -6; else if (lddr < min(min_mn, nb)) arginfo = -6; else if (lddt < min(min_mn, nb)) arginfo = -8; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if (min_mn == 0 ) return arginfo; if ( m > 2048 || n > 2048 ) { printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t i, k, ib=nb, jb=nnb, offset_RT=0, use_stream; magma_int_t ldw, offset; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dW5_displ = NULL; double **dR_displ = NULL; double **dT_displ = NULL; double *dwork = NULL; double **cpuAarray = NULL; double **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_displ, batchCount * sizeof(*dR_displ)); magma_malloc((void**)&dT_displ, batchCount * sizeof(*dT_displ)); magma_dmalloc(&dwork, (2 * nb * n) * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(double*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_displ == NULL || dT_displ == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magma_ddisplace_pointers(dR_displ, dR_array, lddr, 0, 0, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, 0, batchCount, queue); // set dwork to zero because our GEMM routine does propagate NAN when C=betaC+alphaA*B and beta=0 magmablas_dlaset_q( MagmaFull, 2*nb, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, 2*nb, queue ); // set dR and dT to zero. if provide_RT == 0 only a tile of size nbxnb is used and overwritten at each step magmablas_dlaset_batched( MagmaFull, lddr, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dR_displ, lddr, batchCount, queue ); magmablas_dlaset_batched( MagmaFull, lddt, (provide_RT > 0 ? n:min(min_mn,nb)), MAGMA_D_ZERO, MAGMA_D_ZERO, dT_displ, lddt, batchCount, queue ); /* if ( provide_RT > 0 ) { magmablas_dlaset_q( MagmaFull, lddr, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, n*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } else { magmablas_dlaset_q( MagmaFull, lddr, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dR, lddr, queue ); magmablas_dlaset_q( MagmaFull, lddt, nb*batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dT, lddt, queue ); } */ magma_int_t streamid; const magma_int_t nbstreams=10; magma_queue_t queues[nbstreams]; for (i=0; i < nbstreams; i++) { magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1, queue); magma_getvector( batchCount, sizeof(double*), dT_array, 1, cpuTarray, 1, queue); for (i=0; i < min_mn; i += nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); if ( provide_RT > 0 ) { offset_RT = i; magma_ddisplace_pointers(dR_displ, dR_array, lddr, (provide_RT == 1 ? offset_RT:0), offset_RT, batchCount, queue); magma_ddisplace_pointers(dT_displ, dT_array, lddt, 0, offset_RT, batchCount, queue); } //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_dgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dR_displ, lddr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, queue); //=============================================== // end of panel //=============================================== //=============================================== // update trailing matrix //=============================================== if ( (n-ib-i) > 0) { //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; magma_dset_pointer( dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; magma_dset_pointer( dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); // set the diagonal of v as one and the upper triangular part as zero already set inside geqrf_panel //magmablas_dlaset_batched( MagmaUpper, ib, ib, MAGMA_D_ZERO, MAGMA_D_ONE, dW0_displ, ldda, batchCount, queue ); //magma_ddisplace_pointers(dW2_displ, dtau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_dlarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_displ, lddt, dW4_displ, nb*lddt, batchCount, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- use_stream = magma_drecommend_cublas_gemm_stream(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib); if ( use_stream ) { magma_queue_sync(queue); for (k=0; k < batchCount; k++) { streamid = k%nbstreams; // the queue gemm must take cpu pointer magma_dlarfb_gpu_gemm( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k] + offset_RT*lddt, lddt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1, queues[streamid] ); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // if queue is NULL, no need to sync if ( queue != NULL ) { for (magma_int_t s=0; s < nbstreams; s++) magma_queue_sync(queues[s]); } } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_dlarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const double**)dW0_displ, ldda, (const double**)dT_displ, lddt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, queue ); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update, // only when provide_RT=0 otherwise the nbxnb block of V is set to diag=1/0 // The upper portion of V could be set totaly to 0 here if ( provide_RT == 0 ) { magmablas_dlacpy_batched( MagmaUpper, ib, ib, dR_displ, lddr, dW0_displ, ldda, batchCount, queue ); } } magma_queue_sync(queue); for (k=0; k < nbstreams; k++) { magma_queue_destroy( queues[k] ); } magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_displ); magma_free(dT_displ); magma_free(dwork); magma_free_cpu(cpuAarray); magma_free_cpu(cpuTarray); return arginfo; }
/** Purpose ------- Solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by SGETRF_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA REAL array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by SGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from SGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB REAL array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrs_batched( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, float **dA_array, magma_int_t ldda, magma_int_t **dipiv_array, float **dB_array, magma_int_t lddb, magma_int_t batchCount, magma_queue_t queue) { magma_int_t notran = (trans == MagmaNoTrans); magma_int_t info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { info = -1; } else if (n < 0) { info = -2; } else if (nrhs < 0) { info = -3; } else if (ldda < max(1,n)) { info = -5; } else if (lddb < max(1,n)) { info = -8; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return info; } float **dW1_displ = NULL; float **dW2_displ = NULL; float **dW3_displ = NULL; float **dW4_displ = NULL; float **dinvA_array = NULL; float **dwork_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nrhs; float* dinvA = NULL; float* dwork = NULL;// dinvA and dwork are workspace in strsm magma_smalloc( &dinvA, invA_msize * batchCount); magma_smalloc( &dwork, dwork_msize * batchCount ); /* check allocation */ if ( dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL ) { magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_slaset_q(MagmaFull, invA_msize, batchCount, MAGMA_S_ZERO, MAGMA_S_ZERO, dinvA, invA_msize, queue); magmablas_slaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_S_ZERO, MAGMA_S_ZERO, dwork, dwork_msize, queue); sset_pointer(dwork_array, dwork, n, 0, 0, dwork_msize, batchCount, queue); sset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); if (notran) { magma_slaswp_rowserial_batched(nrhs, dB_array, lddb, 1, n, dipiv_array, batchCount, queue); // solve dwork = L^-1 * NRHS magmablas_strsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, n, nrhs, MAGMA_S_ONE, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); // solve X = U^-1 * dwork magmablas_strsm_outofplace_batched(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 1, n, nrhs, MAGMA_S_ONE, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); } else{ /* Solve A**T * X = B or A**H * X = B. */ // solve magmablas_strsm_outofplace_batched(MagmaLeft, MagmaUpper, trans, MagmaUnit, 1, n, nrhs, MAGMA_S_ONE, dA_array, ldda, // dA dB_array, lddb, // dB dwork_array, n, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); // solve magmablas_strsm_outofplace_batched(MagmaLeft, MagmaLower, trans, MagmaNonUnit, 1, n, nrhs, MAGMA_S_ONE, dA_array, ldda, // dA dwork_array, n, // dB dB_array, lddb, // dX //output dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 1, batchCount, queue); magma_slaswp_rowserial_batched(nrhs, dB_array, lddb, 1, n, dipiv_array, batchCount, queue); } magma_queue_sync(cstream); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgemm_batched */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cpu_perf, cpu_time; float magma_error, magma_err, Ynorm, work[1]; magma_int_t M, N, Xm, Ym, lda, ldda; magma_int_t sizeA, sizeX, sizeY; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_int_t batchCount; magmaFloatComplex *h_A, *h_X, *h_Y, *h_Ymagma; magmaFloatComplex *d_A, *d_X, *d_Y; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.29, -0.86 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.48, 0.38 ); magmaFloatComplex **A_array = NULL; magmaFloatComplex **X_array = NULL; magmaFloatComplex **Y_array = NULL; magma_opts opts; parse_opts( argc, argv, &opts ); batchCount = opts.batchcount; opts.lapack |= opts.check; //float tol = opts.tolerance * lapackf77_slamch("E"); printf("trans = %s\n", lapack_trans_const(opts.transA) ); printf("BatchCount M N MAGMA Gflop/s (ms) CPU Gflop/s (ms) MAGMA error\n"); printf("===================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = ((M+31)/32)*32; gflops = FLOPS_CGEMV( M, N ) / 1e9 * batchCount; if ( opts.transA == MagmaNoTrans ) { Xm = N; Ym = M; } else { Xm = M; Ym = N; } sizeA = lda*N*batchCount; sizeX = incx*Xm*batchCount; sizeY = incy*Ym*batchCount; ldda = ((lda+31)/32)*32; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, sizeA ); TESTING_MALLOC_CPU( h_X, magmaFloatComplex, sizeX ); TESTING_MALLOC_CPU( h_Y, magmaFloatComplex, sizeY ); TESTING_MALLOC_CPU( h_Ymagma, magmaFloatComplex, sizeY ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N*batchCount ); TESTING_MALLOC_DEV( d_X, magmaFloatComplex, sizeX ); TESTING_MALLOC_DEV( d_Y, magmaFloatComplex, sizeY ); magma_malloc((void**)&A_array, batchCount * sizeof(*A_array)); magma_malloc((void**)&X_array, batchCount * sizeof(*X_array)); magma_malloc((void**)&Y_array, batchCount * sizeof(*Y_array)); /* Initialize the matrices */ lapackf77_clarnv( &ione, ISEED, &sizeA, h_A ); lapackf77_clarnv( &ione, ISEED, &sizeX, h_X ); lapackf77_clarnv( &ione, ISEED, &sizeY, h_Y ); /* ===================================================================== Performs operation using MAGMABLAS =================================================================== */ magma_csetmatrix( M, N*batchCount, h_A, lda, d_A, ldda ); magma_csetvector( Xm*batchCount, h_X, incx, d_X, incx ); magma_csetvector( Ym*batchCount, h_Y, incy, d_Y, incy ); cset_pointer(A_array, d_A, ldda, 0, 0, ldda*N, batchCount, magma_stream); cset_pointer(X_array, d_X, 1, 0, 0, incx*Xm, batchCount, magma_stream); cset_pointer(Y_array, d_Y, 1, 0, 0, incy*Ym, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ); magmablas_cgemv_batched(opts.transA, M, N, alpha, A_array, ldda, X_array, incx, beta, Y_array, incy, batchCount, magma_stream); magma_time = magma_sync_wtime( NULL ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( Ym*batchCount, d_Y, incy, h_Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ if ( opts.lapack ) { cpu_time = magma_wtime(); for(int i=0; i<batchCount; i++) { blasf77_cgemv( lapack_trans_const(opts.transA), &M, &N, &alpha, h_A + i*lda*N, &lda, h_X + i*Xm, &incx, &beta, h_Y + i*Ym, &incy ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; } /* ===================================================================== Check the result =================================================================== */ if ( opts.lapack ) { // compute relative error for both magma relative to lapack, // |C_magma - C_lapack| / |C_lapack| magma_error = 0.0; for(int s=0; s<batchCount; s++) { Ynorm = lapackf77_clange( "M", &M, &ione, h_Y + s*Ym, &incy, work ); blasf77_caxpy( &Ym, &c_neg_one, h_Y + s*Ym, &ione, h_Ymagma + s*Ym, &ione ); magma_err = lapackf77_clange( "M", &M, &ione, h_Ymagma + s*Ym, &incy, work ) / Ynorm; if ( isnan(magma_err) || isinf(magma_err) ) { magma_error = magma_err; break; } magma_error = max(fabs(magma_err), magma_error); } printf("%10d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %8.2e \n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time, cpu_perf, 1000.*cpu_time, magma_error); } else { printf("%10d %5d %5d %7.2f (%7.2f) --- ( --- ) ---\n", (int) batchCount, (int) M, (int) N, magma_perf, 1000.*magma_time); } TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_X ); TESTING_FREE_CPU( h_Y ); TESTING_FREE_CPU( h_Ymagma ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_X ); TESTING_FREE_DEV( d_Y ); TESTING_FREE_DEV( A_array ); TESTING_FREE_DEV( X_array ); TESTING_FREE_DEV( Y_array ); fflush( stdout); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
magma_err_t magma_sgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, size_t dA_offset, magma_int_t ldda, float *tau, magma_err_t *info, magma_queue_t queue) { /* -- clMAGMA (version 1.0.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver April 2012 Purpose ======= SGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -9, internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magmaFloat_ptr dwork; float *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_sgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_malloc( &dwork, n*nb*sizeof(float))) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( MAGMA_SUCCESS != magma_malloc_host((void**)&work, lwork*sizeof(float)) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } magma_event_t event[2] = {NULL, NULL}; nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_queue_sync( queue ); magma_sgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queue, &event[0]); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue); magma_ssetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork, dA(old_i, old_i), ldda, queue, &event[1]); } magma_event_sync(event[0]); lapackf77_sgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); if (i > 0) { magma_event_sync(event[1]); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); spanel_to_q( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); magma_ssetmatrix(rows, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue); sq_to_panel( MagmaUpper, ib, work_ref(i), ldwork, hwork+ib*ib ); if (i + ib < n) { magma_ssetmatrix(ib, ib, hwork, 0, ib, dwork, 0, lddwork, queue); if (i+nb < k-nx) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue); else { magma_slarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue); magma_ssetmatrix(ib, ib, work_ref(i), 0, ldwork, dA(i,i), ldda, queue); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_sgetmatrix(rows, ib, dA(i, i), ldda, work, 0, rows, queue); lhwork = lwork - rows*ib; lapackf77_sgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_ssetmatrix(rows, ib, work, 0, rows, dA(i, i), ldda, queue); } magma_free_host(work); return *info; } /* magma_sgeqrf2_gpu */