/** Purpose ------- DSYGVD computes all the eigenvalues, and optionally, the eigenvectors of a real generalized symmetric-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be symmetric and B is also positive definite. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA, N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the matrix Z of eigenvectors. The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T * B * Z = I; if ITYPE = 3, Z**T * inv(B) * Z = I. If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper) or the lower triangle (if UPLO=MagmaLower) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B DOUBLE PRECISION array, dimension (LDB, N) On entry, the symmetric matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**T * U or B = L * L**T. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] w DOUBLE PRECISION array, dimension (N) If INFO = 0, the eigenvalues in ascending order. @param[out] work (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ). NB can be obtained through magma_get_dsytrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK and IWORK arrays, returns these values as the first entries of the WORK and IWORK arrays, and no error message related to LWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: DPOTRF or DSYEVD returned an error code: <= N: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1); > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA Modified so that no backsubstitution is performed if DSYEVD fails to converge (NEIG in old code could be greater than N causing out of bounds reference to A - reported by Ralf Meyer). Also corrected the description of INFO and the test on ITYPE. Sven, 16 Feb 05. @ingroup magma_dsygv_driver ********************************************************************/ extern "C" magma_int_t magma_dsygvd( magma_int_t itype, magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, double *B, magma_int_t ldb, double *w, double *work, magma_int_t lwork, #ifdef COMPLEX double *rwork, magma_int_t lrwork, #endif magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); double d_one = MAGMA_D_ONE; double *dA=NULL, *dB=NULL; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lower; magma_trans_t trans; magma_int_t wantz, lquery; magma_int_t lwmin, liwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); lquery = (lwork == -1 || liwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (lda < max(1,n)) { *info = -6; } else if (ldb < max(1,n)) { *info = -8; } magma_int_t nb = magma_get_dsytrd_nb( n ); if ( n <= 1 ) { lwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( 2*n + n*nb, 1 + 6*n + 2*n*n ); liwmin = 3 + 5*n; } else { lwmin = 2*n + n*nb; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon"); work[0] = lwmin * one_eps; iwork[0] = liwmin; if (lwork < lwmin && ! lquery) { *info = -11; } else if (liwork < liwmin && ! lquery) { *info = -13; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif lapackf77_dsygvd(&itype, jobz_, uplo_, &n, A, &lda, B, &ldb, w, work, &lwork, iwork, &liwork, info); return *info; } if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb )) { magma_free( dA ); magma_free( dB ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_dsetmatrix( n, n, B, ldb, dB, lddb ); magma_dsetmatrix_async( n, n, A, lda, dA, ldda, stream ); magma_timer_t time=0; timer_start( time ); magma_dpotrf_gpu(uplo, n, dB, lddb, info); if (*info != 0) { *info = n + *info; return *info; } timer_stop( time ); timer_printf( "time dpotrf_gpu = %6.2f\n", time ); magma_queue_sync( stream ); magma_dgetmatrix_async( n, n, dB, lddb, B, ldb, stream ); timer_start( time ); /* Transform problem to standard eigenvalue problem and solve. */ magma_dsygst_gpu(itype, uplo, n, dA, ldda, dB, lddb, info); timer_stop( time ); timer_printf( "time dsygst_gpu = %6.2f\n", time ); /* simple fix to be able to run bigger size. * need to have a dwork here that will be used * as dB and then passed to dsyevd. * */ if (n > 5000) { magma_queue_sync( stream ); magma_free( dB ); } timer_start( time ); magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, A, lda, work, lwork, iwork, liwork, info); timer_stop( time ); timer_printf( "time dsyevd_gpu = %6.2f\n", time ); if (wantz && *info == 0) { timer_start( time ); /* allocate and copy dB back */ if (n > 5000) { if (MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_dsetmatrix( n, n, B, ldb, dB, lddb ); } /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaTrans; } else { trans = MagmaNoTrans; } magma_dtrsm(MagmaLeft, uplo, trans, MagmaNonUnit, n, n, d_one, dB, lddb, dA, ldda); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaTrans; } magma_dtrmm(MagmaLeft, uplo, trans, MagmaNonUnit, n, n, d_one, dB, lddb, dA, ldda); } magma_dgetmatrix( n, n, dA, ldda, A, lda ); /* free dB */ if (n > 5000) { magma_free( dB ); } timer_stop( time ); timer_printf( "time dtrsm/mm + getmatrix = %6.2f\n", time ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); work[0] = lwmin * one_eps; // round up iwork[0] = liwmin; magma_free( dA ); if (n <= 5000) { magma_free( dB ); } return *info; } /* magma_dsygvd */
/** Purpose ------- SGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] num_gpus INTEGER The number of GPUs. num_gpus > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf_m(magma_int_t num_gpus, magma_int_t m, magma_int_t n, float *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; float *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus0 = num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_sgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(float); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if ( num_gpus*NB >= n ) { #ifdef CHECK_SGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_SGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_SGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_sgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/num_gpus; if ( NB%(nb*num_gpus) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_smalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d < num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_ssetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_ssetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_stranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_spermute_long3( dAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_ssetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_stranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_sgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { //magma_sgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, // (magma_queue_t **)stream, &iinfo); magma_sgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_sgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_SGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_sgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_sgetrf_m */
/** Purpose ------- CLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array dA. If UPLO = MagmaUpper then the upper triangle of the result is stored, overwriting the factor U in dA. If UPLO = MagmaLower then the lower triangle of the result is stored, overwriting the factor L in dA. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the triangular factor stored in the array dA is upper or lower triangular: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the triangular factor U or L. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the triangular factor U or L. On exit, if UPLO = MagmaUpper, the upper triangle of dA is overwritten with the upper triangle of the product U * U'; if UPLO = MagmaLower, the lower triangle of dA is overwritten with the lower triangle of the product L' * L. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value @ingroup magma_cposv_aux ***************************************************************************/ extern "C" magma_int_t magma_clauum_gpu(magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA + (j)*ldda + (i)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nb, i, ib; float d_one = MAGMA_D_ONE; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex *work; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_cpotrf_nb(n); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) { magma_cgetmatrix( n, n, dA, ldda, work, n ); lapackf77_clauum(uplo_, &n, work, &n, info); magma_csetmatrix( n, n, work, n, dA, ldda ); } else { if (upper) { /* Compute inverse of upper triangular matrix */ for (i=0; i < n; i += nb) { ib = min(nb, (n-i)); /* Compute the product U * U'. */ magma_ctrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); magma_cgetmatrix( ib, ib, dA(i, i), ldda, work, ib ); lapackf77_clauum(MagmaUpperStr, &ib, work, &ib, info); magma_csetmatrix( ib, ib, work, ib, dA(i, i), ldda ); if (i+ib < n) { magma_cgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib), ldda, c_one, dA(0,i), ldda); magma_cherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } } } else { /* Compute the product L' * L. */ for (i=0; i < n; i += nb) { ib=min(nb,(n-i)); magma_ctrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); magma_cgetmatrix( ib, ib, dA(i, i), ldda, work, ib ); lapackf77_clauum(MagmaLowerStr, &ib, work, &ib, info); magma_csetmatrix( ib, ib, work, ib, dA(i, i), ldda ); if (i+ib < n) { magma_cgemm( MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_cherk( MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; }
/***************************************************************************//** Purpose ------- CGETRS solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by CGETRF_NOPIV_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_getrs_nopiv *******************************************************************************/ extern "C" magma_int_t magma_cgetrs_nopiv_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t *info) { // Constants const magmaFloatComplex c_one = MAGMA_C_ONE; // Local variables bool notran = (trans == MagmaNoTrans); *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_queue_t queue = NULL; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); if (notran) { /* Solve A * X = B. */ if ( nrhs == 1) { magma_ctrsv( MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1, queue ); magma_ctrsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); } else { magma_ctrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } } else { /* Solve A**T * X = B or A**H * X = B. */ if ( nrhs == 1) { magma_ctrsv( MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); magma_ctrsv( MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1, queue ); } else { magma_ctrsm( MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_ctrsm( MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } } magma_queue_destroy( queue ); return *info; }
/** Purpose ------- ZTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: A is upper triangular; - = MagmaLower: A is lower triangular. @param[in] diag magma_diag_t - = MagmaNonUnit: A is non-unit triangular; - = MagmaUnit: A is unit triangular. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = MagmaUnit, the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. @ingroup magma_zgesv_aux ********************************************************************/ extern "C" magma_int_t magma_ztrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) ( A + (i) + (j)*lda ) #define dA(i, j) (dA + (i) + (j)*ldda) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); const char* diag_ = lapack_diag_const( diag ); magma_int_t ldda, nb, nn, j, jb; magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dA; int upper = (uplo == MagmaUpper); int nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; /* Check for singularity if non-unit */ if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_Z_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } /* Determine the block size for this environment */ nb = magma_get_zpotrf_nb(n); ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) lapackf77_ztrtri(uplo_, diag_, &n, A, &lda, info); else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); magma_zsetmatrix( jb, (n-j), A(j, j), lda, dA(j, j), ldda ); /* Compute rows 1:j-1 of current block column */ magma_ztrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j),ldda); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j),ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[1] ); magma_zgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, stream[0] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaUpperStr, diag_, &jb, A(j,j), &lda, info); magma_zsetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } else { /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for (j=nn-1; j >= 0; j -= nb) { jb=min(nb,(n-j)); if ((j+jb) < n) { magma_zsetmatrix( (n-j), jb, A(j, j), lda, dA(j, j), ldda ); /* Compute rows j+jb:n of current block column */ magma_ztrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda ); magma_ztrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda ); magma_zgetmatrix_async( n-j-jb, jb, dA(j+jb, j), ldda, A(j+jb, j), lda, stream[1] ); magma_zgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); magma_queue_sync( stream[0] ); } /* Compute inverse of current diagonal block */ lapackf77_ztrtri(MagmaLowerStr, diag_, &jb, A(j,j), &lda, info); magma_zsetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_gpu(magma_uplo_t uplo, magma_int_t n, double *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA + (j)*ldda + (i)) magma_int_t j, jb, nb; const char* uplo_ = lapack_uplo_const( uplo ); double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_dpotrf_nb(n); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_dgetmatrix_async( n, n, dA, ldda, work, n, stream[1] ); magma_queue_sync( stream[1] ); lapackf77_dpotrf(uplo_, &n, work, &n, info); magma_dsetmatrix_async( n, n, work, n, dA, ldda, stream[1] ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_dsyrk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_dgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_dsyrk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_dgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { magma_dgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[0] ); lapackf77_dpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_dsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_dtrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dpotrf_gpu */
/* //////////////////////////////////////////////////////////////////////////// -- testing any solver */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_copts zopts; magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); real_Double_t res; magma_c_matrix A={Magma_CSR}, A2={Magma_CSR}, A3={Magma_CSR}, A4={Magma_CSR}, A5={Magma_CSR}; int i=1; TESTING_CHECK( magma_cparse_opts( argc, argv, &zopts, &i, queue )); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_cm_5stencil( laplace_size, &A, queue )); } else { // file-matrix test TESTING_CHECK( magma_c_csr_mtx( &A, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) A.num_rows, (long long) A.num_cols, (long long) A.nnz ); // filename for temporary matrix storage const char *filename = "testmatrix.mtx"; // write to file TESTING_CHECK( magma_cwrite_csrtomtx( A, filename, queue )); // read from file TESTING_CHECK( magma_c_csr_mtx( &A2, filename, queue )); // delete temporary matrix unlink( filename ); //visualize printf("A2:\n"); TESTING_CHECK( magma_cprint_matrix( A2, queue )); //visualize TESTING_CHECK( magma_cmconvert(A2, &A4, Magma_CSR, Magma_CSRL, queue )); printf("A4:\n"); TESTING_CHECK( magma_cprint_matrix( A4, queue )); TESTING_CHECK( magma_cmconvert(A4, &A5, Magma_CSR, Magma_ELL, queue )); printf("A5:\n"); TESTING_CHECK( magma_cprint_matrix( A5, queue )); // pass it to another application and back magma_int_t m, n; magma_index_t *row, *col; magmaFloatComplex *val=NULL; TESTING_CHECK( magma_ccsrget( A2, &m, &n, &row, &col, &val, queue )); TESTING_CHECK( magma_ccsrset( m, n, row, col, val, &A3, queue )); TESTING_CHECK( magma_cmdiff( A, A2, &res, queue )); printf("%% ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% tester IO: ok\n"); else printf("%% tester IO: failed\n"); TESTING_CHECK( magma_cmdiff( A, A3, &res, queue )); printf("%% ||A-B||_F = %8.2e\n", res); if ( res < .000001 ) printf("%% tester matrix interface: ok\n"); else printf("%% tester matrix interface: failed\n"); magma_cmfree(&A, queue ); magma_cmfree(&A2, queue ); magma_cmfree(&A4, queue ); magma_cmfree(&A5, queue ); i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
extern "C" magma_int_t magma_cgetrf_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magmaFloatComplex **d_lA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) (d_lAT[(id)] + (i)*nb*lddat + (j)*nb) magma_int_t nb, n_local[MagmaMaxGPUs]; magma_int_t maxm, mindim; magma_int_t i, j, d, lddat, lddwork; magmaFloatComplex *d_lAT[MagmaMaxGPUs]; magmaFloatComplex *d_panel[MagmaMaxGPUs], *work; magma_queue_t streaml[MagmaMaxGPUs][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_cgetrf_nb(m); if (nb <= 1 || nb >= n) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, d_lA[0], ldda, work, m ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, d_lA[0], ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if( num_gpus > ceil((float)n/nb) ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus ); *info = -1; return *info; } /* allocate workspace for each GPU */ lddat = ((((((n+nb-1)/nb)/num_gpus)*nb)+31)/32)*32; lddat = (n+nb-1)/nb; /* number of block columns */ lddat = (lddat+num_gpus-1)/num_gpus; /* number of block columns per GPU */ lddat = nb*lddat; /* number of columns per GPU */ lddat = ((lddat+31)/32)*32; /* make it a multiple of 32 */ for(i=0; i<num_gpus; i++){ magma_setdevice(i); /* local-n and local-ld */ n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ if (MAGMA_SUCCESS != magma_cmalloc( &d_panel[i], (3+num_gpus)*nb*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* local-matrix storage */ if (MAGMA_SUCCESS != magma_cmalloc( &d_lAT[i], lddat*maxm )) { for( j=0; j<=i; j++ ) { magma_setdevice(j); magma_free( d_panel[j] ); } for( j=0; j<i; j++ ) { magma_setdevice(j); magma_free( d_lAT[j] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* create the streams */ magma_queue_create( &streaml[i][0] ); magma_queue_create( &streaml[i][1] ); magmablasSetKernelStream(streaml[i][1]); magmablas_ctranspose2( d_lAT[i], lddat, d_lA[i], ldda, m, n_local[i] ); } for(i=0; i<num_gpus; i++){ magma_setdevice(i); cudaStreamSynchronize(streaml[i][0]); magmablasSetKernelStream(NULL); } magma_setdevice(0); /* cpu workspace */ lddwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, lddwork*nb*num_gpus )) { for(i=0; i<num_gpus; i++ ) { magma_setdevice(i); magma_free( d_panel[i] ); magma_free( d_lAT[i] ); } *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* calling multi-gpu interface with allocated workspaces and streams */ //magma_cgetrf1_mgpu( num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, // (magma_queue_t **)streaml, info ); magma_cgetrf2_mgpu(num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm, streaml, info); /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* save on output */ magmablas_ctranspose2( d_lA[d], ldda, d_lAT[d], lddat, n_local[d], m ); magma_device_sync(); magma_free( d_lAT[d] ); magma_free( d_panel[d] ); magma_queue_destroy( streaml[d][0] ); magma_queue_destroy( streaml[d][1] ); magmablasSetKernelStream(NULL); } /* end of for d=1,..,num_gpus */ magma_setdevice(0); magma_free_pinned( work ); } return *info; }
extern "C" magma_int_t magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, double *a, magma_int_t lda, double *da, magma_int_t ldda, double *d, double *e, double *tauq, double *taup, double *x, magma_int_t ldx, double *dx, magma_int_t lddx, double *y, magma_int_t ldy, double *dy, magma_int_t lddy) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DLABRD reduces the first NB rows and columns of a real general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by SGEBRD Arguments ========= M (input) INTEGER The number of rows in the matrix A. N (input) INTEGER The number of columns in the matrix A. NB (input) INTEGER The number of leading rows and columns of A to be reduced. A (input/output) DOUBLE_PRECISION array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). D (output) DOUBLE_PRECISION array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). E (output) DOUBLE_PRECISION array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. TAUQ (output) DOUBLE_PRECISION array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. TAUP (output) DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. X (output) DOUBLE_PRECISION array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. LDX (input) INTEGER The leading dimension of the array X. LDX >= M. Y (output) DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. LDY (input) INTEGER The leading dimension of the array Y. LDY >= N. Further Details =============== The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). ===================================================================== */ /* Table of constant values */ double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t c__1 = 1; /* System generated locals */ magma_int_t a_dim1, a_offset, x_dim1, x_offset, y_dim1, y_offset, i__2, i__3; /* Local variables */ magma_int_t i__; double alpha; a_dim1 = lda; a_offset = 1 + a_dim1; a -= a_offset; --d; --e; --tauq; --taup; x_dim1 = ldx; x_offset = 1 + x_dim1; x -= x_offset; dx-= 1 + lddx; y_dim1 = ldy; y_offset = 1 + y_dim1; y -= y_offset; dy-= 1 + lddy; /* Function Body */ if (m <= 0 || n <= 0) { return 0; } double *f; magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, max(n,m) ); assert( f != NULL ); // TODO return error, or allocate outside dlatrd if (m >= n) { /* Reduce to upper bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i:m,i) */ i__2 = m - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + a_dim1], &lda, &y[i__+y_dim1], &ldy, &c_one, &a[i__ + i__ * a_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, &y[i__+y_dim1], &ldy ); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + x_dim1], &ldx, &a[i__*a_dim1+1], &c__1, &c_one, &a[i__+i__*a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = a[i__ + i__ * a_dim1]; i__2 = m - i__ + 1; i__3 = i__ + 1; lapackf77_dlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); d[i__] = MAGMA_D_REAL( alpha ); if (i__ < n) { a[i__ + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__ + 1; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, a + i__ + i__ * a_dim1, 1, da+(i__-1)+(i__-1)* (ldda), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv(MagmaTrans, i__2, i__3, c_one, da + (i__-1) + ((i__-1) + 1) * (ldda), ldda, da + (i__-1) + (i__-1) * (ldda), c__1, c_zero, dy + i__ + 1 + i__ * y_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dy+i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, stream ); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + a_dim1], &lda, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_dgemv("N", &i__2, &i__3, &c_neg_one, &y[i__ + 1 +y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &x[i__ + x_dim1], &ldx, &a[i__ + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3!=0){ i__2 = n - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = i__ - 1; i__3 = n - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); /* Update A(i,i+1:n) */ i__2 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); lapackf77_dlacgv( &i__, &a[i__+a_dim1], &lda ); #endif blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + ( i__ + 1) * a_dim1], &lda); i__2 = i__ - 1; i__3 = n - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__, &a[i__+a_dim1], &lda ); lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[ i__ + (i__ + 1) * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, &x[i__+x_dim1], &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i__; /* Computing MIN */ i__3 = i__ + 2; alpha = a[i__ + (i__ + 1) * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[i__ + min( i__3,n) * a_dim1], &lda, &taup[i__]); e[i__] = MAGMA_D_REAL( alpha ); a[i__ + (i__ + 1) * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__3, a + i__ + (i__ +1)* a_dim1, lda, da+(i__-1)+((i__-1)+1)*(ldda), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy(i__3, da+(i__-1)+((i__-1)+1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_dgemv(MagmaNoTrans, i__2, i__3, c_one, da + (i__-1)+1+ ((i__-1)+1) * (ldda), ldda, da + (i__-1) + ((i__-1)+1) * (ldda), ldda, //dy + 1 + lddy, 1, c_zero, dx + i__ + 1 + i__ * x_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dx+i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, stream ); i__2 = n - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &y[i__ + 1 + y_dim1], &ldy, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[ i__ * x_dim1 + 1], &c__1); i__2 = m - i__; blasf77_dgemv("N", &i__2, &i__, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__; blasf77_dgemv("N", &i__2, &i__3, &c_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &a[i__ + (i__ + 1) * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__!=0){ i__2 = m - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[ i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i__; lapackf77_dlacgv( &i__2, &a[i__+(i__+1)*a_dim1], &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after DLACGV() magma_dsetvector( i__2, a + i__ + (i__ +1)* a_dim1, lda, da+(i__-1)+((i__-1)+1)*(ldda), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i__ = 1; i__ <= nb; ++i__) { /* Update A(i,i:n) */ i__2 = n - i__ + 1; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + y_dim1], &ldy, &a[i__ + a_dim1], &lda, &c_one, &a[i__ + i__ * a_dim1], &lda); i__2 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &a[i__ + a_dim1], &lda); lapackf77_dlacgv(&i__3, &x[i__ + x_dim1], &ldx); #endif i__3 = n - i__ + 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_neg_one, &a[i__ * a_dim1 + 1], &lda, &x[i__ + x_dim1], &ldx, &c_one, &a[i__ + i__ * a_dim1], &lda); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &x[i__ + x_dim1], &ldx); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i__ + 1; /* Computing MIN */ i__3 = i__ + 1; alpha = a[i__ + i__ * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[i__ + min(i__3,n) * a_dim1], &lda, &taup[i__]); d[i__] = MAGMA_D_REAL( alpha ); if (i__ < m) { a[i__ + i__ * a_dim1] = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i__; i__3 = n - i__ + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_dsetvector( i__3, a + i__ + i__ * a_dim1, lda, da+(i__-1)+(i__-1)* (ldda), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy(i__3, da+(i__-1)+(i__-1)*(ldda), ldda, // dy + 1 + lddy, 1); magma_dgemv(MagmaNoTrans, i__2, i__3, c_one, da + (i__-1)+1 + (i__-1) * ldda, ldda, da + (i__-1) + (i__-1) * ldda, ldda, // dy + 1 + lddy, 1, c_zero, dx + i__ + 1 + i__ * x_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dx+i__+1+i__*x_dim1, x_dim1, x+i__+1+i__*x_dim1, x_dim1, stream ); i__2 = n - i__ + 1; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &y[i__ + y_dim1], &ldy, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &x[i__ * x_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = i__ - 1; i__3 = n - i__ + 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_one, &a[i__ * a_dim1 + 1], &lda, &a[i__ + i__ * a_dim1], &lda, &c_zero, &x[i__ * x_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2!=0){ i__3 = m - i__; blasf77_daxpy(&i__3, &c_one, f,&c__1, &x[i__+1+i__*x_dim1],&c__1); } i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &x[i__ * x_dim1 + 1], &c__1, &c_one, &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = m - i__; blasf77_dscal(&i__2, &taup[i__], &x[i__ + 1 + i__ * x_dim1], &c__1); i__2 = n - i__ + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_dsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da+(i__-1)+ (i__-1)*(ldda), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i__; i__3 = i__ - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &a[i__ + 1 + a_dim1], &lda, &y[i__ + y_dim1], &ldy, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); i__2 = m - i__; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv(&i__3, &y[i__ + y_dim1], &ldy); #endif blasf77_dgemv("No transpose", &i__2, &i__, &c_neg_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ * a_dim1 + 1], &c__1, &c_one, &a[i__ + 1 + i__ * a_dim1], &c__1); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i__; i__3 = i__ + 2; alpha = a[i__ + 1 + i__ * a_dim1]; lapackf77_dlarfg(&i__2, &alpha, &a[min(i__3,m) + i__ * a_dim1], &c__1, &tauq[i__]); e[i__] = MAGMA_D_REAL( alpha ); a[i__ + 1 + i__ * a_dim1] = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i__; i__3 = n - i__; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, a + i__ +1+ i__ * a_dim1, 1, da+(i__-1)+1+ (i__-1)*(ldda), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv(MagmaTrans, i__2, i__3, c_one, da + (i__-1)+1+ ((i__-1)+1) * ldda, ldda, da + (i__-1)+1+ (i__-1) * ldda, c__1, c_zero, dy + i__ + 1 + i__ * y_dim1, c__1); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dy+i__+1+i__*y_dim1, y_dim1, y+i__+1+i__*y_dim1, y_dim1, stream ); i__2 = m - i__; i__3 = i__ - 1; blasf77_dgemv(MagmaTransStr, &i__2, &i__3, &c_one, &a[i__ + 1 + a_dim1], &lda, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[ i__ * y_dim1 + 1], &c__1); i__2 = n - i__; i__3 = i__ - 1; blasf77_dgemv("No transpose", &i__2, &i__3, &c_neg_one, &y[i__ + 1 + y_dim1], &ldy, &y[i__ * y_dim1 + 1], &c__1, &c_zero, f, &c__1); i__2 = m - i__; blasf77_dgemv(MagmaTransStr, &i__2, &i__, &c_one, &x[i__ + 1 + x_dim1], &ldx, &a[i__ + 1 + i__ * a_dim1], &c__1, &c_zero, &y[i__ * y_dim1 + 1], &c__1); // 4. Synch to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3!=0){ i__2 = n - i__; blasf77_daxpy(&i__2, &c_one, f,&c__1, &y[i__+1+i__*y_dim1],&c__1); } i__2 = n - i__; blasf77_dgemv(MagmaTransStr, &i__, &i__2, &c_neg_one, &a[(i__ + 1) * a_dim1 + 1], &lda, &y[i__ * y_dim1 + 1], &c__1, &c_one, &y[i__ + 1 + i__ * y_dim1], &c__1); i__2 = n - i__; blasf77_dscal(&i__2, &tauq[i__], &y[i__ + 1 + i__ * y_dim1], &c__1); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i__ + 1; lapackf77_dlacgv(&i__2, &a[i__ + i__ * a_dim1], &lda); magma_dsetvector( i__2, a + i__ + (i__ )* a_dim1, lda, da+(i__-1)+ (i__-1)*(ldda), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu(f); return MAGMA_SUCCESS; } /* dlabrd */
extern "C" magma_int_t magma_cbicgstab_merge( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { // set queue for old dense routines magma_queue_t orig_queue; magmablasGetKernelStream( &orig_queue ); // prepare solver feedback solver_par->solver = Magma_BICGSTABMERGE; solver_par->numiter = 0; solver_par->info = MAGMA_SUCCESS; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO, c_one = MAGMA_C_ONE; magma_int_t dofs = A.num_rows; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); // workspace magma_c_vector q, r,rr,p,v,s,t; magmaFloatComplex *d1, *d2, *skp; d1 = NULL; d2 = NULL; skp = NULL; magma_int_t stat_dev = 0, stat_cpu = 0; stat_dev += magma_cmalloc( &d1, dofs*(2) ); stat_dev += magma_cmalloc( &d2, dofs*(2) ); // array for the parameters stat_dev += magma_cmalloc( &skp, 8 ); if( stat_dev != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); printf("error: memory allocation.\n"); return MAGMA_ERR_DEVICE_ALLOC; } // skp = [alpha|beta|omega|rho_old|rho|nom|tmp1|tmp2] magma_c_vinit( &q, Magma_DEV, dofs*6, c_zero, queue ); // q = rr|r|p|v|s|t rr.memory_location = Magma_DEV; rr.dval = NULL; rr.num_rows = rr.nnz = dofs; rr.num_cols = 1; r.memory_location = Magma_DEV; r.dval = NULL; r.num_rows = r.nnz = dofs; r.num_cols = 1; p.memory_location = Magma_DEV; p.dval = NULL; p.num_rows = p.nnz = dofs; p.num_cols = 1; v.memory_location = Magma_DEV; v.dval = NULL; v.num_rows = v.nnz = dofs; v.num_cols = 1; s.memory_location = Magma_DEV; s.dval = NULL; s.num_rows = s.nnz = dofs; s.num_cols = 1; t.memory_location = Magma_DEV; t.dval = NULL; t.num_rows = t.nnz = dofs; t.num_cols = 1; rr.dval = q(0); r.dval = q(1); p.dval = q(2); v.dval = q(3); s.dval = q(4); t.dval = q(5); // solver variables magmaFloatComplex alpha, beta, omega, rho_old, rho_new, *skp_h; float nom, nom0, betanom, r0, den; // solver setup magma_cscal( dofs, c_zero, x->dval, 1) ; // x = 0 magma_ccopy( dofs, b.dval, 1, q(0), 1 ); // rr = b magma_ccopy( dofs, b.dval, 1, q(1), 1 ); // r = b rho_new = magma_cdotc( dofs, r.dval, 1, r.dval, 1 ); // rho=<rr,r> nom = MAGMA_C_REAL(magma_cdotc( dofs, r.dval, 1, r.dval, 1 )); nom0 = betanom = sqrt(nom); // nom = || r || rho_old = omega = alpha = MAGMA_C_MAKE( 1.0, 0. ); beta = rho_new; solver_par->init_res = nom0; // array on host for the parameters stat_cpu = magma_cmalloc_cpu( &skp_h, 8 ); if( stat_cpu != 0 ){ magma_free( d1 ); magma_free( d2 ); magma_free( skp ); magma_free_cpu( skp_h ); printf("error: memory allocation.\n"); return MAGMA_ERR_HOST_ALLOC; } skp_h[0]=alpha; skp_h[1]=beta; skp_h[2]=omega; skp_h[3]=rho_old; skp_h[4]=rho_new; skp_h[5]=MAGMA_C_MAKE(nom, 0.0); magma_csetvector( 8, skp_h, 1, skp, 1 ); magma_c_spmv( c_one, A, r, c_zero, v, queue ); // z = A r den = MAGMA_C_REAL( magma_cdotc(dofs, v.dval, 1, r.dval, 1) );// den = z dot r if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) { magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } // check positive definite if (den <= 0.0) { printf("Operator A is not postive definite. (Ar,r) = %f\n", den); magmablasSetKernelStream( orig_queue ); return MAGMA_NONSPD; solver_par->info = MAGMA_NONSPD;; } //Chronometry real_Double_t tempo1, tempo2; tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ) { magmablasSetKernelStream(stream[0]); // computes p=r+beta*(p-omega*v) magma_cbicgmerge1( dofs, skp, v.dval, r.dval, p.dval, queue ); magma_c_spmv( c_one, A, p, c_zero, v, queue ); // v = Ap magma_cmdotc( dofs, 1, q.dval, v.dval, d1, d2, skp, queue ); magma_cbicgmerge4( 1, skp, queue ); magma_cbicgmerge2( dofs, skp, r.dval, v.dval, s.dval, queue ); // s=r-alpha*v magma_c_spmv( c_one, A, s, c_zero, t, queue ); // t=As magma_cmdotc( dofs, 2, q.dval+4*dofs, t.dval, d1, d2, skp+6, queue ); magma_cbicgmerge4( 2, skp, queue ); magma_cbicgmerge_xrbeta( dofs, d1, d2, q.dval, r.dval, p.dval, s.dval, t.dval, x->dval, skp, queue ); // check stopping criterion (asynchronous copy) magma_cgetvector_async( 1 , skp+5, 1, skp_h+5, 1, stream[1] ); betanom = sqrt(MAGMA_C_REAL(skp_h[5])); if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; float residual; magma_cresidual( A, b, *x, &residual, queue ); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter) { solver_par->info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_SLOW_CONVERGENCE; } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = MAGMA_DIVERGENCE; } magma_c_vfree(&q, queue ); // frees all vectors magma_free(d1); magma_free(d2); magma_free( skp ); magma_free_cpu( skp_h ); magmablasSetKernelStream( orig_queue ); return MAGMA_SUCCESS; } /* cbicgstab_merge */
/** Purpose ------- SORGQR generates an M-by-N REAL matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by SGEQRF_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix Q. M >= 0. @param[in] n INTEGER The number of columns of the matrix Q. M >= N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. @param[in,out] dA REAL array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. @param[in] ldda INTEGER The first dimension of the array A. LDDA >= max(1,M). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF_GPU. @param[in] dT (workspace) REAL work space array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB. This must be the 6th argument of magma_sgeqrf_gpu [ note that if N here is bigger than N in magma_sgeqrf_gpu, the workspace requirement DT in magma_sgeqrf_gpu must be as specified in this routine ]. @param[in] nb INTEGER This is the block size used in SGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument has an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sorgqr_gpu(magma_int_t m, magma_int_t n, magma_int_t k, float *dA, magma_int_t ldda, float *tau, float *dT, magma_int_t nb, magma_int_t *info) { #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; float *dV, *dW; float *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for sorgqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_smalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_smalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // ((n+31)/32*32 )*nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_sgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk ); lapackf77_sorgqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_ssetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_slaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda ); } if (kk > 0) { // Use blocked code // stream: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_scopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, stream ); // set panel to identity magmablas_slaset( MagmaFull, i, ib, c_zero, c_zero, dA(0, i), ldda ); magmablas_slaset( MagmaFull, mi, ib, c_zero, c_one, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_slarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } } magma_queue_sync( stream ); magmablasSetKernelStream( NULL ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( stream ); return *info; } /* magma_sorgqr_gpu */
/***************************************************************************//** Purpose ------- DLAUUM computes the product U * U^H or L^H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = MagmaUpper then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = MagmaLower then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the triangular factor stored in the array A is upper or lower triangular: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the triangular factor U or L. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = MagmaUpper, the upper triangle of A is overwritten with the upper triangle of the product U * U^H; if UPLO = MagmaLower, the lower triangle of A is overwritten with the lower triangle of the product L^H * L. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value @ingroup magma_lauum *******************************************************************************/ extern "C" magma_int_t magma_dlauum( magma_uplo_t uplo, magma_int_t n, double *A, magma_int_t lda, magma_int_t *info) { #define A(i_, j_) ( A + (i_) + (j_)*lda ) #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const double c_one = MAGMA_D_ONE; const double d_one = MAGMA_D_ONE; const char* uplo_ = lapack_uplo_const( uplo ); /* Local variables */ magma_int_t i, ib, ldda, nb; magmaDouble_ptr dA; bool upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if (n == 0) return *info; nb = magma_get_dpotrf_nb( n ); ldda = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= n) { lapackf77_dlauum( uplo_, &n, A, &lda, info ); } else if (upper) { /* Compute the product U * U^H. */ // Computing 2nd block column (diagonal & above): // [ u11 u12 u13 ] [ u11^H ] [ ... u12*u22^H + u13*u23^H ... ] // [ u22 u23 ] * [ u12^H u22^H ] = [ ... u22*u22^H + u23*u23^H ... ] // [ u33 ] [ u13^H u23^H u33^H ] [ ... ... ... ] for (i=0; i < n; i += nb) { ib = min( nb, n-i ); // Send diagonl block, u22 // This must finish before lauum below magma_dsetmatrix( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); // Send right of diagonl block, u23 magma_dsetmatrix_async( ib, n-i-ib, A(i,i+ib), lda, dA(i,i+ib), ldda, queues[1] ); // u12 = u12 * u22^H magma_dtrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0,i), ldda, queues[0] ); // u22 = u22 * u22^H lapackf77_dlauum( MagmaUpperStr, &ib, A(i,i), &lda, info ); magma_dsetmatrix_async( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); if (i+ib < n) { // wait for u23 magma_queue_sync( queues[1] ); // u12 += u13 * u23^H magma_dgemm( MagmaNoTrans, MagmaConjTrans, i, ib, n-i-ib, c_one, dA(0,i+ib), ldda, dA(i,i+ib), ldda, c_one, dA(0,i), ldda, queues[0] ); // u22 += u23 * u23^H magma_dsyrk( MagmaUpper, MagmaNoTrans, ib, n-i-ib, d_one, dA(i,i+ib), ldda, d_one, dA(i,i), ldda, queues[0] ); } // Get diagonal block & above of current column from device // This could be on a different queue -- not needed until return magma_dgetmatrix_async( i+ib, ib, dA(0,i), ldda, A(0,i), lda, queues[0] ); } } else { /* Compute the product L^H * L. */ for (i=0; i < n; i += nb) { ib = min( nb, n-i ); magma_dsetmatrix( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); magma_dsetmatrix_async( n-i-ib, ib, A(i+ib,i), lda, dA(i+ib,i), ldda, queues[1] ); magma_dtrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i,0), ldda, queues[0] ); lapackf77_dlauum( MagmaLowerStr, &ib, A(i,i), &lda, info ); magma_dsetmatrix_async( ib, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); if (i+ib < n) { magma_queue_sync( queues[1] ); magma_dgemm( MagmaConjTrans, MagmaNoTrans, ib, i, n-i-ib, c_one, dA(i+ib,i), ldda, dA(i+ib,0), ldda, c_one, dA(i,0), ldda, queues[0] ); magma_dsyrk( MagmaLower, MagmaConjTrans, ib, n-i-ib, d_one, dA(i+ib,i), ldda, d_one, dA(i,i), ldda, queues[0] ); } magma_dgetmatrix_async( ib, i+ib, dA(i,0), ldda, A(i,0), lda, queues[0] ); } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { TESTING_INIT(); magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magma_c_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; magma_int_t *pntre; magmaFloatComplex c_one = MAGMA_C_MAKE(1.0, 0.0); magmaFloatComplex c_zero = MAGMA_C_MAKE(0.0, 0.0); magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_cspmv" " [ --blocksize %d --alignment %d (for SELLP) ]" " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_cm_5stencil( laplace_size, &hA, queue ); } else { // file-matrix test magma_c_csr_mtx( &hA, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; magma_c_vector hx, hy, dx, dy, hrefvec, hcheck; // init CPU vectors magma_c_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue ); magma_c_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue ); // init DEV vectors magma_c_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); #ifdef MAGMA_WITH_MKL // calling MKL with CSR pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) ); pntre[0] = 0; for (j=0; j<hA.num_rows; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT *col; TESTING_MALLOC_CPU( col, MKL_INT, nnz ); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_MALLOC_CPU( row, MKL_INT, num_rows ); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } start = magma_wtime(); for (j=0; j<10; j++ ) { mkl_ccsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); TESTING_FREE_CPU( row ); TESTING_FREE_CPU( col ); free(pntre); #endif // MAGMA_WITH_MKL // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); // SpMV on GPU (CSR) -- this is the reference! start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA, queue ); magma_c_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue ); // convert to ELL and copy to GPU magma_c_mconvert( hA, &hA_ELL, Magma_CSR, Magma_ELL, queue ); magma_c_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_ELL, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (ELL) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_ELL, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard ELL).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_mfree(&dA_ELL, queue ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); if ( res < .000001 ) printf("# tester spmv ELL: ok\n"); else printf("# tester spmv ELL: failed\n"); magma_c_vfree( &hcheck, queue ); // convert to SELLP and copy to GPU magma_c_mconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue ); magma_c_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue ); magma_c_mfree(&hA_SELLP, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_c_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv SELL-P: ok\n"); else printf("# tester spmv SELL-P: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_mfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); magmaFloatComplex alpha = c_one; magmaFloatComplex beta = c_zero; magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // copy matrix to GPU magma_c_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseCcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE CSR\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); cusparseMatDescr_t descrA; cusparseStatus = cusparseCreateMatDescr(&descrA); if (cusparseStatus != 0) printf("error\n"); cusparseHybMat_t hybA; cusparseStatus = cusparseCreateHybMat( &hybA ); if (cusparseStatus != 0) printf("error\n"); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE CSR: ok\n"); else printf("# tester spmv cuSPARSE CSR: failed\n"); magma_c_vfree( &hcheck, queue ); magma_c_vfree( &dy, queue ); magma_c_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); cusparseCcsr2hyb(cusparseHandle, hA.num_rows, hA.num_cols, descrA, dA.dval, dA.drow, dA.dcol, hybA, 0, CUSPARSE_HYB_PARTITION_AUTO); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseChybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA, hybA, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE HYB\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (HYB).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_c_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_C_REAL(hcheck.val[k]) - MAGMA_C_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE HYB: ok\n"); else printf("# tester spmv cuSPARSE HYB: failed\n"); magma_c_vfree( &hcheck, queue ); cusparseDestroyMatDescr( descrA ); cusparseDestroyHybMat( hybA ); cusparseDestroy( cusparseHandle ); magma_c_mfree(&dA, queue ); printf("\n\n"); // free CPU memory magma_c_mfree(&hA, queue ); magma_c_vfree(&hx, queue ); magma_c_vfree(&hy, queue ); magma_c_vfree(&hrefvec, queue ); // free GPU memory magma_c_vfree(&dx, queue ); magma_c_vfree(&dy, queue ); i++; } magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix hA={Magma_CSR}, hA_SELLP={Magma_CSR}, dA={Magma_CSR}, dA_SELLP={Magma_CSR}; magma_s_matrix hx={Magma_CSR}, hy={Magma_CSR}, dx={Magma_CSR}, dy={Magma_CSR}, hrefvec={Magma_CSR}, hcheck={Magma_CSR}; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; #ifdef MAGMA_WITH_MKL magma_int_t *pntre=NULL; #endif cusparseHandle_t cusparseHandle = NULL; cusparseMatDescr_t descr = NULL; float c_one = MAGMA_S_MAKE(1.0, 0.0); float c_zero = MAGMA_S_MAKE(0.0, 0.0); float accuracy = 1e-10; #define PRECISION_s #if defined(PRECISION_c) accuracy = 1e-4; #endif #if defined(PRECISION_s) accuracy = 1e-4; #endif magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf("\n# usage: ./run_sspmm" " [ --blocksize %lld --alignment %lld (for SELLP) ] matrices\n\n", (long long) hA_SELLP.blocksize, (long long) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &hA, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &hA, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) hA.num_rows, (long long) hA.num_cols, (long long) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; // m - number of rows for the sparse matrix // n - number of vectors to be multiplied in the SpMM product magma_int_t m, n; m = hA.num_rows; n = 48; // init CPU vectors TESTING_CHECK( magma_svinit( &hx, Magma_CPU, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &hy, Magma_CPU, m, n, c_zero, queue )); // init DEV vectors TESTING_CHECK( magma_svinit( &dx, Magma_DEV, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, m, n, c_zero, queue )); // calling MKL with CSR #ifdef MAGMA_WITH_MKL TESTING_CHECK( magma_imalloc_cpu( &pntre, m + 1 ) ); pntre[0] = 0; for (j=0; j < m; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT num_vecs = n; MKL_INT *col; TESTING_CHECK( magma_malloc_cpu( (void**) &col, nnz * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_CHECK( magma_malloc_cpu( (void**) &row, num_rows * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } // === Call MKL with consecutive SpMVs, using mkl_scsrmv === // warmp up mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL SpMVs : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); // === Call MKL with blocked SpMVs, using mkl_scsrmm === char transa = 'n'; MKL_INT ldb = n, ldc=n; char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'}; // warm up mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); } end = magma_wtime(); printf( "\n > MKL SpMM : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); magma_free_cpu( row ); magma_free_cpu( col ); row = NULL; col = NULL; #endif // MAGMA_WITH_MKL // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue )); // SpMV on GPU (CSR) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue )); magma_smfree(&dA, queue ); // convert to SELLP and copy to GPU TESTING_CHECK( magma_smconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue )); TESTING_CHECK( magma_smtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue )); magma_smfree(&hA_SELLP, queue ); magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res=res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm SELL-P: ok\n"); else printf("%% tester spmm SELL-P: failed\n"); magma_smfree( &hcheck, queue ); magma_smfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); //#ifdef PRECISION_d start = magma_sync_wtime( queue ); TESTING_CHECK( cusparseCreate( &cusparseHandle )); TESTING_CHECK( cusparseSetStream( cusparseHandle, magma_queue_get_cuda_stream(queue) )); TESTING_CHECK( cusparseCreateMatDescr( &descr )); TESTING_CHECK( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); TESTING_CHECK( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); float alpha = c_one; float beta = c_zero; // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue) ); for (j=0; j < 10; j++) { cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, dA.num_rows, n, dA.num_cols, dA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, dA.num_cols, &beta, dy.dval, dA.num_cols); } end = magma_sync_wtime( queue ); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res = res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm cuSPARSE: ok\n"); else printf("%% tester spmm cuSPARSE: failed\n"); magma_smfree( &hcheck, queue ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); descr = NULL; cusparseHandle = NULL; //#endif printf("\n\n"); // free CPU memory magma_smfree( &hA, queue ); magma_smfree( &hx, queue ); magma_smfree( &hy, queue ); magma_smfree( &hrefvec, queue ); // free GPU memory magma_smfree( &dx, queue ); magma_smfree( &dy, queue ); magma_smfree( &dA, queue); #ifdef MAGMA_WITH_MKL magma_free_cpu( pntre ); #endif i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
extern "C" magma_int_t magma_dgeqrf_gpu( magma_int_t m, magma_int_t n, double *dA, magma_int_t ldda, double *tau, double *dT, magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DGEQRF computes a QR factorization of a real M-by-N matrix A: A = Q * R. This version stores the triangular dT matrices used in the block QR factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Also, the upper triangular matrices for V have 0s in them. The corresponding parts of the upper triangular R are inverted and stored separately in dT. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) DOUBLE_PRECISION array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). dT (workspace/output) DOUBLE_PRECISION array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB, where NB can be obtained through magma_get_dgeqrf_nb(M). It starts with MIN(M,N)*NB block that store the triangular T matrices, followed by the MIN(M,N)*NB block of the diagonal inverses for the R matrix. The rest of the array is used as workspace. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define t_ref(a_1) (dT+(a_1)*nb) #define d_ref(a_1) (dT+(minmn+(a_1))*nb) #define dd_ref(a_1) (dT+(2*minmn+(a_1))*nb) #define work_ref(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) magma_int_t i, k, minmn, old_i, old_ib, rows, cols; magma_int_t ib, nb; magma_int_t ldwork, lddwork, lwork, lhwork; double *work, *ut; /* check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = minmn = min(m,n); if (k == 0) return *info; nb = magma_get_dgeqrf_nb(m); lwork = (m + n + nb)*nb; lhwork = lwork - m*nb; if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lwork )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } ut = hwork+nb*(n); memset( ut, 0, nb*nb*sizeof(double)); magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); ldwork = m; lddwork= n; if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); rows = m -i; magma_dgetmatrix_async( rows, ib, a_ref(i,i), ldda, work_ref(i), ldwork, stream[1] ); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ cols = n-old_i-2*old_ib; magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, m-old_i, cols, old_ib, a_ref(old_i, old_i ), ldda, t_ref(old_i), nb, a_ref(old_i, old_i+2*old_ib), ldda, dd_ref(0), lddwork); /* store the diagonal */ magma_dsetmatrix_async( old_ib, old_ib, ut, old_ib, d_ref(old_i), old_ib, stream[0] ); } magma_queue_sync( stream[1] ); lapackf77_dgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &ib); /* Put 0s in the upper triangular part of a panel (and 1s on the diagonal); copy the upper triangular in ut and invert it. */ magma_queue_sync( stream[0] ); dsplit_diag_block(ib, work_ref(i), ldwork, ut); magma_dsetmatrix( rows, ib, work_ref(i), ldwork, a_ref(i,i), ldda ); if (i + ib < n) { /* Send the triangular factor T to the GPU */ magma_dsetmatrix( ib, ib, hwork, ib, t_ref(i), nb ); if (i+nb < k-nb){ /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); } else { cols = n-i-ib; magma_dlarfb_gpu( MagmaLeft, MagmaTrans, MagmaForward, MagmaColumnwise, rows, cols, ib, a_ref(i, i ), ldda, t_ref(i), nb, a_ref(i, i+ib), ldda, dd_ref(0), lddwork); /* Fix the diagonal block */ magma_dsetmatrix( ib, ib, ut, ib, d_ref(i), ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_dgetmatrix( rows, ib, a_ref(i, i), ldda, work, rows ); lhwork = lwork - rows*ib; lapackf77_dgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_dsetmatrix( rows, ib, work, rows, a_ref(i, i), ldda ); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; /* End of MAGMA_DGEQRF */ } /* magma_dgeqrf */
magma_int_t magma_dpgmres( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par ){ // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->info = 0; // local variables double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE, c_mone = MAGMA_D_NEG_ONE; magma_int_t dofs = A.num_rows; magma_int_t i, j, k, m = 0; magma_int_t restart = min( dofs-1, solver_par->restart ); magma_int_t ldh = restart+1; double nom, rNorm, RNorm, nom0, betanom, r0 = 0.; // CPU workspace magma_setdevice(0); double *H, *HH, *y, *h1; magma_dmalloc_pinned( &H, (ldh+1)*ldh ); magma_dmalloc_pinned( &y, ldh ); magma_dmalloc_pinned( &HH, ldh*ldh ); magma_dmalloc_pinned( &h1, ldh ); // GPU workspace magma_d_vector r, q, q_t, z, z_t, t; magma_d_vinit( &t, Magma_DEV, dofs, c_zero ); magma_d_vinit( &r, Magma_DEV, dofs, c_zero ); magma_d_vinit( &q, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z, Magma_DEV, dofs*(ldh+1), c_zero ); magma_d_vinit( &z_t, Magma_DEV, dofs, c_zero ); q_t.memory_location = Magma_DEV; q_t.val = NULL; q_t.num_rows = q_t.nnz = dofs; double *dy, *dH = NULL; if (MAGMA_SUCCESS != magma_dmalloc( &dy, ldh )) return MAGMA_ERR_DEVICE_ALLOC; if (MAGMA_SUCCESS != magma_dmalloc( &dH, (ldh+1)*ldh )) return MAGMA_ERR_DEVICE_ALLOC; // GPU stream magma_queue_t stream[2]; magma_event_t event[1]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_event_create( &event[0] ); magmablasSetKernelStream(stream[0]); magma_dscal( dofs, c_zero, x->val, 1 ); // x = 0 magma_dcopy( dofs, b.val, 1, r.val, 1 ); // r = b nom0 = betanom = magma_dnrm2( dofs, r.val, 1 ); // nom0= || r|| nom = nom0 * nom0; solver_par->init_res = nom0; H(1,0) = MAGMA_D_MAKE( nom0, 0. ); magma_dsetvector(1, &H(1,0), 1, &dH(1,0), 1); if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) r0 = ATOLERANCE; if ( nom < r0 ) return MAGMA_SUCCESS; //Chronometry real_Double_t tempo1, tempo2; magma_device_sync(); tempo1=magma_wtime(); if( solver_par->verbose > 0 ){ solver_par->res_vec[0] = nom0; solver_par->timing[0] = 0.0; } // start iteration for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; solver_par->numiter++ ){ magma_dcopy(dofs, r.val, 1, q(0), 1); // q[0] = 1.0/H(1,0) r magma_dscal(dofs, 1./H(1,0), q(0), 1); // (to be fused) for(k=1; k<=restart; k++) { q_t.val = q(k-1); magmablasSetKernelStream(stream[0]); // preconditioner // z[k] = M^(-1) q(k) magma_d_applyprecond_left( A, q_t, &t, precond_par ); magma_d_applyprecond_right( A, t, &z_t, precond_par ); magma_dcopy(dofs, z_t.val, 1, z(k-1), 1); // r = A q[k] magma_d_spmv( c_one, A, z_t, c_zero, r ); if (solver_par->ortho == Magma_MGS ) { // modified Gram-Schmidt magmablasSetKernelStream(stream[0]); for (i=1; i<=k; i++) { H(i,k) =magma_ddot(dofs, q(i-1), 1, r.val, 1); // H(i,k) = q[i] . r magma_daxpy(dofs,-H(i,k), q(i-1), 1, r.val, 1); // r = r - H(i,k) q[i] } H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if (k < restart) { magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } } else if (solver_par->ortho == Magma_FUSED_CGS ) { // fusing dgemv with dnrm2 in classical Gram-Schmidt magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // dH(1:k+1,k) = q[0:k] . r magmablas_dgemv(MagmaTrans, dofs, k+1, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); // 1) dH(k+1,k) = sqrt( dH(k+1,k) - dH(1:k,k) ) magma_dcopyscale( dofs, k, r.val, q(k), &dH(1,k) ); // 2) q[k] = q[k] / dH(k+1,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); // asynch copy dH(1:(k+1),k) to H(1:(k+1),k) } else { // classical Gram-Schmidt (default) // > explicitly calling magmabls magmablasSetKernelStream(stream[0]); magmablas_dgemv(MagmaTrans, dofs, k, c_one, q(0), dofs, r.val, 1, c_zero, &dH(1,k), 1); // dH(1:k,k) = q[0:k-1] . r #ifndef DNRM2SCALE // start copying dH(1:k,k) to H(1:k,k) magma_event_record( event[0], stream[0] ); magma_queue_wait_event( stream[1], event[0] ); magma_dgetvector_async(k, &dH(1,k), 1, &H(1,k), 1, stream[1]); #endif // r = r - q[0:k-1] dH(1:k,k) magmablas_dgemv(MagmaNoTrans, dofs, k, c_mone, q(0), dofs, &dH(1,k), 1, c_one, r.val, 1); #ifdef DNRM2SCALE magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = r / H(k,k-1) magma_dnrm2scale(dofs, q(k), dofs, &dH(k+1,k) ); // dH(k+1,k) = sqrt(r . r) and r = r / dH(k+1,k) magma_event_record( event[0], stream[0] ); // start sending dH(1:k,k) to H(1:k,k) magma_queue_wait_event( stream[1], event[0] ); // can we keep H(k+1,k) on GPU and combine? magma_dgetvector_async(k+1, &dH(1,k), 1, &H(1,k), 1, stream[1]); #else H(k+1,k) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // H(k+1,k) = sqrt(r . r) if( k<solver_par->restart ){ magmablasSetKernelStream(stream[0]); magma_dcopy(dofs, r.val, 1, q(k), 1); // q[k] = 1.0/H[k][k-1] r magma_dscal(dofs, 1./H(k+1,k), q(k), 1); // (to be fused) } #endif } } magma_queue_sync( stream[1] ); for( k=1; k<=restart; k++ ){ /* Minimization of || b-Ax || in H_k */ for (i=1; i<=k; i++) { #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ddot_sub( i+1, &H(1,k), 1, &H(1,i), 1, &HH(k,i) ); #else HH(k,i) = cblas_ddot(i+1, &H(1,k), 1, &H(1,i), 1); #endif } h1[k] = H(1,k)*H(1,0); if (k != 1) for (i=1; i<k; i++) { for (m=i+1; m<k; m++){ HH(k,m) -= HH(k,i) * HH(m,i); } HH(k,k) -= HH(k,i) * HH(k,i) / HH(i,i); HH(k,i) = HH(k,i)/HH(i,i); h1[k] -= h1[i] * HH(k,i); } y[k] = h1[k]/HH(k,k); if (k != 1) for (i=k-1; i>=1; i--) { y[i] = h1[i]/HH(i,i); for (j=i+1; j<=k; j++) y[i] -= y[j] * HH(j,i); } m = k; rNorm = fabs(MAGMA_D_REAL(H(k+1,k))); } magma_dsetmatrix_async(m, 1, y+1, m, dy, m, stream[0]); magmablasSetKernelStream(stream[0]); magma_dgemv(MagmaNoTrans, dofs, m, c_one, z(0), dofs, dy, 1, c_one, x->val, 1); magma_d_spmv( c_mone, A, *x, c_zero, r ); // r = - A * x magma_daxpy(dofs, c_one, b.val, 1, r.val, 1); // r = r + b H(1,0) = MAGMA_D_MAKE( magma_dnrm2(dofs, r.val, 1), 0. ); // RNorm = H[1][0] = || r || RNorm = MAGMA_D_REAL( H(1,0) ); betanom = fabs(RNorm); if( solver_par->verbose > 0 ){ magma_device_sync(); tempo2=magma_wtime(); if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if ( betanom < r0 ) { break; } } magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; magma_dresidual( A, b, *x, &residual ); solver_par->iter_res = betanom; solver_par->final_res = residual; if( solver_par->numiter < solver_par->maxiter){ solver_par->info = 0; }else if( solver_par->init_res > solver_par->final_res ){ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -2; } else{ if( solver_par->verbose > 0 ){ if( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } solver_par->info = -1; } // free pinned memory magma_free_pinned( H ); magma_free_pinned( y ); magma_free_pinned( HH ); magma_free_pinned( h1 ); // free GPU memory magma_free(dy); if (dH != NULL ) magma_free(dH); magma_d_vfree(&t); magma_d_vfree(&r); magma_d_vfree(&q); magma_d_vfree(&z); magma_d_vfree(&z_t); // free GPU streams and events magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_event_destroy( event[0] ); magmablasSetKernelStream(NULL); return MAGMA_SUCCESS; } /* magma_dgmres */
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. It uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A DOUBLE PRECISION array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf( magma_int_t m, magma_int_t n, double *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_)*nb + (j_)*nb*ldda + dA_offset) #define dAT(i_, j_) dAT, ((i_)*nb*lddat + (j_)*nb + dAT_offset) #define dwork(i_) dwork, (i_) #else #define dA(i_, j_) ( dA + (i_)*nb + (j_)*nb*ldda) #define dAT(i_, j_) ( dAT + (i_)*nb*lddat + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif // Constants const double c_one = MAGMA_D_ONE; const double c_neg_one = MAGMA_D_NEG_ONE; // Local variables double *work; magmaDouble_ptr dA, dAT, dwork; magma_int_t iinfo, nb; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ nb = magma_get_dgetrf_nb( m, n ); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_dgetrf( &m, &n, A, &lda, ipiv, info ); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, lddat, maxdim; magma_int_t i, j, rows, cols, s = min(m, n)/nb; maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); maxdim = max( maxm, maxn ); lddat = maxn; ldda = maxm; /* set number of GPUs */ magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magma_queue_t queues[2] = { NULL, NULL }; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); /* check the memory requirement */ size_t mem_size = magma_queue_mem_size( queues[0] ); mem_size /= sizeof(double); magma_int_t h = 1+(2+ngpu); magma_int_t ngpu2 = ngpu; magma_int_t NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( ngpu > ceil((double)NB/nb) ) { ngpu2 = (magma_int_t)ceil((double)NB/nb); h = 1+(2+ngpu2); NB = (magma_int_t)(0.8*mem_size/maxm - h*nb); } if ( ngpu2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place // dwork is nb*maxm for panel, and maxdim*maxdim for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; ldda = lddat = maxdim; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); dAT = dA; magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place // dwork is nb*maxm for panel, and maxm*maxn for A if (MAGMA_SUCCESS != magma_dmalloc( &dwork, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } dA = dwork + nb*maxm; magma_dsetmatrix( m, n, A, lda, dA(0,0), ldda, queues[0] ); if (MAGMA_SUCCESS != magma_dmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dwork ); magma_dgetrf_m( ngpu, m, n, A, lda, ipiv, info ); return *info; } magmablas_dtranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } lapackf77_dgetrf( &m, &nb, work, &lda, ipiv, &iinfo ); for( j = 0; j < s; j++ ) { // get j-th panel from device cols = maxm - j*nb; if (j > 0) { magmablas_dtranspose( nb, cols, dAT(j,j), lddat, dwork(0), cols, queues[0] ); magma_queue_sync( queues[0] ); magma_dgetmatrix_async( m-j*nb, nb, dwork(0), cols, work, lda, queues[1] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); // do the cpu part rows = m - j*nb; magma_queue_sync( queues[1] ); lapackf77_dgetrf( &rows, &nb, work, &lda, ipiv+j*nb, &iinfo ); } if (*info == 0 && iinfo > 0) *info = iinfo + j*nb; // put j-th panel onto device magma_dsetmatrix_async( m-j*nb, nb, work, lda, dwork(0), cols, queues[1] ); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); magma_queue_sync( queues[1] ); magmablas_dtranspose( cols, nb, dwork(0), cols, dAT(j,j), lddat, queues[0] ); // do the small non-parallel computations (next panel update) if (s > (j+1)) { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_dgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_dtranspose( nb0, rows, dAT(s,s), lddat, dwork(0), cols, queues[0] ); magma_dgetmatrix_async( rows, nb0, dwork(0), cols, work, lda, queues[0] ); magma_queue_sync( queues[0] ); // do the cpu part lapackf77_dgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo ); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_dlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); // put j-th panel onto device magma_dsetmatrix_async( rows, nb0, work, lda, dwork(0), cols, queues[0] ); magmablas_dtranspose( rows, nb0, dwork(0), cols, dAT(s,s), lddat, queues[0] ); magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), lddat, dAT(s, s)+nb0, lddat, queues[0] ); } // undo transpose if (maxdim*maxdim < 2*maxm*maxn) { magmablas_dtranspose_inplace( maxdim, dAT(0,0), lddat, queues[0] ); magma_dgetmatrix( m, n, dAT(0,0), lddat, A, lda, queues[0] ); } else { magmablas_dtranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_dgetmatrix( m, n, dA(0,0), ldda, A, lda, queues[0] ); magma_free( dAT ); } magma_free( dwork ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); } return *info; } /* magma_dgetrf */
/***************************************************************************//** Purpose ------- SGEHRD2 reduces a REAL general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to SGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau REAL array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) REAL array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. @ingroup magma_gehrd *******************************************************************************/ extern "C" magma_int_t magma_sgehrd2( magma_int_t n, magma_int_t ilo, magma_int_t ihi, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i_,j_) ( A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda*2) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dV(i_,j_) dwork, ((i_) + (j_)*ldda + nb*ldda) #define dwork(i_) dwork, ((i_)) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dV(i_,j_) (dV + (i_) + (j_)*ldda) #define dwork(i_) (dwork + (i_)) #endif // Constants const float c_one = MAGMA_S_ONE; const float c_zero = MAGMA_S_ZERO; // Local variables magma_int_t nb = magma_get_sgehrd_nb( n ); magma_int_t ldda = magma_roundup( n, 32 ); magma_int_t i, nh, iws; magma_int_t iinfo; magma_int_t lquery; *info = 0; iws = n*nb; work[0] = magma_smake_lwork( iws ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb > nh) { // Use unblocked code below i = ilo; } else { // Use blocked code magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); // GPU workspace is: // nb*ldda for dwork for slahru // nb*ldda for dV // n*ldda for dA // nb*nb for dT magmaFloat_ptr dwork; if (MAGMA_SUCCESS != magma_smalloc( &dwork, 2*nb*ldda + n*ldda + nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } float *dV = dwork + nb*ldda; float *dA = dwork + nb*ldda*2; float *dT = dwork + nb*ldda*2 + n*ldda; float *T; magma_smalloc_cpu( &T, nb*nb ); if ( T == NULL ) { magma_free( dwork ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } // zero first block of V, which is lower triangular magmablas_slaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; assert( nb % 4 == 0 ); for (i=0; i < nb*nb; i += 4) T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero; // Copy the matrix to the GPU magma_ssetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue ); for (i = ilo; i < ihi-1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) magma_sgetmatrix( ihi-i, nb, dA(i,i-ilo), ldda, A(i,i), lda, queue ); // add 1 to i for 1-based index magma_slahr2( ihi, i+1, nb, dA(0,i-ilo), ldda, dV(0,0), ldda, A(0,i), lda, &tau[i], T, nb, work, n, queue ); // Copy T from the CPU to dT on the GPU magma_ssetmatrix( nb, nb, T, nb, dT(0,0), nb, queue ); magma_slahru( n, ihi, i, nb, A(0,i), lda, dA(0,i-ilo), ldda, // dA dA(i,i-ilo), ldda, // dY, stored over current panel dV(0,0), ldda, dT(0,0), dwork, queue ); } // Copy remainder to host magma_sgetmatrix( n, n-i, dA(0,i-ilo), ldda, A(0,i), lda, queue ); magma_free( dwork ); magma_free_cpu( T ); magma_queue_destroy( queue ); } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_sgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = magma_smake_lwork( iws ); return *info; } /* magma_sgehrd2 */
/** Purpose ------- SORMQR overwrites the general real M-by-N matrix C with @verbatim SIDE = MagmaLeft SIDE = MagmaRight TRANS = MagmaNoTrans: Q * C C * Q TRANS = MagmaTrans: Q**H * C C * Q**H @endverbatim where Q is a real unitary matrix defined as the product of k elementary reflectors Q = H(1) H(2) . . . H(k) as returned by SGEQRF. Q is of order M if SIDE = MagmaLeft and of order N if SIDE = MagmaRight. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] side magma_side_t - = MagmaLeft: apply Q or Q**H from the Left; - = MagmaRight: apply Q or Q**H from the Right. @param[in] trans magma_trans_t - = MagmaNoTrans: No transpose, apply Q; - = MagmaTrans: Conjugate transpose, apply Q**H. @param[in] m INTEGER The number of rows of the matrix C. M >= 0. @param[in] n INTEGER The number of columns of the matrix C. N >= 0. @param[in] k INTEGER The number of elementary reflectors whose product defines the matrix Q. If SIDE = MagmaLeft, M >= K >= 0; if SIDE = MagmaRight, N >= K >= 0. @param[in] A REAL array, dimension (LDA,K) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by SGEQRF in the first k columns of its array argument A. @param[in] lda INTEGER The leading dimension of the array A. If SIDE = MagmaLeft, LDA >= max(1,M); if SIDE = MagmaRight, LDA >= max(1,N). @param[in] tau REAL array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by SGEQRF. @param[in,out] C REAL array, dimension (LDC,N) On entry, the M-by-N matrix C. On exit, C is overwritten by Q*C or Q**H*C or C*Q**H or C*Q. @param[in] ldc INTEGER The leading dimension of the array C. LDC >= max(1,M). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK. If SIDE = MagmaLeft, LWORK >= max(1,N); if SIDE = MagmaRight, LWORK >= max(1,M). For optimum performance LWORK >= N*NB if SIDE = MagmaLeft, and LWORK >= M*NB if SIDE = MagmaRight, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sormqr_m( magma_int_t ngpu, magma_side_t side, magma_trans_t trans, magma_int_t m, magma_int_t n, magma_int_t k, float *A, magma_int_t lda, float *tau, float *C, magma_int_t ldc, float *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define C(i, j) (C + (j)*ldc + (i)) #define dC(gpui, i, j) (dw[gpui] + (j)*lddc + (i)) #define dA_c(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddac) #define dA_r(gpui, ind, i, j) (dw[gpui] + maxnlocal*lddc + (ind)*lddar*lddac + (i) + (j)*lddar) #define dT(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + (ind)*((nb+1)*nb)) #define dwork(gpui, ind) (dw[gpui] + maxnlocal*lddc + 2*lddac*lddar + 2*((nb+1)*nb) + (ind)*(lddwork*nb)) float c_zero = MAGMA_S_ZERO; float c_one = MAGMA_S_ONE; const char* side_ = lapack_side_const( side ); const char* trans_ = lapack_trans_const( trans ); // TODO fix memory leak (alloc after argument checks) magma_int_t nb = 128; float *T; magma_smalloc_pinned(&T, nb*nb); //printf("calling sormqr_m with nb=%d\n", (int) nb); float* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; magma_event_t event [MagmaMaxGPUs][2]; magma_int_t ind_c; magma_device_t igpu; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); *info = 0; magma_int_t left = (side == MagmaLeft); magma_int_t notran = (trans == MagmaNoTrans); magma_int_t lquery = (lwork == -1); /* NQ is the order of Q and NW is the minimum dimension of WORK */ magma_int_t nq, nw; if (left) { nq = m; nw = n; } else { nq = n; nw = m; } if (! left && side != MagmaRight) { *info = -1; } else if (! notran && trans != MagmaTrans) { *info = -2; } else if (m < 0) { *info = -3; } else if (n < 0) { *info = -4; } else if (k < 0 || k > nq) { *info = -5; } else if (lda < max(1,nq)) { *info = -7; } else if (ldc < max(1,m)) { *info = -10; } else if (lwork < max(1,nw) && ! lquery) { *info = -12; } magma_int_t lwkopt = max(1,nw) * nb; if (*info == 0) { work[0] = MAGMA_S_MAKE( lwkopt, 0 ); } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (m == 0 || n == 0 || k == 0) { work[0] = c_one; return *info; } if (nb >= k) { /* Use CPU code */ lapackf77_sormqr(side_, trans_, &m, &n, &k, A, &lda, tau, C, &ldc, work, &lwork, info); return *info; } magma_int_t lddc = (m+63)/64*64; magma_int_t lddac = nq; magma_int_t lddar = nb; magma_int_t lddwork = nw; magma_int_t nlocal[ MagmaMaxGPUs ] = { 0 }; magma_int_t nb_l=256; magma_int_t nbl = (n-1)/nb_l+1; // number of blocks magma_int_t maxnlocal = (nbl+ngpu-1)/ngpu*nb_l; ngpu = min(ngpu, (n+nb_l-1)/nb_l); // Don't use GPU that will not have data. magma_int_t ldw = maxnlocal*lddc // dC + 2*lddac*lddar // 2*dA + 2*(nb + 1 + lddwork)*nb; // 2*(dT and dwork) for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); if (MAGMA_SUCCESS != magma_smalloc( &dw[igpu], ldw )) { *info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(*info) ); return *info; } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); magma_event_create( &event[igpu][0] ); magma_event_create( &event[igpu][1] ); } /* Use hybrid CPU-MGPU code */ if (left) { //copy C to mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_ssetmatrix_async( m, kb, C(0, i*nb_l), ldc, dC(igpu, 0, i/ngpu*nb_l), lddc, stream[igpu][0] ); nlocal[igpu] += kb; } magma_int_t i1, i2, i3; if ( !notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } ind_c = 0; for (magma_int_t i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { // start the copy of A panel magma_int_t kb = min(nb, k - i); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_sync(event[igpu][ind_c]); // check if the new data can be copied magma_ssetmatrix_async(nq-i, kb, A(i, i), lda, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); // set upper triangular part of dA to identity magmablas_slaset_band_q( MagmaUpper, kb, kb, kb, c_zero, c_one, dA_c(igpu, ind_c, i, 0), lddac, stream[igpu][0] ); } /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ magma_int_t nqi = nq - i; lapackf77_slarft("F", "C", &nqi, &kb, A(i, i), &lda, &tau[i], T, &kb); /* H or H' is applied to C(1:m,i:n) */ /* Apply H or H'; First copy T to the GPU */ for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_ssetmatrix_async(kb, kb, T, kb, dT(igpu, ind_c), kb, stream[igpu][0] ); } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][0] ); // check if the data was copied magmablasSetKernelStream(stream[igpu][1]); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, m-i, nlocal[igpu], kb, dA_c(igpu, ind_c, i, 0), lddac, dT(igpu, ind_c), kb, dC(igpu, i, 0), lddc, dwork(igpu, ind_c), lddwork); magma_event_record(event[igpu][ind_c], stream[igpu][1] ); } ind_c = (ind_c+1)%2; } for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_queue_sync( stream[igpu][1] ); } //copy C from mgpus for (magma_int_t i = 0; i < nbl; ++i) { magma_int_t igpu = i%ngpu; magma_setdevice(igpu); magma_int_t kb = min(nb_l, n-i*nb_l); magma_sgetmatrix( m, kb, dC(igpu, 0, i/ngpu*nb_l), lddc, C(0, i*nb_l), ldc ); // magma_sgetmatrix_async( m, kb, // dC(igpu, 0, i/ngpu*nb_l), lddc, // C(0, i*nb_l), ldc, stream[igpu][0] ); } } else { // TODO fix memory leak T, dw, event, stream fprintf(stderr, "The case (side == right) is not implemented\n"); *info = MAGMA_ERR_NOT_IMPLEMENTED; magma_xerbla( __func__, -(*info) ); return *info; /* if ( notran ) { i1 = 0; i2 = k; i3 = nb; } else { i1 = (k - 1) / nb * nb; i2 = 0; i3 = -nb; } mi = m; ic = 0; for (i = i1; (i3 < 0 ? i >= i2 : i < i2); i += i3) { ib = min(nb, k - i); // Form the triangular factor of the block reflector // H = H(i) H(i+1) . . . H(i+ib-1) i__4 = nq - i; lapackf77_slarft("F", "C", &i__4, &ib, A(i, i), &lda, &tau[i], T, &ib); // 1) copy the panel from A to the GPU, and // 2) set upper triangular part of dA to identity magma_ssetmatrix( i__4, ib, A(i, i), lda, dA(i, 0), ldda ); magmablas_slaset_band( MagmaUpper, ib, ib, ib, c_zero, c_one, dA(i, 0), ldda ); // H or H' is applied to C(1:m,i:n) ni = n - i; jc = i; // Apply H or H'; First copy T to the GPU magma_ssetmatrix( ib, ib, T, ib, dT, ib ); magma_slarfb_gpu( side, trans, MagmaForward, MagmaColumnwise, mi, ni, ib, dA(i, 0), ldda, dT, ib, dC(ic, jc), lddc, dwork, lddwork); } */ } work[0] = MAGMA_S_MAKE( lwkopt, 0 ); for (igpu = 0; igpu < ngpu; ++igpu) { magma_setdevice(igpu); magma_event_destroy( event[igpu][0] ); magma_event_destroy( event[igpu][1] ); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_sormqr */
/** Purpose ------- SGEQRF computes a QR factorization of a REAL M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. This uses 2 queues to overlap communication and computation. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A REAL array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max( N*NB, 2*NB*NB ), where NB can be obtained through magma_get_sgeqrf_nb( M, N ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). @ingroup magma_sgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_sgeqrf( magma_int_t m, magma_int_t n, float *A, magma_int_t lda, float *tau, float *work, magma_int_t lwork, magma_int_t *info ) { #define A(i_,j_) (A + (i_) + (j_)*lda) #ifdef HAVE_clBLAS #define dA(i_,j_) dA, ((i_) + (j_)*ldda + dA_offset) #define dT(i_,j_) dT, ((i_) + (j_)*nb + dT_offset) #define dwork(i_) dwork, ((i_) + dwork_offset) #else #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dT(i_,j_) (dT + (i_) + (j_)*nb) #define dwork(i_) (dwork + (i_)) #endif /* Constants */ const float c_one = MAGMA_S_ONE; /* Local variables */ magmaFloat_ptr dA, dT, dwork; magma_int_t i, ib, min_mn, ldda, lddwork, old_i, old_ib; /* Function Body */ *info = 0; magma_int_t nb = magma_get_sgeqrf_nb( m, n ); // need 2*nb*nb to store T and upper triangle of V simultaneously magma_int_t lwkopt = max( n*nb, 2*nb*nb ); work[0] = magma_smake_lwork( lwkopt ); bool lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1, lwkopt) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; min_mn = min( m, n ); if (min_mn == 0) { work[0] = c_one; return *info; } // largest N for larfb is n-nb (trailing matrix lacks 1st panel) lddwork = magma_roundup( n, 32 ) - nb; ldda = magma_roundup( m, 32 ); magma_int_t ngpu = magma_num_gpus(); if ( ngpu > 1 ) { /* call multiple-GPU interface */ return magma_sgeqrf_m( ngpu, m, n, A, lda, tau, work, lwork, info ); } // allocate space for dA, dwork, and dT if (MAGMA_SUCCESS != magma_smalloc( &dA, n*ldda + nb*lddwork + nb*nb )) { /* alloc failed so call non-GPU-resident version */ return magma_sgeqrf_ooc( m, n, A, lda, tau, work, lwork, info ); } dwork = dA + n*ldda; dT = dA + n*ldda + nb*lddwork; magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if ( (nb > 1) && (nb < min_mn) ) { /* Use blocked code initially. Asynchronously send the matrix to the GPU except the first panel. */ magma_ssetmatrix_async( m, n-nb, A(0,nb), lda, dA(0,nb), ldda, queues[0] ); old_i = 0; old_ib = nb; for (i = 0; i < min_mn-nb; i += nb) { ib = min( min_mn-i, nb ); if (i > 0) { /* get i-th panel from device */ magma_queue_sync( queues[1] ); magma_sgetmatrix_async( m-i, ib, dA(i,i), ldda, A(i,i), lda, queues[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i), ldda, dT(0,0), nb, dA(old_i, old_i+2*old_ib), ldda, dwork(0), lddwork, queues[1] ); magma_sgetmatrix_async( i, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); magma_queue_sync( queues[0] ); } magma_int_t rows = m-i; lapackf77_sgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_slarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, A(i,i), &lda, tau+i, work, &ib ); magma_spanel_to_q( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); /* put i-th V matrix onto device */ magma_ssetmatrix_async( rows, ib, A(i,i), lda, dA(i,i), ldda, queues[0] ); /* put T matrix onto device */ magma_queue_sync( queues[1] ); magma_ssetmatrix_async( ib, ib, work, ib, dT(0,0), nb, queues[0] ); magma_queue_sync( queues[0] ); if (i + ib < n) { if (i+ib < min_mn-nb) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left (look-ahead) */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_sq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } else { /* After last panel, update whole trailing matrix. */ /* Apply H' to A(i:m,i+ib:n) from the left */ magma_slarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dT(0,0), nb, dA(i, i+ib), ldda, dwork(0), lddwork, queues[1] ); magma_sq_to_panel( MagmaUpper, ib, A(i,i), lda, work+ib*ib ); } old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < min_mn) { ib = n-i; if (i != 0) { magma_sgetmatrix( m, ib, dA(0,i), ldda, A(0,i), lda, queues[1] ); } magma_int_t rows = m-i; lapackf77_sgeqrf( &rows, &ib, A(i,i), &lda, tau+i, work, &lwork, info ); } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free( dA ); return *info; } /* magma_sgeqrf */
/***************************************************************************//** Purpose ------- SGELQF computes an LQ factorization of a REAL M-by-N matrix dA: dA = L * Q. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and below the diagonal of the array contain the m-by-min(m,n) lower trapezoidal matrix L (L is lower triangular if m <= n); the elements above the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors (see Further Details). @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] tau REAL array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). @param[out] work (workspace) REAL array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. \n Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lwork INTEGER The dimension of the array WORK. LWORK >= max(1,M). For optimum performance LWORK >= M*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details --------------- The matrix Q is represented as a product of elementary reflectors Q = H(k) . . . H(2) H(1), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:n) is stored on exit in A(i,i+1:n), and tau in TAU(i). @ingroup magma_gelqf *******************************************************************************/ extern "C" magma_int_t magma_sgelqf_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, float *tau, float *work, magma_int_t lwork, magma_int_t *info) { /* Constants */ const float c_one = MAGMA_S_ONE; const magma_int_t ione = 1; MAGMA_UNUSED( ione ); // used only for real /* Local variables */ magmaFloat_ptr dAT=NULL; magma_int_t min_mn, maxm, maxn, nb; magma_int_t iinfo; *info = 0; nb = magma_get_sgelqf_nb( m, n ); min_mn = min( m, n ); work[0] = magma_smake_lwork( m*nb ); bool lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } else if (lwork < max(1,m) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (min_mn == 0) { work[0] = c_one; return *info; } maxm = magma_roundup( m, 32 ); maxn = magma_roundup( n, 32 ); magma_int_t lddat = maxn; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); if ( m == n ) { dAT = dA; lddat = ldda; magmablas_stranspose_inplace( m, dAT, ldda, queue ); } else { if (MAGMA_SUCCESS != magma_smalloc( &dAT, maxm*maxn ) ) { *info = MAGMA_ERR_DEVICE_ALLOC; goto cleanup; } magmablas_stranspose( m, n, dA, ldda, dAT, lddat, queue ); } magma_sgeqrf2_gpu( n, m, dAT, lddat, tau, &iinfo ); assert( iinfo >= 0 ); if ( iinfo > 0 ) { *info = iinfo; } // conjugate tau #ifdef COMPLEX lapackf77_slacgv( &min_mn, tau, &ione ); #endif if ( m == n ) { magmablas_stranspose_inplace( m, dAT, lddat, queue ); } else { magmablas_stranspose( n, m, dAT, lddat, dA, ldda, queue ); magma_free( dAT ); } cleanup: magma_queue_destroy( queue ); return *info; } /* magma_sgelqf_gpu */
extern "C" magma_int_t magma_zgeqrf(magma_int_t m, magma_int_t n, cuDoubleComplex *a, magma_int_t lda, cuDoubleComplex *tau, cuDoubleComplex *work, magma_int_t lwork, magma_int_t *info ) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A: A = Q * R. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). TAU (output) COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). WORK (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. Higher performance is achieved if WORK is in pinned memory, e.g. allocated using magma_malloc_pinned. LWORK (input) INTEGER The dimension of the array WORK. LWORK >= N*NB, where NB can be obtained through magma_get_zgeqrf_nb(M). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1)) #define da_ref(a_1,a_2) (da+(a_2)*ldda + (a_1)) cuDoubleComplex *da, *dwork; cuDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t i, k, lddwork, old_i, old_ib; magma_int_t ib, ldda; /* Function Body */ *info = 0; magma_int_t nb = magma_get_zgeqrf_nb(min(m, n)); magma_int_t lwkopt = n * nb; work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 ); int lquery = (lwork == -1); if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < max(1,n) && ! lquery) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { work[0] = c_one; return *info; } lddwork = ((n+31)/32)*32; ldda = ((m+31)/32)*32; magma_int_t num_gpus = magma_num_gpus(); if( num_gpus > 1 ) { /* call multiple-GPU interface */ return magma_zgeqrf4(num_gpus, m, n, a, lda, tau, work, lwork, info); } if (MAGMA_SUCCESS != magma_zmalloc( &da, (n)*ldda + nb*lddwork )) { /* Switch to the "out-of-core" (out of GPU-memory) version */ return magma_zgeqrf_ooc(m, n, a, lda, tau, work, lwork, info); } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); dwork = da + ldda*(n); if ( (nb > 1) && (nb < k) ) { /* Use blocked code initially */ magma_zsetmatrix_async( (m), (n-nb), a_ref(0,nb), lda, da_ref(0,nb), ldda, stream[0] ); old_i = 0; old_ib = nb; for (i = 0; i < k-nb; i += nb) { ib = min(k-i, nb); if (i>0){ magma_zgetmatrix_async( (m-i), ib, da_ref(i,i), ldda, a_ref(i,i), lda, stream[1] ); magma_zgetmatrix_async( i, ib, da_ref(0,i), ldda, a_ref(0,i), lda, stream[0] ); /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, da_ref(old_i, old_i), ldda, dwork, lddwork, da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork); } magma_queue_sync( stream[1] ); magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, a_ref(i,i), &lda, tau+i, work, &ib); zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib); magma_zsetmatrix( rows, ib, a_ref(i,i), lda, da_ref(i,i), ldda ); zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, work+ib*ib); if (i + ib < n) { magma_zsetmatrix( ib, ib, work, ib, dwork, lddwork ); if (i+ib < k-nb) /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); else magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, da_ref(i, i ), ldda, dwork, lddwork, da_ref(i, i+ib), ldda, dwork+ib, lddwork); old_i = i; old_ib = ib; } } } else { i = 0; } /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; if (i!=0) magma_zgetmatrix( m, ib, da_ref(0,i), ldda, a_ref(0,i), lda ); magma_int_t rows = m-i; lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info); } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( da ); return *info; } /* magma_zgeqrf */
/* //////////////////////////////////////////////////////////////////////////// -- Testing zhetrd_he2hb */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_time, gpu_perf; magmaDoubleComplex *h_A, *h_R, *h_work; magmaDoubleComplex *tau; double *D, *E; magma_int_t N, n2, lda, ldda, lwork, ldt, info, nstream; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; // TODO add these options to parse_opts magma_int_t NE = 0; magma_int_t distblk = 0; magma_opts opts; opts.parse_opts( argc, argv ); magma_int_t WANTZ = (opts.jobz == MagmaVec); double tol = opts.tolerance * lapackf77_dlamch("E"); if (opts.nb == 0) opts.nb = 64; //magma_get_zhetrd_he2hb_nb(N); if (NE < 1) NE = N; //64; //magma_get_zhetrd_he2hb_nb(N); nstream = max(3, opts.ngpu+2); magma_queue_t streams[MagmaMaxGPUs][20]; magmaDoubleComplex_ptr da[MagmaMaxGPUs], dT1[MagmaMaxGPUs]; if ((distblk == 0) || (distblk < opts.nb)) distblk = max(256, opts.nb); printf("%% ngpu %d, distblk %d, NB %d, nstream %d\n", (int) opts.ngpu, (int) distblk, (int) opts.nb, (int) nstream); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); for( int i = 0; i < nstream; ++i ) { magma_queue_create( &streams[dev][i] ); } } magma_setdevice( 0 ); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; lda = N; ldt = N; ldda = magma_roundup( N, opts.align ); // multiple of 32 by default n2 = lda*N; /* We suppose the magma NB is bigger than lapack NB */ lwork = N*opts.nb; //gflops = ....? /* Allocate host memory for the matrix */ TESTING_MALLOC_CPU( tau, magmaDoubleComplex, N-1 ); TESTING_MALLOC_PIN( h_A, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_R, magmaDoubleComplex, lda*N ); TESTING_MALLOC_PIN( h_work, magmaDoubleComplex, lwork ); TESTING_MALLOC_PIN( D, double, N ); TESTING_MALLOC_PIN( E, double, N ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_int_t mlocal = ((N / distblk) / opts.ngpu + 1) * distblk; magma_setdevice( dev ); TESTING_MALLOC_DEV( da[dev], magmaDoubleComplex, ldda*mlocal ); TESTING_MALLOC_DEV( dT1[dev], magmaDoubleComplex, N*opts.nb ); } /* ==================================================================== Initialize the matrix =================================================================== */ lapackf77_zlarnv( &ione, ISEED, &n2, h_A ); magma_zmake_hermitian( N, h_A, lda ); lapackf77_zlacpy( MagmaUpperLowerStr, &N, &N, h_A, &lda, h_R, &lda ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ /* Copy the matrix to the GPU */ magma_zsetmatrix_1D_col_bcyclic( N, N, h_R, lda, da, ldda, opts.ngpu, distblk); //magmaDoubleComplex_ptr dabis; //TESTING_MALLOC_DEV( dabis, magmaDoubleComplex, ldda*N ); //magma_zsetmatrix(N, N, h_R, lda, dabis, ldda); for (int count=0; count < 1; ++count) { magma_setdevice(0); gpu_time = magma_wtime(); if (opts.version == 30) { // see src/obsolete and magmablas/obsolete printf( "magma_zhetrd_he2hb_mgpu_spec not compiled\n" ); //magma_zhetrd_he2hb_mgpu_spec( // opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, // da, ldda, dT1, opts.nb, opts.ngpu, distblk, // streams, nstream, opts.nthread, &info); } else { nstream = 3; magma_zhetrd_he2hb_mgpu( opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, da, ldda, dT1, opts.nb, opts.ngpu, distblk, streams, nstream, opts.nthread, &info); } // magma_zhetrd_he2hb(opts.uplo, N, opts.nb, h_R, lda, tau, h_work, lwork, dT1[0], &info); gpu_time = magma_wtime() - gpu_time; printf(" Finish BAND N %d NB %d dist %d ngpu %d version %d timing= %f\n", N, opts.nb, distblk, opts.ngpu, opts.version, gpu_time); } magma_setdevice(0); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice(dev); magma_device_sync(); } magma_setdevice(0); magmablasSetKernelStream( NULL ); // todo neither of these is declared in headers // magma_zhetrd_bhe2trc_v5(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // magma_zhetrd_bhe2trc(opts.nthread, WANTZ, opts.uplo, NE, N, opts.nb, h_R, lda, D, E, dT1[0], ldt); // todo where is this timer started? // gpu_time = magma_wtime() - gpu_time; // todo what are the gflops? gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_zhetrd_he2hb returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Print performance and error. =================================================================== */ #if defined(CHECKEIG) #if defined(PRECISION_z) || defined(PRECISION_d) if ( opts.check ) { printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, gpu_perf, gpu_time ); double nrmI=0.0, nrm1=0.0, nrm2=0.0; int lwork2 = 256*N; magmaDoubleComplex *work2, *AINIT; double *rwork2, *D2; // TODO free this memory ! magma_zmalloc_cpu( &work2, lwork2 ); magma_dmalloc_cpu( &rwork2, N ); magma_dmalloc_cpu( &D2, N ); magma_zmalloc_cpu( &AINIT, N*lda ); memcpy(AINIT, h_A, N*lda*sizeof(magmaDoubleComplex)); /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); int nt = min(12, opts.nthread); magma_set_lapack_numthreads(nt); lapackf77_zheev( "N", "L", &N, h_A, &lda, D2, work2, &lwork2, #ifdef COMPLEX rwork2, #endif &info ); ///* call eigensolver for our resulting tridiag [D E] and for Q */ //dstedc_withZ('V', N, D, E, h_R, lda); ////dsterf_( &N, D, E, &info); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - EIGEN timing= %f threads %d\n", cpu_time, nt); /* compare result */ cmp_vals(N, D2, D, &nrmI, &nrm1, &nrm2); magmaDoubleComplex *WORKAJETER; double *RWORKAJETER, *RESU; // TODO free this memory ! magma_zmalloc_cpu( &WORKAJETER, (2* N * N + N) ); magma_dmalloc_cpu( &RWORKAJETER, N ); magma_dmalloc_cpu( &RESU, 10 ); int MATYPE; memset(RESU, 0, 10*sizeof(double)); MATYPE=3; double NOTHING=0.0; cpu_time = magma_wtime(); // check results zcheck_eig_( lapack_vec_const(opts.jobz), &MATYPE, &N, &opts.nb, AINIT, &lda, &NOTHING, &NOTHING, D2, D, h_R, &lda, WORKAJETER, RWORKAJETER, RESU ); cpu_time = magma_wtime() - cpu_time; printf(" Finish CHECK - results timing= %f\n", cpu_time); magma_set_lapack_numthreads(1); printf("\n"); printf(" ================================================================================================================\n"); printf(" ==> INFO voici threads=%d N=%d NB=%d WANTZ=%d\n", (int) opts.nthread, (int) N, (int) opts.nb, (int) WANTZ); printf(" ================================================================================================================\n"); printf(" DSBTRD : %15s \n", "STATblgv9withQ "); printf(" ================================================================================================================\n"); if (WANTZ > 0) printf(" | A - U S U' | / ( |A| n ulp ) : %15.3E \n", RESU[0]); if (WANTZ > 0) printf(" | I - U U' | / ( n ulp ) : %15.3E \n", RESU[1]); printf(" | D1 - EVEIGS | / (|D| ulp) : %15.3E \n", RESU[2]); printf(" max | D1 - EVEIGS | : %15.3E \n", RESU[6]); printf(" ================================================================================================================\n\n\n"); printf(" ****************************************************************************************************************\n"); printf(" * Hello here are the norm Infinite (max)=%8.2e norm one (sum)=%8.2e norm2(sqrt)=%8.2e *\n", nrmI, nrm1, nrm2); printf(" ****************************************************************************************************************\n\n"); } #endif // PRECISION_z || PRECISION_d #endif // CHECKEIG printf(" Total N %5d flops %6.2f timing %6.2f seconds\n", (int) N, 0.0, gpu_time ); printf("%%===========================================================================\n\n\n"); TESTING_FREE_CPU( tau ); TESTING_FREE_PIN( h_A ); TESTING_FREE_PIN( h_R ); TESTING_FREE_PIN( h_work ); TESTING_FREE_PIN( D ); TESTING_FREE_PIN( E ); for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { magma_setdevice( dev ); TESTING_FREE_DEV( da[dev] ); TESTING_FREE_DEV( dT1[dev] ); } magma_setdevice( 0 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } for( magma_int_t dev = 0; dev < opts.ngpu; ++dev ) { for( int i = 0; i < nstream; ++i ) { magma_queue_destroy( streams[dev][i] ); } } TESTING_FINALIZE(); return status; }
/** Purpose ------- SLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = MagmaUpper then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = MagmaLower then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the triangular factor stored in the array A is upper or lower triangular: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the triangular factor U or L. N >= 0. @param[in,out] A COPLEX_16 array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = MagmaUpper, the upper triangle of A is overwritten with the upper triangle of the product U * U'; if UPLO = MagmaLower, the lower triangle of A is overwritten with the lower triangle of the product L' * L. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value @ingroup magma_sposv_aux ***************************************************************************/ extern "C" magma_int_t magma_slauum( magma_uplo_t uplo, magma_int_t n, float *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ldda, nb; magma_int_t i, ib; float c_one = MAGMA_S_ONE; float d_one = MAGMA_D_ONE; float *dA; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_smalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nb = magma_get_spotrf_nb(n); if (nb <= 1 || nb >= n) lapackf77_slauum(uplo_, &n, A, &lda, info); else { if (upper) { /* Compute the product U * U'. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_ssetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_ssetmatrix_async( ib, (n-i-ib), A(i,i+ib), lda, dA(i,i+ib), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_strmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); lapackf77_slauum(MagmaUpperStr, &ib, A(i,i), &lda, info); magma_ssetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_sgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib),ldda, c_one, dA(0,i), ldda); magma_queue_sync( stream[0] ); magma_ssyrk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } magma_sgetmatrix( i+ib, ib, dA(0, i), ldda, A(0, i), lda ); } } else { /* Compute the product L' * L. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_ssetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_ssetmatrix_async( (n-i-ib), ib, A(i+ib, i), lda, dA(i+ib, i), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_strmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); lapackf77_slauum(MagmaLowerStr, &ib, A(i,i), &lda, info); magma_ssetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_sgemm(MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_queue_sync( stream[0] ); magma_ssyrk(MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } magma_sgetmatrix( ib, i+ib, dA(i, 0), ldda, A(i, 0), lda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
/** Purpose ------- DLABRD reduces the first NB rows and columns of a real general m by n matrix A to upper or lower bidiagonal form by an orthogonal transformation Q' * A * P, and returns the matrices X and Y which are needed to apply the transformation to the unreduced part of A. If m >= n, A is reduced to upper bidiagonal form; if m < n, to lower bidiagonal form. This is an auxiliary routine called by DGEBRD. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. @param[in] n INTEGER The number of columns in the matrix A. @param[in] nb INTEGER The number of leading rows and columns of A to be reduced. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the m by n general matrix to be reduced. On exit, the first NB rows and columns of the matrix are overwritten; the rest of the array is unchanged. If m >= n, elements on and below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors; and elements above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. \n If m < n, elements below the diagonal in the first NB columns, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and elements on and above the diagonal in the first NB rows, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] dA DOUBLE_PRECISION array, dimension (LDDA,N) Copy of A on GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] d DOUBLE_PRECISION array, dimension (NB) The diagonal elements of the first NB rows and columns of the reduced matrix. D(i) = A(i,i). @param[out] e DOUBLE_PRECISION array, dimension (NB) The off-diagonal elements of the first NB rows and columns of the reduced matrix. @param[out] tauq DOUBLE_PRECISION array dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup DOUBLE_PRECISION array, dimension (NB) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] X DOUBLE_PRECISION array, dimension (LDX,NB) The m-by-nb matrix X required to update the unreduced part of A. @param[in] ldx INTEGER The leading dimension of the array X. LDX >= M. @param[out] dX DOUBLE_PRECISION array, dimension (LDDX,NB) Copy of X on GPU. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= M. @param[out] Y DOUBLE_PRECISION array, dimension (LDY,NB) The n-by-nb matrix Y required to update the unreduced part of A. @param[in] ldy INTEGER The leading dimension of the array Y. LDY >= N. @param[out] dY DOUBLE_PRECISION array, dimension (LDDY,NB) Copy of Y on GPU. @param[in] lddy INTEGER The leading dimension of the array dY. LDDY >= N. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: Q = H(1) H(2) . . . H(nb) and P = G(1) G(2) . . . G(nb) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are real scalars, and v and u are real vectors. If m >= n, v(1:i-1) = 0, v(i) = 1, and v(i:m) is stored on exit in A(i:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, v(1:i) = 0, v(i+1) = 1, and v(i+1:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The elements of the vectors v and u together form the m-by-nb matrix V and the nb-by-n matrix U' which are needed, with X and Y, to apply the transformation to the unreduced part of the matrix, using a block update of the form: A := A - V*Y' - X*U'. The contents of A on exit are illustrated by the following examples with nb = 2: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( 1 1 u1 u1 u1 ) ( 1 u1 u1 u1 u1 u1 ) ( v1 1 1 u2 u2 ) ( 1 1 u2 u2 u2 u2 ) ( v1 v2 a a a ) ( v1 1 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) ( v1 v2 a a a a ) ( v1 v2 a a a ) @endverbatim where a denotes an element of the original matrix which is unchanged, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_dgesvd_aux ********************************************************************/ extern "C" magma_int_t magma_dlabrd_gpu( magma_int_t m, magma_int_t n, magma_int_t nb, double *A, magma_int_t lda, double *dA, magma_int_t ldda, double *d, double *e, double *tauq, double *taup, double *X, magma_int_t ldx, double *dX, magma_int_t lddx, double *Y, magma_int_t ldy, double *dY, magma_int_t lddy) { #define A(i_,j_) (A + (i_) + (j_)*lda) #define X(i_,j_) (X + (i_) + (j_)*ldx) #define Y(i_,j_) (Y + (i_) + (j_)*ldy) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) #define dY(i_,j_) (dY + (i_) + (j_)*lddy) #define dX(i_,j_) (dX + (i_) + (j_)*lddx) double c_neg_one = MAGMA_D_NEG_ONE; double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t ione = 1; magma_int_t i__2, i__3; magma_int_t i; double alpha; A -= 1 + lda; X -= 1 + ldx; dX -= 1 + lddx; Y -= 1 + ldy; dY -= 1 + lddy; --d; --e; --tauq; --taup; /* Quick return if possible */ magma_int_t info = 0; if (m <= 0 || n <= 0) { return info; } double *f; magma_queue_t stream; magma_queue_create( &stream ); magma_dmalloc_cpu( &f, max(n,m) ); if ( f == NULL ) { info = MAGMA_ERR_HOST_ALLOC; return info; } if (m >= n) { /* Reduce to upper bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i:m,i) */ i__2 = m - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i,1), &lda, Y(i,1), &ldy, &c_one, A(i,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i,1), &ldx, A(1,i), &ione, &c_one, A(i,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+1:m,i) */ alpha = *A(i,i); i__2 = m - i + 1; i__3 = i + 1; lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < n) { *A(i,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i + 1; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i,i), 1, dA(i-1,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i-1,i), ldda, dA(i-1,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i,1), &lda, A(i,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "N", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, X(i,1), &ldx, A(i,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = i - 1; i__3 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); /* Update A(i,i+1:n) */ i__2 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); lapackf77_dlacgv( &i, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, Y(i+1,1), &ldy, A(i,1), &lda, &c_one, A(i,i+1), &lda ); i__2 = i - 1; i__3 = n - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i, A(i,1), &lda ); lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i+1), &lda, X(i,1), &ldx, &c_one, A(i,i+1), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+2:n) */ i__2 = n - i; i__3 = i + 2; alpha = *A(i,i+1); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i,i+1) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__3, A(i,i+1), lda, dA(i-1,i), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i-1,i), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, Y(i+1,1), &ldy, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; blasf77_dgemv( "N", &i__2, &i, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i; blasf77_dgemv( "N", &i__2, &i__3, &c_one, A(1,i+1), &lda, A(i,i+1), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i != 0) { i__2 = m - i; blasf77_daxpy( &i__2, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); #if defined(PRECISION_z) || defined(PRECISION_c) i__2 = n - i; lapackf77_dlacgv( &i__2, A(i,i+1), &lda ); // 4. Send the block reflector A(i+1:m,i) to the GPU after DLACGV() magma_dsetvector( i__2, A(i,i+1), lda, dA(i-1,i), ldda ); #endif } } } else { /* Reduce to lower bidiagonal form */ for (i = 1; i <= nb; ++i) { /* Update A(i,i:n) */ i__2 = n - i + 1; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); lapackf77_dlacgv( &i__3, A(i,1), &lda ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i,1), &ldy, A(i,1), &lda, &c_one, A(i,i), &lda ); i__2 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, A(i,1), &lda ); lapackf77_dlacgv( &i__3, X(i,1), &ldx ); #endif i__3 = n - i + 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_neg_one, A(1,i), &lda, X(i,1), &ldx, &c_one, A(i,i), &lda ); #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, X(i,1), &ldx ); #endif /* Generate reflection P(i) to annihilate A(i,i+1:n) */ i__2 = n - i + 1; i__3 = i + 1; alpha = *A(i,i); lapackf77_dlarfg( &i__2, &alpha, A(i,min(i__3,n)), &lda, &taup[i] ); d[i] = MAGMA_D_REAL( alpha ); if (i < m) { *A(i,i) = c_one; /* Compute X(i+1:m,i) */ i__2 = m - i; i__3 = n - i + 1; // 1. Send the block reflector A(i,i+1:n) to the GPU ------ magma_dsetvector( i__3, A(i,i), lda, dA(i-1,i-1), ldda ); // 2. Multiply --------------------------------------------- //magma_dcopy( i__3, dA(i-1,i-1), ldda, dY(1,1), 1 ); magma_dgemv( MagmaNoTrans, i__2, i__3, c_one, dA(i,i-1), ldda, dA(i-1,i-1), ldda, //dY(1,1), 1, c_zero, dX(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__2, 1, dX(i+1,i), lddx, X(i+1,i), ldx, stream ); i__2 = n - i + 1; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, Y(i,1), &ldy, A(i,i), &lda, &c_zero, X(1,i), &ione ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, X(1,i), &ione, &c_zero, f, &ione ); i__2 = i - 1; i__3 = n - i + 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_one, A(1,i), &lda, A(i,i), &lda, &c_zero, X(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__2 != 0) { i__3 = m - i; blasf77_daxpy( &i__3, &c_one, f, &ione, X(i+1,i), &ione ); } i__2 = m - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, X(i+1,1), &ldx, X(1,i), &ione, &c_one, X(i+1,i), &ione ); i__2 = m - i; blasf77_dscal( &i__2, &taup[i], X(i+1,i), &ione ); i__2 = n - i + 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); #endif /* Update A(i+1:m,i) */ i__2 = m - i; i__3 = i - 1; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, A(i+1,1), &lda, Y(i,1), &ldy, &c_one, A(i+1,i), &ione ); i__2 = m - i; #if defined(PRECISION_z) || defined(PRECISION_c) lapackf77_dlacgv( &i__3, Y(i,1), &ldy ); #endif blasf77_dgemv( "No transpose", &i__2, &i, &c_neg_one, X(i+1,1), &ldx, A(1,i), &ione, &c_one, A(i+1,i), &ione ); /* Generate reflection Q(i) to annihilate A(i+2:m,i) */ i__2 = m - i; i__3 = i + 2; alpha = *A(i+1,i); lapackf77_dlarfg( &i__2, &alpha, A(min(i__3,m),i), &ione, &tauq[i] ); e[i] = MAGMA_D_REAL( alpha ); *A(i+1,i) = c_one; /* Compute Y(i+1:n,i) */ i__2 = m - i; i__3 = n - i; // 1. Send the block reflector A(i+1:m,i) to the GPU ------ magma_dsetvector( i__2, A(i+1,i), 1, dA(i,i-1), 1 ); // 2. Multiply --------------------------------------------- magma_dgemv( MagmaConjTrans, i__2, i__3, c_one, dA(i,i), ldda, dA(i,i-1), ione, c_zero, dY(i+1,i), ione ); // 3. Put the result back ---------------------------------- magma_dgetmatrix_async( i__3, 1, dY(i+1,i), lddy, Y(i+1,i), ldy, stream ); i__2 = m - i; i__3 = i - 1; blasf77_dgemv( MagmaConjTransStr, &i__2, &i__3, &c_one, A(i+1,1), &lda, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); i__2 = n - i; i__3 = i - 1; blasf77_dgemv( "No transpose", &i__2, &i__3, &c_neg_one, Y(i+1,1), &ldy, Y(1,i), &ione, &c_zero, f, &ione ); i__2 = m - i; blasf77_dgemv( MagmaConjTransStr, &i__2, &i, &c_one, X(i+1,1), &ldx, A(i+1,i), &ione, &c_zero, Y(1,i), &ione ); // 4. Sync to make sure the result is back ---------------- magma_queue_sync( stream ); if (i__3 != 0) { i__2 = n - i; blasf77_daxpy( &i__2, &c_one, f, &ione, Y(i+1,i), &ione ); } i__2 = n - i; blasf77_dgemv( MagmaConjTransStr, &i, &i__2, &c_neg_one, A(1,i+1), &lda, Y(1,i), &ione, &c_one, Y(i+1,i), &ione ); i__2 = n - i; blasf77_dscal( &i__2, &tauq[i], Y(i+1,i), &ione ); } #if defined(PRECISION_z) || defined(PRECISION_c) else { i__2 = n - i + 1; lapackf77_dlacgv( &i__2, A(i,i), &lda ); magma_dsetvector( i__2, A(i,i), lda, dA(i-1,i-1), ldda ); } #endif } } magma_queue_destroy( stream ); magma_free_cpu( f ); return info; } /* magma_dlabrd_gpu */
/** Purpose ------- CHEEVDX_GPU computes selected eigenvalues and, optionally, eigenvectors of a complex Hermitian matrix A. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. If eigenvectors are desired, it uses a divide and conquer algorithm. The divide and conquer algorithm makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA, N). On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns of A contains the required orthonormal eigenvectors of the matrix A. If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] ldda INTEGER The leading dimension of the array DA. LDDA >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) If INFO = 0, the required m eigenvalues in ascending order. @param wA (workspace) COMPLEX array, dimension (LDWA, N) @param[in] ldwa INTEGER The leading dimension of the array wA. LDWA >= max(1,N). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. If N <= 1, LWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB. If JOBZ = MagmaVec and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ). NB can be obtained through magma_get_chetrd_nb(N). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] rwork (workspace) REAL array, dimension (LRWORK) On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK. @param[in] lrwork INTEGER The dimension of the array RWORK. If N <= 1, LRWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LRWORK >= N. If JOBZ = MagmaVec and N > 1, LRWORK >= 1 + 5*N + 2*N**2. \n If LRWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] iwork (workspace) INTEGER array, dimension (MAX(1,LIWORK)) On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK. @param[in] liwork INTEGER The dimension of the array IWORK. If N <= 1, LIWORK >= 1. If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1. If JOBZ = MagmaVec and N > 1, LIWORK >= 3 + 5*N. \n If LIWORK = -1, then a workspace query is assumed; the routine only calculates the optimal sizes of the WORK, RWORK and IWORK arrays, returns these values as the first entries of the WORK, RWORK and IWORK arrays, and no error message related to LWORK or LRWORK or LIWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed to converge; i off-diagonal elements of an intermediate tridiagonal form did not converge to zero; if INFO = i and JOBZ = MagmaVec, then the algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified description of INFO. Sven, 16 Feb 05. @ingroup magma_cheev_driver ********************************************************************/ extern "C" magma_int_t magma_cheevdx_gpu(magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *m, float *w, magmaFloatComplex *wA, magma_int_t ldwa, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t lrwork, magma_int_t *iwork, magma_int_t liwork, magma_int_t *info) { const char* uplo_ = lapack_uplo_const( uplo ); const char* jobz_ = lapack_vec_const( jobz ); magma_int_t ione = 1; float d__1; float eps; magma_int_t inde; float anrm; magma_int_t imax; float rmin, rmax; float sigma; magma_int_t iinfo, lwmin; magma_int_t lower; magma_int_t llrwk; magma_int_t wantz; magma_int_t indwk2, llwrk2; magma_int_t iscale; float safmin; float bignum; magma_int_t indtau; magma_int_t indrwk, indwrk, liwmin; magma_int_t lrwmin, llwork; float smlnum; magma_int_t lquery; magma_int_t alleig, valeig, indeig; float *dwork; magmaFloatComplex *dC; magma_int_t lddc = ldda; wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1 || lrwork == -1 || liwork == -1); *info = 0; if (! (wantz || (jobz == MagmaNoVec))) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (lower || (uplo == MagmaUpper))) { *info = -3; } else if (n < 0) { *info = -4; } else if (ldda < max(1,n)) { *info = -6; } else if (ldwa < max(1,n)) { *info = -14; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -8; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -9; } else if (iu < min(n,il) || iu > n) { *info = -10; } } } magma_int_t nb = magma_get_chetrd_nb( n ); if ( n <= 1 ) { lwmin = 1; lrwmin = 1; liwmin = 1; } else if ( wantz ) { lwmin = max( n + n*nb, 2*n + n*n ); lrwmin = 1 + 5*n + 2*n*n; liwmin = 3 + 5*n; } else { lwmin = n + n*nb; lrwmin = n; liwmin = 1; } // multiply by 1+eps (in Double!) to ensure length gets rounded up, // if it cannot be exactly represented in floating point. real_Double_t one_eps = 1. + lapackf77_slamch("Epsilon"); work[0] = MAGMA_C_MAKE( lwmin * one_eps, 0.); rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; if ((lwork < lwmin) && !lquery) { *info = -16; } else if ((lrwork < lrwmin) && ! lquery) { *info = -18; } else if ((liwork < liwmin) && ! lquery) { *info = -20; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */ if (n <= 128) { #ifdef ENABLE_DEBUG printf("--------------------------------------------------------------\n"); printf(" warning matrix too small N=%d NB=%d, calling lapack on CPU \n", (int) n, (int) nb); printf("--------------------------------------------------------------\n"); #endif magmaFloatComplex *A; magma_cmalloc_cpu( &A, n*n ); magma_cgetmatrix(n, n, dA, ldda, A, n); lapackf77_cheevd(jobz_, uplo_, &n, A, &n, w, work, &lwork, rwork, &lrwork, iwork, &liwork, info); magma_csetmatrix( n, n, A, n, dA, ldda); magma_free_cpu(A); *m=n; return *info; } magma_queue_t stream; magma_queue_create( &stream ); // dC and dwork are never used together, so use one buffer for both; // unfortunately they're different types (complex and float). // (this works better in dsyevd_gpu where they're both float). // n*lddc for chetrd2_gpu, *2 for complex // n for clanhe magma_int_t ldwork = n*lddc*2; if ( wantz ) { // need 3n^2/2 for cstedx ldwork = max( ldwork, 3*n*(n/2 + 1) ); } if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dC = (magmaFloatComplex*) dwork; /* Get machine constants. */ safmin = lapackf77_slamch("Safe minimum"); eps = lapackf77_slamch("Precision"); smlnum = safmin / eps; bignum = 1. / smlnum; rmin = magma_ssqrt(smlnum); rmax = magma_ssqrt(bignum); /* Scale matrix to allowable range, if necessary. */ anrm = magmablas_clanhe(MagmaMaxNorm, uplo, n, dA, ldda, dwork); iscale = 0; sigma = 1; if (anrm > 0. && anrm < rmin) { iscale = 1; sigma = rmin / anrm; } else if (anrm > rmax) { iscale = 1; sigma = rmax / anrm; } if (iscale == 1) { magmablas_clascl(uplo, 0, 0, 1., sigma, n, n, dA, ldda, info); } /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */ // chetrd rwork: e (n) // cstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2) ==> 1 + 5n + 2n^2 inde = 0; indrwk = inde + n; llrwk = lrwork - indrwk; // chetrd work: tau (n) + llwork (n*nb) ==> n + n*nb // cstedx work: tau (n) + z (n^2) // cunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb) ==> 2n + n^2, or n + n*nb + n^2 indtau = 0; indwrk = indtau + n; indwk2 = indwrk + n*n; llwork = lwork - indwrk; llwrk2 = lwork - indwk2; magma_timer_t time=0; timer_start( time ); #ifdef FAST_HEMV magma_chetrd2_gpu(uplo, n, dA, ldda, w, &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, dC, n*lddc, &iinfo); #else magma_chetrd_gpu (uplo, n, dA, ldda, w, &rwork[inde], &work[indtau], wA, ldwa, &work[indwrk], llwork, &iinfo); #endif timer_stop( time ); timer_printf( "time chetrd_gpu = %6.2f\n", time ); /* For eigenvalues only, call SSTERF. For eigenvectors, first call CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the tridiagonal matrix, then call CUNMTR to multiply it to the Householder transformations represented as Householder vectors in A. */ if (! wantz) { lapackf77_ssterf(&n, w, &rwork[inde], info); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); } else { timer_start( time ); magma_cstedx(range, n, vl, vu, il, iu, w, &rwork[inde], &work[indwrk], n, &rwork[indrwk], llrwk, iwork, liwork, dwork, info); timer_stop( time ); timer_printf( "time cstedx = %6.2f\n", time ); timer_start( time ); magma_smove_eig(range, n, w, &il, &iu, vl, vu, m); magma_csetmatrix( n, *m, &work[indwrk + n * (il-1) ], n, dC, lddc ); magma_cunmtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau], dC, lddc, wA, ldwa, &iinfo); magma_ccopymatrix( n, *m, dC, lddc, dA, ldda ); timer_stop( time ); timer_printf( "time cunmtr_gpu + copy = %6.2f\n", time ); } /* If matrix was scaled, then rescale eigenvalues appropriately. */ if (iscale == 1) { if (*info == 0) { imax = n; } else { imax = *info - 1; } d__1 = 1. / sigma; blasf77_sscal(&imax, &d__1, w, &ione); } work[0] = MAGMA_C_MAKE( lwmin * one_eps, 0.); // round up rwork[0] = lrwmin * one_eps; iwork[0] = liwmin; magma_queue_destroy( stream ); magma_free( dwork ); return *info; } /* magma_cheevdx_gpu */
extern "C" magma_int_t magma_slaex0_m(magma_int_t nrgpu, magma_int_t n, float* d, float* e, float* q, magma_int_t ldq, float* work, magma_int_t* iwork, char range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t* info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 .. Scalar Arguments .. CHARACTER RANGE INTEGER IL, IU, INFO, LDQ, N REAL VL, VU .. .. Array Arguments .. INTEGER IWORK( * ) REAL D( * ), E( * ), Q( LDQ, * ), $ WORK( * ) .. Purpose ======= SLAEX0 computes all eigenvalues and the choosen eigenvectors of a symmetric tridiagonal matrix using the divide and conquer method. Arguments ========= N (input) INTEGER The dimension of the symmetric tridiagonal matrix. N >= 0. D (input/output) REAL array, dimension (N) On entry, the main diagonal of the tridiagonal matrix. On exit, its eigenvalues. E (input) REAL array, dimension (N-1) The off-diagonal elements of the tridiagonal matrix. On exit, E has been destroyed. Q (input/output) REAL array, dimension (LDQ, N) On entry, Q will be the identity matrix. On exit, Q contains the eigenvectors of the tridiagonal matrix. LDQ (input) INTEGER The leading dimension of the array Q. If eigenvectors are desired, then LDQ >= max(1,N). In any case, LDQ >= 1. WORK (workspace) REAL array, the dimension of WORK >= 4*N + N**2. IWORK (workspace) INTEGER array, the dimension of IWORK >= 3 + 5*N. RANGE (input) CHARACTER*1 = 'A': all eigenvalues will be found. = 'V': all eigenvalues in the half-open interval (VL,VU] will be found. = 'I': the IL-th through IU-th eigenvalues will be found. VL (input) REAL VU (input) REAL If RANGE='V', the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = 'A' or 'I'. IL (input) INTEGER IU (input) INTEGER If RANGE='I', the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = 'A' or 'V'. INFO (output) INTEGER = 0: successful exit. < 0: if INFO = -i, the i-th argument had an illegal value. > 0: The algorithm failed to compute an eigenvalue while working on the submatrix lying in rows and columns INFO/(N+1) through mod(INFO,N+1). Further Details =============== Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA ===================================================================== */ magma_int_t ione = 1; char range_; magma_int_t curlvl, i, indxq; magma_int_t igpu, j, k, matsiz, msd2, smlsiz; magma_int_t submat, subpbs, tlvls; float* dw[MagmaMaxGPUs]; magma_queue_t stream [MagmaMaxGPUs][2]; int gpu_b; magma_getdevice(&gpu_b); // Test the input parameters. *info = 0; if( n < 0 ) *info = -1; else if( ldq < max(1, n) ) *info = -5; if( *info != 0 ){ magma_xerbla( __func__, -*info ); return MAGMA_ERR_ILLEGAL_VALUE; } // Quick return if possible if(n == 0) return MAGMA_SUCCESS; //workspace dimension for nrgpu > 1 size_t tmp = (n-1)/2+1; if (nrgpu > 1){ size_t tmp2 = (tmp-1) / (nrgpu/2) + 1; tmp = tmp * tmp2 + 2 * magma_get_slaex3_m_nb()*(tmp + tmp2); } for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); if(nrgpu==1){ if (MAGMA_SUCCESS != magma_smalloc( &dw[igpu], 3*n*(n/2 + 1) )) { *info = -15; return MAGMA_ERR_DEVICE_ALLOC; } } else { if (MAGMA_SUCCESS != magma_smalloc( &dw[igpu], tmp )) { *info = -15; return MAGMA_ERR_DEVICE_ALLOC; } } magma_queue_create( &stream[igpu][0] ); magma_queue_create( &stream[igpu][1] ); } smlsiz = magma_get_smlsize_divideconquer(); // Determine the size and placement of the submatrices, and save in // the leading elements of IWORK. iwork[0] = n; subpbs= 1; tlvls = 0; while (iwork[subpbs - 1] > smlsiz) { for (j = subpbs; j > 0; --j){ iwork[2*j - 1] = (iwork[j-1]+1)/2; iwork[2*j - 2] = iwork[j-1]/2; } ++tlvls; subpbs *= 2; } for (j=1; j<subpbs; ++j) iwork[j] += iwork[j-1]; // Divide the matrix into SUBPBS submatrices of size at most SMLSIZ+1 // using rank-1 modifications (cuts). for(i=0; i < subpbs-1; ++i){ submat = iwork[i]; d[submat-1] -= MAGMA_S_ABS(e[submat-1]); d[submat] -= MAGMA_S_ABS(e[submat-1]); } indxq = 4*n + 3; // Solve each submatrix eigenproblem at the bottom of the divide and // conquer tree. char char_I[] = {'I', 0}; #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif for (i = 0; i < subpbs; ++i){ if(i == 0){ submat = 0; matsiz = iwork[0]; } else { submat = iwork[i-1]; matsiz = iwork[i] - iwork[i-1]; } lapackf77_ssteqr(char_I , &matsiz, &d[submat], &e[submat], Q(submat, submat), &ldq, work, info); // change to edc? if(*info != 0){ printf("info: %d\n, submat: %d\n", (int) *info, (int) submat); *info = (submat+1)*(n+1) + submat + matsiz; printf("info: %d\n", (int) *info); return MAGMA_SUCCESS; } k = 1; for(j = submat; j < iwork[i]; ++j){ iwork[indxq+j] = k; ++k; } } #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); printf(" for: ssteqr = %6.2f\n", GetTimerValue(start,end)/1000.); #endif // Successively merge eigensystems of adjacent submatrices // into eigensystem for the corresponding larger matrix. curlvl = 1; while (subpbs > 1){ #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER magma_timestr_t start, end; start = get_current_time(); #endif for (i=0; i<subpbs-1; i+=2){ if(i == 0){ submat = 0; matsiz = iwork[1]; msd2 = iwork[0]; } else { submat = iwork[i-1]; matsiz = iwork[i+1] - iwork[i-1]; msd2 = matsiz / 2; } // Merge lower order eigensystems (of size MSD2 and MATSIZ - MSD2) // into an eigensystem of size MATSIZ. // SLAEX1 is used only for the full eigensystem of a tridiagonal // matrix. if (matsiz == n) range_=range; else // We need all the eigenvectors if it is not last step range_='A'; magma_slaex1_m(nrgpu, matsiz, &d[submat], Q(submat, submat), ldq, &iwork[indxq+submat], e[submat+msd2-1], msd2, work, &iwork[subpbs], dw, stream, range_, vl, vu, il, iu, info); if(*info != 0){ *info = (submat+1)*(n+1) + submat + matsiz; return MAGMA_SUCCESS; } iwork[i/2]= iwork[i+1]; } subpbs /= 2; ++curlvl; #ifdef ENABLE_TIMER_DIVIDE_AND_CONQUER end = get_current_time(); //printf("%d: time: %6.2f\n", curlvl, GetTimerValue(start,end)/1000.); #endif } // Re-merge the eigenvalues/vectors which were deflated at the final // merge step. for(i = 0; i<n; ++i){ j = iwork[indxq+i] - 1; work[i] = d[j]; blasf77_scopy(&n, Q(0, j), &ione, &work[ n*(i+1) ], &ione); } blasf77_scopy(&n, work, &ione, d, &ione); char char_A[] = {'A',0}; lapackf77_slacpy ( char_A, &n, &n, &work[n], &n, q, &ldq ); for (igpu = 0; igpu < nrgpu; ++igpu){ magma_setdevice(igpu); magma_queue_destroy( stream[igpu][0] ); magma_queue_destroy( stream[igpu][1] ); magma_free( dw[igpu] ); } magma_setdevice(gpu_b); return MAGMA_SUCCESS; } /* magma_slaex0 */
/** Purpose ------- SGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N matrix A without any pivoting. The factorization has the form A = L * U where L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA REAL array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_sgesv_comp ********************************************************************/ extern "C" magma_int_t magma_sgetrf_nopiv_gpu( magma_int_t m, magma_int_t n, magmaFloat_ptr dA, magma_int_t ldda, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, (dA_offset + (i_)*nb + (j_)*nb*ldda) #else #define dA(i_, j_) (dA + (i_)*nb + (j_)*nb*ldda) #endif float c_one = MAGMA_S_ONE; float c_neg_one = MAGMA_S_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, mindim; magma_int_t j, rows, s, ldwork; float *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min( m, n ); nb = magma_get_sgetrf_nb( m, n ); s = mindim / nb; magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_smalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_sgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] ); magma_sgetrf_nopiv( m, n, work, m, info ); magma_ssetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); ldwork = maxm; if (MAGMA_SUCCESS != magma_smalloc_pinned( &work, ldwork*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( j=0; j < s; j++ ) { // get j-th panel from device magma_queue_sync( queues[1] ); magma_sgetmatrix_async( m-j*nb, nb, dA(j,j), ldda, work, ldwork, queues[0] ); if ( j > 0 ) { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n - (j+1)*nb, c_one, dA(j-1,j-1), ldda, dA(j-1,j+1), ldda, queues[1] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-j*nb, n-(j+1)*nb, nb, c_neg_one, dA(j, j-1), ldda, dA(j-1,j+1), ldda, c_one, dA(j, j+1), ldda, queues[1] ); } // do the cpu part rows = m - j*nb; magma_queue_sync( queues[0] ); magma_sgetrf_nopiv( rows, nb, work, ldwork, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; // send j-th panel to device magma_ssetmatrix_async( m-j*nb, nb, work, ldwork, dA(j, j), ldda, queues[0] ); magma_queue_sync( queues[0] ); // do the small non-parallel computations (next panel update) if ( s > j+1 ) { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dA(j, j ), ldda, dA(j, j+1), ldda, queues[1] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-(j+1)*nb, nb, nb, c_neg_one, dA(j+1, j ), ldda, dA(j, j+1), ldda, c_one, dA(j+1, j+1), ldda, queues[1] ); } else { magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n-s*nb, c_one, dA(j, j ), ldda, dA(j, j+1), ldda, queues[1] ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, m-(j+1)*nb, n-(j+1)*nb, nb, c_neg_one, dA(j+1, j ), ldda, dA(j, j+1), ldda, c_one, dA(j+1, j+1), ldda, queues[1] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magma_sgetmatrix( rows, nb0, dA(s,s), ldda, work, ldwork, queues[1] ); // do the cpu part magma_sgetrf_nopiv( rows, nb0, work, ldwork, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; // send j-th panel to device magma_ssetmatrix( rows, nb0, work, ldwork, dA(s,s), ldda, queues[1] ); magma_strsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb0, n-s*nb-nb0, c_one, dA(s,s), ldda, dA(s,s)+nb0, ldda, queues[1] ); } magma_free_pinned( work ); } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); return *info; } /* magma_sgetrf_nopiv_gpu */
/** Purpose ------- DGEHRD reduces a DOUBLE_PRECISION general matrix A to upper Hessenberg form H by an orthogonal similarity transformation: Q' * A * Q = H . This version stores the triangular matrices used in the factorization so that they can be applied directly (i.e., without being recomputed) later. As a result, the application of Q is much faster. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ilo INTEGER @param[in] ihi INTEGER It is assumed that A is already upper triangular in rows and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally set by a previous call to DGEBAL; otherwise they should be set to 1 and N respectively. See Further Details. 1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0. @param[in,out] A DOUBLE_PRECISION array, dimension (LDA,N) On entry, the N-by-N general matrix to be reduced. On exit, the upper triangle and the first subdiagonal of A are overwritten with the upper Hessenberg matrix H, and the elements below the first subdiagonal, with the array TAU, represent the orthogonal matrix Q as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] tau DOUBLE_PRECISION array, dimension (N-1) The scalar factors of the elementary reflectors (see Further Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to zero. @param[out] work (workspace) DOUBLE_PRECISION array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,N). For optimum performance LWORK >= N*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] T DOUBLE_PRECISION array, dimension NB*N, where NB is the optimal blocksize. It stores the NB*NB blocks of the triangular T matrices used in the reduction. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrix Q is represented as a product of (ihi-ilo) elementary reflectors Q = H(ilo) H(ilo+1) . . . H(ihi-1). Each H(i) has the form H(i) = I - tau * v * v' where tau is a real scalar, and v is a real vector with v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on exit in A(i+2:ihi,i), and tau in TAU(i). The contents of A are illustrated by the following example, with n = 7, ilo = 2 and ihi = 6: @verbatim on entry, on exit, ( a a a a a a a ) ( a a h h h h a ) ( a a a a a a ) ( a h h h h a ) ( a a a a a a ) ( h h h h h h ) ( a a a a a a ) ( v2 h h h h h ) ( a a a a a a ) ( v2 v3 h h h h ) ( a a a a a a ) ( v2 v3 v4 h h h ) ( a ) ( a ) @endverbatim where a denotes an element of the original matrix A, h denotes a modified element of the upper Hessenberg matrix H, and vi denotes an element of the vector defining H(i). This implementation follows the hybrid algorithm and notations described in S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. This version stores the T matrices, for later use in magma_dorghr. @ingroup magma_dgeev_comp ********************************************************************/ extern "C" magma_int_t magma_dgehrd_m( magma_int_t n, magma_int_t ilo, magma_int_t ihi, double *A, magma_int_t lda, double *tau, double *work, magma_int_t lwork, double *T, magma_int_t *info) { #define A( i, j ) (A + (i) + (j)*lda) #define dA( d, i, j ) (data.A[d] + (i) + (j)*ldda) double c_one = MAGMA_D_ONE; double c_zero = MAGMA_D_ZERO; magma_int_t nb = magma_get_dgehrd_nb(n); magma_int_t nh, iws, ldda, min_lblocks, max_lblocks, last_dev, d; magma_int_t dpanel, di, nlocal, i, i2, ib, ldwork; magma_int_t iinfo; magma_int_t lquery; struct dgehrd_data data; int ngpu = magma_num_gpus(); *info = 0; iws = n*(nb + nb*ngpu); work[0] = MAGMA_D_MAKE( iws, 0 ); lquery = (lwork == -1); if (n < 0) { *info = -1; } else if (ilo < 1 || ilo > max(1,n)) { *info = -2; } else if (ihi < min(ilo,n) || ihi > n) { *info = -3; } else if (lda < max(1,n)) { *info = -5; } else if (lwork < max(1,n) && ! lquery) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; // Adjust from 1-based indexing ilo -= 1; // Quick return if possible nh = ihi - ilo; if (nh <= 1) { work[0] = c_one; return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero for (i = 0; i < ilo; ++i) tau[i] = c_zero; for (i = max(0,ihi-1); i < n-1; ++i) tau[i] = c_zero; // set T to zero lapackf77_dlaset( "Full", &nb, &n, &c_zero, &c_zero, T, &nb ); // set to null, to simplify cleanup code for( d = 0; d < ngpu; ++d ) { data.A[d] = NULL; data.streams[d] = NULL; } // If not enough workspace, use unblocked code if ( lwork < iws ) { nb = 1; } if (nb == 1 || nb >= nh) { // Use unblocked code below i = ilo; } else { // Use blocked code // allocate memory on GPUs for A and workspaces ldda = ((n+31)/32)*32; min_lblocks = (n / nb) / ngpu; max_lblocks = ((n-1) / nb) / ngpu + 1; last_dev = (n / nb) % ngpu; // V and Vd need to be padded for copying in mdlahr2 data.ngpu = ngpu; data.ldda = ldda; data.ldv = nb*max_lblocks*ngpu; data.ldvd = nb*max_lblocks; for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); nlocal = min_lblocks*nb; if ( d < last_dev ) { nlocal += nb; } else if ( d == last_dev ) { nlocal += (n % nb); } ldwork = nlocal*ldda // A + nb*data.ldv // V + nb*data.ldvd // Vd + nb*ldda // Y + nb*ldda // W + nb*nb; // Ti if ( MAGMA_SUCCESS != magma_dmalloc( &data.A[d], ldwork )) { *info = MAGMA_ERR_DEVICE_ALLOC; goto CLEANUP; } data.V [d] = data.A [d] + nlocal*ldda; data.Vd[d] = data.V [d] + nb*data.ldv; data.Y [d] = data.Vd[d] + nb*data.ldvd; data.W [d] = data.Y [d] + nb*ldda; data.Ti[d] = data.W [d] + nb*ldda; magma_queue_create( &data.streams[d] ); } // Copy the matrix to GPUs magma_dsetmatrix_1D_col_bcyclic( n, n, A, lda, data.A, ldda, ngpu, nb ); // round ilo down to block boundary ilo = (ilo/nb)*nb; for (i = ilo; i < ihi - 1 - nb; i += nb) { // Reduce columns i:i+nb-1 to Hessenberg form, returning the // matrices V and T of the block reflector H = I - V*T*V' // which performs the reduction, and also the matrix Y = A*V*T // Get the current panel (no need for the 1st iteration) dpanel = (i / nb) % ngpu; di = ((i / nb) / ngpu) * nb; if ( i > ilo ) { magma_setdevice( dpanel ); magma_dgetmatrix( ihi-i, nb, dA(dpanel, i, di), ldda, A(i,i), lda ); } // add 1 to i for 1-based index magma_dlahr2_m( ihi, i+1, nb, A(0,i), lda, &tau[i], &T[i*nb], nb, work, n, &data ); magma_dlahru_m( n, ihi, i, nb, A, lda, &data ); // copy first i rows above panel to host magma_setdevice( dpanel ); magma_dgetmatrix_async( i, nb, dA(dpanel, 0, di), ldda, A(0,i), lda, data.streams[dpanel] ); } // Copy remainder to host, block-by-block for( i2 = i; i2 < n; i2 += nb ) { ib = min( nb, n-i2 ); d = (i2 / nb) % ngpu; di = (i2 / nb) / ngpu * nb; magma_setdevice( d ); magma_dgetmatrix( n, ib, dA(d, 0, di), ldda, A(0,i2), lda ); } } // Use unblocked code to reduce the rest of the matrix // add 1 to i for 1-based index i += 1; lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo); work[0] = MAGMA_D_MAKE( iws, 0 ); CLEANUP: for( d = 0; d < ngpu; ++d ) { magma_setdevice( d ); magma_free( data.A[d] ); magma_queue_destroy( data.streams[d] ); } magma_setdevice( orig_dev ); return *info; } /* magma_dgehrd */
extern "C" magma_int_t magma_dorgqr_gpu(magma_int_t m, magma_int_t n, magma_int_t k, double *dA, magma_int_t ldda, double *tau, double *dT, magma_int_t nb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns, which is defined as the first N columns of a product of K elementary reflectors of order M Q = H(1) H(2) . . . H(k) as returned by DGEQRF_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix Q. M >= 0. N (input) INTEGER The number of columns of the matrix Q. M >= N >= 0. K (input) INTEGER The number of elementary reflectors whose product defines the matrix Q. N >= K >= 0. DA (input/output) DOUBLE_PRECISION array A on the GPU, dimension (LDDA,N). On entry, the i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,k, as returned by DGEQRF_GPU in the first k columns of its array argument A. On exit, the M-by-N matrix Q. LDDA (input) INTEGER The first dimension of the array A. LDDA >= max(1,M). TAU (input) DOUBLE_PRECISION array, dimension (K) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by DGEQRF_GPU. DT (input/workspace) DOUBLE_PRECISION work space array on the GPU, dimension (2*MIN(M, N) + (N+31)/32*32 )*NB. This must be the 6th argument of magma_dgeqrf_gpu [ note that if N here is bigger than N in magma_dgeqrf_gpu, the workspace requirement DT in magma_dgeqrf_gpu must be as specified in this routine ]. NB (input) INTEGER This is the block size used in DGEQRF_GPU, and correspondingly the size of the T matrices, used in the factorization, and stored in DT. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument has an illegal value ===================================================================== */ #define dA(i,j) (dA + (i) + (j)*ldda) #define dT(j) (dT + (j)*nb) magma_int_t m_kk, n_kk, k_kk, mi; magma_int_t lwork, lpanel; magma_int_t i, ib, ki, kk, iinfo; magma_int_t lddwork; double *dV, *dW; double *work, *panel; *info = 0; if (m < 0) { *info = -1; } else if ((n < 0) || (n > m)) { *info = -2; } else if ((k < 0) || (k > n)) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (n <= 0) { return *info; } // first kk columns are handled by blocked method. // ki is start of 2nd-to-last block if ((nb > 1) && (nb < k)) { ki = (k - nb - 1) / nb * nb; kk = min( k, ki+nb ); } else { ki = 0; kk = 0; } // Allocate CPU work space // n*nb for dorgqr workspace // (m - kk)*(n - kk) for last block's panel lwork = n*nb; lpanel = (m - kk)*(n - kk); magma_dmalloc_cpu( &work, lwork + lpanel ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } panel = work + lwork; // Allocate work space on GPU if (MAGMA_SUCCESS != magma_dmalloc( &dV, ldda*nb )) { magma_free_cpu( work ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // dT workspace has: // 2*min(m,n)*nb for T and R^{-1} matrices from geqrf // ((n+31)/32*32 )*nb for dW larfb workspace. lddwork = min(m,n); dW = dT + 2*lddwork*nb; magma_queue_t stream; magma_queue_create( &stream ); // Use unblocked code for the last or only block. if (kk < n) { m_kk = m - kk; n_kk = n - kk; k_kk = k - kk; magma_dgetmatrix( m_kk, k_kk, dA(kk, kk), ldda, panel, m_kk ); lapackf77_dorgqr( &m_kk, &n_kk, &k_kk, panel, &m_kk, &tau[kk], work, &lwork, &iinfo ); magma_dsetmatrix( m_kk, n_kk, panel, m_kk, dA(kk, kk), ldda ); // Set A(1:kk,kk+1:n) to zero. magmablas_dlaset( MagmaUpperLower, kk, n - kk, dA(0, kk), ldda ); } if (kk > 0) { // Use blocked code // stream: copy Aii to V --> laset --> laset --> larfb --> [next] // CPU has no computation magmablasSetKernelStream( stream ); for (i = ki; i >= 0; i -= nb) { ib = min( nb, k-i ); mi = m - i; // Copy current panel on the GPU from dA to dV magma_dcopymatrix_async( mi, ib, dA(i,i), ldda, dV, ldda, stream ); // set panel to identity magmablas_dlaset( MagmaUpperLower, i, ib, dA(0, i), ldda ); magmablas_dlaset_identity( mi, ib, dA(i, i), ldda ); if (i < n) { // Apply H to A(i:m,i:n) from the left magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise, mi, n-i, ib, dV, ldda, dT(i), nb, dA(i, i), ldda, dW, lddwork ); } } } magma_queue_sync( stream ); magmablasSetKernelStream( NULL ); magma_free( dV ); magma_free_cpu( work ); magma_queue_destroy( stream ); return *info; } /* magma_dorgqr_gpu */