/** Purpose ------- CPOTRS solves a system of linear equations A*X = B with a Hermitian positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by CPOTRF. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by CPOTRF. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cposv_comp ********************************************************************/ extern "C" magma_int_t magma_cpotrs_gpu(magma_uplo_t uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dB, magma_int_t lddb, magma_int_t *info) { magmaFloatComplex c_one = MAGMA_C_ONE; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) *info = -1; if ( n < 0 ) *info = -2; if ( nrhs < 0) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } if ( uplo == MagmaUpper ) { if ( nrhs == 1) { magma_ctrsv(MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_ctrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } else { if ( nrhs == 1) { magma_ctrsv(MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaLower, MagmaConjTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_ctrsm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } return *info; }
extern "C" magma_int_t magma_cgetrf_m(magma_int_t num_gpus0, magma_int_t m, magma_int_t n, magmaFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may not fit entirely in the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define A(i,j) (a + (j)*lda + (i)) #define inAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define inPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) //#define PROFILE #ifdef PROFILE float flops, time_rmajor = 0, time_rmajor2 = 0, time_rmajor3 = 0, time_mem = 0; magma_timestr_t start, start1, start2, end1, end, start0 = get_current_time(); #endif magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, num_gpus; magma_int_t ii, jj, h, offset, ib, rows, s; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+num_gpus0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); char * ngr_nb_char = getenv("MAGMA_NGR_NB"); if( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if( num_gpus0 > ceil((float)NB/nb) ) { num_gpus = (int)ceil((float)NB/nb); h = 1+(2+num_gpus); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { num_gpus = num_gpus0; } if( num_gpus*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = num_gpus*NB; NB = max(nb,(NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n",n,NB,nb,(float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ #ifdef PROFILE start = get_current_time(); #endif n_local[0] = (NB/nb)/num_gpus; if( NB%(nb*num_gpus) != 0 ) n_local[0] ++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); #ifdef PROFILE end = get_current_time(); printf( " memory-allocation time: %e\n",GetTimerValue(start, end)/1000.0 ); start = get_current_time(); #endif for( I=0; I<n; I+=NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ s = min(max(m-I,0),N)/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if( num_gpus0 > ceil((float)N/nb) ) { num_gpus = (int)ceil((float)N/nb); } else { num_gpus = num_gpus0; } for( d=0; d<num_gpus; d++ ) { n_local[d] = ((N/nb)/num_gpus)*nb; if (d < (N/nb)%num_gpus) n_local[d] += nb; else if (d == (N/nb)%num_gpus) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; #ifdef PROFILE start2 = get_current_time(); #endif /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ magmablas_csetmatrix_transpose_mgpu(num_gpus, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE start1 = get_current_time(); #endif /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset<min(m,I); offset+=NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose2( inPT(d,0,0), nb, dA[d], maxm-offset, M-offset, nbi); } /* applying the pivot from the previous big-panel */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][1]); magmablas_cpermute_long3( inAT(d,0,0), ldn_local, ipiv, NBk, offset ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj<NBk; jj+=nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose2( inPT(d,0,(1+jj/nb)%2), nb, dA[d], rows-nb, M-ii-nb, nb); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, inPT(d,0,(jj/nb)%2), nb, inAT(d,ib,0), ldn_local ); if( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, inAT(d,ib,0), ldn_local, inPT(d,1,(jj/nb)%2), nb, c_one, inAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if( M > I ) { //magma_cgetrf1_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, &a[I*lda], lda, // (magma_queue_t **)stream, &iinfo); magma_cgetrf2_mgpu(num_gpus, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if( iinfo < 0 ) { *info = iinfo; break; } else if( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii<min(I+N,m); ii++ ) ipiv[ii] += I; } #ifdef PROFILE end1 = get_current_time(); time_rmajor += GetTimerValue(start1, end1); time_rmajor3 += GetTimerValue(start2, end1); time_mem += (GetTimerValue(start2, end1)-GetTimerValue(start1, end1))/1000.0; #endif /* download the current big panel to CPU */ magmablas_cgetmatrix_transpose_mgpu(num_gpus, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } #ifdef PROFILE end1 = get_current_time(); time_rmajor2 += GetTimerValue(start1, end1); #endif } /* end of for */ #ifdef PROFILE end = get_current_time(); flops = FLOPS_CGETRF( m, n ) / 1000000; printf(" NB=%d nb=%d\n",NB,nb); printf(" memcopy and transpose %e seconds\n",time_mem ); printf(" total time %e seconds\n",GetTimerValue(start0,end)/1000.0); printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / time_rmajor, time_rmajor /1000.0); printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / time_rmajor3, time_rmajor3/1000.0); printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / time_rmajor2, time_rmajor2/1000.0); printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / GetTimerValue(start, end), GetTimerValue(start,end)/1000.0); #endif for( d=0; d<num_gpus0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); magmablasSetKernelStream(NULL); } magma_setdevice(0); } if( *info >= 0 ) magma_cgetrf_piv(m, n, NB, a, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
/** Purpose ------- CGETRF_NOPIV_GPU computes an LU factorization of a general M-by-N matrix A without any pivoting. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_nopiv_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *info) { #define dA(i,j) (dA + (i)*nb + (j)*nb*ldda) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, mindim; magma_int_t i, rows, s, lddwork; magmaFloatComplex *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_cgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, dA, ldda, work, m ); magma_cgetrf_nopiv( m, n, work, m, info); magma_csetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; lddwork = maxm; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, maxm*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } for( i=0; i < s; i++ ) { // download i-th panel magma_queue_sync( stream[1] ); magma_cgetmatrix_async( m-i*nb, nb, dA(i,i), ldda, work, lddwork, stream[0] ); if ( i > 0 ) { magma_ctrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n - (i+1)*nb, c_one, dA(i-1,i-1), ldda, dA(i-1,i+1), ldda ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, m-i*nb, n-(i+1)*nb, nb, c_neg_one, dA(i, i-1), ldda, dA(i-1,i+1), ldda, c_one, dA(i, i+1), ldda ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); magma_cgetrf_nopiv( rows, nb, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_csetmatrix_async( m-i*nb, nb, work, lddwork, dA(i, i), ldda, stream[0] ); magma_queue_sync( stream[0] ); // do the small non-parallel computations if ( s > (i+1) ) { magma_ctrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } else { magma_ctrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb, n-s*nb, c_one, dA(i, i ), ldda, dA(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, m-(i+1)*nb, n-(i+1)*nb, nb, c_neg_one, dA(i+1, i ), ldda, dA(i, i+1), ldda, c_one, dA(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; magma_cgetmatrix( rows, nb0, dA(s,s), ldda, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part magma_cgetrf_nopiv( rows, nb0, work, lddwork, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; // upload i-th panel magma_csetmatrix( rows, nb0, work, lddwork, dA(s,s), ldda ); magma_ctrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, nb0, n-s*nb-nb0, c_one, dA(s,s), ldda, dA(s,s)+nb0, ldda); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); } return *info; } /* magma_cgetrf_nopiv_gpu */
/** Purpose ------- CSYTRF_nopiv_gpu computes the LDLt factorization of a complex symmetric matrix A. The factorization has the form A = U^T * D * U, if UPLO = MagmaUpper, or A = L * D * L^T, if UPLO = MagmaLower, where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored; - = MagmaLower: Lower triangle of A is stored. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_csysv_comp ******************************************************************* */ extern "C" magma_int_t magma_csytrf_nopiv_gpu( magma_uplo_t uplo, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *info) { #define A(i, j) (A) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Constants */ const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; /* Local variables */ bool upper = (uplo == MagmaUpper); magma_int_t j, k, jb, nb, ib, iinfo; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_chetrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization magma_queue_t queues[2]; magma_event_t event; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); magma_event_create( &event ); trace_init( 1, 1, 2, queues ); // CPU workspace magmaFloatComplex *A; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &A, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } // GPU workspace magmaFloatComplex_ptr dW; if (MAGMA_SUCCESS != magma_cmalloc( &dW, (1+nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_event_sync( event ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, A(j,j), nb, queues[1] ); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync( queues[1] ); trace_cpu_start( 0, "potrf", "potrf" ); magma_csytrf_nopiv_cpu( MagmaUpper, jb, ib, A(j, j), nb, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_csetmatrix_async( jb, jb, A(j, j), nb, dA(j, j), ldda, queues[0] ); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaTrans, MagmaUnit, jb, (n-j-jb), c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0] ); magma_ccopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb, queues[0] ); // update the trailing submatrix with D magmablas_clascl_diag( MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, queues[0], &iinfo ); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_cgemm( MagmaTrans, MagmaNoTrans, kb, n-k, jb, c_neg_one, dWt(0, k), nb, dA(j, k), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) magma_event_record( event, queues[0] ); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // main loop for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); magma_event_sync( event ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, A(j,j), nb, queues[1] ); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync( queues[1] ); trace_cpu_start( 0, "potrf", "potrf" ); magma_csytrf_nopiv_cpu( MagmaLower, jb, ib, A(j, j), nb, info ); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_csetmatrix_async( jb, jb, A(j, j), nb, dA(j, j), ldda, queues[0] ); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0] ); magma_ccopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda, queues[0] ); // update the trailing submatrix with D magmablas_clascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, queues[0], &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k < n; k += nb) { magma_int_t kb = min(nb,n-k); magma_cgemm( MagmaNoTrans, MagmaTrans, n-k, kb, jb, c_neg_one, dA(k, j), ldda, dW(k, 0), ldda, c_one, dA(k, k), ldda, queues[0] ); if (k == j+jb) magma_event_record( event, queues[0] ); } trace_gpu_end( 0, 0 ); } } } trace_finalize( "chetrf.svg","trace.css" ); magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_event_destroy( event ); magma_free( dW ); magma_free_pinned( A ); return *info; } /* magma_csytrf_nopiv */
/** Purpose ------- CGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_cgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((float)NB/nb) ) { ngpu = (int)ceil((float)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_CGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_CGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_CGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (float)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_cmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((float)N/nb) ) { ngpu = (int)ceil((float)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_csetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_csetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ctranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_claswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_csetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ctranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_cgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_cgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_CGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_cgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_cgetrf_m */
extern "C" magma_int_t magma_cpotrf2_msub( magma_int_t num_subs, magma_int_t num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaFloatComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, magmaFloatComplex_ptr *d_lP, magma_int_t lddp, magmaFloatComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t *queues, magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ int tot_subs = num_subs*num_gpus; magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; magmaFloatComplex_ptr dlpanel; size_t dlpanel_offset; magma_int_t n_local[MagmaMaxSubs * MagmaMaxGPUs], ldpanel; // initialize trace trace_init(1, num_gpus, 2, queues); *info = 0; if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) { *info = -1; } else if (n < 0) { *info = -2; } else if ((uplo != MagmaUpper) && tot_subs*ldda < max(1,n)) { *info = -4; } else if ((uplo == MagmaUpper) && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for (d=0; d<tot_subs; d++) { /* local-n and local-ld */ if (uplo == MagmaUpper) { n_local[d] = ((n/nb)/tot_subs)*nb; if (d < (n/nb)%tot_subs) n_local[d] += nb; else if (d == (n/nb)%tot_subs) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/tot_subs)*nb; if (d < (m/nb)%tot_subs) n_local[d] += nb; else if (d == (m/nb)%tot_subs) n_local[d] += m%nb; } } /* Use blocked code. */ if (uplo == MagmaUpper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (m-j)); if (j > 0) { // Wait for the column on CPU magma_queue_sync(queues[2*(id%num_gpus)]); /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { magma_csetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,id%num_gpus), lddp, queues[2*d], trace_gpu_event(d, 0, "set", "set-col") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // Wait for syrk } /* Send the diagonal to cpu */ magma_cgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get-diag") ); if (j > 0) { /* Compute the local block column of the panel. */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (n_local[d] > nb0) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(jb, 0, id%num_gpus); ldpanel = lddp; /* Wait for the offdiagonal column */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(0, nb*j_local); ldpanel = ldda; } /* update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_cgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); // wait for the diagonal trace_cpu_start(0, "potrf", "potrf"); lapackf77_cpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < n) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag")); d = (d+1)%num_gpus; } } else { magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* panel-factorize the off-diagonal */ if ((j+jb) < n) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2); if (dd < num_gpus) magma_queue_sync( queues[2*(d%num_gpus)] ); // wait for the diagonal if (j+jb < m && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); /* send the column to cpu */ magma_queue_sync(queues[2*(d%num_gpus)+1]); // wait for lookahead magma_cgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get-col") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, queues[2*(d%num_gpus)+1] ); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); } d = (d+1)%tot_subs; } } } } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (n-j)); if (j > 0) { if (num_gpus > 1) { // Wait for the row on CPU to broadcast magma_queue_sync(queues[2*(id%num_gpus)]); } /* broadcast off-diagonal row to all the GPUs */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { /* send it to GPU-d */ magma_csetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,id%num_gpus), nb, queues[2*d], trace_gpu_event(d, 0, "set", "set-row") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_cherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // wait for syrk } /* send the diagonal to cpu */ magma_cgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get") ); /* update the offdiagonal blocks */ if (j > 0) { /* compute the block-rows of the panel */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (nb0 < n_local[d]) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, jb, id%num_gpus); ldpanel = nb; /* Wait for offdiagonal row */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, 0); ldpanel = ldda; } /* Update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, dlpanel_offset, ldpanel, c_one, dlA(d, nb0, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); trace_cpu_start(0, "potrf", "potrf"); lapackf77_cpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { printf( " cpotrf returned %d (id=%d,j=%d,j_local=%d,jb=%d)\n",*info,id,j,j_local,jb ); *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < m) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag") ); d = (d+1)%num_gpus; } } else { magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* factorize off-diagonal blocks */ if ((j+jb) < m) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); // wait for the diagonal if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); if (j+jb < n && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); /* send the column to cpu */ magma_queue_sync( queues[2*(d%num_gpus)+1] ); // wait for lookahead magma_cgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, queues[2*(d%num_gpus)+1]); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } } } /* end of else not upper */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); } trace_finalize("cpotrf_msub.svg", "trace.css"); return *info; } /* magma_cpotrf2_msub */
extern "C" magma_int_t magma_cpotrs_gpu(char uplo, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dB, magma_int_t lddb, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRS solves a system of linear equations A*X = B with a Hermitian positive definite matrix A using the Cholesky factorization A = U**H*U or A = L*L**H computed by CPOTRF. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of A is stored; = 'L': Lower triangle of A is stored. N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. dA (input) COMPLEX array on the GPU, dimension (LDDA,N) The triangular factor U or L from the Cholesky factorization A = U**H*U or A = L*L**H, as computed by CPOTRF. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,N). dB (input/output) COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDDB (input) INTEGER The leading dimension of the array B. LDDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ magmaFloatComplex c_one = MAGMA_C_ONE; *info = 0 ; if( (uplo != 'U') && (uplo != 'u') && (uplo != 'L') && (uplo != 'l') ) *info = -1; if( n < 0 ) *info = -2; if( nrhs < 0) *info = -3; if ( ldda < max(1, n) ) *info = -5; if ( lddb < max(1, n) ) *info = -7; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( (n == 0) || (nrhs == 0) ) { return *info; } if( (uplo=='U') || (uplo=='u') ){ if ( nrhs == 1) { magma_ctrsv(MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_ctrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } else{ if ( nrhs == 1) { magma_ctrsv(MagmaLower, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaLower, MagmaConjTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); magma_ctrsm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb); } } return *info; }
/** Purpose ------- CGEGQR orthogonalizes the N vectors given by a complex M-by-N matrix A: A = Q * R. On exit, if successful, the orthogonal vectors Q overwrite A and R is given in work (on the CPU memory). The routine is designed for tall-and-skinny matrices: M >> N, N <= 128. This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. Arguments --------- @param[in] ikind INTEGER Several versions are implemented indiceted by the ikind value: 1: This version uses normal equations and SVD in an iterative process that makes the computation numerically accurate. 2: This version uses a standard LAPACK-based orthogonalization through MAGMA's QR panel factorization (magma_cgeqr2x3_gpu) and magma_cungqr 3: MGS 4. Cholesky QR @param[in] m INTEGER The number of rows of the matrix A. m >= n >= 0. @param[in] n INTEGER The number of columns of the matrix A. 128 >= n >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (ldda,n) On entry, the m-by-n matrix A. On exit, the m-by-n matrix Q with orthogonal columns. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,m). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param dwork (GPU workspace) COMPLEX array, dimension: n^2 for ikind = 1 3 n^2 + min(m, n) for ikind = 2 0 (not used) for ikind = 3 n^2 for ikind = 4 @param[out] work (CPU workspace) COMPLEX array, dimension 3 n^2. On exit, work(1:n^2) holds the rectangular matrix R. Preferably, for higher performance, work should be in pinned memory. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. @ingroup magma_cgeqrf_comp ********************************************************************/ extern "C" magma_int_t magma_cgegqr_gpu( magma_int_t ikind, magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dwork, magmaFloatComplex *work, magma_int_t *info ) { #define work(i_,j_) (work + (i_) + (j_)*n) #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magma_int_t i = 0, j, k, n2 = n*n; magma_int_t ione = 1; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; float cn = 200., mins, maxs; /* check arguments */ *info = 0; if (ikind < 1 || ikind > 4) { *info = -1; } else if (m < 0 || m < n) { *info = -2; } else if (n < 0 || n > 128) { *info = -3; } else if (ldda < max(1,m)) { *info = -5; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (ikind == 1) { // === Iterative, based on SVD ============================================================ magmaFloatComplex *U, *VT, *vt, *R, *G, *hwork, *tau; float *S; R = work; // Size n * n G = R + n*n; // Size n * n VT = G + n*n; // Size n * n magma_cmalloc_cpu( &hwork, 32 + 2*n*n + 2*n); if ( hwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_int_t lwork=n*n+32; // First part f hwork; used as workspace in svd U = hwork + n*n + 32; // Size n*n S = (float *)(U+n*n); // Size n tau = U + n*n + n; // Size n #if defined(PRECISION_c) || defined(PRECISION_z) float *rwork; magma_smalloc_cpu( &rwork, 5*n); if ( rwork == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif do { i++; magma_cgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_cgetmatrix(n, n, dwork, n, G, n); #if defined(PRECISION_s) || defined(PRECISION_d) lapackf77_cgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, info); #else lapackf77_cgesvd("n", "a", &n, &n, G, &n, S, U, &n, VT, &n, hwork, &lwork, rwork, info); #endif mins = 100.f, maxs = 0.f; for (k=0; k < n; k++) { S[k] = magma_ssqrt( S[k] ); if (S[k] < mins) mins = S[k]; if (S[k] > maxs) maxs = S[k]; } for (k=0; k < n; k++) { vt = VT + k*n; for (j=0; j < n; j++) vt[j] *= S[j]; } lapackf77_cgeqrf(&n, &n, VT, &n, tau, hwork, &lwork, info); if (i == 1) blasf77_ccopy(&n2, VT, &ione, R, &ione); else blasf77_ctrmm("l", "u", "n", "n", &n, &n, &c_one, VT, &n, R, &n); magma_csetmatrix(n, n, VT, n, dwork, n); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); if (mins > 0.00001f) cn = maxs/mins; //fprintf(stderr, "Iteration %d, cond num = %f \n", i, cn); } while (cn > 10.f); magma_free_cpu( hwork ); #if defined(PRECISION_c) || defined(PRECISION_z) magma_free_cpu( rwork ); #endif // ================== end of ikind == 1 =================================================== } else if (ikind == 2) { // ================== LAPACK based =================================================== magma_int_t min_mn = min(m, n); magma_int_t nb = n; magmaFloatComplex *dtau = dwork + 2*n*n, *d_T = dwork, *ddA = dwork + n*n; magmaFloatComplex *tau = work+n*n; magmablas_claset( MagmaFull, n, n, c_zero, c_zero, d_T, n ); magma_cgeqr2x3_gpu(&m, &n, dA, &ldda, dtau, d_T, ddA, (float *)(dwork+min_mn+2*n*n), info); magma_cgetmatrix( min_mn, 1, dtau, min_mn, tau, min_mn); magma_cgetmatrix( n, n, ddA, n, work, n); magma_cungqr_gpu( m, n, n, dA, ldda, tau, d_T, nb, info ); // ================== end of ikind == 2 =================================================== } else if (ikind == 3) { // ================== MGS =================================================== for(magma_int_t j = 0; j<n; j++){ for(magma_int_t i = 0; i<j; i++){ *work(i, j) = magma_cdotc(m, dA(0,i), 1, dA(0,j), 1); magma_caxpy(m, -(*work(i,j)), dA(0,i), 1, dA(0,j), 1); } for(magma_int_t i = j; i<n; i++) *work(i, j) = MAGMA_C_ZERO; //*work(j,j) = MAGMA_C_MAKE( magma_scnrm2(m, dA(0,j), 1), 0. ); *work(j,j) = magma_cdotc(m, dA(0,j), 1, dA(0,j), 1); *work(j,j) = MAGMA_C_MAKE( sqrt(MAGMA_C_REAL( *work(j,j) )), 0.); magma_cscal(m, 1./ *work(j,j), dA(0,j), 1); } // ================== end of ikind == 3 =================================================== } else if (ikind == 4) { // ================== Cholesky QR =================================================== magma_cgemm(MagmaConjTrans, MagmaNoTrans, n, n, m, c_one, dA, ldda, dA, ldda, c_zero, dwork, n ); magma_cgetmatrix(n, n, dwork, n, work, n); lapackf77_cpotrf("u", &n, work, &n, info); magma_csetmatrix(n, n, work, n, dwork, n); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, n, c_one, dwork, n, dA, ldda); // ================== end of ikind == 4 =================================================== } return *info; } /* magma_cgegqr_gpu */
extern "C" magma_int_t magma_ctrtri_gpu( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_queue_t queues[2], magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= CTRTRI computes the inverse of a real upper or lower triangular matrix dA. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. dA (input/output) COMPLEX array ON THE GPU, dimension (LDDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array dA contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array dA contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, dA(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. (Singularity check is currently disabled.) ===================================================================== */ /* Local variables */ magma_int_t nb, nn, j, jb; //magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *work; int upper = (uplo == MagmaUpper); int nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Check for singularity if non-unit */ /* cannot do here with matrix dA on GPU -- need kernel */ /* if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_C_EQUAL( *dA(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } */ /* Determine the block size for this environment */ nb = magma_get_cpotrf_nb(n); /* Create Queues */ //magma_queue_t queues[2]; //magma_device_t device[MagmaMaxGPUs]; //magma_int_t num = 0; //magma_int_t err; // //err = magma_getdevices( device, MagmaMaxGPUs, &num ); //if ( err != 0 || num < 1 ) { // fprintf( stderr, "magma_getdevices failed: %d\n", err ); // exit(-1); //} //err = magma_queue_create( device[0], &queues[0] ); //if ( err != 0 ) { // fprintf( stderr, "magma_queue_create 0 failed: %d\n", err ); // exit(-1); //} //err = magma_queue_create( device[0], &queues[1] ); //if ( err != 0 ) { // fprintf( stderr, "magma_queue_create 1 failed: %d\n", err ); // exit(-1); //} if (MAGMA_SUCCESS != magma_cmalloc_cpu( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (nb <= 1 || nb >= n) { magma_cgetmatrix( n, n, dA, dA_offset, ldda, work, n, queues[0] ); lapackf77_ctrtri( lapack_const(uplo), lapack_const(diag), &n, work, &n, info ); magma_csetmatrix( n, n, work, n, dA, dA_offset, ldda, queues[0] ); } else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); /* Compute rows 1:j-1 of current block column */ magma_ctrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j), ldda, queues[0] ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j), ldda, queues[0] ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ctrtri( MagmaUpperStr, lapack_const(diag), &jb, work, &jb, info ); /* magma_csetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_csetmatrix( jb, jb, work, jb, dA(j, j), ldda, queues[0] ); } } else { /* Compute inverse of lower triangular matrix */ nn = ((n-1)/nb)*nb+1; for(j=nn-1; j >= 0; j -= nb) { jb = min(nb,(n-j)); if((j+jb) < n) { /* Compute rows j+jb:n of current block column */ magma_ctrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda, queues[0] ); magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda, queues[0] ); } magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[1], NULL ); magma_queue_sync( queues[1] ); /* Compute inverse of current diagonal block */ lapackf77_ctrtri( MagmaLowerStr, lapack_const(diag), &jb, work, &jb, info ); /* magma_csetmatrix_async( jb, jb, work, 0, jb, dA(j, j), ldda, queues[0], NULL ); */ magma_csetmatrix( jb, jb, work, jb, dA(j, j), ldda, queues[0] ); } } } //magma_queue_destroy( queues[0] ); //magma_queue_destroy( queues[1] ); magma_free_cpu( work ); return *info; }
/** Purpose ------- Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by CGEQRF3_GPU. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of columns of the matrix C. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by CGEQRF3_GPU in the first n columns of its array argument A. @param[in] ldda INTEGER The leading dimension of the array A, LDDA >= M. @param[in] tau COMPLEX array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_CGEQRF_GPU. @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. @param[in] dT COMPLEX array that is the output (the 6th argument) of magma_cgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block matrices for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= M. @param[out] hwork (workspace) COMPLEX array, dimension (LWORK) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_cgeqrf_nb( M ). \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgels_comp ********************************************************************/ extern "C" magma_int_t magma_cgeqrs3_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *tau, magmaFloatComplex *dT, magmaFloatComplex *dB, magma_int_t lddb, magmaFloatComplex *hwork, magma_int_t lwork, magma_int_t *info) { #define dA(a_1,a_2) (dA + (a_2)*(ldda) + (a_1)) #define dT(a_1) (dT + (lddwork+(a_1))*nb) magmaFloatComplex c_one = MAGMA_C_ONE; magma_int_t k, lddwork; magma_int_t nb = magma_get_cgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_C_MAKE( (float)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -8; else if (lwork < lwkopt && ! lquery) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } lddwork = k; /* B := Q' * B */ magma_cunmqr_gpu( MagmaLeft, Magma_ConjTrans, m, nrhs, n, dA(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) 1. Move the (k-1)/nb block diagonal submatrices from dT to R 2. Solve 3. Restore the data format moving data from R back to dT */ magmablas_cswapdblk(k-1, nb, dA(0,0), ldda, 1, dT(0), nb, 0); if ( nrhs == 1 ) { magma_ctrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA(0,0), ldda, dB, 1); } else { magma_ctrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA(0,0), ldda, dB, lddb); } magmablas_cswapdblk(k-1, nb, dT(0), nb, 0, dA(0,0), ldda, 1); return *info; }
/** Purpose ------- ZCGETRS solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by MAGMA_CGETRF_GPU. B and X are in COMPLEX_16, and A is in COMPLEX. This routine is used in the mixed precision iterative solver magma_zcgesv. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[in] dipiv INTEGER array on the GPU, dimension (N) The pivot indices; for 1 <= i <= N, after permuting, row i of the matrix was moved to row dIPIV(i). Note this is different than IPIV from ZGETRF, where interchanges are applied one-after-another. @param[in] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the arrays X and B. LDDB >= max(1,N). @param[out] dX COMPLEX_16 array on the GPU, dimension (LDDX, NRHS) On exit, the solution matrix dX. @param[in] lddx INTEGER The leading dimension of the array dX, LDDX >= max(1,N). @param dSX (workspace) COMPLEX array on the GPU used as workspace, dimension (N, NRHS) @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zcgetrs_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaInt_ptr dipiv, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex_ptr dX, magma_int_t lddx, magmaFloatComplex_ptr dSX, magma_int_t *info) { /* Constants */ magmaFloatComplex c_one = MAGMA_C_ONE; /* Local variables */ bool notran = (trans == MagmaNoTrans); magma_int_t inc; magma_int_t lddsx = n; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < n) { *info = -5; } else if (lddb < n) { *info = -8; } else if (lddx < n) { *info = -10; } // I think this is resolved, but it is unclear what the issue ever was. //else if (lddx != lddb) { /* TODO: remove it when zclaswp will have the correct interface */ // *info = -10; //} if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); if (notran) { inc = 1; /* Get X by row applying interchanges to B and cast to single */ /* * TODO: clean zclaswp interface to have interface closer to zlaswp */ magmablas_zclaswp( nrhs, dB, lddb, dSX, lddsx, n, dipiv, inc, queue ); /* Solve L*X = B, overwriting B with SX. */ magma_ctrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dSX, lddsx, queue ); /* Solve U*X = B, overwriting B with X. */ magma_ctrsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dSX, lddsx, queue ); magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, queue, info ); } else { inc = -1; /* Cast the COMPLEX_16 RHS to COMPLEX */ magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, queue, info ); /* Solve A**T * X = B, or A**H * X = B */ magma_ctrsm( MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dSX, lddsx, queue ); magma_ctrsm( MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dSX, lddsx, queue ); magmablas_zclaswp( nrhs, dX, lddx, dSX, lddsx, n, dipiv, inc, queue ); } magma_queue_destroy( queue ); return *info; } /* magma_zcgetrs */
extern "C" magma_int_t magma_cgetrf2_msub( magma_int_t num_subs, magma_int_t ngpu, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, magmaFloatComplex_ptr *d_lAT, size_t dlAT_offset, magma_int_t lddat, magma_int_t *ipiv, magmaFloatComplex_ptr *d_panel, magmaFloatComplex_ptr *d_lAP, size_t dlAP_offset, magmaFloatComplex *w, magma_int_t ldw, magma_queue_t *queues, magma_int_t *info) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define d_lAT(id,i,j) d_lAT[(id)], (((offset)+(i)*nb)*lddat + (j)*nb) #define d_lAT_offset(i, j) (((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w +((j)%(1+ngpu))*nb*ldw) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t tot_subs = num_subs * ngpu; magma_int_t block_size = 32; magma_int_t iinfo, maxm, mindim; magma_int_t i, j, d, dd, rows, cols, s; magma_int_t id, j_local, j_local2, nb0, nb1; /* local submatrix info */ magma_int_t ldpan[MagmaMaxSubs * MagmaMaxGPUs], n_local[MagmaMaxSubs * MagmaMaxGPUs]; size_t dpanel_local_offset[MagmaMaxSubs * MagmaMaxGPUs]; magmaFloatComplex_ptr dpanel_local[MagmaMaxSubs * MagmaMaxGPUs]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (tot_subs*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); if (tot_subs > ceil((float)n/nb)) { *info = -1; return *info; } else { /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for (i=0; i < tot_subs; i++) { n_local[i] = ((n/nb)/tot_subs)*nb; if (i < (n/nb)%tot_subs) n_local[i] += nb; else if (i == (n/nb)%tot_subs) n_local[i] += n%nb; } /* start sending the first panel to cpu */ nb0 = min(mindim, nb); magmablas_ctranspose( nb0, maxm, d_lAT(0,0,0), lddat, d_lAP[0], dlAP_offset, maxm, queues[2*0+1] ); magma_cgetmatrix_async( m, nb0, d_lAP[0], dlAP_offset, maxm, W(0), ldw, queues[2*0+1], NULL ); clFlush(queues[2*0+1]); /* ------------------------------------------------------------------------------------- */ s = mindim / nb; for (j=0; j < s; j++) { /* Set the submatrix ID that holds the current panel */ id = j%tot_subs; /* Set the local index where the current panel is */ j_local = j/tot_subs; // cols for gpu panel cols = maxm - j*nb; // rows for cpu panel rows = m - j*nb; /* synchrnoize j-th panel from id-th gpu into work */ magma_queue_sync( queues[2*(id%ngpu)+1] ); /* j-th panel factorization */ lapackf77_cgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo); if ((*info == 0) && (iinfo > 0)) { *info = iinfo + j*nb; //break; } /* start sending the panel to all the gpus */ d = (j+1)%ngpu; for (dd=0; dd < ngpu; dd++) { magma_csetmatrix_async( rows, nb, W(j), ldw, d_lAP[d], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm, queues[2*d+1], NULL ); d = (d+1)%ngpu; } /* apply the pivoting */ for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } d = (j+1)%tot_subs; for (dd=0; dd < tot_subs; dd++) { magmablas_claswp( lddat, d_lAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[2*(d%ngpu)] ); d = (d+1)%tot_subs; } /* update the trailing-matrix/look-ahead */ d = (j+1)%tot_subs; for (dd=0; dd < tot_subs; dd++) { /* storage for panel */ if (d%ngpu == id%ngpu) { /* the panel belond to this gpu */ dpanel_local[d] = d_lAT[id]; dpanel_local_offset[d] = d_lAT_offset(j, j_local); ldpan[d] = lddat; /* next column */ j_local2 = j_local; if ( d <= id ) j_local2++; } else { /* the panel belong to another gpu */ dpanel_local[d] = d_panel[d%ngpu]; dpanel_local_offset[d] = (j%(2+ngpu))*nb*maxm; ldpan[d] = nb; /* next column */ j_local2 = j_local; if ( d < id ) j_local2++; } /* the size of the next column */ if (s > (j+1)) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/tot_subs); if (d < s%tot_subs) nb0 -= nb; } if (d == (j+1)%tot_subs) { /* owns the next column, look-ahead the column */ nb1 = nb0; } else { /* update the entire trailing matrix */ nb1 = n_local[d] - j_local2*nb; } /* gpu updating the trailing matrix */ if (d == (j+1)%tot_subs) { /* look-ahead, this is executed first (j.e., dd=0) */ magma_queue_sync(queues[2*(d%ngpu)]); /* pivoting done? (overwrite with panel) */ magmablas_ctranspose( cols, nb, d_lAP[d%ngpu], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm, dpanel_local[d], dpanel_local_offset[d], ldpan[d], queues[2*(d%ngpu)+1] ); magma_queue_sync(queues[2*(d%ngpu)+1]); /* panel arrived and transposed for remaining update ? */ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, dpanel_local[d], dpanel_local_offset[d], ldpan[d], d_lAT(d, j, j_local2), lddat, queues[2*(d%ngpu)+1]); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(j+1)*nb, nb, c_neg_one, d_lAT(d, j, j_local2), lddat, dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, d_lAT(d, j+1, j_local2), lddat, queues[2*(d%ngpu)+1]); } else { /* no look-ahead */ if (dd < ngpu) { /* synch and transpose only the first time */ magma_queue_sync(queues[2*(d%ngpu)+1]); /* panel arrived? */ magmablas_ctranspose( cols, nb, d_lAP[d%ngpu], dlAP_offset+(j%(2+ngpu))*nb*maxm, maxm, dpanel_local[d], dpanel_local_offset[d], ldpan[d], queues[2*(d%ngpu)] ); } magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, dpanel_local[d], dpanel_local_offset[d], ldpan[d], d_lAT(d, j, j_local2), lddat, queues[2*(d%ngpu)]); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(j+1)*nb, nb, c_neg_one, d_lAT(d, j, j_local2), lddat, dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, d_lAT(d, j+1, j_local2), lddat, queues[2*(d%ngpu)]); } if (d == (j+1)%tot_subs) { /* Set the local index where the current panel is */ int loff = j+1; int j_local = (j+1)/tot_subs; int ldda = maxm - (j+1)*nb; int cols = m - (j+1)*nb; nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */ if (nb0 > 0) { /* transpose the panel for sending it to cpu */ magmablas_ctranspose( nb0, ldda, d_lAT(d,loff,j_local), lddat, d_lAP[d%ngpu], dlAP_offset + ((j+1)%(2+ngpu))*nb*maxm, ldda, queues[2*(d%ngpu)+1] ); /* send the panel to cpu */ magma_cgetmatrix_async( cols, nb0, d_lAP[d%ngpu], dlAP_offset + ((j+1)%(2+ngpu))*nb*maxm, ldda, W(j+1), ldw, queues[2*(d%ngpu)+1], NULL ); } } else { //trace_gpu_end( d, 0 ); } d = (d+1)%tot_subs; } /* update the remaining matrix by gpu owning the next panel */ if ((j+1) < s) { d = (j+1)%tot_subs; int j_local = (j+1)/tot_subs; int rows = m - (j+1)*nb; magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (j_local+1)*nb, nb, c_one, dpanel_local[d], dpanel_local_offset[d], ldpan[d], d_lAT(d,j,j_local+1), lddat, queues[2*(d%ngpu)] ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(j_local+1)*nb, rows, nb, c_neg_one, d_lAT(d,j,j_local+1), lddat, dpanel_local[d], dpanel_local_offset[d]+nb*ldpan[d], ldpan[d], c_one, d_lAT(d,j+1, j_local+1), lddat, queues[2*(d%ngpu)] ); } } /* end of for j=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%tot_subs; /* Set the local index where the last panel is */ j_local = s/tot_subs; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if (nb0 > 0) { /* wait for the last panel on cpu */ magma_queue_sync( queues[2*(id%ngpu)+1] ); /* factor on cpu */ lapackf77_cgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for (d=0; d < ngpu; d++) { magma_csetmatrix_async( rows, nb0, W(s), ldw, d_lAP[d], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, queues[2*d+1], NULL ); } for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } for (d=0; d < tot_subs; d++) { magmablas_claswp( lddat, d_lAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[2*(d%ngpu)] ); } d = id; for (dd=0; dd < tot_subs; dd++) { /* wait for the pivoting to be done */ if (dd < ngpu) { /* synch only the first time */ magma_queue_sync( queues[2*(d%ngpu)] ); } j_local2 = j_local; if (d%ngpu == id%ngpu) { /* the panel belond to this gpu */ dpanel_local[d] = d_lAT[id]; dpanel_local_offset[d] = d_lAT_offset(s, j_local); if (dd < ngpu) { magmablas_ctranspose( rows, nb0, d_lAP[d%ngpu], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, dpanel_local[d], dpanel_local_offset[d], lddat, queues[2*(d%ngpu)+1] ); } /* size of the "extra" block */ if (d == id) { /* the last diagonal block belongs to this submatrix */ nb1 = nb0; } else if (d < id) { nb1 = nb; } else { nb1 = 0; } if (n_local[d] > j_local*nb+nb1) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (j_local*nb+nb1), nb0, c_one, dpanel_local[d], dpanel_local_offset[d], lddat, d_lAT(d, s, j_local)+nb1, lddat, queues[2*(d%ngpu)+1]); } } else if (n_local[d] > j_local2*nb) { /* the panel belong to another gpu */ dpanel_local[d] = d_panel[d%ngpu]; dpanel_local_offset[d] = (s%(2+ngpu))*nb*maxm; /* next column */ if (d < ngpu) { /* transpose only the first time */ magmablas_ctranspose( rows, nb0, d_lAP[d%ngpu], dlAP_offset+(s%(2+ngpu))*nb*maxm, cols, dpanel_local[d], dpanel_local_offset[d], nb, queues[2*(d%ngpu)+1] ); } if (d < id) j_local2++; nb1 = n_local[d] - j_local2*nb; magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, dpanel_local[d], dpanel_local_offset[d], nb, d_lAT(d,s,j_local2), lddat, queues[2*(d%ngpu)+1]); } d = (d+1)%tot_subs; } } /* if( nb0 > 0 ) */ /* clean up */ for (d=0; d < ngpu; d++) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); } } return *info; /* End of MAGMA_CGETRF2_MSUB */ }
/** Purpose ------- CTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: A is upper triangular; - = MagmaLower: A is lower triangular. @param[in] diag magma_diag_t - = MagmaNonUnit: A is non-unit triangular; - = MagmaUnit: A is unit triangular. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = MagmaUnit, the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_ctrtri( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i_, j_) ( A + (i_) + (j_)*lda ) #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif // Constants const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; const char* uplo_ = lapack_uplo_const( uplo ); const char* diag_ = lapack_diag_const( diag ); // Local variables magma_int_t ldda, nb, nn, j, jb; magmaFloatComplex_ptr dA; bool upper = (uplo == MagmaUpper); bool nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } // Quick return if ( n == 0 ) return *info; // Check for singularity if non-unit if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_C_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } // Determine the block size for this environment nb = magma_get_cpotrf_nb( n ); ldda = magma_roundup( n, 32 ); if (MAGMA_SUCCESS != magma_cmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); //magma_queue_create( cdev, &queues[1] ); // unused if (nb <= 1 || nb >= n) { lapackf77_ctrtri( uplo_, diag_, &n, A, &lda, info ); } else if (upper) { // Compute inverse of upper triangular matrix for (j=0; j < n; j += nb) { jb = min( nb, n-j ); if (j > 0) { // Send current block column (with diagonal) to device // This must finish before trtri below magma_csetmatrix( j+jb, jb, A(0,j), lda, dA(0,j), ldda, queues[0] ); // Compute rows 0:j of current block column magma_ctrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, diag, j, jb, c_one, dA(0,0), ldda, dA(0,j), ldda, queues[0] ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, diag, j, jb, c_neg_one, dA(j,j), ldda, dA(0,j), ldda, queues[0] ); // Get above diagonal from device // TODO: could be on another queue, after trmm/trsm finish magma_cgetmatrix_async( j, jb, dA(0,j), ldda, A(0,j), lda, queues[0] ); } // Compute inverse of current diagonal block // TODO: problem if diagonal has not finished sending yet? lapackf77_ctrtri( MagmaUpperStr, diag_, &jb, A(j,j), &lda, info ); if (j+jb < n) { // Send inverted diagonal block to device magma_csetmatrix( jb, jb, A(j,j), lda, dA(j,j), ldda, queues[0] ); } } } else { // Compute inverse of lower triangular matrix nn = ((n-1)/nb)*nb; for (j=nn; j >= 0; j -= nb) { jb = min( nb, n-j ); if (j+jb < n) { // Send current block row (with diagonal) to device // This must finish before trtri below magma_csetmatrix( n-j, jb, A(j,j), lda, dA(j,j), ldda, queues[0] ); // Compute rows j+jb:n of current block column magma_ctrmm( MagmaLeft, MagmaLower, MagmaNoTrans, diag, n-j-jb, jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb,j), ldda, queues[0] ); magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, diag, n-j-jb, jb, c_neg_one, dA(j,j), ldda, dA(j+jb,j), ldda, queues[0] ); // Get below diagonal block from device magma_cgetmatrix_async( n-j-jb, jb, dA(j+jb,j), ldda, A(j+jb,j), lda, queues[0] ); } // Compute inverse of current diagonal block lapackf77_ctrtri( MagmaLowerStr, diag_, &jb, A(j,j), &lda, info ); if (j > 0) { // Send inverted diagonal block to device magma_csetmatrix( jb, jb, A(j,j), lda, dA(j,j), ldda, queues[0] ); } } } magma_queue_destroy( queues[0] ); //magma_queue_destroy( queues[1] ); // unused magma_free( dA ); return *info; }
extern "C" magma_int_t magma_ctrtri(char uplo, char diag, magma_int_t n, cuFloatComplex *a, magma_int_t lda, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= CTRTRI computes the inverse of a real upper or lower triangular matrix A. This is the Level 3 BLAS version of the algorithm. Arguments ========= UPLO (input) CHARACTER*1 = 'U': A is upper triangular; = 'L': A is lower triangular. DIAG (input) CHARACTER*1 = 'N': A is non-unit triangular; = 'U': A is unit triangular. N (input) INTEGER The order of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the triangular matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of the array A contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of the array A contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = 'U', the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, A(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. ===================================================================== */ /* Local variables */ char uplo_[2] = {uplo, 0}; char diag_[2] = {diag, 0}; magma_int_t ldda, nb, nn, j, jb; cuFloatComplex c_zero = MAGMA_C_ZERO; cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; cuFloatComplex *work; int upper = lapackf77_lsame(uplo_, "U"); int nounit = lapackf77_lsame(diag_, "N"); *info = 0; if ((! upper) && (! lapackf77_lsame(uplo_, "L"))) *info = -1; else if ((! nounit) && (! lapackf77_lsame(diag_, "U"))) *info = -2; else if (n < 0) *info = -3; else if (lda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; /* Check for singularity if non-unit */ if (nounit) { for ( j=0; j<n; ++j ) { if ( MAGMA_C_EQUAL( *A(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } /* Determine the block size for this environment */ nb = magma_get_cpotrf_nb(n); ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_cmalloc( &work, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } cudaStream_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) lapackf77_ctrtri(uplo_, diag_, &n, a, &lda, info); else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j<n; j=j+nb) { jb = min(nb, (n-j)); magma_csetmatrix( jb, (n-j), A(j, j), lda, dA(j, j), ldda ); /* Compute rows 1:j-1 of current block column */ magma_ctrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j),ldda); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j),ldda); //cublasGetMatrix(j ,jb, sizeof( cuFloatComplex), //dA(0, j), ldda, A(0, j), lda); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, A(j, j), lda, stream[1] ); magma_cgetmatrix_async( j, jb, dA(0, j), ldda, A(0, j), lda, stream[0] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ctrtri(MagmaUpperStr, diag_, &jb, A(j,j), &lda, info); magma_csetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } else { /* Compute inverse of lower triangular matrix */ nn=((n-1)/nb)*nb+1; for(j=nn-1; j>=0; j=j-nb) { jb=min(nb,(n-j)); if((j+jb) < n) { magma_csetmatrix( (n-j), jb, A(j, j), lda, dA(j, j), ldda ); /* Compute rows j+jb:n of current block column */ magma_ctrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda ); magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda ); //cublasGetMatrix((n-j), jb, sizeof( cuFloatComplex),dA(j, j), ldda, A(j, j), lda); magma_cgetmatrix_async( n-j-jb, jb, dA(j+jb, j), ldda, A(j+jb, j), lda, stream[1] ); magma_cgetmatrix_async( jb, jb, dA(j,j), ldda, A(j,j), lda, stream[0] ); magma_queue_sync( stream[0] ); } /* Compute inverse of current diagonal block */ lapackf77_ctrtri(MagmaLowerStr, diag_, &jb, A(j,j), &lda, info); magma_csetmatrix( jb, jb, A(j, j), lda, dA(j, j), ldda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( work ); return *info; }
extern "C" magma_int_t magma_cgessm_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuFloatComplex *dL1, magma_int_t lddl1, cuFloatComplex *dL, magma_int_t lddl, cuFloatComplex *dA, magma_int_t ldda, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= CGESSM applies the factors L computed by CGETRF_INCPIV to a complex M-by-N tile A. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. K (input) INTEGER The number of columns of the matrix L. K >= 0. IB (input) INTEGER The inner-blocking size. IB >= 0. IPIV (input) INTEGER array on the cpu. The pivot indices array of size K as returned by CGETRF_INCPIV. dL1 (input) DOUBLE COMPLEX array, dimension(LDDL1, N) The IB-by-K matrix in which is stored L^(-1) as returned by GETRF_INCPIV LDDL1 (input) INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). dL (input) DOUBLE COMPLEX array, dimension(LDDL, N) The M-by-K lower triangular tile on the gpu. LDDL (input) INTEGER The leading dimension of the array L. LDDL >= max(1,M). dA (input/output) DOUBLE COMPLEX array, dimension (LDDA, N) On entry, the M-by-N tile A on the gpu. On exit, updated by the application of L on the gpu. ===================================================================== */ #define AT(i,j) (dAT + (i)*ldda + (j) ) #define L(i,j) (dL + (i) + (j)*lddl ) #define dL1(j) (dL1 + (j)*lddl1) cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; int i, s, sb; cuFloatComplex *dAT; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } s = k / ib; for(i = 0; i < k; i += ib) { sb = min(ib, k-i); magmablas_claswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, dL1(i), lddl1, AT(i, 0), ldda); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, L( i, i), lddl, AT(i, 0), ldda); #endif if ( (i+sb) < m) { magma_cgemm( MagmaNoTrans, MagmaTrans, n, m-(i+sb), sb, c_neg_one, AT(i, 0), ldda, L( i+sb, i), lddl, c_one, AT(i+sb, 0), ldda ); } } if ( (storev == 'C') || (storev == 'c') ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } return *info; /* End of MAGMA_CGETRF_GPU */ }
extern "C" magma_int_t magma_cgetrf(magma_int_t m, magma_int_t n, cuFloatComplex *a, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(i,j) (dAT + (i)*nb*ldda + (j)*nb) cuFloatComplex *dAT, *dA, *da, *work; cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; nb = magma_get_cgetrf_nb(m); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, a, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, maxdim; magma_int_t i, rows, cols, s = min(m, n)/nb; magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); ldda = maxn; work = a; if (maxdim*maxdim < 2*maxm*maxn) { if (MAGMA_SUCCESS != magma_cmalloc( &dA, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } da = dA + nb*maxm; ldda = maxdim; magma_csetmatrix( m, n, a, lda, da, ldda ); dAT = da; magmablas_cinplace_transpose( dAT, ldda, ldda ); } else { if (MAGMA_SUCCESS != magma_cmalloc( &dA, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } da = dA + nb*maxm; magma_csetmatrix( m, n, a, lda, da, maxm ); if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dA ); magma_int_t rval = magma_cgetrf_m(num_gpus, m, n, a, lda, ipiv, info); if( *info >= 0 ) magma_cgetrf_piv(num_gpus, m, n, a, lda, ipiv, info); return *info; } magmablas_ctranspose2( dAT, ldda, da, maxm, m, n ); } lapackf77_cgetrf( &m, &nb, work, &lda, ipiv, &iinfo); for( i = 0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; if (i>0){ magmablas_ctranspose( dA, cols, inAT(i,i), ldda, nb, cols ); magma_cgetmatrix( m-i*nb, nb, dA, cols, work, lda ); // make sure that gpu queue is empty magma_device_sync(); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, inAT(i-1,i-1), ldda, inAT(i-1,i+1), ldda ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, inAT(i-1,i+1), ldda, inAT(i, i-1), ldda, c_one, inAT(i, i+1), ldda ); // do the cpu part rows = m - i*nb; lapackf77_cgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo); } if (*info == 0 && iinfo > 0) *info = iinfo + i*nb; magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb, i*nb ); // upload i-th panel magma_csetmatrix( m-i*nb, nb, work, lda, dA, cols ); magmablas_ctranspose( inAT(i,i), ldda, dA, cols, cols, nb); // do the small non-parallel computations if (s > (i+1)){ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, inAT(i, i ), ldda, inAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), ldda, inAT(i+1, i ), ldda, c_one, inAT(i+1, i+1), ldda ); } else{ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, inAT(i, i ), ldda, inAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, inAT(i, i+1), ldda, inAT(i+1, i ), ldda, c_one, inAT(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ctranspose2( dA, cols, inAT(s,s), ldda, nb0, rows); magma_cgetmatrix( rows, nb0, dA, cols, work, lda ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_cgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb0, s*nb ); magma_csetmatrix( rows, nb0, work, lda, dA, cols ); magmablas_ctranspose2( inAT(s,s), ldda, dA, cols, rows, nb0); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, inAT(s, s), ldda, inAT(s, s)+nb0, ldda); } if (maxdim*maxdim< 2*maxm*maxn){ magmablas_cinplace_transpose( dAT, ldda, ldda ); magma_cgetmatrix( m, n, da, ldda, a, lda ); } else { magmablas_ctranspose2( da, maxm, dAT, ldda, n, m ); magma_cgetmatrix( m, n, da, maxm, a, lda ); magma_free( dAT ); } magma_free( dA ); } return *info; } /* magma_cgetrf */
/** Purpose ------- CHEGST_GPU reduces a complex Hermitian-definite generalized eigenproblem to standard form. If ITYPE = 1, the problem is A*x = lambda*B*x, and A is overwritten by inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H) If ITYPE = 2 or 3, the problem is A*B*x = lambda*x or B*A*x = lambda*x, and A is overwritten by U*A*U**H or L**H*A*L. B must have been previously factorized as U**H*U or L*L**H by CPOTRF. Arguments --------- @param[in] itype INTEGER = 1: compute inv(U**H)*A*inv(U) or inv(L)*A*inv(L**H); = 2 or 3: compute U*A*U**H or L**H*A*L. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of A is stored and B is factored as U**H*U; - = MagmaLower: Lower triangle of A is stored and B is factored as L*L**H. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] dA COMPLEX array, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the transformed matrix, stored in the same format as A. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] dB COMPLEX array, dimension (LDB,N) The triangular factor from the Cholesky factorization of B, as returned by CPOTRF. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cheev_comp ********************************************************************/ extern "C" magma_int_t magma_chegst_gpu(magma_int_t itype, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *dB, magma_int_t lddb, magma_int_t *info) { #define A(i, j) (w + (j)*lda + (i)) #define B(i, j) (w + nb*lda + (j)*ldb + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) #define dB(i, j) (dB + (j)*lddb + (i)) const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t nb; magma_int_t k, kb, kb2; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; magmaFloatComplex c_neg_half = MAGMA_C_NEG_HALF; magmaFloatComplex *w; magma_int_t lda; magma_int_t ldb; float d_one = 1.0; int upper = (uplo == MagmaUpper); /* Test the input parameters. */ *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! upper && uplo != MagmaLower) { *info = -2; } else if (n < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; nb = magma_get_chegst_nb(n); lda = nb; ldb = nb; if (MAGMA_SUCCESS != magma_cmalloc_pinned( &w, 2*nb*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[3]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); magma_queue_create( &stream[2] ); /* Use hybrid blocked code */ if (itype == 1) { if (upper) { kb = min(n,nb); /* Compute inv(U')*A*inv(U) */ magma_cgetmatrix_async( kb, kb, dB(0, 0), lddb, B(0, 0), nb, stream[2] ); magma_cgetmatrix_async( kb, kb, dA(0, 0), ldda, A(0, 0), nb, stream[1] ); for (k = 0; k < n; k += nb) { kb = min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the upper triangle of A(k:n,k:n) */ magma_queue_sync( stream[2] ); magma_queue_sync( stream[1] ); lapackf77_chegst( &itype, uplo_, &kb, A(0,0), &lda, B(0,0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { // Start copying the new B block magma_cgetmatrix_async( kb2, kb2, dB(k+kb, k+kb), lddb, B(0, 0), nb, stream[2] ); magma_ctrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k,k), lddb, dA(k,k+kb), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_cher2k(MagmaUpper, MagmaConjTrans, n-k-kb, kb, c_neg_one, dA(k,k+kb), ldda, dB(k,k+kb), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(0, 0), lda, stream[1] ); magma_chemm(MagmaLeft, MagmaUpper, kb, n-k-kb, c_neg_half, dA(k,k), ldda, dB(k,k+kb), lddb, c_one, dA(k, k+kb), ldda); magma_ctrsm(MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, kb, n-k-kb, c_one, dB(k+kb,k+kb), lddb, dA(k,k+kb), ldda); } } magma_queue_sync( stream[0] ); } else { kb = min(n,nb); /* Compute inv(L)*A*inv(L') */ magma_cgetmatrix_async( kb, kb, dB(0, 0), lddb, B(0, 0), nb, stream[2] ); magma_cgetmatrix_async( kb, kb, dA(0, 0), ldda, A(0, 0), nb, stream[1] ); for (k = 0; k < n; k += nb) { kb= min(n-k,nb); kb2= min(n-k-nb,nb); /* Update the lower triangle of A(k:n,k:n) */ magma_queue_sync( stream[2] ); magma_queue_sync( stream[1] ); lapackf77_chegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[0] ); if (k+kb < n) { // Start copying the new B block magma_cgetmatrix_async( kb2, kb2, dB(k+kb, k+kb), lddb, B(0, 0), nb, stream[2] ); magma_ctrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k,k), lddb, dA(k+kb,k), ldda); magma_queue_sync( stream[0] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_cher2k(MagmaLower, MagmaNoTrans, n-k-kb, kb, c_neg_one, dA(k+kb,k), ldda, dB(k+kb,k), lddb, d_one, dA(k+kb,k+kb), ldda); magma_cgetmatrix_async( kb2, kb2, dA(k+kb, k+kb), ldda, A(0, 0), lda, stream[1] ); magma_chemm(MagmaRight, MagmaLower, n-k-kb, kb, c_neg_half, dA(k,k), ldda, dB(k+kb,k), lddb, c_one, dA(k+kb, k), ldda); magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, n-k-kb, kb, c_one, dB(k+kb,k+kb), lddb, dA(k+kb,k), ldda); } } } magma_queue_sync( stream[0] ); } else { if (upper) { /* Compute U*A*U' */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dB(k, k), lddb, B(0, 0), nb, stream[2] ); /* Update the upper triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, k, kb, c_one, dB(0,0), lddb, dA(0,k), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_queue_sync( stream[1] ); } magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(0, 0), lda, stream[0] ); if (k > 0) { magma_cher2k(MagmaUpper, MagmaNoTrans, k, kb, c_one, dA(0,k), ldda, dB(0,k), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaRight, MagmaUpper, k, kb, c_half, dA(k,k), ldda, dB(0,k), lddb, c_one, dA(0, k), ldda); magma_ctrmm(MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, k, kb, c_one, dB(k,k), lddb, dA(0,k), ldda); } magma_queue_sync( stream[2] ); magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } else { /* Compute L'*A*L */ for (k = 0; k < n; k += nb) { kb= min(n-k,nb); magma_cgetmatrix_async( kb, kb, dB(k, k), lddb, B(0, 0), nb, stream[2] ); /* Update the lower triangle of A(1:k+kb-1,1:k+kb-1) */ if (k > 0) { magma_ctrmm(MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, kb, k, c_one, dB(0,0), lddb, dA(k,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_queue_sync( stream[1] ); } magma_cgetmatrix_async( kb, kb, dA(k, k), ldda, A(0, 0), lda, stream[0] ); if (k > 0) { magma_cher2k(MagmaLower, MagmaConjTrans, k, kb, c_one, dA(k,0), ldda, dB(k,0), lddb, d_one, dA(0,0), ldda); magma_chemm(MagmaLeft, MagmaLower, kb, k, c_half, dA(k,k), ldda, dB(k,0), lddb, c_one, dA(k, 0), ldda); magma_ctrmm(MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, kb, k, c_one, dB(k,k), lddb, dA(k,0), ldda); } magma_queue_sync( stream[2] ); magma_queue_sync( stream[0] ); lapackf77_chegst( &itype, uplo_, &kb, A(0, 0), &lda, B(0, 0), &ldb, info); magma_csetmatrix_async( kb, kb, A(0, 0), lda, dA(k, k), ldda, stream[1] ); } magma_queue_sync( stream[1] ); } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_queue_destroy( stream[2] ); magma_free_pinned( w ); return *info; } /* magma_chegst_gpu */
extern "C" magma_int_t magma_cpotrf_gpu(char uplo, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb; char uplo_[2] = {uplo, 0}; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *work; float d_one = 1.0; float d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_cpotrf_nb(n); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_cgetmatrix_async( n, n, dA, ldda, work, n, stream[1] ); magma_queue_sync( stream[1] ); lapackf77_cpotrf(uplo_, &n, work, &n, info); magma_csetmatrix_async( n, n, work, n, dA, ldda, stream[1] ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j<n; j+=nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_cgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[0] ); lapackf77_cpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_csetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j<n; j+=nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_cherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_queue_sync( stream[1] ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[0] ); if ( (j+jb) < n) { magma_cgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[0] ); lapackf77_cpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_csetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[1] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ctrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } return *info; } /* magma_cpotrf_gpu */
extern "C" magma_int_t magma_cpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaFloatComplex *d_lA[], magma_int_t ldda, magmaFloatComplex *d_lP[], magma_int_t lddp, magmaFloatComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. Auxiliary subroutine for cpotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; float d_one = 1.0; float d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); magmaFloatComplex *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) /* used by ctrsm_work */ int trsm_nb = 128; int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb); magmaFloatComplex *d_dinvA[MagmaMaxGPUs]; magmaFloatComplex *d_x[MagmaMaxGPUs]; #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n]) #define dx(d,j) &(d_x[(d)][(j)*nb*m]) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if ( (MAGMA_SUCCESS != magma_cmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) || (MAGMA_SUCCESS != magma_cmalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } magma_setdevice(0); #endif *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* initialization */ for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = (n/(nb*num_gpus))*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = (m/(nb*num_gpus))*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } } /* == initialize the trace */ trace_init( 1, num_gpus, 3, (CUstream_st**)stream ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same.. /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_cherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_cgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows) if( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d,0,nb*j_local); ldpanel = ldda; // the GPU owns the row from start, and no need of synch. //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } else { dlpanel = dlP(d,nb,0,buf); ldpanel = lddp; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_cgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_cpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_csetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; magma_setdevice(d); if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead next block on stream1 */ nb0 = min(nb, nb2); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) magmablas_claset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_claset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ctrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); trace_gpu_end( d, stream1 ); } else if( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) magmablas_claset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_claset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ctrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%num_gpus; } /* end of for */ /* ========================================================== */ if( j+jb < m ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ int d2, buf2; magma_setdevice(d); /* lookahead done */ magma_queue_wait_event( stream[d][stream3], event[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_cgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( event[d][3], stream[d][stream3] ); /* needed on pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3 magma_csetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb,0,buf2), lddp, stream[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } /* =========================== */ /* update the remaining blocks */ nb2 = n_local[d]-(nb*j_local2 + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_claset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_claset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ctrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor magma_ctrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of ctrsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_cherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_cgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream1] ); /* update off-diagonal blocks of the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } else { dlpanel = dlPT(d,0,nb,buf); ldpanel = nb; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } magma_cgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); lapackf77_cpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_csetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */ if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) magmablas_claset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_claset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ctrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) magmablas_claset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_claset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ctrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } /* end for d */ /* ========================================================== */ if( j+jb < n ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ int d2, buf2; //#define CPOTRF_DEVICE_TO_DEVICE #ifdef CPOTRF_DEVICE_TO_DEVICE // lookahead done /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][4] ); if( d2 != d ) { magma_ccopymatrix_async( nb0, j+jb, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block dlA(d, nb*j_local2, 0), ldda, stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } else { magma_cgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); } } #else // lookahead done magma_setdevice(d); magma_queue_wait_event( stream[d][stream3], event[d][4] ); magma_cgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); magma_event_record( event[d][3], stream[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done magma_csetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } #endif /* =================================== */ /* updates remaining blocks on stream2 */ nb2 = n_local[d] - (j_local2*nb + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); /* update the remaining blocks in the column */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_claset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_claset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ctrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received magma_ctrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "cpotrf.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { magma_queue_sync( stream[d][j] ); } #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(CTRSM_WORK) magma_free( d_dinvA[d] ); magma_free( d_x[d] ); #endif magmablasSetKernelStream(NULL); } magma_setdevice(0); return *info; } /* magma_cpotrf_mgpu */
/** Purpose ------- CGETRI computes the inverse of a matrix using the LU factorization computed by CGETRF. This method inverts U and then computes inv(A) by solving the system inv(A)*L = inv(U) for inv(A). Note that it is generally both faster and more accurate to use CGESV, or CGETRF and CGETRS, to solve the system AX = B, rather than inverting the matrix and multiplying to form X = inv(A)*B. Only in special instances should an explicit inverse be computed with this routine. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array on the GPU, dimension (LDDA,N) On entry, the factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. On exit, if INFO = 0, the inverse of the original matrix A. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from CGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[out] dwork (workspace) COMPLEX array on the GPU, dimension (MAX(1,LWORK)) @param[in] lwork INTEGER The dimension of the array DWORK. LWORK >= N*NB, where NB is the optimal blocksize returned by magma_get_cgetri_nb(n). \n Unlike LAPACK, this version does not currently support a workspace query, because the workspace is on the GPU. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, U(i,i) is exactly zero; the matrix is singular and its cannot be computed. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetri_gpu( magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex *dwork, magma_int_t lwork, magma_int_t *info ) { #define dA(i, j) (dA + (i) + (j)*ldda) #define dL(i, j) (dL + (i) + (j)*lddl) /* Local variables */ magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *dL = dwork; magma_int_t lddl = n; magma_int_t nb = magma_get_cgetri_nb(n); magma_int_t j, jmax, jb, jp; *info = 0; if (n < 0) *info = -1; else if (ldda < max(1,n)) *info = -3; else if ( lwork < n*nb ) *info = -6; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if ( n == 0 ) return *info; /* Invert the triangular factor U */ magma_ctrtri_gpu( MagmaUpper, MagmaNonUnit, n, dA, ldda, info ); if ( *info != 0 ) return *info; jmax = ((n-1) / nb)*nb; for( j = jmax; j >= 0; j -= nb ) { jb = min( nb, n-j ); // copy current block column of A to work space dL // (only needs lower trapezoid, but we also copy upper triangle), // then zero the strictly lower trapezoid block column of A. magmablas_clacpy( MagmaFull, n-j, jb, dA(j,j), ldda, dL(j,0), lddl ); magmablas_claset( MagmaLower, n-j-1, jb, c_zero, c_zero, dA(j+1,j), ldda ); // compute current block column of Ainv // Ainv(:, j:j+jb-1) // = ( U(:, j:j+jb-1) - Ainv(:, j+jb:n) L(j+jb:n, j:j+jb-1) ) // * L(j:j+jb-1, j:j+jb-1)^{-1} // where L(:, j:j+jb-1) is stored in dL. if ( j+jb < n ) { magma_cgemm( MagmaNoTrans, MagmaNoTrans, n, jb, n-j-jb, c_neg_one, dA(0,j+jb), ldda, dL(j+jb,0), lddl, c_one, dA(0,j), ldda ); } // TODO use magmablas work interface magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaUnit, n, jb, c_one, dL(j,0), lddl, dA(0,j), ldda ); } // Apply column interchanges for( j = n-2; j >= 0; --j ) { jp = ipiv[j] - 1; if ( jp != j ) { magmablas_cswap( n, dA(0,j), 1, dA(0,jp), 1 ); } } return *info; }
/** Purpose ------- CGETRF_INCPIV computes an LU factorization of a general M-by-N tile A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] hA COMPLEX array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX array, dimension(LDHL, min(M,N)), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-min(M,N) lower triangular tile, and in the lower part IB-by-min(M,N) the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] ipiv INTEGER array, dimension min(M,N), on the cpu. The pivot indices array. @param[out] dWORK COMPLEX array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf_incpiv_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magmaFloatComplex *hA, magma_int_t ldha, magmaFloatComplex *dA, magma_int_t ldda, magmaFloatComplex *hL, magma_int_t ldhl, magmaFloatComplex *dL, magma_int_t lddl, magma_int_t *ipiv, magmaFloatComplex *dwork, magma_int_t lddwork, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib) #define hA(i,j) (hA + (i)*ib + (j)*ib*ldha) #define hL(j) (hL + (j)*ib*ldhl ) #define hL2(j) (hL2 + (j)*ib*ldhl ) #define dL(j) (dL + (j)*ib*lddl ) #define dL2(j) (dL2 + (j)*ib*lddl ) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo; magma_int_t maxm, mindim; magma_int_t i, rows, cols, s, ii, sb; magmaFloatComplex *dAT; #ifndef WITHOUTTRTRI magmaFloatComplex *dL2 = dL + ib; magmaFloatComplex *hL2 = hL + ib; #endif /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, hA, &ldha, ipiv, info); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaUpperLower, mindim, mindim, (PLASMA_Complex32_t*)hA, ldha, (PLASMA_Complex32_t*)hL2, ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex32_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( mindim, mindim, hL2, ldhl, dL2, lddl ); #endif if ( order == MagmaRowMajor ) { magma_csetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ctranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_csetmatrix( m, n, hA, ldha, dA, ldda ); } } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; if ( order == MagmaColMajor ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for( i=0; i < s; i++ ) { ii = i * ib; sb = min(ib, mindim-ii); cols = maxm - ii; if ( i > 0 ) { // download i-th panel magmablas_ctranspose( sb, m, AT(0,i), ldda, dwork, maxm ); magma_cgetmatrix( m, sb, dwork, maxm, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n - (ii+sb), ib, c_one, dL2(i-1), lddl, AT(i-1,i+1), ldda ); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (ii+sb), ib, c_one, AT(i-1,i-1), ldda, AT(i-1,i+1), ldda ); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m-ii, ib, c_neg_one, AT(i-1,i+1), ldda, AT(i, i-1), ldda, c_one, AT(i, i+1), ldda ); } // do the cpu part rows = m - ii; lapackf77_cgetrf( &rows, &sb, hA(i, i), &ldha, ipiv+ii, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; { int j; int fin = ii + sb; for (j=ii; j < fin; j++) { ipiv[j] = ii + ipiv[j]; } } magmablas_claswp( n-ii, AT(0, i), ldda, ii+1, ii+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI CORE_clacpy(PlasmaLower, sb, sb, (PLASMA_Complex32_t*)hA(i, i), ldha, (PLASMA_Complex32_t*)hL2(i), ldhl ); CORE_ctrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex32_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } magma_csetmatrix( sb, sb, hL2(i), ldhl, dL2(i), lddl ); #endif // upload i-th panel magma_csetmatrix( rows, sb, hA(i, i), ldha, dwork, cols ); magmablas_ctranspose( rows, sb, dwork, cols, AT(i,i), ldda ); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, sb, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif magma_cgemm( MagmaNoTrans, MagmaNoTrans, sb, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } else { /* Update of the last panel */ #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, dL2(i), lddl, AT(i, i+1), ldda); #else magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-mindim, sb, c_one, AT(i, i ), ldda, AT(i, i+1), ldda); #endif /* m-(ii+sb) should be always 0 */ magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m-(ii+sb), sb, c_neg_one, AT(i, i+1), ldda, AT(i+1, i ), ldda, c_one, AT(i+1, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_cgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaFloatComplex *A, *B, *C, *C2, *LU; magmaFloatComplex_ptr dA, dB, dC1, dC2; magmaFloatComplex alpha = MAGMA_C_MAKE( 0.5, 0.1 ); magmaFloatComplex beta = MAGMA_C_MAKE( 0.7, 0.2 ); float dalpha = 0.6; float dbeta = 0.8; float work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_cmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_cmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_cmalloc( &dA, size ); assert( err == 0 ); err = magma_cmalloc( &dB, size ); assert( err == 0 ); err = magma_cmalloc( &dC1, size ); assert( err == 0 ); err = magma_cmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_clarnv( &ione, ISEED, &size, A ); lapackf77_clarnv( &ione, ISEED, &size, B ); lapackf77_clarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test CSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetmatrix( m, n, A, ld, dB, ld ); magma_cswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_cswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_cgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_clange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "cswap diff %.2g\n", error ); } else { printf( "cswap skipped for n < 3\n" ); } // ----- test ICAMAX // get argmax of column of A magma_csetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_icamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIcamax( opts.handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (float)m * k / 1e9; printf( "icamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test CGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_csetmatrix( m, n, A, ld, dA, ld ); magma_csetvector( maxn, B, 1, dB, 1 ); magma_csetvector( maxn, C, 1, dC1, 1 ); magma_csetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_cgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemv( opts.handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasCaxpy( opts.handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMV( m, n ) / 1e9; printf( "cgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetvector( m, B, 1, dB, 1 ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_chemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemv( opts.handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMV( m ) / 1e9; printf( "chemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test CTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_clacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_cgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_csetmatrix( m, m, LU, ld, dA, ld ); magma_csetvector( m, C, 1, dC1, 1 ); magma_csetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ctrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsv( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_clange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ctrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test CGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_csetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_csetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCgemm( opts.handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CGEMM( m, n, k ) / 1e9; printf( "cgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_csetmatrix( m, m, A, ld, dA, ld ); magma_csetmatrix( m, n, B, ld, dB, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_chemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasChemm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHEMM( side[is], m, n ) / 1e9; printf( "chemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_csetmatrix( n, k, A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCherk( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHERK( k, n ) / 1e9; printf( "cherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_csetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_csetmatrix( n, n, C, ld, dC1, ld ); magma_csetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_cher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCher2k( opts.handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CHER2K( k, n ) / 1e9; printf( "cher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test CTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasCtrmm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRMM( side[is], m, n ) / 1e9; printf( "ctrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test CTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_csetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_csetmatrix( m, n, C, ld, dC1, ld ); magma_csetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ctrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasCtrsm( opts.handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasCaxpy( opts.handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_cgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_clange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_CTRSM( side[is], m, n ) / 1e9; printf( "ctrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
/***************************************************************************//** Purpose ------- CGETRS solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by CGETRF_NOPIV_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDDA,N) The factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,N). @param[in,out] dB COMPLEX array on the GPU, dimension (LDDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_getrs_nopiv *******************************************************************************/ extern "C" magma_int_t magma_cgetrs_nopiv_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t *info) { // Constants const magmaFloatComplex c_one = MAGMA_C_ONE; // Local variables bool notran = (trans == MagmaNoTrans); *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -7; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_queue_t queue = NULL; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); if (notran) { /* Solve A * X = B. */ if ( nrhs == 1) { magma_ctrsv( MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1, queue ); magma_ctrsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); } else { magma_ctrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_ctrsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } } else { /* Solve A**T * X = B or A**H * X = B. */ if ( nrhs == 1) { magma_ctrsv( MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1, queue ); magma_ctrsv( MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1, queue ); } else { magma_ctrsm( MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); magma_ctrsm( MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue ); } } magma_queue_destroy( queue ); return *info; }
/** Purpose ------- CGESSM applies the factors L computed by CGETRF_INCPIV to a complex M-by-N tile A. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] k INTEGER The number of columns of the matrix L. K >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by CGETRF_INCPIV. @param[in] dL1 COMPLEX array, dimension(LDDL1, N) The IB-by-K matrix in which is stored L^(-1) as returned by GETRF_INCPIV @param[in] lddl1 INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). @param[in] dL COMPLEX array, dimension(LDDL, N) The M-by-K lower triangular tile on the gpu. @param[in] lddl INTEGER The leading dimension of the array L. LDDL >= max(1,M). @param[in,out] dA COMPLEX array, dimension (LDDA, N) On entry, the M-by-N tile A on the gpu. On exit, updated by the application of L on the gpu. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @ingroup magma_cgesv_tile ********************************************************************/ extern "C" magma_int_t magma_cgessm_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, magmaFloatComplex_ptr dL1, magma_int_t lddl1, magmaFloatComplex_ptr dL, magma_int_t lddl, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *info) { #define AT(i,j) (dAT + (i)*ldda + (j) ) #define L(i,j) (dL + (i) + (j)*lddl ) #define dL1(j) (dL1 + (j)*lddl1) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; int i, sb; magmaFloatComplex_ptr dAT; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; if ( order == MagmaColMajor ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } for (i = 0; i < k; i += ib) { sb = min(ib, k-i); magmablas_claswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI magma_ctrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, dL1(i), lddl1, AT(i, 0), ldda); #else magma_ctrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, L( i, i), lddl, AT(i, 0), ldda); #endif if ( (i+sb) < m) { magma_cgemm( MagmaNoTrans, MagmaTrans, n, m-(i+sb), sb, c_neg_one, AT(i, 0), ldda, L( i+sb, i), lddl, c_one, AT(i+sb, 0), ldda ); } } if ( order == MagmaColMajor ) { magmablas_cgetmo_in( dA, dAT, ldda, m, n ); } return *info; } /* magma_cgessm_gpu */
extern "C" magma_int_t magma_cgetrf_gpu(magma_int_t m, magma_int_t n, magmaFloatComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; magmaFloatComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_cgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_cmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_cgetmatrix( m, n, dA, ldda, work, m ); lapackf77_cgetrf(&m, &n, work, &m, ipiv, info); magma_csetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_cmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( m == n ) { lddat = ldda; magmablas_ctranspose_inplace( m, dAT, ldda ); } else { if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ctranspose2( dAT, lddat, dA, ldda, m, n ); } if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; for( i=0; i<s; i++ ) { // download i-th panel cols = maxm - i*nb; //magmablas_ctranspose( dAP, cols, dAT(i,i), lddat, nb, cols ); magmablas_ctranspose2( dAP, cols, dAT(i,i), lddat, nb, m-i*nb ); // make sure that that the transpose has completed magma_queue_sync( stream[1] ); magma_cgetmatrix_async( m-i*nb, nb, dAP, cols, work, lddwork, stream[0]); if ( i>0 ){ magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), lddat, dAT(i-1,i+1), lddat ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), lddat, dAT(i, i-1), lddat, c_one, dAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); lapackf77_cgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; // upload i-th panel magma_csetmatrix_async( m-i*nb, nb, work, lddwork, dAP, maxm, stream[0]); magmablas_cpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); magma_queue_sync( stream[0] ); //magmablas_ctranspose(dAT(i,i), lddat, dAP, maxm, cols, nb); magmablas_ctranspose2(dAT(i,i), lddat, dAP, maxm, m-i*nb, nb); // do the small non-parallel computations (next panel update) if ( s > (i+1) ) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } else { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_ctranspose2( dAP, maxm, dAT(s,s), lddat, nb0, rows); magma_cgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // do the cpu part lapackf77_cgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_cpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_csetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_ctranspose2( dAT(s,s), lddat, dAP, maxm, rows, nb0); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); if ( m == n ) { magmablas_ctranspose_inplace( m, dAT, lddat ); } else { magmablas_ctranspose2( dA, ldda, dAT, lddat, n, m ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* End of MAGMA_CGETRF_GPU */
/** Purpose ------- CHEGVX computes selected eigenvalues, and optionally, eigenvectors of a complex generalized Hermitian-definite eigenproblem, of the form A*x=(lambda)*B*x, A*Bx=(lambda)*x, or B*A*x=(lambda)*x. Here A and B are assumed to be Hermitian and B is also positive definite. Eigenvalues and eigenvectors can be selected by specifying either a range of values or a range of indices for the desired eigenvalues. Arguments --------- @param[in] itype INTEGER Specifies the problem type to be solved: = 1: A*x = (lambda)*B*x = 2: A*B*x = (lambda)*x = 3: B*A*x = (lambda)*x @param[in] jobz magma_vec_t - = MagmaNoVec: Compute eigenvalues only; - = MagmaVec: Compute eigenvalues and eigenvectors. @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangles of A and B are stored; - = MagmaLower: Lower triangles of A and B are stored. @param[in] n INTEGER The order of the matrices A and B. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA, N) On entry, the Hermitian matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A. If UPLO = MagmaLower, the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A. \n On exit, the lower triangle (if UPLO=MagmaLower) or the upper triangle (if UPLO=MagmaUpper) of A, including the diagonal, is destroyed. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] B COMPLEX array, dimension (LDB, N) On entry, the Hermitian matrix B. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of B contains the upper triangular part of the matrix B. If UPLO = MagmaLower, the leading N-by-N lower triangular part of B contains the lower triangular part of the matrix B. \n On exit, if INFO <= N, the part of B containing the matrix is overwritten by the triangular factor U or L from the Cholesky factorization B = U**H*U or B = L*L**H. @param[in] ldb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[in] vl REAL @param[in] vu REAL If RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER If RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[in] abstol REAL The absolute error tolerance for the eigenvalues. An approximate eigenvalue is accepted as converged when it is determined to lie in an interval [a,b] of width less than or equal to \n ABSTOL + EPS * max( |a|,|b| ), \n where EPS is the machine precision. If ABSTOL is less than or equal to zero, then EPS*|T| will be used in its place, where |T| is the 1-norm of the tridiagonal matrix obtained by reducing A to tridiagonal form. \n Eigenvalues will be computed most accurately when ABSTOL is set to twice the underflow threshold 2*SLAMCH('S'), not zero. If this routine returns with INFO > 0, indicating that some eigenvectors did not converge, try setting ABSTOL to 2*SLAMCH('S'). @param[out] m INTEGER The total number of eigenvalues found. 0 <= M <= N. If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1. @param[out] w REAL array, dimension (N) The first M elements contain the selected eigenvalues in ascending order. @param[out] Z COMPLEX array, dimension (LDZ, max(1,M)) If JOBZ = MagmaNoVec, then Z is not referenced. If JOBZ = MagmaVec, then if INFO = 0, the first M columns of Z contain the orthonormal eigenvectors of the matrix A corresponding to the selected eigenvalues, with the i-th column of Z holding the eigenvector associated with W(i). The eigenvectors are normalized as follows: if ITYPE = 1 or 2, Z**T*B*Z = I; if ITYPE = 3, Z**T*inv(B)*Z = I. \n If an eigenvector fails to converge, then that column of Z contains the latest approximation to the eigenvector, and the index of the eigenvector is returned in IFAIL. Note: the user must ensure that at least max(1,M) columns are supplied in the array Z; if RANGE = MagmaRangeV, the exact value of M is not known in advance and an upper bound must be used. @param[in] ldz INTEGER The leading dimension of the array Z. LDZ >= 1, and if JOBZ = MagmaVec, LDZ >= max(1,N). @param[out] work (workspace) COMPLEX array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= max(1,2*N). For optimal efficiency, LWORK >= (NB+1)*N, where NB is the blocksize for CHETRD returned by ILAENV. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param rwork (workspace) REAL array, dimension (7*N) @param iwork (workspace) INTEGER array, dimension (5*N) @param[out] ifail INTEGER array, dimension (N) If JOBZ = MagmaVec, then if INFO = 0, the first M elements of IFAIL are zero. If INFO > 0, then IFAIL contains the indices of the eigenvectors that failed to converge. If JOBZ = MagmaNoVec, then IFAIL is not referenced. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: CPOTRF or CHEEVX returned an error code: <= N: if INFO = i, CHEEVX failed to converge; i eigenvectors failed to converge. Their indices are stored in array IFAIL. > N: if INFO = N + i, for 1 <= i <= N, then the leading minor of order i of B is not positive definite. The factorization of B could not be completed and no eigenvalues or eigenvectors were computed. Further Details --------------- Based on contributions by Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA @ingroup magma_chegv_driver ********************************************************************/ extern "C" magma_int_t magma_chegvx( magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *B, magma_int_t ldb, float vl, float vu, magma_int_t il, magma_int_t iu, float abstol, magma_int_t *m, float *w, magmaFloatComplex *Z, magma_int_t ldz, magmaFloatComplex *work, magma_int_t lwork, float *rwork, magma_int_t *iwork, magma_int_t *ifail, magma_int_t *info) { magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex *dA; magmaFloatComplex *dB; magmaFloatComplex *dZ; magma_int_t ldda = n; magma_int_t lddb = n; magma_int_t lddz = n; magma_int_t lower; magma_trans_t trans; magma_int_t wantz; magma_int_t lquery; magma_int_t alleig, valeig, indeig; magma_int_t lwmin; magma_queue_t stream; magma_queue_create( &stream ); wantz = (jobz == MagmaVec); lower = (uplo == MagmaLower); alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); lquery = (lwork == -1); *info = 0; if (itype < 1 || itype > 3) { *info = -1; } else if (! (alleig || valeig || indeig)) { *info = -2; } else if (! (wantz || (jobz == MagmaNoVec))) { *info = -3; } else if (! (lower || (uplo == MagmaUpper))) { *info = -4; } else if (n < 0) { *info = -5; } else if (lda < max(1,n)) { *info = -7; } else if (ldb < max(1,n)) { *info = -9; } else if (ldz < 1 || (wantz && ldz < n)) { *info = -18; } else { if (valeig) { if (n > 0 && vu <= vl) { *info = -11; } } else if (indeig) { if (il < 1 || il > max(1,n)) { *info = -12; } else if (iu < min(n,il) || iu > n) { *info = -13; } } } magma_int_t nb = magma_get_chetrd_nb(n); lwmin = n * (nb + 1); work[0] = MAGMA_C_MAKE( lwmin, 0 ); if (lwork < lwmin && ! lquery) { *info = -20; } if (*info != 0) { magma_xerbla( __func__, -(*info)); return *info; } else if (lquery) { return *info; } /* Quick return if possible */ if (n == 0) { return *info; } if (MAGMA_SUCCESS != magma_cmalloc( &dA, n*ldda ) || MAGMA_SUCCESS != magma_cmalloc( &dB, n*lddb ) || MAGMA_SUCCESS != magma_cmalloc( &dZ, n*lddz )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Form a Cholesky factorization of B. */ magma_csetmatrix( n, n, B, ldb, dB, lddb ); magma_csetmatrix_async( n, n, A, lda, dA, ldda, stream ); magma_cpotrf_gpu(uplo, n, dB, lddb, info); if (*info != 0) { *info = n + *info; return *info; } magma_queue_sync( stream ); magma_cgetmatrix_async( n, n, dB, lddb, B, ldb, stream ); /* Transform problem to standard eigenvalue problem and solve. */ magma_chegst_gpu(itype, uplo, n, dA, ldda, dB, lddb, info); magma_cheevx_gpu(jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, abstol, m, w, dZ, lddz, A, lda, Z, ldz, work, lwork, rwork, iwork, ifail, info); if (wantz && *info == 0) { /* Backtransform eigenvectors to the original problem. */ if (itype == 1 || itype == 2) { /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x; backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */ if (lower) { trans = MagmaConjTrans; } else { trans = MagmaNoTrans; } magma_ctrsm(MagmaLeft, uplo, trans, MagmaNonUnit, n, *m, c_one, dB, lddb, dZ, lddz); } else if (itype == 3) { /* For B*A*x=(lambda)*x; backtransform eigenvectors: x = L*y or U'*y */ if (lower) { trans = MagmaNoTrans; } else { trans = MagmaConjTrans; } magma_ctrmm(MagmaLeft, uplo, trans, MagmaNonUnit, n, *m, c_one, dB, lddb, dZ, lddz); } magma_cgetmatrix( n, *m, dZ, lddz, Z, ldz ); } magma_queue_sync( stream ); magma_queue_destroy( stream ); magma_free( dA ); magma_free( dB ); magma_free( dZ ); return *info; } /* magma_chegvx */
/** Purpose ------- CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with user defined stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrf(magma_int_t m, magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i,j) (dAT + (i)*nb*ldda + (j)*nb) magmaFloatComplex *dAT, *dA, *da, *work; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magma_int_t iinfo, nb; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; nb = magma_get_cgetrf_nb(m); if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code. */ lapackf77_cgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ magma_int_t maxm, maxn, ldda, maxdim; magma_int_t i, rows, cols, s = min(m, n)/nb; maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; maxdim = max(maxm, maxn); /* set number of GPUs */ magma_int_t num_gpus = magma_num_gpus(); if ( num_gpus > 1 ) { /* call multi-GPU non-GPU-resident interface */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } /* explicitly checking the memory requirement */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaFloatComplex); int h = 1+(2+num_gpus), num_gpus2 = num_gpus; int NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); if ( num_gpus > ceil((float)NB/nb) ) { num_gpus2 = (int)ceil((float)NB/nb); h = 1+(2+num_gpus2); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } if ( num_gpus2*NB < n ) { /* require too much memory, so call non-GPU-resident version */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } ldda = maxn; work = A; if (maxdim*maxdim < 2*maxm*maxn) { // if close to square, allocate square matrix and transpose in-place if (MAGMA_SUCCESS != magma_cmalloc( &dA, nb*maxm + maxdim*maxdim )) { /* alloc failed so call non-GPU-resident version */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } da = dA + nb*maxm; ldda = maxdim; magma_csetmatrix( m, n, A, lda, da, ldda ); dAT = da; magmablas_ctranspose_inplace( ldda, dAT, ldda ); } else { // if very rectangular, allocate dA and dAT and transpose out-of-place if (MAGMA_SUCCESS != magma_cmalloc( &dA, (nb + maxn)*maxm )) { /* alloc failed so call non-GPU-resident version */ magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } da = dA + nb*maxm; magma_csetmatrix( m, n, A, lda, da, maxm ); if (MAGMA_SUCCESS != magma_cmalloc( &dAT, maxm*maxn )) { /* alloc failed so call non-GPU-resident version */ magma_free( dA ); magma_cgetrf_m(num_gpus, m, n, A, lda, ipiv, info); return *info; } magmablas_ctranspose( m, n, da, maxm, dAT, ldda ); } lapackf77_cgetrf( &m, &nb, work, &lda, ipiv, &iinfo); /* Define user stream if current stream is NULL */ cudaStream_t stream[2], current_stream; magmablasGetKernelStream(¤t_stream); magma_queue_create( &stream[0] ); if (current_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else stream[1] = current_stream; for( i = 0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; if (i > 0) { // download i-th panel magmablas_ctranspose( nb, cols, dAT(i,i), ldda, dA, cols ); // make sure that gpu queue is empty magma_device_sync(); magma_cgetmatrix_async( m-i*nb, nb, dA, cols, work, lda, stream[0]); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), ldda, dAT(i-1,i+1), ldda ); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), ldda, dAT(i, i-1), ldda, c_one, dAT(i, i+1), ldda ); // do the cpu part rows = m - i*nb; magma_queue_sync( stream[0] ); lapackf77_cgetrf( &rows, &nb, work, &lda, ipiv+i*nb, &iinfo); } if (*info == 0 && iinfo > 0) *info = iinfo + i*nb; // upload i-th panel magma_csetmatrix_async( m-i*nb, nb, work, lda, dA, cols, stream[0]); magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb, i*nb ); magma_queue_sync( stream[0] ); magmablas_ctranspose( cols, nb, dA, cols, dAT(i,i), ldda ); // do the small non-parallel computations if (s > (i+1)) { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), ldda, dAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), ldda, dAT(i+1, i ), ldda, c_one, dAT(i+1, i+1), ldda ); } else { magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), ldda, dAT(i, i+1), ldda); magma_cgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), ldda, dAT(i+1, i ), ldda, c_one, dAT(i+1, i+1), ldda ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ctranspose( nb0, rows, dAT(s,s), ldda, dA, cols ); magma_cgetmatrix( rows, nb0, dA, cols, work, lda ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_cgetrf( &rows, &nb0, work, &lda, ipiv+s*nb, &iinfo); if (*info == 0 && iinfo > 0) *info = iinfo + s*nb; magmablas_cpermute_long2( ldda, dAT, ldda, ipiv, nb0, s*nb ); magma_csetmatrix( rows, nb0, work, lda, dA, cols ); magmablas_ctranspose( rows, nb0, dA, cols, dAT(s,s), ldda ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s, s), ldda, dAT(s, s)+nb0, ldda); } if (maxdim*maxdim < 2*maxm*maxn) { magmablas_ctranspose_inplace( ldda, dAT, ldda ); magma_cgetmatrix( m, n, da, ldda, A, lda ); } else { magmablas_ctranspose( n, m, dAT, ldda, da, maxm ); magma_cgetmatrix( m, n, da, maxm, A, lda ); magma_free( dAT ); } magma_free( dA ); magma_queue_destroy( stream[0] ); if (current_stream == NULL) { magma_queue_destroy( stream[1] ); magmablasSetKernelStream(NULL); } } return *info; } /* magma_cgetrf */
/** Purpose ------- CTRTRI computes the inverse of a real upper or lower triangular matrix dA. This is the Level 3 BLAS version of the algorithm. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: A is upper triangular; - = MagmaLower: A is lower triangular. @param[in] diag magma_diag_t - = MagmaNonUnit: A is non-unit triangular; - = MagmaUnit: A is unit triangular. @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX array ON THE GPU, dimension (LDDA,N) On entry, the triangular matrix A. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of the array dA contains the upper triangular matrix, and the strictly lower triangular part of A is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of the array dA contains the lower triangular matrix, and the strictly upper triangular part of A is not referenced. If DIAG = MagmaUnit, the diagonal elements of A are also not referenced and are assumed to be 1. On exit, the (triangular) inverse of the original matrix, in the same storage format. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, dA(i,i) is exactly zero. The triangular matrix is singular and its inverse cannot be computed. (Singularity check is currently disabled.) @ingroup magma_cgesv_aux ********************************************************************/ extern "C" magma_int_t magma_ctrtri_gpu( magma_uplo_t uplo, magma_diag_t diag, magma_int_t n, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA+(j)*ldda + (i)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); const char* diag_ = lapack_diag_const( diag ); magma_int_t nb, nn, j, jb; //magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *work; int upper = (uplo == MagmaUpper); int nounit = (diag == MagmaNonUnit); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (! nounit && diag != MagmaUnit) *info = -2; else if (n < 0) *info = -3; else if (ldda < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Check for singularity if non-unit */ /* cannot do here with matrix dA on GPU -- need kernel */ /* if (nounit) { for (j=0; j < n; ++j) { if ( MAGMA_C_EQUAL( *dA(j,j), c_zero )) { *info = j+1; // Fortran index return *info; } } } */ /* Determine the block size for this environment */ nb = magma_get_cpotrf_nb(n); if (MAGMA_SUCCESS != magma_cmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if (nb <= 1 || nb >= n) { magma_cgetmatrix( n, n, dA, ldda, work, n ); lapackf77_ctrtri( uplo_, diag_, &n, work, &n, info ); magma_csetmatrix( n, n, work, n, dA, ldda ); } else { if (upper) { /* Compute inverse of upper triangular matrix */ for (j=0; j < n; j += nb) { jb = min(nb, (n-j)); /* Compute rows 1:j-1 of current block column */ magma_ctrmm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_one, dA(0,0), ldda, dA(0, j), ldda ); magma_ctrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, j, jb, c_neg_one, dA(j,j), ldda, dA(0, j), ldda ); magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ctrtri( MagmaUpperStr, diag_, &jb, work, &jb, info ); magma_csetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); } } else { /* Compute inverse of lower triangular matrix */ nn = ((n-1)/nb)*nb+1; for (j=nn-1; j >= 0; j -= nb) { jb = min(nb,(n-j)); if ((j+jb) < n) { /* Compute rows j+jb:n of current block column */ magma_ctrmm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j+jb,j+jb), ldda, dA(j+jb, j), ldda ); magma_ctrsm( MagmaRight, MagmaLower, MagmaNoTrans, MagmaNonUnit, (n-j-jb), jb, c_neg_one, dA(j,j), ldda, dA(j+jb, j), ldda ); } magma_cgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); magma_queue_sync( stream[1] ); /* Compute inverse of current diagonal block */ lapackf77_ctrtri( MagmaLowerStr, diag_, &jb, work, &jb, info ); magma_csetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; }
/** Purpose ------- Solves a system of linear equations A * X = B, A**T * X = B, or A**H * X = B with a general N-by-N matrix A using the LU factorization computed by CGETRF_GPU. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in] dA COMPLEX array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. @param[in] ldda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in] ipiv INTEGER array, dimension (N) The pivot indices from CGETRF; for 1 <= i <= N, row i of the matrix was interchanged with row IPIV(i). @param[in,out] dB COMPLEX array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. @param[in] lddb INTEGER The leading dimension of the array B. LDB >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value @ingroup magma_cgesv_comp ********************************************************************/ extern "C" magma_int_t magma_cgetrs_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaFloatComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magmaFloatComplex_ptr dB, magma_int_t lddb, magma_int_t *info) { magmaFloatComplex c_one = MAGMA_C_ONE; magmaFloatComplex *work = NULL; int notran = (trans == MagmaNoTrans); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (trans != MagmaTrans) && (trans != MagmaConjTrans) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_cmalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_cgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_claswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_csetmatrix( n, nrhs, work, n, dB, lddb ); if ( nrhs == 1) { magma_ctrsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_ctrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } } else { inc = -1; /* Solve A**T * X = B or A**H * X = B. */ if ( nrhs == 1) { magma_ctrsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_ctrsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } magma_cgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_claswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_csetmatrix( n, nrhs, work, n, dB, lddb ); } magma_free_cpu(work); return *info; }
extern "C" magma_int_t magma_cgetrs_gpu(char trans, magma_int_t n, magma_int_t nrhs, cuFloatComplex *dA, magma_int_t ldda, magma_int_t *ipiv, cuFloatComplex *dB, magma_int_t lddb, magma_int_t *info) { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2012 Purpose ======= Solves a system of linear equations A * X = B or A' * X = B with a general N-by-N matrix A using the LU factorization computed by CGETRF_GPU. Arguments ========= TRANS (input) CHARACTER*1 Specifies the form of the system of equations: = 'N': A * X = B (No transpose) = 'T': A'* X = B (Transpose) = 'C': A'* X = B (Conjugate transpose = Transpose) N (input) INTEGER The order of the matrix A. N >= 0. NRHS (input) INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. A (input) COMPLEX array on the GPU, dimension (LDA,N) The factors L and U from the factorization A = P*L*U as computed by CGETRF_GPU. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,N). IPIV (input) INTEGER array, dimension (N) The pivot indices from CGETRF; for 1<=i<=N, row i of the matrix was interchanged with row IPIV(i). B (input/output) COMPLEX array on the GPU, dimension (LDB,NRHS) On entry, the right hand side matrix B. On exit, the solution matrix X. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,N). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value HWORK (workspace) COMPLEX array, dimension N*NRHS ===================================================================== */ cuFloatComplex c_one = MAGMA_C_ONE; cuFloatComplex *work = NULL; char trans_[2] = {trans, 0}; int notran = lapackf77_lsame(trans_, "N"); magma_int_t i1, i2, inc; *info = 0; if ( (! notran) && (! lapackf77_lsame(trans_, "T")) && (! lapackf77_lsame(trans_, "C")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (nrhs < 0) { *info = -3; } else if (ldda < max(1,n)) { *info = -5; } else if (lddb < max(1,n)) { *info = -8; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (n == 0 || nrhs == 0) { return *info; } magma_cmalloc_cpu( &work, n * nrhs ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } i1 = 1; i2 = n; if (notran) { inc = 1; /* Solve A * X = B. */ magma_cgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_claswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_csetmatrix( n, nrhs, work, n, dB, lddb ); if ( nrhs == 1) { magma_ctrsv(MagmaLower, MagmaNoTrans, MagmaUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_ctrsm(MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } } else { inc = -1; /* Solve A' * X = B. */ if ( nrhs == 1) { magma_ctrsv(MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1 ); magma_ctrsv(MagmaLower, trans, MagmaUnit, n, dA, ldda, dB, 1 ); } else { magma_ctrsm(MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); magma_ctrsm(MagmaLeft, MagmaLower, trans, MagmaUnit, n, nrhs, c_one, dA, ldda, dB, lddb ); } magma_cgetmatrix( n, nrhs, dB, lddb, work, n ); lapackf77_claswp(&nrhs, work, &n, &i1, &i2, ipiv, &inc); magma_csetmatrix( n, nrhs, work, n, dB, lddb ); } magma_free_cpu(work); return *info; }