extern "C" magma_int_t magma_zlarfb2_gpu( magma_int_t m, magma_int_t n, magma_int_t k, magmaDoubleComplex_const_ptr dV, magma_int_t lddv, magmaDoubleComplex_const_ptr dT, magma_int_t lddt, magmaDoubleComplex_ptr dC, magma_int_t lddc, magmaDoubleComplex_ptr dwork, magma_int_t ldwork, magma_queue_t queue ) { magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; if (m <= 0 || n <= 0) return MAGMA_SUCCESS; // W = C^H V magma_zgemm( MagmaConjTrans, MagmaNoTrans, //magmablas_zgemm_reduce( n, k, m, c_one, dC, lddc, dV, lddv, c_zero, dwork, ldwork, queue ); // W = W T^H = C^H V T^H magma_ztrmm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, k, c_one, dT, lddt, dwork, ldwork, queue ); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C magma_zgemm( MagmaNoTrans, MagmaConjTrans, m, n, k, c_neg_one, dV, lddv, dwork, ldwork, c_one, dC, lddc, queue ); return MAGMA_SUCCESS; }
void magmablas_zherk_gpu( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magma_int_t nb, double alpha, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t a_offset, double beta, magmaDoubleComplex_ptr dC, magma_int_t lddc, magma_int_t c_offset) { #define dA(i_, j_) (dA + (i_) + (j_)*ldda + (a_offset) ) #define dC(i_, j_) (dC + (i_) + (j_)*lddc) magma_trans_t transA; magma_trans_t transB; magmaDoubleComplex cbeta = MAGMA_Z_MAKE( beta, 0. ); magmaDoubleComplex calpha = MAGMA_Z_MAKE( alpha, 0. ); if (trans == MagmaNoTrans) { transA = MagmaNoTrans; transB = Magma_ConjTrans; } else { transA = Magma_ConjTrans; transB = MagmaNoTrans; } if (uplo == MagmaUpper) { fprintf( stderr, "%s: uplo=Upper not supported\n", __func__ ); return; } magma_int_t ib, ioff; magma_int_t blockoffset = c_offset % nb; // loop over all blocks and does A * A**H // blockoffset is c_offset within first block; for subsequent blocks it is 0 for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + c_offset; // global index in parent matrix // C[i:n,i] += A[i:n,0] * A[i,0]' // printf( "zgemm n=%4d, ib=%4d, k=%4d, i=%4d ioff=%4d\n", n-i, ib, k, i, ioff ); magma_zgemm( transA, transB, n-i, ib, k, calpha, dA(i,0), ldda, dA(i,0), ldda, cbeta, dC(ioff,ioff), lddc ); blockoffset = 0; } }
void gmm_magma(const Tensor_core<complex<double>,2>& A, const Tensor_core<complex<double>,2>& B, Tensor_core<complex<double>,2>& C, char TRANSA, char TRANSB, complex<double> alpha, complex<double> beta) { int AL0 = A.rank(0); int AL1 = A.rank(1); int BL0 = B.rank(0); int BL1 = B.rank(1); int CL0 = C.rank(0); int CL1 = C.rank(1); magma_int_t M, N, K, LDA, LDB, LDC; magma_trans_t transA=magma_trans_const(TRANSA), transB=magma_trans_const(TRANSB); magmaDoubleComplex_ptr d_A, d_B, d_C; //Set LDA, LDB, and LDC, round up to multiple of 32 for best GPU performance LDA = ((AL0+31)/32)*32; LDB = ((BL0+31)/32)*32; LDC = ((CL0+31)/32)*32; // Allocate memory for the matrices on GPU magma_zmalloc(&d_A, LDA*AL1 ); magma_zmalloc(&d_B, LDB*BL1 ); magma_zmalloc(&d_C, LDC*CL1 ); // Copy data from host (CPU) to device (GPU) magma_zsetmatrix( AL0, AL1, (magmaDoubleComplex* ) A.data(), AL0, d_A, LDA ); magma_zsetmatrix( BL0, BL1, (magmaDoubleComplex* ) B.data(), BL0, d_B, LDB ); if( abs(beta)>1e-32 ) magma_zsetmatrix( CL0, CL1, (magmaDoubleComplex* ) C.data(), CL0, d_C, LDC ); //Call magma_sgemm M=( TRANSA=='N' || TRANSA=='n' ) ? AL0:AL1; K=( TRANSA=='N' || TRANSA=='n' ) ? AL1:AL0; N=( TRANSB=='N' || TRANSB=='n' ) ? BL1:BL0; magma_zgemm(transA, transB, M, N, K, _cast_Z(alpha), d_A, LDA, d_B, LDB, _cast_Z(beta),d_C, LDC); // Copy solution from device (GPU) to host (CPU) magma_zgetmatrix(CL0, CL1, d_C, LDC, (magmaDoubleComplex* ) C.data(), CL0); // Free memory on GPU magma_free(d_A); magma_free(d_B); magma_free(d_C); }
/** Purpose ------- ZGETRF_m computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. This version does not require work space on the GPU passed as input. GPU memory is allocated in the routine. The matrix may exceed the GPU memory. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Note: The factorization of big panel is done calling multiple-gpu-interface. Pivots are applied on GPU within the big panel. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using magma_malloc_pinned. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_m( magma_int_t ngpu, magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *ipiv, magma_int_t *info) { #define A(i,j) (A + (j)*lda + (i)) #define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb) #define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm) magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0; timer_start( time_total ); real_Double_t flops; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs]; magma_int_t iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local; magma_int_t N, M, NB, NBk, I, d, ngpu0 = ngpu; magma_int_t ii, jj, h, offset, ib, rows; magma_queue_t stream[MagmaMaxGPUs][2]; magma_event_t event[MagmaMaxGPUs][2]; *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* initialize nb */ nb = magma_get_zgetrf_nb(m); maxm = ((m + 31)/32)*32; /* figure out NB */ size_t freeMem, totalMem; cudaMemGetInfo( &freeMem, &totalMem ); freeMem /= sizeof(magmaDoubleComplex); /* number of columns in the big panel */ h = 1+(2+ngpu0); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); const char* ngr_nb_char = getenv("MAGMA_NGR_NB"); if ( ngr_nb_char != NULL ) NB = max( nb, min( NB, atoi(ngr_nb_char) ) ); //NB = 5*max(nb,32); if ( ngpu0 > ceil((double)NB/nb) ) { ngpu = (int)ceil((double)NB/nb); h = 1+(2+ngpu); NB = (magma_int_t)(0.8*freeMem/maxm-h*nb); } else { ngpu = ngpu0; } if ( ngpu*NB >= n ) { #ifdef CHECK_ZGETRF_OOC printf( " * still fit in GPU memory.\n" ); #endif NB = n; } else { #ifdef CHECK_ZGETRF_OOC printf( " * don't fit in GPU memory.\n" ); #endif NB = ngpu*NB; NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */ } #ifdef CHECK_ZGETRF_OOC if ( NB != n ) printf( " * running in out-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); else printf( " * running in in-core mode (n=%d, NB=%d, nb=%d, freeMem=%.2e).\n", n, NB, nb, (double)freeMem ); #endif if ( (nb <= 1) || (nb >= min(m,n)) ) { /* Use CPU code for scalar of one tile. */ lapackf77_zgetrf(&m, &n, A, &lda, ipiv, info); } else { /* Use hybrid blocked code. */ /* allocate memory on GPU to store the big panel */ timer_start( time_alloc ); n_local[0] = (NB/nb)/ngpu; if ( NB%(nb*ngpu) != 0 ) n_local[0]++; n_local[0] *= nb; ldn_local = ((n_local[0]+31)/32)*32; for( d=0; d < ngpu; d++ ) { magma_setdevice(d); if (MAGMA_SUCCESS != magma_zmalloc( &dA[d], (ldn_local+h*nb)*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dPT[d] = dA[d] + nb*maxm; /* for storing the previous panel from CPU */ dAT[d] = dA[d] + h*nb*maxm; /* for storing the big panel */ magma_queue_create( &stream[d][0] ); magma_queue_create( &stream[d][1] ); magma_event_create( &event[d][0] ); magma_event_create( &event[d][1] ); } //magma_setdevice(0); timer_stop( time_alloc ); for( I=0; I < n; I += NB ) { M = m; N = min( NB, n-I ); /* number of columns in this big panel */ //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */ maxm = ((M + 31)/32)*32; if ( ngpu0 > ceil((double)N/nb) ) { ngpu = (int)ceil((double)N/nb); } else { ngpu = ngpu0; } for( d=0; d < ngpu; d++ ) { n_local[d] = ((N/nb)/ngpu)*nb; if (d < (N/nb)%ngpu) n_local[d] += nb; else if (d == (N/nb)%ngpu) n_local[d] += N%nb; } ldn_local = ((n_local[0]+31)/32)*32; /* upload the next big panel into GPU, transpose (A->A'), and pivot it */ timer_start( time ); magmablas_zsetmatrix_transpose_mgpu(ngpu, stream, A(0,I), lda, dAT, ldn_local, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_set += timer_stop( time ); timer_start( time ); /* == --------------------------------------------------------------- == */ /* == loop around the previous big-panels to update the new big-panel == */ for( offset = 0; offset < min(m,I); offset += NB ) { NBk = min( m-offset, NB ); /* start sending the first tile from the previous big-panels to gpus */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); nbi = min( nb, NBk ); magma_zsetmatrix_async( (M-offset), nbi, A(offset,offset), lda, dA[d], (maxm-offset), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][0] ); /* transpose */ magmablas_ztranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb ); } /* applying the pivot from the previous big-panel */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magmablas_zlaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, stream[d][1] ); } /* == going through each block-column of previous big-panels == */ for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) { ii = offset+jj; rows = maxm - ii; nbi = min( nb, NBk-jj ); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); /* wait for a block-column on GPU */ magma_queue_sync( stream[d][0] ); /* start sending next column */ if ( jj+nb < NBk ) { magma_zsetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb), A(ii+nb,ii+nb), lda, dA[d], (rows-nb), stream[d][0] ); /* make sure the previous update finished */ magmablasSetKernelStream(stream[d][0]); //magma_queue_sync( stream[d][1] ); magma_queue_wait_event( stream[d][0], event[d][(1+jj/nb)%2] ); /* transpose next column */ magmablas_ztranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb ); } /* update with the block column */ magmablasSetKernelStream(stream[d][1]); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local ); if ( M > ii+nb ) { magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local, dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local ); } magma_event_record( event[d][(jj/nb)%2], stream[d][1] ); } /* end of for each block-columns in a big-panel */ } } /* end of for each previous big-panels */ for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } /* calling magma-gpu interface to panel-factorize the big panel */ if ( M > I ) { magma_zgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda, stream, &iinfo); if ( iinfo < 0 ) { *info = iinfo; break; } else if ( iinfo != 0 ) { *info = iinfo + I * NB; //break; } /* adjust pivots */ for( ii=I; ii < min(I+N,m); ii++ ) ipiv[ii] += I; } time_comp += timer_stop( time ); /* download the current big panel to CPU */ timer_start( time ); magmablas_zgetmatrix_transpose_mgpu(ngpu, stream, dAT, ldn_local, A(0,I), lda, dA, maxm, M, N, nb); for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); } time_get += timer_stop( time ); } /* end of for */ timer_stop( time_total ); flops = FLOPS_ZGETRF( m, n ) / 1e9; timer_printf(" memory-allocation time: %e\n", time_alloc ); timer_printf(" NB=%d nb=%d\n", (int) NB, (int) nb ); timer_printf(" memcopy and transpose %e seconds\n", time_set ); timer_printf(" total time %e seconds\n", time_total ); timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n", flops / (time_comp), time_comp ); timer_printf(" Performance %f GFlop/s, %f seconds with htod\n", flops / (time_comp + time_set), time_comp + time_set ); timer_printf(" Performance %f GFlop/s, %f seconds with dtoh\n", flops / (time_comp + time_get), time_comp + time_get ); timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc ); for( d=0; d < ngpu0; d++ ) { magma_setdevice(d); magma_free( dA[d] ); magma_event_destroy( event[d][0] ); magma_event_destroy( event[d][1] ); magma_queue_destroy( stream[d][0] ); magma_queue_destroy( stream[d][1] ); } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); } if ( *info >= 0 ) magma_zgetrf_piv(m, n, NB, A, lda, ipiv, info); return *info; } /* magma_zgetrf_m */
/** Purpose ------- ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i_, j_) (dAT + (i_)*nb*lddat + (j_)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, cols, s, lddat, ldwork; magmaDoubleComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; lddat = ldda; magmablas_ztranspose_inplace( m, dAT, ldda ); } else { lddat = maxn; // N-by-M if (MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA, ldda, dAT, lddat ); } ldwork = maxm; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, ldwork*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } /* Define user stream if current stream is NULL */ magma_queue_t stream[2]; magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_create( &stream[0] ); if (orig_stream == NULL) { magma_queue_create( &stream[1] ); magmablasSetKernelStream(stream[1]); } else { stream[1] = orig_stream; } for( j=0; j < s; j++ ) { // download j-th panel cols = maxm - j*nb; magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP, cols ); // make sure that the transpose has completed magma_queue_sync( stream[1] ); magma_zgetmatrix_async( m-j*nb, nb, dAP, cols, work, ldwork, stream[0]); if ( j > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat ); } // do the cpu part rows = m - j*nb; magma_queue_sync( stream[0] ); lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; // upload j-th panel magma_zsetmatrix_async( m-j*nb, nb, work, ldwork, dAP, maxm, stream[0]); for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_zlaswp( n, dAT, lddat, j*nb + 1, j*nb + nb, ipiv, 1 ); magma_queue_sync( stream[0] ); magmablas_ztranspose( m-j*nb, nb, dAP, maxm, dAT(j,j), lddat ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); if ( nb0 > 0 ) { rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, maxm ); magma_zgetmatrix( rows, nb0, dAP, maxm, work, ldwork ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo); if ( *info == 0 && iinfo > 0 ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_zlaswp( n, dAT, lddat, s*nb + 1, s*nb + nb0, ipiv, 1 ); // upload j-th panel magma_zsetmatrix( rows, nb0, work, ldwork, dAP, maxm ); magmablas_ztranspose( rows, nb0, dAP, maxm, dAT(s,s), lddat ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); } // undo transpose if ( m == n ) { magmablas_ztranspose_inplace( m, dAT, lddat ); } else { magmablas_ztranspose( n, m, dAT, lddat, dA, ldda ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); magma_queue_destroy( stream[0] ); if (orig_stream == NULL) { magma_queue_destroy( stream[1] ); } magmablasSetKernelStream( orig_stream ); } return *info; } /* magma_zgetrf_gpu */
int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, t1, t2; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t ione = 1; magma_trans_t trans[] = { MagmaNoTrans, MagmaConjTrans, MagmaTrans }; magma_uplo_t uplo [] = { MagmaLower, MagmaUpper }; magma_diag_t diag [] = { MagmaUnit, MagmaNonUnit }; magma_side_t side [] = { MagmaLeft, MagmaRight }; magmaDoubleComplex *A, *B, *C, *C2, *LU; magmaDoubleComplex *dA, *dB, *dC1, *dC2; magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 ); magmaDoubleComplex beta = MAGMA_Z_MAKE( 0.7, 0.2 ); double dalpha = 0.6; double dbeta = 0.8; double work[1], error, total_error; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t m, n, k, size, maxn, ld, info; magma_int_t *piv; magma_int_t err; magma_opts opts; parse_opts( argc, argv, &opts ); printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" ); total_error = 0.; for( int itest = 0; itest < opts.ntest; ++itest ) { m = opts.msize[itest]; n = opts.nsize[itest]; k = opts.ksize[itest]; printf("=========================================================================\n"); printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k ); // allocate matrices // over-allocate so they can be any combination of {m,n,k} x {m,n,k}. maxn = max( max( m, n ), k ); ld = max( 1, maxn ); size = ld*maxn; err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) ); assert( err == 0 ); err = magma_zmalloc_pinned( &A, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &B, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &C2, size ); assert( err == 0 ); err = magma_zmalloc_pinned( &LU, size ); assert( err == 0 ); err = magma_zmalloc( &dA, size ); assert( err == 0 ); err = magma_zmalloc( &dB, size ); assert( err == 0 ); err = magma_zmalloc( &dC1, size ); assert( err == 0 ); err = magma_zmalloc( &dC2, size ); assert( err == 0 ); // initialize matrices size = maxn*maxn; lapackf77_zlarnv( &ione, ISEED, &size, A ); lapackf77_zlarnv( &ione, ISEED, &size, B ); lapackf77_zlarnv( &ione, ISEED, &size, C ); printf( "========== Level 1 BLAS ==========\n" ); // ----- test ZSWAP // swap columns 2 and 3 of dA, then copy to C2 and compare with A if ( n >= 3 ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetmatrix( m, n, A, ld, dB, ld ); magma_zswap( m, dA(0,1), 1, dA(0,2), 1 ); magma_zswap( m, dB(0,1), 1, dB(0,2), 1 ); // check results, storing diff between magma and cuda calls in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dA, 1, dB, 1 ); magma_zgetmatrix( m, n, dB, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &k, C2, &ld, work ); total_error += error; printf( "zswap diff %.2g\n", error ); } else { printf( "zswap skipped for n < 3\n" ); } // ----- test IZAMAX // get argmax of column of A magma_zsetmatrix( m, k, A, ld, dA, ld ); error = 0; for( int j = 0; j < k; ++j ) { magma_int_t i1 = magma_izamax( m, dA(0,j), 1 ); int i2; // NOT magma_int_t, for cublas cublasIzamax( handle, m, dA(0,j), 1, &i2 ); // todo need sync here? assert( i1 == i2 ); error += abs( i1 - i2 ); } total_error += error; gflops = (double)m * k / 1e9; printf( "izamax diff %.2g\n", error ); printf( "\n" ); printf( "========== Level 2 BLAS ==========\n" ); // ----- test ZGEMV // c = alpha*A*b + beta*c, with A m*n; b,c m or n-vectors // try no-trans/trans for( int ia = 0; ia < 3; ++ia ) { magma_zsetmatrix( m, n, A, ld, dA, ld ); magma_zsetvector( maxn, B, 1, dB, 1 ); magma_zsetvector( maxn, C, 1, dC1, 1 ); magma_zsetvector( maxn, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemv( handle, cublas_trans_const(trans[ia]), m, n, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 size = (trans[ia] == MagmaNoTrans ? m : n); cublasZaxpy( handle, size, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( size, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMV( m, n ) / 1e9; printf( "zgemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZHEMV // c = alpha*A*b + beta*c, with A m*m symmetric; b,c m-vectors // try upper/lower for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetvector( m, B, 1, dB, 1 ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemv( handle, cublas_uplo_const(uplo[iu]), m, &alpha, dA, ld, dB, 1, &beta, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMV( m ) / 1e9; printf( "zhemv( %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); } printf( "\n" ); // ----- test ZTRSV // solve A*c = c, with A m*m triangular; c m-vector // try upper/lower, no-trans/trans, unit/non-unit diag // Factor A into LU to get well-conditioned triangles, else solve yields garbage. // Still can give garbage if solves aren't consistent with LU factors, // e.g., using unit diag for U, so copy lower triangle to upper triangle. // Also used for trsm later. lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld ); lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info ); for( int j = 0; j < maxn; ++j ) { for( int i = 0; i < j; ++i ) { *LU(i,j) = *LU(j,i); } } for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { magma_zsetmatrix( m, m, LU, ld, dA, ld ); magma_zsetvector( m, C, 1, dC1, 1 ); magma_zsetvector( m, C, 1, dC2, 1 ); t1 = magma_sync_wtime( 0 ); magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsv( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, dA, ld, dC2, 1 ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, m, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetvector( m, dC2, 1, C2, 1 ); error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9; printf( "ztrsv( %c, %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), lapacke_diag_const(diag[id]), error, gflops/t1, gflops/t2 ); }}} printf( "\n" ); printf( "========== Level 3 BLAS ==========\n" ); // ----- test ZGEMM // C = alpha*A*B + beta*C, with A m*k or k*m; B k*n or n*k; C m*n // try combinations of no-trans/trans for( int ia = 0; ia < 3; ++ia ) { for( int ib = 0; ib < 3; ++ib ) { bool nta = (trans[ia] == MagmaNoTrans); bool ntb = (trans[ib] == MagmaNoTrans); magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA, ld ); magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZgemm( handle, cublas_trans_const(trans[ia]), cublas_trans_const(trans[ib]), m, n, k, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZGEMM( m, n, k ) / 1e9; printf( "zgemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_trans_const(trans[ia]), lapacke_trans_const(trans[ib]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHEMM // C = alpha*A*B + beta*C (left) with A m*m symmetric; B,C m*n; or // C = alpha*B*A + beta*C (right) with A n*n symmetric; B,C m*n // try left/right, upper/lower for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { magma_zsetmatrix( m, m, A, ld, dA, ld ); magma_zsetmatrix( m, n, B, ld, dB, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZhemm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), m, n, &alpha, dA, ld, dB, ld, &beta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &m, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9; printf( "zhemm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_side_const(side[is]), lapacke_uplo_const(uplo[iu]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHERK // C = alpha*A*A^H + beta*C (no-trans) with A m*k and C m*m symmetric; or // C = alpha*A^H*A + beta*C (trans) with A k*m and C m*m symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { magma_zsetmatrix( n, k, A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZherk( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &dalpha, dA, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHERK( k, n ) / 1e9; printf( "zherk( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZHER2K // C = alpha*A*B^H + ^alpha*B*A^H + beta*C (no-trans) with A,B n*k; C n*n symmetric; or // C = alpha*A^H*B + ^alpha*B^H*A + beta*C (trans) with A,B k*n; C n*n symmetric // try upper/lower, no-trans/trans for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { bool nt = (trans[it] == MagmaNoTrans); magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA, ld ); magma_zsetmatrix( n, n, C, ld, dC1, ld ); magma_zsetmatrix( n, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZher2k( handle, cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), n, k, &alpha, dA, ld, dB, ld, &dbeta, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( n, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZHER2K( k, n ) / 1e9; printf( "zher2k( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }} printf( "\n" ); // ----- test ZTRMM // C = alpha*A*C (left) with A m*m triangular; C m*n; or // C = alpha*C*A (right) with A n*n triangular; C m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; // note cublas does trmm out-of-place (i.e., adds output matrix C), // but allows C=B to do in-place. t2 = magma_sync_wtime( 0 ); cublasZtrmm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9; printf( "ztrmm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // ----- test ZTRSM // solve A*X = alpha*B (left) with A m*m triangular; B m*n; or // solve X*A = alpha*B (right) with A n*n triangular; B m*n // try left/right, upper/lower, no-trans/trans, unit/non-unit for( int is = 0; is < 2; ++is ) { for( int iu = 0; iu < 2; ++iu ) { for( int it = 0; it < 3; ++it ) { for( int id = 0; id < 2; ++id ) { bool left = (side[is] == MagmaLeft); magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA, ld ); magma_zsetmatrix( m, n, C, ld, dC1, ld ); magma_zsetmatrix( m, n, C, ld, dC2, ld ); t1 = magma_sync_wtime( 0 ); magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld ); t1 = magma_sync_wtime( 0 ) - t1; t2 = magma_sync_wtime( 0 ); cublasZtrsm( handle, cublas_side_const(side[is]), cublas_uplo_const(uplo[iu]), cublas_trans_const(trans[it]), cublas_diag_const(diag[id]), m, n, &alpha, dA, ld, dC2, ld ); t2 = magma_sync_wtime( 0 ) - t2; // check results, storing diff between magma and cuda call in C2 cublasZaxpy( handle, ld*n, &c_neg_one, dC1, 1, dC2, 1 ); magma_zgetmatrix( m, n, dC2, ld, C2, ld ); error = lapackf77_zlange( "F", &n, &n, C2, &ld, work ); total_error += error; gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9; printf( "ztrsm( %c, %c ) diff %.2g, Gflop/s %7.2f, %7.2f\n", lapacke_uplo_const(uplo[iu]), lapacke_trans_const(trans[it]), error, gflops/t1, gflops/t2 ); }}}} printf( "\n" ); // cleanup magma_free_cpu( piv ); magma_free_pinned( A ); magma_free_pinned( B ); magma_free_pinned( C ); magma_free_pinned( C2 ); magma_free_pinned( LU ); magma_free( dA ); magma_free( dB ); magma_free( dC1 ); magma_free( dC2 ); fflush( stdout ); } if ( total_error != 0. ) { printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n", total_error ); } else { printf( "all tests passed\n" ); } TESTING_FINALIZE(); int status = (total_error != 0.); return status; }
extern "C" magma_int_t magma_zpotrf2_mgpu(int num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex **d_lA, magma_int_t ldda, magmaDoubleComplex **d_lP, magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); magmaDoubleComplex *dlpanel; //magma_event_t event0[MagmaMaxGPUs], // syrk // event1[MagmaMaxGPUs], // send off-diagonal // event2[MagmaMaxGPUs], // send diagonal // event3[MagmaMaxGPUs]; // trsm magma_int_t n_local[MagmaMaxGPUs], ldpanel; int stream0 = 0, stream1 = 1; #ifdef ZTRSM_WORK magmaDoubleComplex *d_dinvA[MagmaMaxGPUs][2], *d_x[MagmaMaxGPUs][2]; /* used by ztrsm_work */ #endif *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = ((n/nb)/num_gpus)*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/num_gpus)*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } //magma_setdevice(d); //magma_event_create( &event0[d] ); //magma_event_create( &event1[d] ); //magma_event_create( &event2[d] ); //magma_event_create( &event3[d] ); } magma_setdevice(0); /* == initialize the trace */ trace_init( 1, num_gpus, 3, stream ); /* Use blocked code. */ if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) /* invert the diagonals * Allocate device memory for the inversed diagonal blocks, size=m*NB */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { magma_zmalloc( &d_dinvA[d][j], nb*nb ); magma_zmalloc( &d_x[d][j], n*nb ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex)); cudaMemset(d_x[d][j], 0, n*nb*sizeof(magmaDoubleComplex)); } } magma_setdevice(0); #endif for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); if( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ trace_gpu_start( d, stream0, "comm", "rows to GPUs" ); magma_zsetmatrix_async( j, jb, Aup(0,j), lda, dlP(d,jb,0,buf), lddp, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk trace_gpu_start( id, stream0, "comm", "D to CPU" ); magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); if ( j > 0 ) { /* Compute the local block column of the panel. */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( n_local[d] > nb0 ) { /* wait for the off-diagonal */ if( d != id ) { //magma_queue_sync( stream[id][3] ); dlpanel = dlP(d, jb, 0, buf); ldpanel = lddp; /* wait for the offdiagonal column */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, 0, nb*j_local); ldpanel = ldda; } /* update the panel */ magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); trace_gpu_start( d, stream1, "gemm", "gemm" ); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the diagonal trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_setdevice(d); if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } trace_gpu_start( d, stream0, "comm", "D to GPUs" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream0, "comm", "D to GPUs" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream0] ); trace_gpu_end( id, stream0 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d, 0, 0, buf); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][0], d_x[d][0] ); /*nb2 = n_local[d] - j_local2*nb; magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d], d_x[d] );*/ #else /*nb2 = n_local[d] - j_local2*nb; magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldda, dlA(d, j, nb*j_local2), ldda); */ magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if( j+jb < m ) { trace_gpu_start( d, stream0, "comm", "rows to CPU" ); magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream0] ); trace_gpu_end( d, stream0 ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif } else if( nb2 > 0 ) { /* update the entire trailing matrix */ trace_gpu_start( d, stream1, "trsm", "trsm" ); #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream1 ); } d = (d+1)%num_gpus; } } /* end of ztrsm */ } /* end of for j=1, .., n */ } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<2; j++ ) { magma_zmalloc( &d_dinvA[d][j], nb*nb ); magma_zmalloc( &d_x[d][j], nb*m ); cudaMemset(d_dinvA[d][j], 0, nb*nb*sizeof(magmaDoubleComplex)); cudaMemset(d_x[d][j], 0, nb* m*sizeof(magmaDoubleComplex)); } } magma_setdevice(0); #endif for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); if( j > 0 ) { /* needed on pluto... */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); // wait for the column on CPU /* broadcast offdiagonal row to all gpus */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d != id ) { magma_setdevice(d); /* wait for it on CPU */ magma_queue_wait_event( stream[d][stream0], event[id][1] ); /* send it to GPU */ magma_zsetmatrix_async( jb, j, Alo(j,0), lda, dlPT(d,0,jb,buf), nb, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } d = (d+1)%num_gpus; } } /* Update the current diagonal block */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); magma_event_record( event[id][0], stream[id][stream1] ); } /* send the diagonal to cpu */ magma_queue_wait_event( stream[id][stream0], event[id][0] ); // wait for syrk magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream0] ); /* update the offdiagonal blocks */ if ( j > 0 ) { /* compute the block-rows of the panel */ d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { if( d != id ) { dlpanel = dlPT(d, 0, jb, buf); ldpanel = nb; /* wait for offdiagonal row */ magma_queue_wait_event( stream[d][stream1], event[d][1] ); } else { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); } d = (d+1)%num_gpus; } } /* factor the diagonal */ magma_setdevice(id); magma_queue_sync( stream[id][stream0] ); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magma_setdevice(d); if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream0] ); magma_event_record( event[d][2], stream[d][stream0] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream0] ); } /* factorize off-diagonal blocks */ if ( (j+jb) < m ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); magma_setdevice(d); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for the diagonal if( j+jb < n && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead the column */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][0], d_x[d][0]); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][3], stream[d][stream1] ); /* send the column to cpu */ if( j+jb < n ) { magma_queue_wait_event( stream[d][stream0], event[d][3] ); // wait for lookahead magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream0] ); magma_event_record( event[d][1], stream[d][stream0] ); } /* update the remaining blocks */ nb2 = nb2 - nb0; #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } else if( nb2 > 0 ) { /* update the entire trailing matrix */ #if defined(PRECISION_d) && defined(ZTRSM_WORK) magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, d_dinvA[d][1], d_x[d][1] ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "zpotrf.svg","trace.css" ); /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); magma_queue_sync( stream[d][0] ); magma_queue_sync( stream[d][1] ); magmablasSetKernelStream(NULL); //magma_event_destroy( event0[d] ); //magma_event_destroy( event1[d] ); //magma_event_destroy( event2[d] ); //magma_event_destroy( event3[d] ); } magma_setdevice(0); return *info; } /* magma_zpotrf_mgpu */
void magmablas_zher2k_mgpu2( char uplo, char trans, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex *dA[], magma_int_t lda, magma_int_t aoffset, magmaDoubleComplex *dB[], magma_int_t ldb, magma_int_t boffset, double beta, magmaDoubleComplex *dC[], magma_int_t ldc, magma_int_t coffset, magma_int_t ngpu, magma_int_t nb, magma_queue_t streams[][20], magma_int_t nstream ) { #define dA(dev, i, j) (dA[dev] + (i) + (j)*lda + (aoffset) ) #define dB(dev, i, j) (dB[dev] + (i) + (j)*ldb + (boffset) ) #define dC(dev, i, j) (dC[dev] + (i) + (j)*ldc) /* Check arguments */ magma_int_t info = 0; if ( ! (uplo == 'l' || uplo == 'L')) { info = -1; // 'u' not yet handled } else if ( ! (trans == 'n' || trans == 'N')) { info = -2; // 'c' not yet handled } else if ( n < 0 ) { info = -3; } else if ( k < 0 ) { info = -4; } else if ( ((trans == 'n' || trans == 'N') && lda < max(1,n)) || ((trans == 'c' || trans == 'C') && lda < max(1,k)) ) { info = -7; } else if ( aoffset < 0 || aoffset > lda ) { info = -8; } else if ( ((trans == 'n' || trans == 'N') && ldb < max(1,n)) || ((trans == 'c' || trans == 'C') && ldb < max(1,k)) ) { info = -10; } else if ( boffset < 0 || boffset > ldb ) { info = -11; } else if ( ldc < max(1,n) ) { info = -13; } else if ( coffset < 0 || coffset > ldc ) { info = -14; } else if ( ngpu <= 0 ) { info = -15; } else if ( nb <= 0 ) { info = -16; } else if ( nstream <= 0 ) { info = -18; } if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return; } const magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex cbeta = MAGMA_Z_MAKE( beta, 0. ); magma_int_t ib, ioff, iblock, idev, di, s; magma_device_t cdev; magma_queue_t cqueue; magma_getdevice( &cdev ); magmablasGetKernelStream( &cqueue ); // loop over all blocks // Faster to have two loops: first loop does C_hat = alpha*A*B' + beta*C // blockoffset is offset within first block; for subsequent blocks it is 0 magma_int_t blockoffset = coffset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + coffset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nstream; magmablasSetKernelStream( streams[ idev ][ s ] ); // C[i:n,i] = alpha * A[i:n,0] * B[i,0]' + beta*C[i:n,i] //printf( "zgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-i, ib, k, alpha, dA(idev,i,0), lda, dB(idev,i,0), ldb, cbeta, dC(idev,ioff,di), ldc ); blockoffset = 0; } // second loop does C = conj(alpha)*B*A' + C_hat alpha = MAGMA_Z_CNJG( alpha ); blockoffset = coffset % nb; for( magma_int_t i = 0; i < n; i += ib ) { ib = min( nb-blockoffset, n-i ); // block size ioff = i + coffset; // global index in parent matrix iblock = (ioff / nb) / ngpu; // local block id idev = (ioff / nb) % ngpu; // device with this block di = iblock*nb + blockoffset; // local index in parent matrix magma_setdevice( idev ); s = iblock % nstream; magmablasSetKernelStream( streams[ idev ][ s ] ); // C[i:n,i] += conj(alpha) * B[i:n,0] * A[i,0]' //printf( "zgemm n=%4d, ib=%4d, k=%4d, i=%4d\n", n-i, ib, k, i ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-i, ib, k, alpha, dB(idev,i,0), ldb, dA(idev,i,0), lda, c_one, dC(idev,ioff,di), ldc ); blockoffset = 0; } magma_setdevice( cdev ); magmablasSetKernelStream( cqueue ); }
/** Purpose ------- ZCGESV computes the solution to a complex system of linear equations A * X = B, A**T * X = B, or A**H * X = B, where A is an N-by-N matrix and X and B are N-by-NRHS matrices. ZCGESV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] trans magma_trans_t Specifies the form of the system of equations: - = MagmaNoTrans: A * X = B (No transpose) - = MagmaTrans: A**T * X = B (Transpose) - = MagmaConjTrans: A**H * X = B (Conjugate transpose) @param[in] n INTEGER The number of linear equations, i.e., the order of the matrix A. N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (ldda,N) On entry, the N-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array dA. ldda >= max(1,N). @param[out] ipiv INTEGER array, dimension (N) The pivot indices that define the permutation matrix P; row i of the matrix was interchanged with row IPIV(i). Corresponds either to the single precision factorization (if info.EQ.0 and ITER.GE.0) or the double precision factorization (if info.EQ.0 and ITER.LT.0). @param[out] dipiv INTEGER array on the GPU, dimension (N) The pivot indices; for 1 <= i <= N, after permuting, row i of the matrix was moved to row dIPIV(i). Note this is different than IPIV, where interchanges are applied one-after-another. @param[in] dB COMPLEX_16 array on the GPU, dimension (lddb,NRHS) The N-by-NRHS right hand side matrix B. @param[in] lddb INTEGER The leading dimension of the array dB. lddb >= max(1,N). @param[out] dX COMPLEX_16 array on the GPU, dimension (lddx,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. lddx >= max(1,N). @param dworkd (workspace) COMPLEX_16 array on the GPU, dimension (N*NRHS) This array is used to hold the residual vectors. @param dworks (workspace) COMPLEX array on the GPU, dimension (N*(N+NRHS)) This array is used to store the complex single precision matrix and the right-hand sides or solutions in single precision. @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGETRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value - > 0: if info = i, U(i,i) computed in DOUBLE PRECISION is exactly zero. The factorization has been completed, but the factor U is exactly singular, so the solution could not be computed. @ingroup magma_zgesv_driver ********************************************************************/ extern "C" magma_int_t magma_zcgesv_gpu(magma_trans_t trans, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *dipiv, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *dX, magma_int_t lddx, magmaDoubleComplex *dworkd, magmaFloatComplex *dworks, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *dR; magmaFloatComplex *dSA, *dSX; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddr; /* Check arguments */ *iter = 0; *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -2; else if ( ldda < max(1,n)) *info = -4; else if ( lddb < max(1,n)) *info = -8; else if ( lddx < max(1,n)) *info = -10; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( n == 0 || nrhs == 0 ) return *info; lddsa = n; lddr = n; dSA = dworks; dSX = dSA + lddsa*n; dR = dworkd; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange(MagmaInfNorm, n, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ //magmablas_zlag2c( n, nrhs, dB, lddb, dSX, lddsx, info ); // done inside zcgetrs with pivots if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( n, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgetrf_gpu( n, n, dSA, lddsa, ipiv, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Generate parallel pivots { magma_int_t *newipiv; magma_imalloc_cpu( &newipiv, n ); if ( newipiv == NULL ) { *iter = -3; goto FALLBACK; } swp2pswp( trans, n, ipiv, newipiv ); magma_setvector( n, sizeof(magma_int_t), newipiv, 1, dipiv, 1 ); magma_free_cpu( newipiv ); } // solve dSA*dSX = dB in single precision // converts dB to dSX and applies pivots, solves, then converts result back to dX magma_zcgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dB, lddb, dX, lddx, dSX, info ); // residual dR = dB - dA*dX in double precision magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; return *info; REFINEMENT: for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX // solve dSA*dSX = R in single precision // convert result back to double precision dR // it's okay that dR is used for both dB input and dX output. magma_zcgetrs_gpu( trans, n, nrhs, dSA, lddsa, dipiv, dR, lddr, dR, lddr, dSX, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dR --and-- // dR = dB // This saves going through dR a second time (if done with one more kernel). // -- not really: first time is read, second time is write. for( j=0; j < nrhs; j++ ) { magmablas_zaxpycp( n, dR(0,j), dX(0,j), dB(0,j) ); } // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( trans, n, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( trans, MagmaNoTrans, n, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( n, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_zgetrf_gpu( n, n, dA, ldda, ipiv, info ); if (*info == 0) { magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); magma_zgetrs_gpu( trans, n, nrhs, dA, ldda, ipiv, dX, lddx, info ); } return *info; }
/** Purpose ------- ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. This version assumes the computation runs through the NULL stream and therefore is not overlapping computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_comp ********************************************************************/ extern "C" magma_int_t magma_zgetrf2_gpu(magma_int_t m, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *ipiv, magma_int_t *info) { #define dAT(i,j) (dAT + (i)*nb*lddat + (j)*nb) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, rows, cols, s, lddat, lddwork; magmaDoubleComplex *dAT, *dAP, *work; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ magma_zmalloc_cpu( &work, m * n ); if ( work == NULL ) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA, ldda, work, m ); lapackf77_zgetrf(&m, &n, work, &m, ipiv, info); magma_zsetmatrix( m, n, work, m, dA, ldda ); magma_free_cpu(work); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; lddat = maxn; lddwork = maxm; dAT = dA; if (MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } if ( m == n ) { lddat = ldda; magmablas_ztranspose_inplace( m, dAT, ldda ); } else { if (MAGMA_SUCCESS != magma_zmalloc( &dAT, maxm*maxn )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA, ldda, dAT, lddat ); } if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, maxm*nb )) { magma_free( dAP ); if ( ! (m == n)) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } for( i=0; i < s; i++ ) { // download i-th panel cols = maxm - i*nb; //magmablas_ztranspose( nb, cols, dAT(i,i), lddat, dAP, cols ); magmablas_ztranspose( nb, m-i*nb, dAT(i,i), lddat, dAP, cols ); magma_zgetmatrix( m-i*nb, nb, dAP, cols, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); if ( i > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (i+1)*nb, nb, c_one, dAT(i-1,i-1), lddat, dAT(i-1,i+1), lddat ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-i*nb, nb, c_neg_one, dAT(i-1,i+1), lddat, dAT(i, i-1), lddat, c_one, dAT(i, i+1), lddat ); } // do the cpu part rows = m - i*nb; lapackf77_zgetrf( &rows, &nb, work, &lddwork, ipiv+i*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + i*nb; magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb, i*nb ); // upload i-th panel magma_zsetmatrix( m-i*nb, nb, work, lddwork, dAP, maxm ); //magmablas_ztranspose( cols, nb, dAP, maxm, dAT(i,i), lddat ); magmablas_ztranspose( m-i*nb, nb, dAP, maxm, dAT(i,i), lddat ); // do the small non-parallel computations (next panel update) if ( s > (i+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(i, i ), lddat, dAT(i, i+1), lddat); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(i+1)*nb, m-(i+1)*nb, nb, c_neg_one, dAT(i, i+1), lddat, dAT(i+1, i ), lddat, c_one, dAT(i+1, i+1), lddat ); } } magma_int_t nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP, maxm ); magma_zgetmatrix( rows, nb0, dAP, maxm, work, lddwork ); // make sure that gpu queue is empty magma_device_sync(); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &lddwork, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; magmablas_zpermute_long2( n, dAT, lddat, ipiv, nb0, s*nb ); // upload i-th panel magma_zsetmatrix( rows, nb0, work, lddwork, dAP, maxm ); magmablas_ztranspose( rows, nb0, dAP, maxm, dAT(s,s), lddat ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat); if ( m == n ) { magmablas_ztranspose_inplace( m, dAT, lddat ); } else { magmablas_ztranspose( n, m, dAT, lddat, dA, ldda ); magma_free( dAT ); } magma_free( dAP ); magma_free_pinned( work ); } return *info; } /* magma_zgetrf2_gpu */
/** Purpose ------- ZLAUUM computes the product U * U' or L' * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. If UPLO = MagmaUpper then the upper triangle of the result is stored, overwriting the factor U in A. If UPLO = MagmaLower then the lower triangle of the result is stored, overwriting the factor L in A. This is the blocked form of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t Specifies whether the triangular factor stored in the array A is upper or lower triangular: - = MagmaUpper: Upper triangular - = MagmaLower: Lower triangular @param[in] n INTEGER The order of the triangular factor U or L. N >= 0. @param[in,out] A COPLEX_16 array, dimension (LDA,N) On entry, the triangular factor U or L. On exit, if UPLO = MagmaUpper, the upper triangle of A is overwritten with the upper triangle of the product U * U'; if UPLO = MagmaLower, the lower triangle of A is overwritten with the lower triangle of the product L' * L. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -k, the k-th argument had an illegal value @ingroup magma_zposv_aux ***************************************************************************/ extern "C" magma_int_t magma_zlauum( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t ldda, nb; magma_int_t i, ib; magmaDoubleComplex c_one = MAGMA_Z_ONE; double d_one = MAGMA_D_ONE; magmaDoubleComplex *dA; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) *info = -1; else if (n < 0) *info = -2; else if (lda < max(1,n)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ( n == 0 ) return *info; ldda = ((n+31)/32)*32; if (MAGMA_SUCCESS != magma_zmalloc( &dA, (n)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); nb = magma_get_zpotrf_nb(n); if (nb <= 1 || nb >= n) lapackf77_zlauum(uplo_, &n, A, &lda, info); else { if (upper) { /* Compute the product U * U'. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( ib, (n-i-ib), A(i,i+ib), lda, dA(i,i+ib), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaRight, MagmaUpper, MagmaConjTrans, MagmaNonUnit, i, ib, c_one, dA(i,i), ldda, dA(0, i),ldda); lapackf77_zlauum(MagmaUpperStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, (n-i-ib), c_one, dA(0,i+ib), ldda, dA(i, i+ib),ldda, c_one, dA(0,i), ldda); magma_queue_sync( stream[0] ); magma_zherk( MagmaUpper, MagmaNoTrans, ib,(n-i-ib), d_one, dA(i, i+ib), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( i+ib, ib, dA(0, i), ldda, A(0, i), lda ); } } else { /* Compute the product L' * L. */ for (i=0; i < n; i += nb) { ib=min(nb,n-i); magma_zsetmatrix_async( ib, ib, A(i,i), lda, dA(i, i), ldda, stream[1] ); magma_zsetmatrix_async( (n-i-ib), ib, A(i+ib, i), lda, dA(i+ib, i), ldda, stream[0] ); magma_queue_sync( stream[1] ); magma_ztrmm( MagmaLeft, MagmaLower, MagmaConjTrans, MagmaNonUnit, ib, i, c_one, dA(i,i), ldda, dA(i, 0),ldda); lapackf77_zlauum(MagmaLowerStr, &ib, A(i,i), &lda, info); magma_zsetmatrix_async( ib, ib, A(i, i), lda, dA(i, i), ldda, stream[0] ); if (i+ib < n) { magma_zgemm(MagmaConjTrans, MagmaNoTrans, ib, i, (n-i-ib), c_one, dA( i+ib,i), ldda, dA(i+ib, 0),ldda, c_one, dA(i,0), ldda); magma_queue_sync( stream[0] ); magma_zherk(MagmaLower, MagmaConjTrans, ib, (n-i-ib), d_one, dA(i+ib, i), ldda, d_one, dA(i, i), ldda); } magma_zgetmatrix( ib, i+ib, dA(i, 0), ldda, A(i, 0), lda ); } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free( dA ); return *info; }
/** Purpose ======= ZHETRF_nopiv_gpu computes the LDLt factorization of a complex Hermitian matrix A. The factorization has the form A = U^H * D * U , if UPLO = 'U', or A = L * D * L^H, if UPLO = 'L', where U is an upper triangular matrix, L is lower triangular, and D is a diagonal matrix. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] UPLO CHARACTER*1 - = 'U': Upper triangle of A is stored; - = 'L': Lower triangle of A is stored. @param[in] N INTEGER The order of the matrix A. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDA,N) On entry, the Hermitian matrix A. If UPLO = 'U', the leading N-by-N upper triangular part of A contains the upper triangular part of the matrix A, and the strictly lower triangular part of A is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower triangular part of the matrix A, and the strictly upper triangular part of A is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization A = U^H D U or A = L D L^H. \n Higher performance is achieved if A is in pinned memory, e.g. allocated using cudaMallocHost. @param[in] LDA INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[out] INFO INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -6, the GPU memory allocation failed - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zhetrf_comp ******************************************************************* */ extern "C" magma_int_t magma_zhetrf_nopiv_gpu( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *info) { #define A(i, j) (A) #define dA(i, j) (dA +(j)*ldda + (i)) #define dW(i, j) (dW +(j)*ldda + (i)) #define dWt(i, j) (dW +(j)*nb + (i)) /* Local variables */ magmaDoubleComplex zone = MAGMA_Z_ONE; magmaDoubleComplex mzone = MAGMA_Z_NEG_ONE; int upper = (uplo == MagmaUpper); magma_int_t j, k, jb, nb, ib, iinfo; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return MAGMA_ERR_ILLEGAL_VALUE; } /* Quick return */ if ( n == 0 ) return MAGMA_SUCCESS; nb = magma_get_zhetrf_nopiv_nb(n); ib = min(32, nb); // inner-block for diagonal factorization magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); magma_queue_t stream[2]; magma_event_t event; magma_queue_create(&stream[0]); magma_queue_create(&stream[1]); magma_event_create( &event ); trace_init( 1, 1, 2, stream ); // CPU workspace magmaDoubleComplex *A; if (MAGMA_SUCCESS != magma_zmalloc_pinned( &A, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } // GPU workspace magmaDoubleComplex_ptr dW; if (MAGMA_SUCCESS != magma_zmalloc( &dW, (1+nb)*ldda )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* Use hybrid blocked code. */ if (upper) { //========================================================= // Compute the LDLt factorization A = U'*D*U without pivoting. // main loop for (j=0; j<n; j += nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[1], event ); magma_event_sync(event); magma_zgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); zhetrf_nopiv_cpu(MagmaUpper, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_zsetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ztrsm(MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaUnit, jb, (n-j-jb), zone, dA(j, j), ldda, dA(j, j+jb), ldda); magma_zcopymatrix( jb, n-j-jb, dA( j, j+jb ), ldda, dWt( 0, j+jb ), nb ); // update the trailing submatrix with D magmablas_zlascl_diag(MagmaUpper, jb, n-j-jb, dA(j, j), ldda, dA(j, j+jb), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with U and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_zgemm(MagmaConjTrans, MagmaNoTrans, kb, n-k, jb, mzone, dWt(0, k), nb, dA(j, k), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } else { //========================================================= // Compute the LDLt factorization A = L*D*L' without pivoting. // main loop for (j=0; j<n; j+=nb) { jb = min(nb, (n-j)); // copy A(j,j) back to CPU trace_gpu_start( 0, 0, "get", "get" ); //magma_queue_wait_event( stream[0], event ); magma_event_sync(event); magma_zgetmatrix_async(jb, jb, dA(j, j), ldda, A(j,j), nb, stream[1]); trace_gpu_end( 0, 0 ); // factorize the diagonal block magma_queue_sync(stream[1]); trace_cpu_start( 0, "potrf", "potrf" ); zhetrf_nopiv_cpu(MagmaLower, jb, ib, A(j, j), nb, info); trace_cpu_end( 0 ); if (*info != 0){ *info = *info + j; break; } // copy A(j,j) back to GPU trace_gpu_start( 0, 0, "set", "set" ); magma_zsetmatrix_async(jb, jb, A(j, j), nb, dA(j, j), ldda, stream[0]); trace_gpu_end( 0, 0 ); if ( (j+jb) < n) { // compute the off-diagonal blocks of current block column magmablasSetKernelStream( stream[0] ); trace_gpu_start( 0, 0, "trsm", "trsm" ); magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaUnit, (n-j-jb), jb, zone, dA(j, j), ldda, dA(j+jb, j), ldda); magma_zcopymatrix( n-j-jb,jb, dA( j+jb, j ), ldda, dW( j+jb, 0 ), ldda ); // update the trailing submatrix with D magmablas_zlascl_diag(MagmaLower, n-j-jb, jb, dA(j, j), ldda, dA(j+jb, j), ldda, &iinfo); trace_gpu_end( 0, 0 ); // update the trailing submatrix with L and W trace_gpu_start( 0, 0, "gemm", "gemm" ); for (k=j+jb; k<n; k+=nb) { magma_int_t kb = min(nb,n-k); magma_zgemm(MagmaNoTrans, MagmaConjTrans, n-k, kb, jb, mzone, dA(k, j), ldda, dW(k, 0), ldda, zone, dA(k, k), ldda); if (k==j+jb) magma_event_record( event, stream[0] ); } trace_gpu_end( 0, 0 ); } } } trace_finalize( "zhetrf.svg","trace.css" ); magma_queue_destroy(stream[0]); magma_queue_destroy(stream[1]); magma_event_destroy( event ); magma_free( dW ); magma_free_pinned( A ); magmablasSetKernelStream( orig_stream ); return MAGMA_SUCCESS; } /* magma_zhetrf_nopiv */
/** Purpose ------- ZSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in] NB INTEGER The blocking size. NB >= 0. @param[in,out] hU COMPLEX_16 array, dimension(LDHU, N), on cpu. On entry, the NB-by-N upper triangular tile hU. On exit, the content is incomplete. Shouldn't be used. @param[in] ldhu INTEGER The leading dimension of the array hU. LDHU >= max(1,NB). @param[in,out] dU COMPLEX_16 array, dimension(LDDU, N), on gpu. On entry, the NB-by-N upper triangular tile dU identical to hU. On exit, the new factor U from the factorization. @param[in] lddu INTEGER The leading dimension of the array dU. LDDU >= max(1,NB). @param[in,out] hA COMPLEX_16 array, dimension(LDHA, N), on cpu. On entry, only the M-by-IB first panel needs to be identical to dA(1..M, 1..IB). On exit, the content is incomplete. Shouldn't be used. @param[in] ldha INTEGER The leading dimension of the array hA. LDHA >= max(1,M). @param[in,out] dA COMPLEX_16 array, dimension(LDDA, N), on gpu. On entry, the M-by-N tile to be factored. On exit, the factor L from the factorization @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] hL COMPLEX_16 array, dimension(LDHL, K), on vpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] ldhl INTEGER The leading dimension of the array hL. LDHL >= max(1,2*IB). @param[out] dL COMPLEX_16 array, dimension(LDDL, K), on gpu. On exit, contains in the upper part the IB-by-K lower triangular tile, and in the lower part IB-by-K the inverse of the top part. @param[in] lddl INTEGER The leading dimension of the array dL. LDDL >= max(1,2*IB). @param[out] hWORK COMPLEX_16 array, dimension(LDHWORK, 2*IB), on cpu. Workspace. @param[in] ldhwork INTEGER The leading dimension of the array hWORK. LDHWORK >= max(NB, 1). @param[out] dWORK COMPLEX_16 array, dimension(LDDWORK, 2*IB), on gpu. Workspace. @param[in] lddwork INTEGER The leading dimension of the array dWORK. LDDWORK >= max(NB, 1). @param[out] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by ZTSTRF @param[out] info INTEGER - PLASMA_SUCCESS successful exit - < 0 if INFO = -k, the k-th argument had an illegal value - > 0 if INFO = k, U(k,k) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_zgesv_tile ********************************************************************/ extern "C" magma_int_t magma_ztstrf_gpu( magma_order_t order, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb, magmaDoubleComplex *hU, magma_int_t ldhu, magmaDoubleComplex_ptr dU, magma_int_t lddu, magmaDoubleComplex *hA, magma_int_t ldha, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex *hL, magma_int_t ldhl, magmaDoubleComplex_ptr dL, magma_int_t lddl, magma_int_t *ipiv, magmaDoubleComplex *hwork, magma_int_t ldhwork, magmaDoubleComplex_ptr dwork, magma_int_t lddwork, magma_int_t *info) { #define UT(i,j) (dUT + (i)*ib*lddu + (j)*ib ) #define AT(i,j) (dAT + (i)*ib*ldda + (j)*ib ) #define L(i) (dL + (i)*ib*lddl ) #define L2(i) (dL2 + (i)*ib*lddl ) #define hU(i,j) (hU + (j)*ib*ldhu + (i)*ib ) #define hA(i,j) (hA + (j)*ib*ldha + (i)*ib ) #define hL(i) (hL + (i)*ib*ldhl ) #define hL2(i) (hL2 + (i)*ib*ldhl ) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; int iinfo = 0; int maxm, mindim; int i, j, im, s, ip, ii, sb, p = 1; magmaDoubleComplex_ptr dAT, dUT; magmaDoubleComplex_ptr dAp, dUp; #ifndef WITHOUTTRTRI magmaDoubleComplex_ptr dL2 = dL + ib; magmaDoubleComplex *hL2 = hL + ib; p = 2; #endif /* Check input arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ib < 0) { *info = -3; } else if ((lddu < max(1,m)) && (m > 0)) { *info = -6; } else if ((ldda < max(1,m)) && (m > 0)) { *info = -8; } else if ((lddl < max(1,ib)) && (ib > 0)) { *info = -10; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* quick return */ if ((m == 0) || (n == 0) || (ib == 0)) return *info; ip = 0; /* Function Body */ mindim = min(m, n); s = mindim / ib; if ( ib >= mindim ) { /* Use CPU code. */ CORE_ztstrf(m, n, ib, nb, (PLASMA_Complex64_t*)hU, ldhu, (PLASMA_Complex64_t*)hA, ldha, (PLASMA_Complex64_t*)hL, ldhl, ipiv, (PLASMA_Complex64_t*)hwork, ldhwork, info); #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, mindim, mindim, (PLASMA_Complex64_t*)hL, ldhl, (PLASMA_Complex64_t*)hL2, ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, mindim, (PLASMA_Complex64_t*)hL2, ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif if ( order == MagmaRowMajor ) { magma_zsetmatrix( m, n, hU, ldhu, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dwork, lddwork ); magmablas_ztranspose( m, n, dwork, lddwork, dA, ldda ); } else { magma_zsetmatrix( m, n, hU, ldhu, dU, lddu ); magma_zsetmatrix( m, n, hA, ldha, dA, ldda ); } magma_zsetmatrix( p*ib, n, hL, ldhl, dL, lddl ); } else { /* Use hybrid blocked code. */ maxm = magma_roundup( m, 32 ); if ( order == MagmaColMajor ) { magmablas_zgetmo_in( dU, dUT, lddu, m, n ); magmablas_zgetmo_in( dA, dAT, ldda, m, n ); } else { dUT = dU; dAT = dA; } dAp = dwork; dUp = dAp + ib*lddwork; ip = 0; for( i=0; i < s; i++ ) { ii = i * ib; sb = min(mindim-ii, ib); if ( i > 0 ) { // download i-th panel magmablas_ztranspose( sb, ii, UT(0,i), lddu, dUp, lddu ); magmablas_ztranspose( sb, m, AT(0,i), ldda, dAp, ldda ); magma_zgetmatrix( ii, sb, dUp, lddu, hU(0, i), ldhu ); magma_zgetmatrix( m, sb, dAp, ldda, hA(0, i), ldha ); // make sure that gpu queue is empty //magma_device_sync(); #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L2(i-1), lddl, UT(i-1, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-(ii+sb), ib, c_one, L(i-1), lddl, UT(i-1, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(ii+sb), m, ib, c_neg_one, UT(i-1, i+1), lddu, AT(0, i-1), ldda, c_one, AT(0, i+1), ldda ); } // do the cpu part CORE_ztstrf(m, sb, ib, nb, (PLASMA_Complex64_t*)hU(i, i), ldhu, (PLASMA_Complex64_t*)hA(0, i), ldha, (PLASMA_Complex64_t*)hL(i), ldhl, ipiv+ii, (PLASMA_Complex64_t*)hwork, ldhwork, info); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + ii; // Need to swap betw U and A #ifndef NOSWAPBLK magmablas_zswapblk( MagmaRowMajor, n-(ii+sb), UT(i, i+1), lddu, AT(0, i+1), ldda, 1, sb, ipiv+ii, 1, nb ); for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im == j ) { ipiv[ip] += ii; } ip++; } #else for (j=0; j < ib; j++) { im = ipiv[ip]-1; if ( im != (j) ) { im = im - nb; assert( (im >= 0) && (im < m) ); magmablas_zswap( n-(ii+sb), UT(i, i+1)+j*lddu, 1, AT(0, i+1)+im*ldda, 1 ); } else { ipiv[ip] += ii; } ip++; } #endif #ifndef WITHOUTTRTRI CORE_zlacpy( PlasmaUpperLower, sb, sb, (PLASMA_Complex64_t*)hL(i), ldhl, (PLASMA_Complex64_t*)hL2(i), ldhl ); CORE_ztrtri( PlasmaLower, PlasmaUnit, sb, (PLASMA_Complex64_t*)hL2(i), ldhl, info ); if (*info != 0 ) { fprintf(stderr, "ERROR, trtri returned with info = %d\n", *info); } #endif // upload i-th panel magma_zsetmatrix( sb, sb, hU(i, i), ldhu, dUp, lddu ); magma_zsetmatrix( m, sb, hA(0, i), ldha, dAp, ldda ); magma_zsetmatrix( p*ib, sb, hL(i), ldhl, L(i), lddl ); magmablas_ztranspose( sb, sb, dUp, lddu, UT(i,i), lddu ); magmablas_ztranspose( m, sb, dAp, ldda, AT(0,i), ldda ); // make sure that gpu queue is empty //magma_device_sync(); // do the small non-parallel computations if ( s > (i+1) ) { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, sb, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, sb, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } else { #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L2(i), lddl, UT(i, i+1), lddu); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n-mindim, sb, c_one, L(i), lddl, UT(i, i+1), lddu); #endif magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-mindim, m, sb, c_neg_one, UT(i, i+1), lddu, AT(0, i ), ldda, c_one, AT(0, i+1), ldda ); } } if ( order == MagmaColMajor ) { magmablas_zgetmo_out( dU, dUT, lddu, m, n ); magmablas_zgetmo_out( dA, dAT, ldda, m, n ); } } return *info; }
/** Purpose ------- ZCGEQRSV solves the least squares problem min || A*X - B ||, where A is an M-by-N matrix and X and B are M-by-NRHS matrices. ZCGEQRSV first attempts to factorize the matrix in complex SINGLE PRECISION and use this factorization within an iterative refinement procedure to produce a solution with complex DOUBLE PRECISION norm-wise backward error quality (see below). If the approach fails the method switches to a complex DOUBLE PRECISION factorization and solve. The iterative refinement is not going to be a winning strategy if the ratio complex SINGLE PRECISION performance over complex DOUBLE PRECISION performance is too small. A reasonable strategy should take the number of right-hand sides and the size of the matrix into account. This might be done with a call to ILAENV in the future. Up to now, we always try iterative refinement. The iterative refinement process is stopped if ITER > ITERMAX or for all the RHS we have: RNRM < SQRT(N)*XNRM*ANRM*EPS*BWDMAX where o ITER is the number of the current iteration in the iterative refinement process o RNRM is the infinity-norm of the residual o XNRM is the infinity-norm of the solution o ANRM is the infinity-operator-norm of the matrix A o EPS is the machine epsilon returned by DLAMCH('Epsilon') The value ITERMAX and BWDMAX are fixed to 30 and 1.0D+00 respectively. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. M >= N >= 0. @param[in] nrhs INTEGER The number of right hand sides, i.e., the number of columns of the matrix B. NRHS >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the M-by-N coefficient matrix A. On exit, if iterative refinement has been successfully used (info.EQ.0 and ITER.GE.0, see description below), A is unchanged. If double precision factorization has been used (info.EQ.0 and ITER.LT.0, see description below), then the array dA contains the QR factorization of A as returned by function DGEQRF_GPU. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[in,out] dB COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) The M-by-NRHS right hand side matrix B. May be overwritten (e.g., if refinement fails). @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,M). @param[out] dX COMPLEX_16 array on the GPU, dimension (LDDX,NRHS) If info = 0, the N-by-NRHS solution matrix X. @param[in] lddx INTEGER The leading dimension of the array dX. LDDX >= max(1,N). @param[out] iter INTEGER - < 0: iterative refinement has failed, double precision factorization has been performed + -1 : the routine fell back to full precision for implementation- or machine-specific reasons + -2 : narrowing the precision induced an overflow, the routine fell back to full precision + -3 : failure of SGEQRF + -31: stop the iterative refinement after the 30th iteration - > 0: iterative refinement has been successfully used. Returns the number of iterations @param[out] info INTEGER - = 0: successful exit - < 0: if info = -i, the i-th argument had an illegal value @ingroup magma_zgels_driver ********************************************************************/ extern "C" magma_int_t magma_zcgeqrsv_gpu( magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex_ptr dA, magma_int_t ldda, magmaDoubleComplex_ptr dB, magma_int_t lddb, magmaDoubleComplex_ptr dX, magma_int_t lddx, magma_int_t *iter, magma_int_t *info) { #define dB(i,j) (dB + (i) + (j)*lddb) #define dX(i,j) (dX + (i) + (j)*lddx) #define dR(i,j) (dR + (i) + (j)*lddr) #define dSX(i,j) (dSX + (i) + (j)*lddsx) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_int_t ione = 1; magmaDoubleComplex *hworkd; magmaFloatComplex *hworks; magmaDoubleComplex *tau; magmaFloatComplex *stau; magmaDoubleComplex_ptr dworkd; magmaFloatComplex_ptr dworks; magmaDoubleComplex_ptr dR, dT; magmaFloatComplex_ptr dSA, dSX, dST; magmaDoubleComplex Xnrmv, Rnrmv; double Anrm, Xnrm, Rnrm, cte, eps; magma_int_t i, j, iiter, lddsa, lddsx, lddr, nb, lhwork, minmn, size, ldworkd; /* Check arguments */ *iter = 0; *info = 0; if ( m < 0 ) *info = -1; else if ( n < 0 || n > m ) *info = -2; else if ( nrhs < 0 ) *info = -3; else if ( ldda < max(1,m)) *info = -5; else if ( lddb < max(1,m)) *info = -7; else if ( lddx < max(1,n)) *info = -9; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if ( m == 0 || n == 0 || nrhs == 0 ) return *info; nb = magma_get_cgeqrf_nb(m); minmn= min(m, n); /* dSX contains both B and X, so must be max(m or lddb,n). */ lddsa = ldda; lddsx = max(lddb,n); lddr = lddb; /* * Allocate temporary buffers */ /* dworks(dSA + dSX + dST) */ size = lddsa*n + lddsx*nrhs + ( 2*minmn + ((n+31)/32)*32 )*nb; if (MAGMA_SUCCESS != magma_cmalloc( &dworks, size )) { fprintf(stderr, "Allocation of dworks failed (%d)\n", (int) size); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dSA = dworks; dSX = dSA + lddsa*n; dST = dSX + lddsx*nrhs; /* dworkd(dR) = lddr*nrhs */ ldworkd = lddr*nrhs; if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, ldworkd )) { magma_free( dworks ); fprintf(stderr, "Allocation of dworkd failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dR = dworkd; /* hworks(workspace for cgeqrs + stau) = min(m,n) + lhworks */ lhwork = (m - n + nb)*(nrhs + nb) + nrhs*nb; size = lhwork + minmn; magma_cmalloc_cpu( &hworks, size ); if ( hworks == NULL ) { magma_free( dworks ); magma_free( dworkd ); fprintf(stderr, "Allocation of hworks failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } stau = hworks + lhwork; eps = lapackf77_dlamch("Epsilon"); Anrm = magmablas_zlange(MagmaInfNorm, m, n, dA, ldda, (double*)dworkd ); cte = Anrm * eps * pow((double)n, 0.5) * BWDMAX; /* * Convert to single precision */ magmablas_zlag2c( m, nrhs, dB, lddb, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } magmablas_zlag2c( m, n, dA, ldda, dSA, lddsa, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // factor dSA in single precision magma_cgeqrf_gpu( m, n, dSA, lddsa, stau, dST, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // solve dSA*dSX = dB in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // residual dR = dB - dA*dX in double precision magmablas_clag2z( n, nrhs, dSX, lddsx, dX, lddx, info ); magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange? for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto REFINEMENT; } } *iter = 0; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; REFINEMENT: /* TODO: this iterative refinement algorithm works only for compatibile * systems (B in colspan of A). * See Matrix Computations (3rd ed) p. 267 for correct algorithm. */ for( iiter=1; iiter < ITERMAX; ) { *info = 0; // convert residual dR to single precision dSX magmablas_zlag2c( m, nrhs, dR, lddr, dSX, lddsx, info ); if (*info != 0) { *iter = -2; goto FALLBACK; } // solve dSA*dSX = R in single precision magma_cgeqrs_gpu( m, n, nrhs, dSA, lddsa, stau, dST, dSX, lddsx, hworks, lhwork, info ); if (*info != 0) { *iter = -3; goto FALLBACK; } // Add correction and setup residual // dX += dSX [including conversion] --and-- // dR[1:n] = dB[1:n] (only n rows, not whole m rows! -- useless if m > n) for( j=0; j < nrhs; j++ ) { magmablas_zcaxpycp( n, dSX(0,j), dX(0,j), dB(0,j), dR(0,j) ); } // dR = dB (whole m rows) magmablas_zlacpy( MagmaUpperLower, m, nrhs, dB, lddb, dR, lddr ); // residual dR = dB - dA*dX in double precision if ( nrhs == 1 ) { magma_zgemv( MagmaNoTrans, m, n, c_neg_one, dA, ldda, dX, 1, c_one, dR, 1 ); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, nrhs, n, c_neg_one, dA, ldda, dX, lddx, c_one, dR, lddr ); } /* Check whether the nrhs normwise backward errors satisfy the * stopping criterion. If yes, set ITER=IITER > 0 and return. */ for( j=0; j < nrhs; j++ ) { i = magma_izamax( n, dX(0,j), 1) - 1; magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1 ); Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, NULL ); i = magma_izamax ( m, dR(0,j), 1 ) - 1; magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1 ); Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, NULL ); if ( Rnrm > Xnrm*cte ) { goto L20; } } /* If we are here, the nrhs normwise backward errors satisfy * the stopping criterion, we are good to exit. */ *iter = iiter; /* Free workspaces */ magma_free( dworks ); magma_free( dworkd ); magma_free_cpu( hworks ); return *info; L20: iiter++; } /* If we are at this place of the code, this is because we have * performed ITER=ITERMAX iterations and never satisified the * stopping criterion. Set up the ITER flag accordingly and follow * up on double precision routine. */ *iter = -ITERMAX - 1; FALLBACK: /* Single-precision iterative refinement failed to converge to a * satisfactory solution, so we resort to double precision. */ magma_free( dworks ); magma_free_cpu( hworks ); /* * Allocate temporary buffers */ /* dworkd = dT for zgeqrf */ nb = magma_get_zgeqrf_nb( m ); size = (2*min(m, n) + (n+31)/32*32 )*nb; if ( size > ldworkd ) { magma_free( dworkd ); if (MAGMA_SUCCESS != magma_zmalloc( &dworkd, size )) { fprintf(stderr, "Allocation of dworkd2 failed\n"); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } dT = dworkd; /* hworkd(dtau + workspace for zgeqrs) = min(m,n) + lhwork */ size = lhwork + minmn; magma_zmalloc_cpu( &hworkd, size ); if ( hworkd == NULL ) { magma_free( dworkd ); fprintf(stderr, "Allocation of hworkd2 failed\n"); *info = MAGMA_ERR_HOST_ALLOC; return *info; } tau = hworkd + lhwork; magma_zgeqrf_gpu( m, n, dA, ldda, tau, dT, info ); if (*info == 0) { // if m > n, then dB won't fit in dX, so solve with dB and copy n rows to dX magma_zgeqrs_gpu( m, n, nrhs, dA, ldda, tau, dT, dB, lddb, hworkd, lhwork, info ); magmablas_zlacpy( MagmaUpperLower, n, nrhs, dB, lddb, dX, lddx ); } magma_free( dworkd ); magma_free_cpu( hworkd ); return *info; }
/** Purpose ------- ZLAHRU is an auxiliary MAGMA routine that is used in ZGEHRD to update the trailing sub-matrices after the reductions of the corresponding panels. See further details below. Arguments --------- @param[in] n INTEGER The order of the matrix A. N >= 0. @param[in] ihi INTEGER Last row to update. Same as IHI in zgehrd. @param[in] k INTEGER Number of rows of the matrix Am (see details below) @param[in] nb INTEGER Block size @param[out] A COMPLEX_16 array, dimension (LDA,N-K) On entry, the N-by-(N-K) general matrix to be updated. The computation is done on the GPU. After Am is updated on the GPU only Am(1:NB) is transferred to the CPU - to update the corresponding Am matrix. See Further Details below. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,N). @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N-K). On entry, the N-by-(N-K) general matrix to be updated. On exit, the 1st K rows (matrix Am) of A are updated by applying an orthogonal transformation from the right Am = Am (I-V T V'), and sub-matrix Ag is updated by Ag = (I - V T V') Ag (I - V T V(NB+1:)' ) where Q = I - V T V' represent the orthogonal matrix (as a product of elementary reflectors V) used to reduce the current panel of A to upper Hessenberg form. After Am is updated Am(:,1:NB) is sent to the CPU. See Further Details below. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). @param[in,out] dY (workspace) COMPLEX_16 array on the GPU, dimension (LDDY, NB). On entry the (N-K)-by-NB Y = A V. It is used internally as workspace, so its value is changed on exit. @param[in] lddy INTEGER The leading dimension of the array dY. LDDY >= max(1,N). @param[in,out] dV (workspace) COMPLEX_16 array on the GPU, dimension (LDDV, NB). On entry the (N-K)-by-NB matrix V of elementary reflectors used to reduce the current panel of A to upper Hessenberg form. The rest K-by-NB part is used as workspace. V is unchanged on exit. @param[in] lddv INTEGER The leading dimension of the array dV. LDDV >= max(1,N). @param[in] dT COMPLEX_16 array on the GPU, dimension (NB, NB). On entry the NB-by-NB upper trinagular matrix defining the orthogonal Hessenberg reduction transformation matrix for the current panel. The lower triangular part are 0s. @param dwork (workspace) COMPLEX_16 array on the GPU, dimension N*NB. Further Details --------------- This implementation follows the algorithm and notations described in: S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg form through hybrid GPU-based computing," University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219), May 24, 2009. The difference is that here Am is computed on the GPU. M is renamed Am, G is renamed Ag. @ingroup magma_zgeev_aux ********************************************************************/ extern "C" magma_int_t magma_zlahru( magma_int_t n, magma_int_t ihi, magma_int_t k, magma_int_t nb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *dY, magma_int_t lddy, magmaDoubleComplex *dV, magma_int_t lddv, magmaDoubleComplex *dT, magmaDoubleComplex *dwork ) { #define dA(i_,j_) (dA + (i_) + (j_)*ldda) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dYm = dV + ihi - k; magma_int_t info = 0; if (n < 0) { info = -1; } else if (ihi < 0 || ihi > n) { info = -2; } else if (k < 0 || k > n) { info = -3; } else if (nb < 1 || nb > n) { info = -4; } else if (lda < max(1,n)) { info = -6; } else if (ldda < max(1,n)) { info = -8; } else if (lddy < max(1,n)) { info = -10; } else if (lddv < max(1,n)) { info = -12; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } // top part of Y, above panel, hasn't been computed yet, so do that now // Ym = Am V = A(0:k-1, 0:ihi-k-1) * V(0:ihi-k-1, 0:nb-1) magma_zgemm( MagmaNoTrans, MagmaNoTrans, k, nb, ihi-k, c_one, dA, ldda, dV, lddv, c_zero, dYm, ldda ); // ----- // on right, A := A Q = A - A V T V' // Update Am = Am - Am V T V' = Am - Ym W', with W = V T' // W = V T' = V(0:ihi-k-1, 0:nb-1) * T(0:nb-1, 0:nb-1)' magma_zgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, nb, nb, c_one, dV, lddv, dT, nb, c_zero, dwork, ldda ); // Am = Am - Ym W' = A(0:k-1, 0:ihi-k-1) - Ym(0:k-1, 0:nb-1) * W(0:ihi-k-1, 0:nb-1)' magma_zgemm( MagmaNoTrans, MagmaConjTrans, k, ihi-k, nb, c_neg_one, dYm, ldda, dwork, ldda, c_one, dA, ldda ); // copy first nb columns of Am, A(0:k-1, 0:nb-1), to host magma_zgetmatrix( k, nb, dA, ldda, A, lda ); // ----- // on right, A := A Q = A - A V T V' // Update Ag = Ag - Ag V T V' = Ag - Y W' // Ag = Ag - Y W' = A(k:ihi-1, nb:ihi-k-1) - Y(0:ihi-k-1, 0:nb-1) * W(nb:ihi-k-1, 0:nb-1)' magma_zgemm( MagmaNoTrans, MagmaConjTrans, ihi-k, ihi-k-nb, nb, c_neg_one, dY, ldda, dwork + nb, ldda, c_one, dA(k,nb), ldda ); // ----- // on left, A := Q' A = A - V T' V' A // Ag2 = Ag2 - V T' V' Ag2 = W Yg, with W = V T' and Yg = V' Ag2 // Note that Ag is A(k:ihi, nb+1:ihi-k) // while Ag2 is A(k:ihi, nb+1: n -k) // Z = V(0:ihi-k-1, 0:nb-1)' * A(k:ihi-1, nb:n-k-1); Z is stored over Y magma_zgemm( MagmaConjTrans, MagmaNoTrans, nb, n-k-nb, ihi-k, c_one, dV, lddv, dA(k,nb), ldda, c_zero, dY, nb ); // Ag2 = Ag2 - W Z = A(k:ihi-1, nb:n-k-1) - W(nb:n-k-1, 0:nb-1) * Z(0:nb-1, nb:n-k-1) magma_zgemm( MagmaNoTrans, MagmaNoTrans, ihi-k, n-k-nb, nb, c_neg_one, dwork, ldda, dY, nb, c_one, dA(k,nb), ldda ); return 0; }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. This version assumes the computation runs through the NULL stream and therefore is not overlapping some computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_gpu(magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) { #define dA(i, j) (dA + (j)*ldda + (i)) magma_int_t j, jb, nb; const char* uplo_ = lapack_uplo_const( uplo ); magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *work; double d_one = 1.0; double d_neg_one = -1.0; int upper = (uplo == MagmaUpper); *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb(n); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t stream[2]; magma_queue_create( &stream[0] ); magma_queue_create( &stream[1] ); if ((nb <= 1) || (nb >= n)) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA, ldda, work, n ); lapackf77_zpotrf(uplo_, &n, work, &n, info); magma_zsetmatrix( n, n, work, n, dA, ldda ); } else { /* Use blocked code. */ if (upper) { /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { /* Update and factorize the current diagonal block and test for non-positive-definiteness. Computing MIN */ jb = min(nb, (n-j)); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { /* Compute the current block row. */ magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, (n-j-jb), j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda); } magma_queue_sync( stream[1] ); lapackf77_zpotrf(MagmaUpperStr, &jb, work, &jb, info); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, (n-j-jb), c_one, dA(j, j ), ldda, dA(j, j+jb), ldda); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // Update and factorize the current diagonal block and test // for non-positive-definiteness. Computing MIN jb = min(nb, (n-j)); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, stream[1] ); if ( (j+jb) < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, (n-j-jb), jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda); } magma_queue_sync( stream[1] ); lapackf77_zpotrf(MagmaLowerStr, &jb, work, &jb, info); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, stream[0] ); if (*info != 0) { *info = *info + j; break; } if ( (j+jb) < n) { magma_ztrsm(MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, (n-j-jb), jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda); } } } } magma_queue_destroy( stream[0] ); magma_queue_destroy( stream[1] ); magma_free_pinned( work ); return *info; } /* magma_zpotrf_gpu */
extern "C" magma_int_t magma_zgessm_gpu( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL, magma_int_t lddl, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info) { /* -- MAGMA (version 1.4.1) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver December 2013 Purpose ======= ZGESSM applies the factors L computed by ZGETRF_INCPIV to a complex M-by-N tile A. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. K (input) INTEGER The number of columns of the matrix L. K >= 0. IB (input) INTEGER The inner-blocking size. IB >= 0. IPIV (input) INTEGER array on the cpu. The pivot indices array of size K as returned by ZGETRF_INCPIV. dL1 (input) DOUBLE COMPLEX array, dimension(LDDL1, N) The IB-by-K matrix in which is stored L^(-1) as returned by GETRF_INCPIV LDDL1 (input) INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). dL (input) DOUBLE COMPLEX array, dimension(LDDL, N) The M-by-K lower triangular tile on the gpu. LDDL (input) INTEGER The leading dimension of the array L. LDDL >= max(1,M). dA (input/output) DOUBLE COMPLEX array, dimension (LDDA, N) On entry, the M-by-N tile A on the gpu. On exit, updated by the application of L on the gpu. ===================================================================== */ #define AT(i,j) (dAT + (i)*ldda + (j) ) #define L(i,j) (dL + (i) + (j)*lddl ) #define dL1(j) (dL1 + (j)*lddl1) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; int i, s, sb; magmaDoubleComplex *dAT; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; if ( (storev == 'C') || (storev == 'c') ) { magmablas_zgetmo_in( dA, dAT, ldda, m, n ); } else { dAT = dA; } s = k / ib; for(i = 0; i < k; i += ib) { sb = min(ib, k-i); magmablas_zlaswp( n, dAT, ldda, i+1, i+sb, ipiv, 1 ); #ifndef WITHOUTTRTRI magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, dL1(i), lddl1, AT(i, 0), ldda); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n, sb, c_one, L( i, i), lddl, AT(i, 0), ldda); #endif if ( (i+sb) < m) { magma_zgemm( MagmaNoTrans, MagmaTrans, n, m-(i+sb), sb, c_neg_one, AT(i, 0), ldda, L( i+sb, i), lddl, c_one, AT(i+sb, 0), ldda ); } } if ( (storev == 'C') || (storev == 'c') ) { magmablas_zgetmo_in( dA, dAT, ldda, m, n ); } return *info; /* End of MAGMA_ZGETRF_GPU */ }
// ---------------------------------------------------------------------- extern "C" void magma_zher2k_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_trans_t trans, magma_int_t nb, magma_int_t n, magma_int_t k, magmaDoubleComplex alpha, magmaDoubleComplex_ptr dB[], magma_int_t lddb, magma_int_t b_offset, double beta, magmaDoubleComplex_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t nqueue, magma_queue_t queues[][10]) { #define dB(id, i, j) (dB[(id)] + (j)*lddb + (i)+b_offset) #define dB1(id, i, j) (dB[(id)] + (j)*lddb + (i)+b_offset) + k*lddb #define dC(id, i, j) (dC[(id)] + (j)*lddc + (i)) magma_int_t i, id, ib, ii, kk, n1; magmaDoubleComplex c_one = MAGMA_Z_ONE; magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* diagonal update */ for( i=0; i < n; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = (i/(nb*ngpu))%nqueue; magma_setdevice( id ); magmablasSetKernelStream( queues[id][kk] ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); /* zher2k on diagonal block */ trace_gpu_start( id, kk, "syr2k", "syr2k" ); magma_zher2k( uplo, trans, ib, k, alpha, dB1(id, i, 0 ), lddb, dB(id, i, 0 ), lddb, beta, dC(id, i+c_offset, ii), lddc ); trace_gpu_end( id, kk ); } /* off-diagonal update */ if (uplo == MagmaUpper) { for( i=nb; i < n; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = (i/(nb*ngpu))%nqueue; magma_setdevice( id ); magmablasSetKernelStream( queues[id][kk] ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, k, alpha, dB1(id, 0, 0 ), lddb, dB(id, i, 0 ), lddb, c_one, dC(id, 0, ii), lddc ); } } else { for( i=0; i < n-nb; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = (i/(nb*ngpu))%nqueue; magma_setdevice( id ); magmablasSetKernelStream( queues[id][kk] ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); n1 = n-i-ib; // zgemm on off-diagonal blocks trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n1, ib, k, alpha, dB1(id, i+ib, 0 ), lddb, dB(id, i, 0 ), lddb, c_one, dC(id, i+c_offset+ib, ii), lddc ); trace_gpu_end( id, kk ); } } if (uplo == MagmaUpper) { for( i=nb; i < n; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = (i/(nb*ngpu))%nqueue; magma_setdevice( id ); magmablasSetKernelStream( queues[id][kk] ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); magma_zgemm( MagmaNoTrans, MagmaConjTrans, i, ib, k, alpha, dB( id, 0, 0 ), lddb, dB1(id, i, 0 ), lddb, c_one, dC(id, 0, ii), lddc ); } } else { for( i=0; i < n-nb; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = (i/(nb*ngpu))%nqueue; magma_setdevice( id ); magmablasSetKernelStream( queues[id][kk] ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); n1 = n-i-ib; /* zgemm on off-diagonal blocks */ trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n1, ib, k, alpha, dB(id, i+ib, 0 ), lddb, dB1(id, i, 0 ), lddb, c_one, dC(id, i+c_offset+ib, ii), lddc ); trace_gpu_end( id, kk ); } } for( id=0; id < ngpu; id++ ) { magma_setdevice( id ); for( kk=0; kk < nqueue; kk++ ) { magma_queue_sync( queues[id][kk] ); } } magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); }
/** Purpose ------- ZSSSSM applies the LU factorization update from a complex matrix formed by a lower triangular IB-by-K tile L1 on top of a M2-by-K tile L2 to a second complex matrix formed by a M1-by-N1 tile A1 on top of a M2-by-N2 tile A2 (N1 == N2). This is the right-looking Level 2.5 BLAS version of the algorithm. Arguments --------- @param[in] m1 INTEGER The number of rows of the matrix A1. M1 >= 0. @param[in] n1 INTEGER The number of columns of the matrix A1. N1 >= 0. @param[in] m2 INTEGER The number of rows of the matrix A2. M2 >= 0. @param[in] n2 INTEGER The number of columns of the matrix A2. N2 >= 0. @param[in] k INTEGER The number of columns of the matrix L1 and L2. K >= 0. @param[in] ib INTEGER The inner-blocking size. IB >= 0. @param[in,out] dA1 COMPLEX_16 array, dimension(LDDA1, N), on gpu. On entry, the M1-by-N1 tile dA1. On exit, dA1 is updated by the application of dL (dL1 dL2). @param[in] ldda1 INTEGER The leading dimension of the array dA1. LDDA1 >= max(1,M1). @param[in,out] dA2 COMPLEX_16 array, dimension(LDDA2, N), on gpu. On entry, the M2-by-N2 tile dA2. On exit, dA2 is updated by the application of dL (dL1 dL2). @param[in] ldda2 INTEGER The leading dimension of the array dA2. LDDA2 >= max(1,M2). @param[in] dL1 COMPLEX_16 array, dimension(LDDL1, K), on gpu. The inverse of the IB-by-K lower triangular tile as returned by ZTSTRF. @param[in] lddl1 INTEGER The leading dimension of the array L1. LDDL1 >= max(1,2*IB). @param[in] dL2 COMPLEX_16 array, dimension(LDDL2, K) The M2-by-K tile as returned by ZTSTRF. @param[in] lddl2 INTEGER The leading dimension of the array L2. LDDL2 >= max(1,M2). @param[in] ipiv INTEGER array on the cpu. The pivot indices array of size K as returned by ZTSTRF @ingroup magma_zgesv_tile ********************************************************************/ extern "C" magma_int_t magma_zssssm_gpu( magma_order_t order, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex_ptr dA1, magma_int_t ldda1, magmaDoubleComplex_ptr dA2, magma_int_t ldda2, magmaDoubleComplex_ptr dL1, magma_int_t lddl1, magmaDoubleComplex_ptr dL2, magma_int_t lddl2, magma_int_t *ipiv, magma_int_t *info) { #define A1T(i,j) (dA1T + (i)*ldda1 + (j)) #define A2T(i,j) (dA2T + (i)*ldda2 + (j)) #define L1(i) (dL1 + (i)*lddl1 ) #define L2(i,j) (dL2 + (i)*lddl2i + (j)*lddl2j) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; int ip, ii, sb; magmaDoubleComplex_ptr dA1T, dA2T; magma_trans_t transL; int lddl2i, lddl2j; MAGMA_UNUSED( ip ); // used only if NOSWAPBLK /* Check input arguments */ *info = 0; if (m1 < 0) { *info = -1; } else if (n1 < 0) { *info = -2; } else if (m2 < 0) { *info = -3; } else if (n2 < 0) { *info = -4; } else if (k < 0) { *info = -5; } else if (ib < 0) { *info = -6; } else if (ldda1 < max(1,m1)) { *info = -8; } else if (ldda2 < max(1,m2)) { *info = -10; } else if (lddl1 < max(1,ib)) { *info = -12; } else if (lddl2 < max(1,m2)) { *info = -14; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return */ if ((m1 == 0) || (n1 == 0) || (m2 == 0) || (n2 == 0) || (k == 0) || (ib == 0)) return *info; if ( order == MagmaColMajor ) { magmablas_zgetmo_in( dA1, dA1T, ldda1, m1, n1 ); magmablas_zgetmo_in( dA2, dA2T, ldda2, m2, n2 ); transL = MagmaTrans; lddl2i = 1; lddl2j = lddl2; } else { dA1T = dA1; dA2T = dA2; transL = MagmaNoTrans; lddl2i = lddl2; lddl2j = 1; } ip = 0; for( ii=0; ii < k; ii += ib ) { sb = min( k-ii, ib); #ifndef NOSWAPBLK magmablas_zswapblk( MagmaRowMajor, n1, A1T(0, 0), ldda1, A2T(0, 0), ldda2, ii+1, ii+ib, ipiv, 1, m1 ); #else { int im; for (i=0; i < ib; i++) { im = ipiv[ip]-1; if (im != (ii+i)) { im = im - m1; assert( (im >= 0) && (im < m1) && (im < m2) ); magmablas_zswap( n1, A1T(ii+i, 0), 1, A2T(im, 0), 1 ); } ip++; } } #endif #ifndef WITHOUTTRTRI /* Lower, Trans, because L1 is not transposed */ magma_ztrmm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n1, sb, c_one, L1( ii), lddl1, A1T(ii, 0), ldda1); #else /* Lower, Trans, because L1 is not transposed */ magma_ztrsm( MagmaRight, MagmaLower, MagmaTrans, MagmaUnit, n1, sb, c_one, L1( ii), lddl1, A1T(ii, 0), ldda1); #endif /* Second parameter is trans because L2 is not transposed */ magma_zgemm( MagmaNoTrans, transL, n2, m2, sb, c_neg_one, A1T(ii, 0), ldda1, L2( 0, ii), lddl2, c_one, A2T(0, 0 ), ldda2 ); } if ( order == MagmaColMajor ) { magmablas_zgetmo_out( dA1, dA1T, ldda1, m1, n1 ); magmablas_zgetmo_out( dA2, dA2T, ldda2, m2, n2 ); } return *info; }
extern "C" magma_err_t magma_zpotrf2_msub(int num_subs, int num_gpus, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex_ptr *d_lA, size_t d_lA_offset, magma_int_t ldda, magmaDoubleComplex_ptr *d_lP, magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_int_t *info, magma_queue_t *queues ) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ int tot_subs = num_subs*num_gpus; magma_int_t j, jb, nb0, nb2, dd, d, id, j_local, j_local2; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; magmaDoubleComplex_ptr dlpanel; size_t dlpanel_offset; magma_int_t n_local[MagmaMaxSubs * MagmaMaxGPUs], ldpanel; // initialize trace trace_init(1, num_gpus, 2, queues); *info = 0; if ( (uplo != MagmaUpper) && (uplo != MagmaLower) ) { *info = -1; } else if (n < 0) { *info = -2; } else if ((uplo != MagmaUpper) && tot_subs*ldda < max(1,n)) { *info = -4; } else if ((uplo == MagmaUpper) && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } for (d=0; d<tot_subs; d++) { /* local-n and local-ld */ if (uplo == MagmaUpper) { n_local[d] = ((n/nb)/tot_subs)*nb; if (d < (n/nb)%tot_subs) n_local[d] += nb; else if (d == (n/nb)%tot_subs) n_local[d] += n%nb; } else { n_local[d] = ((m/nb)/tot_subs)*nb; if (d < (m/nb)%tot_subs) n_local[d] += nb; else if (d == (m/nb)%tot_subs) n_local[d] += m%nb; } } /* Use blocked code. */ if (uplo == MagmaUpper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (m-j)); if (j > 0) { // Wait for the column on CPU magma_queue_sync(queues[2*(id%num_gpus)]); /* broadcast off-diagonal column to all gpus */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { magma_zsetmatrix_async( j, jb, Aup_off(0,j), lda, dlP(d,jb,0,id%num_gpus), lddp, queues[2*d], trace_gpu_event(d, 0, "set", "set-col") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // Wait for syrk } /* Send the diagonal to cpu */ magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup_off(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get-diag") ); if (j > 0) { /* Compute the local block column of the panel. */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (n_local[d] > nb0) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(jb, 0, id%num_gpus); ldpanel = lddp; /* Wait for the offdiagonal column */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(0, nb*j_local); ldpanel = ldda; } /* update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); // wait for the diagonal trace_cpu_start(0, "potrf", "potrf"); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < n) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } magma_zsetmatrix_async( jb, jb, Aup_off(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag")); d = (d+1)%num_gpus; } } else { magma_zsetmatrix_async( jb, jb, Aup_off(j,j), lda, dlA(id, j, nb*j_local), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* panel-factorize the off-diagonal */ if ((j+jb) < n) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(j, nb*j_local); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlP_offset(0, 0, id%num_gpus); ldpanel = lddp; } nb2 = n_local[d]-nb*j_local2; nb0 = min(nb, nb2); if (dd < num_gpus) magma_queue_sync( queues[2*(d%num_gpus)] ); // wait for the diagonal if (j+jb < m && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); /* send the column to cpu */ magma_queue_sync(queues[2*(d%num_gpus)+1]); // wait for lookahead magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup_off(0,j+jb), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get-col") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, queues[2*(d%num_gpus)+1] ); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, j, nb*j_local2), ldda, queues[2*(d%num_gpus)+1] ); } d = (d+1)%tot_subs; } } } } else { /* -------------------------------------------- */ /* Lower-triangular case */ /* Compute the Cholesky factorization A = L*L'. */ /* -------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%tot_subs; /* Set the local index where the current panel is */ j_local = j/(nb*tot_subs); jb = min(nb, (n-j)); if (j > 0) { if (num_gpus > 1) { // Wait for the row on CPU to broadcast magma_queue_sync(queues[2*(id%num_gpus)]); } /* broadcast off-diagonal row to all the GPUs */ d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d != id%num_gpus) { /* send it to GPU-d */ magma_zsetmatrix_async( jb, j, Alo_off(j,0), lda, dlPT(d,0,jb,id%num_gpus), nb, queues[2*d], trace_gpu_event(d, 0, "set", "set-row") ); } d = (d+1)%num_gpus; } /* Update the current diagonal block */ trace_gpu_start(id%num_gpus, 1, "herk", "herk"); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)+1]); magma_queue_sync(queues[2*(id%num_gpus)+1]); // wait for syrk } /* send the diagonal to cpu */ magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo_off(j,j), lda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "get", "get") ); /* update the offdiagonal blocks */ if (j > 0) { /* compute the block-rows of the panel */ d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { j_local2 = j_local+1; if (d > id) j_local2 --; nb0 = nb*j_local2; if (nb0 < n_local[d]) { if (d%num_gpus != id%num_gpus) { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, jb, id%num_gpus); ldpanel = nb; /* Wait for offdiagonal row */ if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); } else { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, 0); ldpanel = ldda; } /* Update the panel */ trace_gpu_start(d%num_gpus, 1, "gemm", "gemm"); magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, dlpanel_offset, ldpanel, c_one, dlA(d, nb0, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } /* factor the diagonal */ magma_queue_sync( queues[2*(id%num_gpus)] ); trace_cpu_start(0, "potrf", "potrf"); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); trace_cpu_end(0); if (*info != 0) { printf( " zpotrf returned %d (id=%d,j=%d,j_local=%d,jb=%d)\n",*info,id,j,j_local,jb ); *info = *info + j; break; } /* send the diagonal to gpus */ if ((j+jb) < m) { d = (j/nb+1)%num_gpus; for (dd=0; dd<num_gpus; dd++) { if (d == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } magma_zsetmatrix_async( jb, jb, Alo_off(j,j), lda, dlpanel, dlpanel_offset, ldpanel, queues[2*d], trace_gpu_event(d, 0, "set", "set-diag") ); d = (d+1)%num_gpus; } } else { magma_zsetmatrix_async( jb, jb, Alo_off(j,j), lda, dlA(id, nb*j_local, j), ldda, queues[2*(id%num_gpus)], trace_gpu_event(id%num_gpus, 0, "set", "set-diag") ); } /* factorize off-diagonal blocks */ if ((j+jb) < m) { d = (j/nb+1)%tot_subs; for (dd=0; dd<tot_subs; dd++) { /* next column */ j_local2 = j_local+1; if (d > id) j_local2--; if (d%num_gpus == id%num_gpus) { dlpanel = d_lA[id]; dlpanel_offset = dlA_offset(nb*j_local, j); ldpanel = ldda; } else { dlpanel = d_lP[d%num_gpus]; dlpanel_offset = dlPT_offset(0, 0, id%num_gpus); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2 ); // wait for the diagonal if (dd < num_gpus) magma_queue_sync(queues[2*(d%num_gpus)]); if (j+jb < n && d == (j/nb+1)%tot_subs) { /* owns the next column, look-ahead the column */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); /* send the column to cpu */ magma_queue_sync( queues[2*(d%num_gpus)+1] ); // wait for lookahead magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo_off(j+jb,0), lda, queues[2*(d%num_gpus)], trace_gpu_event(d%num_gpus, 0, "get", "get") ); /* update the remaining blocks */ nb2 = nb2 - nb0; trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, queues[2*(d%num_gpus)+1]); } else if (nb2 > 0) { /* update the entire trailing matrix */ trace_gpu_start(d%num_gpus, 1, "trsm", "trsm"); magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, dlpanel_offset, ldpanel, dlA(d, nb*j_local2, j), ldda, queues[2*(d%num_gpus)+1]); } d = (d+1)%tot_subs; } } } } /* end of else not upper */ /* clean up */ for( d=0; d<num_gpus; d++ ) { magma_queue_sync( queues[2*d] ); magma_queue_sync( queues[2*d+1] ); } trace_finalize("zpotrf_msub.svg", "trace.css"); return *info; } /* magma_zpotrf2_msub */
/** Purpose ------- ZGEBRD reduces a general complex M-by-N matrix A to upper or lower bidiagonal form B by an orthogonal transformation: Q**H * A * P = B. If m >= n, B is upper bidiagonal; if m < n, B is lower bidiagonal. Arguments --------- @param[in] m INTEGER The number of rows in the matrix A. M >= 0. @param[in] n INTEGER The number of columns in the matrix A. N >= 0. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N general matrix to be reduced. On exit, if m >= n, the diagonal and the first superdiagonal are overwritten with the upper bidiagonal matrix B; the elements below the diagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the first superdiagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors; \n if m < n, the diagonal and the first subdiagonal are overwritten with the lower bidiagonal matrix B; the elements below the first subdiagonal, with the array TAUQ, represent the orthogonal matrix Q as a product of elementary reflectors, and the elements above the diagonal, with the array TAUP, represent the orthogonal matrix P as a product of elementary reflectors. See Further Details. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[out] d double precision array, dimension (min(M,N)) The diagonal elements of the bidiagonal matrix B: D(i) = A(i,i). @param[out] e double precision array, dimension (min(M,N)-1) The off-diagonal elements of the bidiagonal matrix B: if m >= n, E(i) = A(i,i+1) for i = 1,2,...,n-1; if m < n, E(i) = A(i+1,i) for i = 1,2,...,m-1. @param[out] tauq COMPLEX_16 array dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix Q. See Further Details. @param[out] taup COMPLEX_16 array, dimension (min(M,N)) The scalar factors of the elementary reflectors which represent the orthogonal matrix P. See Further Details. @param[out] work (workspace) COMPLEX_16 array, dimension (MAX(1,LWORK)) On exit, if INFO = 0, WORK[0] returns the optimal LWORK. @param[in] lwork INTEGER The length of the array WORK. LWORK >= (M+N)*NB, where NB is the optimal blocksize. \n If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the WORK array, returns this value as the first entry of the WORK array, and no error message related to LWORK is issued by XERBLA. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value. Further Details --------------- The matrices Q and P are represented as products of elementary reflectors: If m >= n, Q = H(1) H(2) . . . H(n) and P = G(1) G(2) . . . G(n-1) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are complex scalars, and v and u are complex vectors; v(1:i-1) = 0, v(i) = 1, and v(i+1:m) is stored on exit in A(i+1:m,i); u(1:i) = 0, u(i+1) = 1, and u(i+2:n) is stored on exit in A(i,i+2:n); tauq is stored in TAUQ(i) and taup in TAUP(i). If m < n, Q = H(1) H(2) . . . H(m-1) and P = G(1) G(2) . . . G(m) Each H(i) and G(i) has the form: H(i) = I - tauq * v * v' and G(i) = I - taup * u * u' where tauq and taup are complex scalars, and v and u are complex vectors; v(1:i) = 0, v(i+1) = 1, and v(i+2:m) is stored on exit in A(i+2:m,i); u(1:i-1) = 0, u(i) = 1, and u(i+1:n) is stored on exit in A(i,i+1:n); tauq is stored in TAUQ(i) and taup in TAUP(i). The contents of A on exit are illustrated by the following examples: @verbatim m = 6 and n = 5 (m > n): m = 5 and n = 6 (m < n): ( d e u1 u1 u1 ) ( d u1 u1 u1 u1 u1 ) ( v1 d e u2 u2 ) ( e d u2 u2 u2 u2 ) ( v1 v2 d e u3 ) ( v1 e d u3 u3 u3 ) ( v1 v2 v3 d e ) ( v1 v2 e d u4 u4 ) ( v1 v2 v3 v4 d ) ( v1 v2 v3 e d u5 ) ( v1 v2 v3 v4 v5 ) @endverbatim where d and e denote diagonal and off-diagonal elements of B, vi denotes an element of the vector defining H(i), and ui an element of the vector defining G(i). @ingroup magma_zgesvd_comp ********************************************************************/ extern "C" magma_int_t magma_zgebrd( magma_int_t m, magma_int_t n, magmaDoubleComplex *A, magma_int_t lda, double *d, double *e, magmaDoubleComplex *tauq, magmaDoubleComplex *taup, magmaDoubleComplex *work, magma_int_t lwork, magma_int_t *info) { #define A(i, j) (A + (j)*lda + (i)) #define dA(i, j) (dA + (j)*ldda + (i)) magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex *dA, *dwork; magma_int_t ncol, nrow, jmax, nb, ldda; magma_int_t i, j, nx; magma_int_t iinfo; magma_int_t minmn; magma_int_t ldwrkx, ldwrky, lwkopt; magma_int_t lquery; nb = magma_get_zgebrd_nb( m, n ); ldda = m; lwkopt = (m + n) * nb; work[0] = magma_zmake_lwork( lwkopt ); lquery = (lwork == -1); /* Check arguments */ *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (lda < max(1,m)) { *info = -4; } else if (lwork < lwkopt && (! lquery) ) { *info = -10; } if (*info < 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; /* Quick return if possible */ minmn = min(m,n); if (minmn == 0) { work[0] = c_one; return *info; } magma_queue_t queue = NULL; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magmaDoubleComplex *work2; magma_int_t lwork2 = max(m,n); if (MAGMA_SUCCESS != magma_zmalloc_cpu( &work2, lwork2 )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } if (MAGMA_SUCCESS != magma_zmalloc( &dA, n*ldda + (m + n)*nb )) { magma_free_cpu( work2 ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } dwork = dA + n*ldda; ldwrkx = m; ldwrky = n; /* Set the block/unblock crossover point NX. */ nx = 128; /* Copy the matrix to the GPU */ if (minmn - nx >= 1) { magma_zsetmatrix( m, n, A, lda, dA, ldda, queue ); } for (i=0; i < (minmn - nx); i += nb) { /* Reduce rows and columns i:i+nb-1 to bidiagonal form and return the matrices X and Y which are needed to update the unreduced part of the matrix */ nrow = m - i; ncol = n - i; /* Get the current panel (no need for the 1st iteration) */ if ( i > 0 ) { magma_zgetmatrix( nrow, nb, dA(i, i), ldda, A( i, i), lda, queue ); magma_zgetmatrix( nb, ncol - nb, dA(i, i+nb), ldda, A( i, i+nb), lda, queue ); } magma_zlabrd_gpu(nrow, ncol, nb, A(i, i), lda, dA(i, i), ldda, d+i, e+i, tauq+i, taup+i, work, ldwrkx, dwork, ldwrkx, // x, dx work+(ldwrkx*nb), ldwrky, dwork+(ldwrkx*nb), ldwrky, work2, lwork2, queue ); // y, dy /* Update the trailing submatrix A(i+nb:m,i+nb:n), using an update of the form A := A - V*Y' - X*U' */ nrow = m - i - nb; ncol = n - i - nb; // Send Y back to the GPU magma_zsetmatrix( nrow, nb, work + nb, ldwrkx, dwork + nb, ldwrkx, queue ); magma_zsetmatrix( ncol, nb, work + (ldwrkx+1)*nb, ldwrky, dwork + (ldwrkx+1)*nb, ldwrky, queue ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, nrow, ncol, nb, c_neg_one, dA(i+nb, i ), ldda, dwork+(ldwrkx+1)*nb, ldwrky, c_one, dA(i+nb, i+nb), ldda, queue ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nrow, ncol, nb, c_neg_one, dwork+nb, ldwrkx, dA( i, i+nb ), ldda, c_one, dA( i+nb, i+nb ), ldda, queue ); /* Copy diagonal and off-diagonal elements of B back into A */ if (m >= n) { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_Z_MAKE( d[j], 0. ); *A(j, j+1) = MAGMA_Z_MAKE( e[j], 0. ); } } else { jmax = i + nb; for (j = i; j < jmax; ++j) { *A(j, j ) = MAGMA_Z_MAKE( d[j], 0. ); *A(j+1, j ) = MAGMA_Z_MAKE( e[j], 0. ); } } } /* Use unblocked code to reduce the remainder of the matrix */ nrow = m - i; ncol = n - i; if ( 0 < minmn - nx ) { magma_zgetmatrix( nrow, ncol, dA(i, i), ldda, A( i, i), lda, queue ); } lapackf77_zgebrd( &nrow, &ncol, A(i, i), &lda, d+i, e+i, tauq+i, taup+i, work, &lwork, &iinfo); work[0] = magma_zmake_lwork( lwkopt ); magma_free_cpu( work2 ); magma_free( dA ); magma_queue_destroy( queue ); return *info; } /* magma_zgebrd */
extern "C" magma_int_t magma_zlaqps_gpu(magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex *A, magma_int_t lda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex *auxv, magmaDoubleComplex *F, magma_int_t ldf) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0 OFFSET (input) INTEGER The number of rows of A that have been factorized in previous steps. NB (input) INTEGER The number of columns to factorize. KB (output) INTEGER The number of columns actually factorized. A (input/output) COMPLEX*16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). JPVT (input/output) INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. TAU (output) COMPLEX*16 array, dimension (KB) The scalar factors of the elementary reflectors. VN1 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. VN2 (input/output) DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. AUXV (input/output) COMPLEX*16 array, dimension (NB) Auxiliar vector. F (input/output) COMPLEX*16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. LDF (input) INTEGER The leading dimension of the array F. LDF >= max(1,N). ===================================================================== */ #define A(i, j) (A + (i) + (j)*(lda )) #define F(i, j) (F + (i) + (j)*(ldf )) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; //double d__1; magmaDoubleComplex z__1; //magma_int_t j; magma_int_t k, rk; //magmaDoubleComplex Akk; magmaDoubleComplex *Aks; magmaDoubleComplex tauk; magma_int_t pvt; //double temp, temp2; double tol3z; magma_int_t itemp; double lsticc, *lsticcs; magma_int_t lastrk; magma_dmalloc( &lsticcs, 1+256*(n+255)/256 ); lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); lsticc = 0; k = 0; magma_zmalloc( &Aks, nb ); while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // Fortran: pvt, k, idamax are all 1-based; subtract 1 from k. // C: pvt, k, idamax are all 0-based; don't subtract 1. pvt = k - 1 + magma_idamax( n-k, &vn1[k], ione ); if (pvt != k) { /*if (pvt >= nb) { // 1. Start copy from GPU magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, stream ); }*/ /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; /*if (pvt < nb){ // no need of transfer if pivot is within the panel blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { // 1. Finish copy from GPU magma_queue_sync( stream ); // 2. Swap as usual on CPU blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); // 3. Restore the GPU magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, stream); }*/ magmablas_zswap( m, A(0, pvt), ione, A(0, k), ione ); //blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); magmablas_zswap( i__1, F(pvt, 0), ldf, F(k, 0), ldf); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; //vn1[pvt] = vn1[k]; //vn2[pvt] = vn2[k]; #if defined(PRECISION_d) || defined(PRECISION_z) //magma_dswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_dswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_dswap( 2, &vn1[pvt], n+offset, &vn1[k], n+offset ); #else //magma_sswap( 1, &vn1[pvt], 1, &vn1[k], 1 ); //magma_sswap( 1, &vn2[pvt], 1, &vn2[k], 1 ); magma_sswap(2, &vn1[pvt], n+offset, &vn1[k], n+offset); #endif } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j){ *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ //#define RIGHT_UPDATE #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(offset+nb, 0), lda, F(k, 0), ldf, c_one, A(offset+nb, k), ione ); #else i__1 = m - rk; i__2 = k; /*blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione );*/ magma_zgemv( MagmaNoTrans, i__1, i__2, c_neg_one, A(rk, 0), lda, F(k, 0), ldf, c_one, A(rk, k), ione ); #endif /*#if (defined(PRECISION_c) || defined(PRECISION_z)) for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CNJG( *F(k,j) ); } #endif*/ } /* Generate elementary reflector H(k). */ magma_zlarfg_gpu(m-rk, A(rk, k), A(rk + 1, k), &tau[k], &vn1[k], &Aks[k]); //Akk = *A(rk, k); //*A(rk, k) = c_one; //magma_zgetvector( 1, &Aks[k], 1, &Akk, 1 ); /* needed to avoid the race condition */ if (k == 0) magma_zsetvector( 1, &c_one, 1, A(rk, k), 1 ); else magma_zcopymatrix( 1, 1, A(offset, 0), 1, A(rk, k), 1 ); /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1 || k > 0) magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ //magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); magma_zgemv( MagmaConjTrans, m-rk, n-k-1, tauk, A( rk, k+1 ), lda, A( rk, k ), 1, c_zero, F( k+1, k ), 1 ); //magma_zscal( m-rk, tau[k], F( k+1, k), 1 ); //magma_int_t i__3 = nb-k-1; //magma_int_t i__4 = i__2 - i__3; //magma_int_t i__5 = nb-k; //magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, // tau[k], dA(rk +i__5, k+1+i__3), ldda, // dA(rk +i__5, k ), ione, // c_zero, dF(k+1+i__3, k ), ione ); //magma_zgetmatrix_async( i__2-i__3, 1, // dF(k + 1 +i__3, k), i__2, // F (k + 1 +i__3, k), i__2, stream ); //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, // &tau[k], A(rk, k+1), &lda, // A(rk, k ), &ione, // &c_zero, F(k+1, k ), &ione ); //magma_queue_sync( stream ); //blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, // &tau[k], A(rk, k+1+i__3), &lda, // A(rk, k ), &ione, // &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. for (j = 0; j <= k; ++j) { magma_zsetvector( 1, &c_zero, 1, F(j, k), 1 ); }*/ /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). F(1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K) := tau(K)(A(RK:M,K+1:N)' - F(1:N,1:K-1)*A(RK:M,1:K-1)') A(RK:M,K) so, F is (updated A)*V */ //if (k > 0 && k<n-1) { if (k > 0) { //magma_zgetvector( 1, &tau[k], 1, &tauk, 1 ); z__1 = MAGMA_Z_NEGATE( tauk ); #ifdef RIGHT_UPDATE i__1 = m - offset - nb; i__2 = k; magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, A(offset+nb, 0), lda, A(offset+nb, k), ione, c_zero, auxv, ione ); i__1 = k; magma_zgemv( MagmaNoTrans, n-k-1, i__1, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #else i__1 = m - rk; i__2 = k; //blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, // &z__1, A(rk, 0), &lda, // A(rk, k), &ione, // &c_zero, auxv, &ione ); magma_zgemv( MagmaConjTrans, i__1, i__2, z__1, A(rk, 0), lda, A(rk, k), ione, c_zero, auxv, ione ); //i__1 = k; //blasf77_zgemv( MagmaNoTransStr, &n, &i__1, // &c_one, F(0,0), &ldf, // auxv, &ione, // &c_one, F(0,k), &ione ); /*magma_zgemv( MagmaNoTrans, n, i__1, c_one, F(0,0), ldf, auxv, ione, c_one, F(0,k), ione );*/ /* I think we only need stricly lower-triangular part :) */ magma_zgemv( MagmaNoTrans, n-k-1, i__2, c_one, F(k+1,0), ldf, auxv, ione, c_one, F(k+1,k), ione ); #endif } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; //blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, // &c_neg_one, A(rk, 0 ), &lda, // F(k+1,0 ), &ldf, // &c_one, A(rk, k+1), &lda ); #ifdef RIGHT_UPDATE /* right-looking update of rows, */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, nb-k, i__1, ione, c_neg_one, A(rk, k ), lda, F(k+1, k ), ldf, c_one, A(rk, k+1), lda ); #else /* left-looking update of rows, * * since F=A'v with original A, so no right-looking */ magma_zgemm( MagmaNoTrans, MagmaConjTrans, ione, i__1, i__2, c_neg_one, A(rk, 0 ), lda, F(k+1,0 ), ldf, c_one, A(rk, k+1), lda ); #endif } /* Update partial column norms. */ if (rk < min(m, n+offset)-1 ) { magmablas_dznrm2_row_check_adjust(n-k-1, tol3z, &vn1[k+1], &vn2[k+1], A(rk,k+1), lda, lsticcs); magma_device_sync(); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #else magma_sgetvector( 1, &lsticcs[0], 1, &lsticc, 1 ); #endif } /*if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { // NOTE: The following 4 lines follow from the analysis in // Lapack Working Note 176. temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } }*/ //*A(rk, k) = Akk; //magma_zsetvector( 1, &Akk, 1, A(rk, k), 1 ); //magma_zswap( 1, &Aks[k], 1, A(rk, k), 1 ); ++k; } magma_zcopymatrix( 1, k, Aks, 1, A(offset, 0), lda+1 ); // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2 );*/ magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, A(rk+1, 0 ), lda, F(*kb, 0 ), ldf, c_one, A(rk+1, *kb), lda ); } /* Recomputation of difficult columns. */ if( lsticc > 0 ) { printf( " -- recompute dnorms --\n" ); magmablas_dznrm2_check(m-rk-1, n-*kb, A(rk+1,*kb), lda, &vn1[*kb], lsticcs); #if defined(PRECISION_d) || defined(PRECISION_z) magma_dcopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #else magma_scopymatrix( n-*kb, 1, &vn1[*kb], *kb, &vn2[*kb], *kb); #endif /*while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) vn1[lsticc] = cblas_dznrm2(i__1, A(rk + 1, lsticc), ione); else { // Where is the data, CPU or GPU ? double r1, r2; r1 = cblas_dznrm2(nb-k, A(rk + 1, lsticc), ione); r2 = magma_dznrm2(m-offset-nb, dA(offset + nb + 1, lsticc), ione); vn1[lsticc] = magma_dsqrt(r1*r1+r2*r2); } // NOTE: The computation of VN1( LSTICC ) relies on the fact that // SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) vn2[lsticc] = vn1[lsticc]; lsticc = itemp;*/ } magma_free(Aks); magma_free(lsticcs); return MAGMA_SUCCESS; } /* magma_zlaqps */
extern "C" magma_int_t magma_zpotrf3_mgpu(magma_int_t num_gpus, char uplo, magma_int_t m, magma_int_t n, magma_int_t off_i, magma_int_t off_j, magma_int_t nb, magmaDoubleComplex *d_lA[], magma_int_t ldda, magmaDoubleComplex *d_lP[], magma_int_t lddp, magmaDoubleComplex *a, magma_int_t lda, magma_int_t h, magma_queue_t stream[][3], magma_event_t event[][5], magma_int_t *info ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. Auxiliary subroutine for zpotrf2_ooc. It is multiple gpu interface to compute Cholesky of a "rectangular" matrix. The factorization has the form dA = U**H * U, if UPLO = 'U', or dA = L * L**H, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ magma_int_t j, jb, nb0, nb2, d, dd, id, j_local, j_local2, buf; char uplo_[2] = {uplo, 0}; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; int upper = lapackf77_lsame(uplo_, "U"); magmaDoubleComplex *dlpanel; magma_int_t n_local[MagmaMaxGPUs], ldpanel; const magma_int_t stream1 = 0, stream2 = 1, stream3 = 2; #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) /* used by ztrsm_work */ int trsm_nb = 128; int trsm_n = trsm_nb*((nb+trsm_nb-1)/trsm_nb); magmaDoubleComplex *d_dinvA[MagmaMaxGPUs]; magmaDoubleComplex *d_x[MagmaMaxGPUs]; #define dinvA(d,j) &(d_dinvA[(d)][(j)*trsm_nb*trsm_n]) #define dx(d,j) &(d_x[(d)][(j)*nb*m]) /* * Allocate device memory for the inversed diagonal blocks, size=N*BLOCK_SIZE */ for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); if ( (MAGMA_SUCCESS != magma_zmalloc( &d_dinvA[d], 2*trsm_nb*trsm_n )) || (MAGMA_SUCCESS != magma_zmalloc( &d_x[d], 2*nb*(upper ? n : m) )) ) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } magma_setdevice(0); #endif *info = 0; if ( (! upper) && (! lapackf77_lsame(uplo_, "L")) ) { *info = -1; } else if (n < 0) { *info = -2; } else if (!upper && num_gpus*ldda < max(1,n)) { *info = -4; } else if (upper && ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* initialization */ for( d=0; d<num_gpus; d++ ) { /* local-n and local-ld */ if (upper) { n_local[d] = (n/(nb*num_gpus))*nb; if (d < (n/nb)%num_gpus) n_local[d] += nb; else if (d == (n/nb)%num_gpus) n_local[d] += n%nb; } else { n_local[d] = (m/(nb*num_gpus))*nb; if (d < (m/nb)%num_gpus) n_local[d] += nb; else if (d == (m/nb)%num_gpus) n_local[d] += m%nb; } } /* == initialize the trace */ trace_init( 1, num_gpus, 3, (CUstream_st**)stream ); if (upper) { /* ---------------------------------------------- */ /* Upper-triangular case */ /* > Compute the Cholesky factorization A = U'*U. */ /* ---------------------------------------------- */ for (j=0; j<m; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; // right now, we have num_gpu buffers, so id and buf are the same.. /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (m-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); trace_gpu_start( id, stream1, "syrk", "syrk" ); magma_zherk(MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dlA(id, 0, nb*j_local), ldda, d_one, dlA(id, j, nb*j_local), ldda); trace_gpu_end( id, stream1 ); } /* send the diagonal to cpu on stream1 */ trace_gpu_start( id, stream1, "comm", "D to CPU" ); magma_zgetmatrix_async( jb, jb, dlA(id, j, nb*j_local), ldda, Aup(j,j), lda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); /* update off-diagonal blocks in the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; // number of local columns in the panel, while jb is panel-size (number of rows) if( n_local[d] > nb0 ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d,0,nb*j_local); ldpanel = ldda; // the GPU owns the row from start, and no need of synch. //magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } else { dlpanel = dlP(d,nb,0,buf); ldpanel = lddp; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } trace_gpu_start( d, stream2, "gemm", "gemm" ); magma_zgemm(MagmaConjTrans, MagmaNoTrans, jb, n_local[d]-nb0, j, c_neg_one, dlpanel, ldpanel, dlA(d, 0, nb0), ldda, c_one, dlA(d, j, nb0), ldda); trace_gpu_end( d, stream2 ); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for panel and factorize it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); trace_cpu_start( 0, "getrf", "getrf" ); lapackf77_zpotrf(MagmaUpperStr, &jb, Aup(j,j), &lda, info); trace_cpu_end( 0 ); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); trace_gpu_start( d, stream1, "comm", "comm" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); trace_gpu_end( d, stream1 ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); trace_gpu_start( id, stream1, "comm", "comm" ); magma_zsetmatrix_async( jb, jb, Aup(j,j), lda, dlA(id, j, nb*j_local), ldda, stream[id][stream1] ); trace_gpu_end( id, stream1 ); } /* panel-factorize the off-diagonal */ if ( (j+jb) < n) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d,j,nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } nb2 = n_local[d] - j_local2*nb; magma_setdevice(d); if( j+jb < m && d == (j/nb+1)%num_gpus ) { /* owns the next column, look-ahead next block on stream1 */ nb0 = min(nb, nb2); magmablasSetKernelStream(stream[d][stream1]); magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update trace_gpu_start( d, stream1, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb0, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); trace_gpu_end( d, stream1 ); } else if( nb2 > 0 ) { /* update all the blocks on stream2 */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor trace_gpu_start( d, stream2, "trsm", "trsm" ); magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2), ldda); #endif trace_gpu_end( d, stream2 ); } d = (d+1)%num_gpus; } /* end of for */ /* ========================================================== */ if( j+jb < m ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, off-diagonals are copied to cpu (synchronize at the end). * * so we have the Cholesky factor, but only diagonal submatrix of the big panel, * * on cpu at the end. */ int d2, buf2; magma_setdevice(d); /* lookahead done */ magma_queue_wait_event( stream[d][stream3], event[d][4] ); trace_gpu_start( d, stream3, "comm", "row to CPU" ); magma_zgetmatrix_async( (j+jb), nb0, dlA(d, 0, nb*j_local2), ldda, Aup(0,j+jb), lda, stream[d][stream3] ); trace_gpu_end( d, stream3 ); magma_event_record( event[d][3], stream[d][stream3] ); /* needed on pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast rows to gpus on stream2 */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); trace_gpu_start( d2, stream3, "comm", "row to GPUs" ); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // rows arrived at cpu on stream3 magma_zsetmatrix_async( j+jb, nb0, Aup(0,j+jb), lda, dlP(d2,nb,0,buf2), lddp, stream[d2][stream3] ); trace_gpu_end( d2, stream3 ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } /* =========================== */ /* update the remaining blocks */ nb2 = n_local[d]-(nb*j_local2 + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, j, nb*j_local); ldpanel = ldda; } else { dlpanel = dlP(d,0,0,buf); ldpanel = lddp; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); trace_gpu_start( d, stream2, "trsm", "trsm" ); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ztrsm_work( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for cholesky factor magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, nb2, c_one, dlpanel, ldpanel, dlA(d, j, nb*j_local2+nb0), ldda); #endif trace_gpu_end( d, stream2 ); } } } /* end of ztrsm */ } /* end of for j=1, .., n */ } else { /* ---------------------------------------------- */ /* Lower-triangular case */ /* > Compute the Cholesky factorization A = L*L'. */ /* ---------------------------------------------- */ for (j=0; j<n; j+=nb) { /* Set the GPU number that holds the current panel */ id = (j/nb)%num_gpus; buf = (j/nb)%num_gpus; /* Set the local index where the current panel is */ j_local = j/(nb*num_gpus); jb = min(nb, (n-j)); /* Update the current diagonal block on stream1 */ magma_setdevice(id); if( j > 0 ) { magmablasSetKernelStream(stream[id][stream1]); magma_zherk(MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dlA(id, nb*j_local, 0), ldda, d_one, dlA(id, nb*j_local, j), ldda); } /* send the diagonal to cpu on stream1 */ magma_zgetmatrix_async( jb, jb, dlA(id, nb*j_local, j), ldda, Alo(j,j), lda, stream[id][stream1] ); /* update off-diagonal blocks of the panel */ if( j > 0 ) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { j_local2 = j_local+1; if( d > id ) j_local2 --; nb0 = nb*j_local2; if( nb0 < n_local[d] ) { magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); if( d == id ) { dlpanel = dlA(d, nb*j_local, 0); ldpanel = ldda; } else { dlpanel = dlPT(d,0,nb,buf); ldpanel = nb; magma_queue_wait_event( stream[d][stream2], event[d][0] ); // rows arrived at gpu } magma_zgemm( MagmaNoTrans, MagmaConjTrans, n_local[d]-nb0, jb, j, c_neg_one, dlA(d, nb0, 0), ldda, dlpanel, ldpanel, c_one, dlA(d, nb0, j), ldda); magma_event_record( event[d][2], stream[d][stream2] ); } d = (d+1)%num_gpus; } } /* wait for the panel and factorized it on cpu */ magma_setdevice(id); magma_queue_sync( stream[id][stream1] ); lapackf77_zpotrf(MagmaLowerStr, &jb, Alo(j,j), &lda, info); if (*info != 0) { *info = *info + j; break; } /* send the diagonal to gpus on stream1 */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } magma_setdevice(d); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlpanel, ldpanel, stream[d][stream1] ); magma_event_record( event[d][1], stream[d][stream1] ); d = (d+1)%num_gpus; } } else { magma_setdevice(id); magma_zsetmatrix_async( jb, jb, Alo(j,j), lda, dlA(id, nb*j_local, j), ldda, stream[id][stream1] ); } /* panel factorize the off-diagonal */ if ( (j+jb) < m) { d = (j/nb+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d, 0, 0, buf); ldpanel = nb; } nb2 = n_local[d] - j_local2*nb; nb0 = min(nb, nb2); magma_setdevice(d); if( j+nb < n && d == (j/nb+1)%num_gpus ) { /* owns next column, look-ahead next block on stream1 */ if ( j > 0 ) magma_queue_wait_event( stream[d][stream1], event[d][2] ); // wait for gemm update magmablasSetKernelStream(stream[d][stream1]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb, trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb0,jb, dx(d,0),nb0 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb0, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif magma_event_record( event[d][4], stream[d][stream1] ); } else if( nb2 > 0 ) { /* other gpus updating all the blocks on stream2 */ /* update the entire column */ magma_queue_wait_event( stream[d][stream2], event[d][1] ); // wait for the cholesky factor magmablasSetKernelStream(stream[d][stream2]); #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,0),trsm_nb ); magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,0),nb2 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda, 1, dinvA(d,0), dx(d,0) ); #else magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2, j), ldda); #endif } d = (d+1)%num_gpus; } /* end for d */ /* ========================================================== */ if( j+jb < n ) { d = (j/nb+1)%num_gpus; /* next column */ j_local2 = j_local+1; if( d > id ) j_local2--; nb0 = min(nb, n_local[d]-nb*j_local2 ); /* even on 1 gpu, we copy off-diagonal to cpu (but don't synchronize). */ /* so we have the Cholesky factor on cpu at the end. */ int d2, buf2; //#define ZPOTRF_DEVICE_TO_DEVICE #ifdef ZPOTRF_DEVICE_TO_DEVICE // lookahead done /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][4] ); if( d2 != d ) { magma_zcopymatrix_async( nb0, j+jb, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block dlA(d, nb*j_local2, 0), ldda, stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } else { magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); } } #else // lookahead done magma_setdevice(d); magma_queue_wait_event( stream[d][stream3], event[d][4] ); magma_zgetmatrix_async( nb0, j+jb, dlA(d, nb*j_local2, 0), ldda, Alo(j+jb,0), lda, stream[d][stream3] ); magma_event_record( event[d][3], stream[d][stream3] ); /* syn on rows on CPU, seem to be needed on Pluto */ //magma_queue_sync( stream[d][stream3] ); /* broadcast the rows to gpus */ buf2 = ((j+jb)/nb)%num_gpus; for( d2=0; d2<num_gpus; d2++ ) { if( d2 != d ) { magma_setdevice(d2); magma_queue_wait_event( stream[d2][stream3], event[d][3] ); // getmatrix done magma_zsetmatrix_async( nb0, j+jb, Alo(j+jb,0), lda, dlPT(d2,0,nb,buf2), nb, // first nbxnb reserved for diagonal block stream[d2][stream3] ); magma_event_record( event[d2][0], stream[d2][stream3] ); } } #endif /* =================================== */ /* updates remaining blocks on stream2 */ nb2 = n_local[d] - (j_local2*nb + nb0); if( nb2 > 0 ) { if( d == id ) { dlpanel = dlA(d, nb*j_local, j); ldpanel = ldda; } else { dlpanel = dlPT(d,0,0,buf); ldpanel = nb; } magma_setdevice(d); magmablasSetKernelStream(stream[d][stream2]); /* update the remaining blocks in the column */ #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) int flag = 0; if (flag == 0) { magma_queue_wait_event( stream[d][stream2], event[d][4] ); // lookahead -> diagonal inversion } else { magmablas_zlaset( MagmaUpperLower, trsm_nb,trsm_n, dinvA(d,flag),trsm_nb ); magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received } magmablas_zlaset( MagmaUpperLower, nb2,jb, dx(d,1),nb2 ); magmablas_ztrsm_work( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda, flag, dinvA(d,flag), dx(d,1) ); #else magma_queue_wait_event( stream[d][stream2], event[d][1] ); // panel received magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, nb2, jb, c_one, dlpanel, ldpanel, dlA(d, nb*j_local2+nb0, j), ldda); #endif } } } } } /* end of else not upper */ /* == finalize the trace == */ trace_finalize( "zpotrf.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magma_setdevice(d); for( j=0; j<3; j++ ) { magma_queue_sync( stream[d][j] ); } #if (defined(PRECISION_d) || defined(PRECISION_s)) && defined(ZTRSM_WORK) magma_free( d_dinvA[d] ); magma_free( d_x[d] ); #endif magmablasSetKernelStream(NULL); } magma_setdevice(0); return *info; } /* magma_zpotrf_mgpu */
/** Purpose ------- ZLAQPS computes a step of QR factorization with column pivoting of a complex M-by-N matrix A by using Blas-3. It tries to factorize NB columns from A starting from the row OFFSET+1, and updates all of the matrix with Blas-3 xGEMM. In some cases, due to catastrophic cancellations, it cannot factorize NB columns. Hence, the actual number of factorized columns is returned in KB. Block A(1:OFFSET,1:N) is accordingly pivoted, but not factorized. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0 @param[in] offset INTEGER The number of rows of A that have been factorized in previous steps. @param[in] nb INTEGER The number of columns to factorize. @param[out] kb INTEGER The number of columns actually factorized. @param[in,out] A COMPLEX_16 array, dimension (LDA,N) On entry, the M-by-N matrix A. On exit, block A(OFFSET+1:M,1:KB) is the triangular factor obtained and block A(1:OFFSET,1:N) has been accordingly pivoted, but no factorized. The rest of the matrix, block A(OFFSET+1:M,KB+1:N) has been updated. @param[in] lda INTEGER The leading dimension of the array A. LDA >= max(1,M). @param[in,out] jpvt INTEGER array, dimension (N) JPVT(I) = K <==> Column K of the full matrix A has been permuted into position I in AP. @param[out] tau COMPLEX_16 array, dimension (KB) The scalar factors of the elementary reflectors. @param[in,out] vn1 DOUBLE PRECISION array, dimension (N) The vector with the partial column norms. @param[in,out] vn2 DOUBLE PRECISION array, dimension (N) The vector with the exact column norms. @param[in,out] auxv COMPLEX_16 array, dimension (NB) Auxiliar vector. @param[in,out] F COMPLEX_16 array, dimension (LDF,NB) Matrix F' = L*Y'*A. @param[in] ldf INTEGER The leading dimension of the array F. LDF >= max(1,N). @ingroup magma_zgeqp3_aux ********************************************************************/ extern "C" magma_int_t magma_zlaqps( magma_int_t m, magma_int_t n, magma_int_t offset, magma_int_t nb, magma_int_t *kb, magmaDoubleComplex *A, magma_int_t lda, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *jpvt, magmaDoubleComplex *tau, double *vn1, double *vn2, magmaDoubleComplex *auxv, magmaDoubleComplex *F, magma_int_t ldf, magmaDoubleComplex_ptr dF, magma_int_t lddf) { #define A(i, j) (A + (i) + (j)*(lda )) #define dA(i, j) (dA + (i) + (j)*(ldda)) #define F(i, j) (F + (i) + (j)*(ldf )) #define dF(i, j) (dF + (i) + (j)*(lddf)) magmaDoubleComplex c_zero = MAGMA_Z_MAKE( 0.,0.); magmaDoubleComplex c_one = MAGMA_Z_MAKE( 1.,0.); magmaDoubleComplex c_neg_one = MAGMA_Z_MAKE(-1.,0.); magma_int_t ione = 1; magma_int_t i__1, i__2; double d__1; magmaDoubleComplex z__1; magma_int_t j, k, rk; magmaDoubleComplex Akk; magma_int_t pvt; double temp, temp2, tol3z; magma_int_t itemp; magma_int_t lsticc; magma_int_t lastrk; lastrk = min( m, n + offset ); tol3z = magma_dsqrt( lapackf77_dlamch("Epsilon")); magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); lsticc = 0; k = 0; while( k < nb && lsticc == 0 ) { rk = offset + k; /* Determine ith pivot column and swap if necessary */ // subtract 1 from Fortran idamax; pvt, k are 0-based. i__1 = n-k; pvt = k + blasf77_idamax( &i__1, &vn1[k], &ione ) - 1; if (pvt != k) { if (pvt >= nb) { /* 1. Start copy from GPU */ magma_zgetmatrix_async( m - offset - nb, 1, dA(offset + nb, pvt), ldda, A (offset + nb, pvt), lda, queue ); } /* F gets swapped so F must be sent at the end to GPU */ i__1 = k; blasf77_zswap( &i__1, F(pvt,0), &ldf, F(k,0), &ldf ); itemp = jpvt[pvt]; jpvt[pvt] = jpvt[k]; jpvt[k] = itemp; vn1[pvt] = vn1[k]; vn2[pvt] = vn2[k]; if (pvt < nb) { /* no need of transfer if pivot is within the panel */ blasf77_zswap( &m, A(0, pvt), &ione, A(0, k), &ione ); } else { /* 1. Finish copy from GPU */ magma_queue_sync( queue ); /* 2. Swap as usual on CPU */ blasf77_zswap(&m, A(0, pvt), &ione, A(0, k), &ione); /* 3. Restore the GPU */ magma_zsetmatrix_async( m - offset - nb, 1, A (offset + nb, pvt), lda, dA(offset + nb, pvt), ldda, queue ); } } /* Apply previous Householder reflectors to column K: A(RK:M,K) := A(RK:M,K) - A(RK:M,1:K-1)*F(K,1:K-1)'. Optimization: multiply with beta=0; wait for vector and subtract */ if (k > 0) { #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CONJ( *F(k,j) ); } #endif i__1 = m - rk; i__2 = k; blasf77_zgemv( MagmaNoTransStr, &i__1, &i__2, &c_neg_one, A(rk, 0), &lda, F(k, 0), &ldf, &c_one, A(rk, k), &ione ); #ifdef COMPLEX for (j = 0; j < k; ++j) { *F(k,j) = MAGMA_Z_CONJ( *F(k,j) ); } #endif } /* Generate elementary reflector H(k). */ if (rk < m-1) { i__1 = m - rk; lapackf77_zlarfg( &i__1, A(rk, k), A(rk + 1, k), &ione, &tau[k] ); } else { lapackf77_zlarfg( &ione, A(rk, k), A(rk, k), &ione, &tau[k] ); } Akk = *A(rk, k); *A(rk, k) = c_one; /* Compute Kth column of F: Compute F(K+1:N,K) := tau(K)*A(RK:M,K+1:N)'*A(RK:M,K) on the GPU */ if (k < n-1) { i__1 = m - rk; i__2 = n - k - 1; /* Send the vector to the GPU */ magma_zsetmatrix( i__1, 1, A(rk, k), lda, dA(rk,k), ldda, queue ); /* Multiply on GPU */ // was CALL ZGEMV( 'Conjugate transpose', M-RK+1, N-K, // TAU( K ), A( RK, K+1 ), LDA, // A( RK, K ), 1, // CZERO, F( K+1, K ), 1 ) magma_int_t i__3 = nb-k-1; magma_int_t i__4 = i__2 - i__3; magma_int_t i__5 = nb-k; magma_zgemv( MagmaConjTrans, i__1 - i__5, i__2 - i__3, tau[k], dA(rk +i__5, k+1+i__3), ldda, dA(rk +i__5, k ), ione, c_zero, dF(k+1+i__3, k ), ione, queue ); magma_zgetmatrix_async( i__2-i__3, 1, dF(k + 1 +i__3, k), i__2, F (k + 1 +i__3, k), i__2, queue ); blasf77_zgemv( MagmaConjTransStr, &i__1, &i__3, &tau[k], A(rk, k+1), &lda, A(rk, k ), &ione, &c_zero, F(k+1, k ), &ione ); magma_queue_sync( queue ); blasf77_zgemv( MagmaConjTransStr, &i__5, &i__4, &tau[k], A(rk, k+1+i__3), &lda, A(rk, k ), &ione, &c_one, F(k+1+i__3, k ), &ione ); } /* Padding F(1:K,K) with zeros. */ for (j = 0; j < k; ++j) { *F(j, k) = c_zero; } /* Incremental updating of F: F(1:N,K) := F(1:N,K) - tau(K)*F(1:N,1:K-1)*A(RK:M,1:K-1)'*A(RK:M,K). */ if (k > 0) { i__1 = m - rk; i__2 = k; z__1 = MAGMA_Z_NEGATE( tau[k] ); blasf77_zgemv( MagmaConjTransStr, &i__1, &i__2, &z__1, A(rk, 0), &lda, A(rk, k), &ione, &c_zero, auxv, &ione ); i__1 = k; blasf77_zgemv( MagmaNoTransStr, &n, &i__1, &c_one, F(0,0), &ldf, auxv, &ione, &c_one, F(0,k), &ione ); } /* Optimization: On the last iteration start sending F back to the GPU */ /* Update the current row of A: A(RK,K+1:N) := A(RK,K+1:N) - A(RK,1:K)*F(K+1:N,1:K)'. */ if (k < n-1) { i__1 = n - k - 1; i__2 = k + 1; blasf77_zgemm( MagmaNoTransStr, MagmaConjTransStr, &ione, &i__1, &i__2, &c_neg_one, A(rk, 0 ), &lda, F(k+1,0 ), &ldf, &c_one, A(rk, k+1), &lda ); } /* Update partial column norms. */ if (rk < lastrk) { for (j = k + 1; j < n; ++j) { if (vn1[j] != 0.) { /* NOTE: The following 4 lines follow from the analysis in Lapack Working Note 176. */ temp = MAGMA_Z_ABS( *A(rk,j) ) / vn1[j]; temp = max( 0., ((1. + temp) * (1. - temp)) ); d__1 = vn1[j] / vn2[j]; temp2 = temp * (d__1 * d__1); if (temp2 <= tol3z) { vn2[j] = (double) lsticc; lsticc = j; } else { vn1[j] *= magma_dsqrt(temp); } } } } *A(rk, k) = Akk; ++k; } // leave k as the last column done --k; *kb = k + 1; rk = offset + *kb - 1; /* Apply the block reflector to the rest of the matrix: A(OFFSET+KB+1:M,KB+1:N) := A(OFFSET+KB+1:M,KB+1:N) - A(OFFSET+KB+1:M,1:KB)*F(KB+1:N,1:KB)' */ if (*kb < min(n, m - offset)) { i__1 = m - rk - 1; i__2 = n - *kb; /* Send F to the GPU */ magma_zsetmatrix( i__2, *kb, F (*kb, 0), ldf, dF(*kb, 0), i__2, queue ); magma_zgemm( MagmaNoTrans, MagmaConjTrans, i__1, i__2, *kb, c_neg_one, dA(rk+1, 0 ), ldda, dF(*kb, 0 ), i__2, c_one, dA(rk+1, *kb), ldda, queue ); } /* Recomputation of difficult columns. */ while( lsticc > 0 ) { itemp = (magma_int_t)(vn2[lsticc] >= 0. ? floor(vn2[lsticc] + .5) : -floor(.5 - vn2[lsticc])); i__1 = m - rk - 1; if (lsticc <= nb) { vn1[lsticc] = magma_cblas_dznrm2( i__1, A(rk+1,lsticc), ione ); } else { /* Where is the data, CPU or GPU ? */ double r1, r2; r1 = magma_cblas_dznrm2( nb-k, A(rk+1,lsticc), ione ); r2 = magma_dznrm2( m-offset-nb, dA(offset + nb + 1, lsticc), ione, queue ); //vn1[lsticc] = magma_dznrm2( i__1, dA(rk + 1, lsticc), ione, queue ); vn1[lsticc] = magma_dsqrt(r1*r1 + r2*r2); } /* NOTE: The computation of VN1( LSTICC ) relies on the fact that SNRM2 does not fail on vectors with norm below the value of SQRT(DLAMCH('S')) */ vn2[lsticc] = vn1[lsticc]; lsticc = itemp; } magma_queue_destroy( queue ); return MAGMA_SUCCESS; } /* magma_zlaqps */
extern "C" magma_int_t magma_zgeqrs_gpu(magma_int_t m, magma_int_t n, magma_int_t nrhs, magmaDoubleComplex *dA, magma_int_t ldda, magmaDoubleComplex *tau, magmaDoubleComplex *dT, magmaDoubleComplex *dB, magma_int_t lddb, magmaDoubleComplex *hwork, magma_int_t lwork, magma_int_t *info) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver August 2013 Purpose ======= Solves the least squares problem min || A*X - C || using the QR factorization A = Q*R computed by ZGEQRF_GPU. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. M >= N >= 0. NRHS (input) INTEGER The number of columns of the matrix C. NRHS >= 0. A (input) COMPLEX_16 array on the GPU, dimension (LDDA,N) The i-th column must contain the vector which defines the elementary reflector H(i), for i = 1,2,...,n, as returned by ZGEQRF_GPU in the first n columns of its array argument A. LDDA (input) INTEGER The leading dimension of the array A, LDDA >= M. TAU (input) COMPLEX_16 array, dimension (N) TAU(i) must contain the scalar factor of the elementary reflector H(i), as returned by MAGMA_ZGEQRF_GPU. DB (input/output) COMPLEX_16 array on the GPU, dimension (LDDB,NRHS) On entry, the M-by-NRHS matrix C. On exit, the N-by-NRHS solution matrix X. DT (input) COMPLEX_16 array that is the output (the 6th argument) of magma_zgeqrf_gpu of size 2*MIN(M, N)*NB + ((N+31)/32*32 )* MAX(NB, NRHS). The array starts with a block of size MIN(M,N)*NB that stores the triangular T matrices used in the QR factorization, followed by MIN(M,N)*NB block storing the diagonal block inverses for the R matrix, followed by work space of size ((N+31)/32*32 )* MAX(NB, NRHS). LDDB (input) INTEGER The leading dimension of the array DB. LDDB >= M. HWORK (workspace/output) COMPLEX_16 array, dimension (LWORK) On exit, if INFO = 0, WORK(1) returns the optimal LWORK. LWORK (input) INTEGER The dimension of the array WORK, LWORK >= (M - N + NB)*(NRHS + NB) + NRHS*NB, where NB is the blocksize given by magma_get_zgeqrf_nb( M ). If LWORK = -1, then a workspace query is assumed; the routine only calculates the optimal size of the HWORK array, returns this value as the first entry of the WORK array. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value ===================================================================== */ #define a_ref(a_1,a_2) (dA+(a_2)*(ldda) + (a_1)) #define d_ref(a_1) (dT+(lddwork+(a_1))*nb) magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magmaDoubleComplex *dwork; magma_int_t i, k, lddwork, rows, ib; magma_int_t ione = 1; magma_int_t nb = magma_get_zgeqrf_nb(m); magma_int_t lwkopt = (m - n + nb)*(nrhs + nb) + nrhs*nb; int lquery = (lwork == -1); hwork[0] = MAGMA_Z_MAKE( (double)lwkopt, 0. ); *info = 0; if (m < 0) *info = -1; else if (n < 0 || m < n) *info = -2; else if (nrhs < 0) *info = -3; else if (ldda < max(1,m)) *info = -5; else if (lddb < max(1,m)) *info = -9; else if (lwork < lwkopt && ! lquery) *info = -11; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } else if (lquery) return *info; k = min(m,n); if (k == 0) { hwork[0] = c_one; return *info; } /* B := Q' * B */ magma_zunmqr_gpu( MagmaLeft, MagmaConjTrans, m, nrhs, n, a_ref(0,0), ldda, tau, dB, lddb, hwork, lwork, dT, nb, info ); if ( *info != 0 ) { return *info; } /* Solve R*X = B(1:n,:) */ lddwork= k; if (nb < k) dwork = dT+2*lddwork*nb; else dwork = dT; // To do: Why did we have this line originally; seems to be a bug (Stan)? // dwork = dT; i = (k-1)/nb * nb; ib = n-i; rows = m-i; // TODO: this assumes that, on exit from magma_zunmqr_gpu, hwork contains // the last block of A and B (i.e., C in zunmqr). This should be fixed. // Seems this data should already be on the GPU, so could switch to // magma_ztrsm and drop the zsetmatrix. if ( nrhs == 1 ) { blasf77_ztrsv( MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, hwork, &rows, hwork+rows*ib, &ione); } else { blasf77_ztrsm( MagmaLeftStr, MagmaUpperStr, MagmaNoTransStr, MagmaNonUnitStr, &ib, &nrhs, &c_one, hwork, &rows, hwork+rows*ib, &rows); } // update the solution vector magma_zsetmatrix( ib, nrhs, hwork+rows*ib, rows, dwork+i, lddwork ); // update c if (nrhs == 1) magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, 1, c_one, dB, 1); else magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); int start = i-nb; if (nb < k) { for (i = start; i >=0; i -= nb) { ib = min(k-i, nb); rows = m -i; if (i + ib < n) { if (nrhs == 1) { magma_zgemv( MagmaNoTrans, ib, ib, c_one, d_ref(i), ib, dB+i, 1, c_zero, dwork+i, 1); magma_zgemv( MagmaNoTrans, i, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, 1, c_one, dB, 1); } else { magma_zgemm( MagmaNoTrans, MagmaNoTrans, ib, nrhs, ib, c_one, d_ref(i), ib, dB+i, lddb, c_zero, dwork+i, lddwork); magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, nrhs, ib, c_neg_one, a_ref(0, i), ldda, dwork + i, lddwork, c_one, dB, lddb); } } } } magma_zcopymatrix( (n), nrhs, dwork, lddwork, dB, lddb ); return *info; }
extern "C" magma_int_t magma_zgetrf2_gpu( magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magma_int_t *ipiv, magma_queue_t queues[2], magma_int_t *info ) { /* -- clMAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define dA(i_, j_) dA, dA_offset + (i_)*nb + (j_)*nb*ldda #define dAT(i_, j_) dAT, dAT_offset + (i_)*nb*lddat + (j_)*nb #define dAP(i_, j_) dAP, (i_) + (j_)*maxm #define work(i_) (work + (i_)) magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t iinfo, nb; magma_int_t maxm, maxn, mindim; magma_int_t i, j, rows, s, lddat, ldwork; magmaDoubleComplex_ptr dAT, dAP; magmaDoubleComplex *work; size_t dAT_offset; /* Check arguments */ *info = 0; if (m < 0) *info = -1; else if (n < 0) *info = -2; else if (ldda < max(1,m)) *info = -4; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); nb = magma_get_zgetrf_nb(m); s = mindim / nb; if (nb <= 1 || nb >= min(m,n)) { /* Use CPU code. */ if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, m*n )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_zgetmatrix( m, n, dA(0,0), ldda, work(0), m, queues[0] ); lapackf77_zgetrf( &m, &n, work, &m, ipiv, info ); magma_zsetmatrix( m, n, work(0), m, dA(0,0), ldda, queues[0] ); magma_free_cpu( work ); } else { /* Use hybrid blocked code. */ maxm = ((m + 31)/32)*32; maxn = ((n + 31)/32)*32; if ( MAGMA_SUCCESS != magma_zmalloc( &dAP, nb*maxm )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } // square matrices can be done in place; // rectangular requires copy to transpose if ( m == n ) { dAT = dA; dAT_offset = dA_offset; lddat = ldda; magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { lddat = maxn; // N-by-M dAT_offset = 0; if ( MAGMA_SUCCESS != magma_zmalloc( &dAT, lddat*maxm )) { magma_free( dAP ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } magmablas_ztranspose( m, n, dA(0,0), ldda, dAT(0,0), lddat, queues[0] ); } ldwork = maxm; /* if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, ldwork*nb ) ) { magma_free( dAP ); if ( dA != dAT ) magma_free( dAT ); *info = MAGMA_ERR_HOST_ALLOC; return *info; } */ cl_mem work_mapped = clCreateBuffer( gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, ldwork*nb * sizeof(magmaDoubleComplex), NULL, NULL ); work = (magmaDoubleComplex*) clEnqueueMapBuffer( queues[0], work_mapped, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, ldwork*nb * sizeof(magmaDoubleComplex), 0, NULL, NULL, NULL ); for( j=0; j < s; j++ ) { // download j-th panel magmablas_ztranspose( nb, m-j*nb, dAT(j,j), lddat, dAP(0,0), maxm, queues[0] ); clFlush( queues[0] ); magma_queue_sync( queues[0] ); magma_zgetmatrix_async( m-j*nb, nb, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL ); clFlush( queues[1] ); if ( j > 0 ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n - (j+1)*nb, nb, c_one, dAT(j-1,j-1), lddat, dAT(j-1,j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-j*nb, nb, c_neg_one, dAT(j-1,j+1), lddat, dAT(j, j-1), lddat, c_one, dAT(j, j+1), lddat, queues[0] ); } magma_queue_sync( queues[1] ); // do the cpu part rows = m - j*nb; lapackf77_zgetrf( &rows, &nb, work, &ldwork, ipiv+j*nb, &iinfo ); if ( *info == 0 && iinfo > 0 ) *info = iinfo + j*nb; for( i=j*nb; i < j*nb + nb; ++i ) { ipiv[i] += j*nb; } magmablas_zlaswp( n, dAT(0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[0] ); clFlush( queues[0] ); // upload j-th panel magma_zsetmatrix_async( m-j*nb, nb, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL ); magma_queue_sync( queues[1] ); magmablas_ztranspose( m-j*nb, nb, dAP(0,0), maxm, dAT(j,j), lddat, queues[0] ); clFlush( queues[0] ); // do the small non-parallel computations (next panel update) if ( s > (j+1) ) { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } else { magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb, nb, c_one, dAT(j, j ), lddat, dAT(j, j+1), lddat, queues[0] ); magma_zgemm( MagmaNoTrans, MagmaNoTrans, n-(j+1)*nb, m-(j+1)*nb, nb, c_neg_one, dAT(j, j+1), lddat, dAT(j+1, j ), lddat, c_one, dAT(j+1, j+1), lddat, queues[0] ); } } magma_int_t nb0 = min( m - s*nb, n - s*nb ); if ( nb0 > 0 ) { rows = m - s*nb; magmablas_ztranspose( nb0, rows, dAT(s,s), lddat, dAP(0,0), maxm, queues[0] ); clFlush( queues[0] ); magma_queue_sync( queues[0] ); magma_zgetmatrix_async( rows, nb0, dAP(0,0), maxm, work(0), ldwork, queues[1], NULL ); magma_queue_sync( queues[1] ); // do the cpu part lapackf77_zgetrf( &rows, &nb0, work, &ldwork, ipiv+s*nb, &iinfo ); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; for( i=s*nb; i < s*nb + nb0; ++i ) { ipiv[i] += s*nb; } magmablas_zlaswp( n, dAT(0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[0] ); clFlush( queues[0] ); // upload j-th panel magma_zsetmatrix_async( rows, nb0, work(0), ldwork, dAP(0,0), maxm, queues[1], NULL ); magma_queue_sync( queues[1] ); magmablas_ztranspose( rows, nb0, dAP(0,0), maxm, dAT(s,s), lddat, queues[0] ); clFlush( queues[0] ); magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n-s*nb-nb0, nb0, c_one, dAT(s,s), lddat, dAT(s,s)+nb0, lddat, queues[0] ); } // undo transpose if ( dA == dAT ) { magmablas_ztranspose_inplace( m, dAT(0,0), lddat, queues[0] ); } else { magmablas_ztranspose( n, m, dAT(0,0), lddat, dA(0,0), ldda, queues[0] ); magma_free( dAT ); } magma_queue_sync( queues[0] ); magma_queue_sync( queues[1] ); magma_free( dAP ); // magma_free_cpu( work ); clEnqueueUnmapMemObject( queues[0], work_mapped, work, 0, NULL, NULL ); clReleaseMemObject( work_mapped ); } return *info; } /* magma_zgetrf_gpu */
extern "C" magma_int_t magma_zgetrf2_mgpu(magma_int_t num_gpus, magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset, cuDoubleComplex **d_lAT, magma_int_t lddat, magma_int_t *ipiv, cuDoubleComplex **d_lAP, cuDoubleComplex *w, magma_int_t ldw, cudaStream_t streaml[][2], magma_int_t *info) #endif { /* -- MAGMA (version 1.3.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver November 2010 Purpose ======= ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. Use two buffer to send panels.. Arguments ========= NUM_GPUS (input) INTEGER The number of GPUS to be used for the factorization. M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX_16 array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. LDDA (input) INTEGER The leading dimension of the array A. LDDA >= max(1,M). IPIV (output) INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -7, internal GPU memory allocation failed. > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. ===================================================================== */ #define inAT(id,i,j) (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb) #define W(j) (w+((j)%num_gpus)*nb*ldw) cuDoubleComplex c_one = MAGMA_Z_ONE; cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; magma_int_t block_size = 32; magma_int_t iinfo, n_local[4]; magma_int_t maxm, mindim; magma_int_t i, ii, d, dd, rows, cols, s, ldpan[4]; magma_int_t id, i_local, i_local2, nb0, nb1; cuDoubleComplex *d_panel[4], *panel_local[4]; //cudaStream_t streaml[4][2]; /* Check arguments */ *info = 0; if (m < 0) *info = -2; else if (n < 0) *info = -3; else if (num_gpus*lddat < max(1,n)) *info = -5; if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } /* Quick return if possible */ if (m == 0 || n == 0) return *info; /* Function Body */ mindim = min(m, n); //nb = magma_get_zgetrf_nb(m); if( num_gpus > ceil((double)n/nb) ) { *info = -1; return *info; } { /* Use hybrid blocked code. */ maxm = ((m + block_size-1)/block_size)*block_size; /* some initializations */ for(i=0; i<num_gpus; i++){ magmaSetDevice(i); n_local[i] = ((n/nb)/num_gpus)*nb; if (i < (n/nb)%num_gpus) n_local[i] += nb; else if (i == (n/nb)%num_gpus) n_local[i] += n%nb; /* workspaces */ d_panel[i] = &(d_lAP[i][nb*maxm]); /* temporary panel storage */ /* create local streams */ //magma_queue_create(&streaml[i][0]); //magma_queue_create(&streaml[i][1]); } trace_init( 1, num_gpus, 2, (CUstream_st**)streaml ); /* start sending the panel to cpu */ nb0 = min(mindim, nb); magmaSetDevice(0); magmablasSetKernelStream(streaml[0][1]); trace_gpu_start( 0, 1, "comm", "get" ); if( nb0 == nb ) magmablas_ztranspose( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm ); else magmablas_ztranspose2( d_lAP[0], maxm, inAT(0,0,0), lddat, nb0, maxm ); magma_zgetmatrix_async( m, nb0, d_lAP[0], maxm, W(0), ldw, streaml[0][1] ); trace_gpu_end( 0, 1 ); /* ------------------------------------------------------------------------------------- */ #ifdef PROFILE magma_timestr_t start_timer, end_timer; start_timer = get_current_time(); #endif s = mindim / nb; for( i=0; i<s; i++ ) { /* Set the GPU number that holds the current panel */ id = i%num_gpus; magmaSetDevice(id); /* Set the local index where the current panel is */ i_local = i/num_gpus; cols = maxm - i*nb; rows = m - i*nb; /* synchrnoize i-th panel from id-th gpu into work */ magma_queue_sync( streaml[id][1] ); /* i-th panel factorization */ trace_cpu_start( 0, "getrf", "getrf" ); #ifdef PANEL_FACT_MC cntxt->nb = 12; magma_zgetrf_mc(cntxt, &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); #else lapackf77_zgetrf( &rows, &nb, W(i), &ldw, ipiv+i*nb, &iinfo); #endif if ( (*info == 0) && (iinfo > 0) ) { *info = iinfo + i*nb; //break; } trace_cpu_end( 0 ); /* start sending the panel to all the gpus */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); trace_gpu_start( 0, 1, "comm", "set" ); magma_zsetmatrix_async( rows, nb, W(i), ldw, d_lAP[d], cols, streaml[d][1] ); trace_gpu_end( 0, 1 ); d = (d+1)%num_gpus; } /* apply the pivoting */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 1, "pivot", "pivot" ); if( dd == 0 ) magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb, i*nb ); else magmablas_zpermute_long3( inAT(d,0,0), lddat, ipiv, nb, i*nb ); trace_gpu_end( d, 1 ); d = (d+1)%num_gpus; } /* update the trailing-matrix/look-ahead */ d = (i+1)%num_gpus; for( dd=0; dd<num_gpus; dd++ ) { magmaSetDevice(d); /* storage for panel */ if( d == id ) { /* the panel belond to this gpu */ panel_local[d] = inAT(d,i,i_local); ldpan[d] = lddat; /* next column */ i_local2 = i_local+1; } else { /* the panel belong to another gpu */ panel_local[d] = &d_panel[d][(i%2)*nb*maxm]; //panel_local[d] = d_panel[d]; ldpan[d] = nb; /* next column */ i_local2 = i_local; if( d < id ) i_local2 ++; } /* the size of the next column */ if ( s > (i+1) ) { nb0 = nb; } else { nb0 = n_local[d]-nb*(s/num_gpus); if( d < s%num_gpus ) nb0 -= nb; } if( d == (i+1)%num_gpus) { /* owns the next column, look-ahead the column */ nb1 = nb0; magmablasSetKernelStream(streaml[d][1]); /* make sure all the pivoting has been applied */ magma_queue_sync(streaml[d][0]); trace_gpu_start( d, 1, "gemm", "gemm" ); } else { /* update the entire trailing matrix */ nb1 = n_local[d] - i_local2*nb; magmablasSetKernelStream(streaml[d][0]); /* synchronization to make sure panel arrived on gpu */ magma_queue_sync(streaml[d][1]); trace_gpu_start( d, 0, "gemm", "gemm" ); } magmablas_ztranspose(panel_local[d], ldpan[d], d_lAP[d], cols, cols, nb); /* gpu updating the trailing matrix */ //magmablas_ztrsm( magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb, c_one, panel_local[d], ldpan[d], inAT(d, i, i_local2), lddat); //cublasZgemm magma_zgemm( MagmaNoTrans, MagmaNoTrans, nb1, m-(i+1)*nb, nb, c_neg_one, inAT(d, i, i_local2), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, inAT(d, i+1, i_local2), lddat ); if( d == (i+1)%num_gpus ) { /* Set the local index where the current panel is */ int loff = i+1; int i_local = (i+1)/num_gpus; int ldda = maxm - (i+1)*nb; int cols = m - (i+1)*nb; nb0 = min(nb, mindim - (i+1)*nb); /* size of the diagonal block */ trace_gpu_end( d, 1 ); if( nb0 > 0 ) { /* transpose the panel for sending it to cpu */ trace_gpu_start( d, 1, "comm", "get" ); if( i+1 < s ) magmablas_ztranspose( d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda ); else magmablas_ztranspose2( d_lAP[d], ldda, inAT(d,loff,i_local), lddat, nb0, ldda ); /* send the panel to cpu */ magma_zgetmatrix_async( cols, nb0, d_lAP[d], ldda, W(i+1), ldw, streaml[d][1] ); trace_gpu_end( d, 1 ); } } else { trace_gpu_end( d, 0 ); } d = (d+1)%num_gpus; } /* update the remaining matrix by gpu owning the next panel */ if( (i+1) < s ) { int i_local = (i+1)/num_gpus; int rows = m - (i+1)*nb; d = (i+1)%num_gpus; magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); trace_gpu_start( d, 0, "gemm", "gemm" ); //magmablas_ztrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, n_local[d] - (i_local+1)*nb, nb, c_one, panel_local[d], ldpan[d], inAT(d,i,i_local+1), lddat ); //cublasZgemm magma_zgemm( MagmaNoTrans, MagmaNoTrans, n_local[d]-(i_local+1)*nb, rows, nb, c_neg_one, inAT(d,i,i_local+1), lddat, &(panel_local[d][nb*ldpan[d]]), ldpan[d], c_one, inAT(d,i+1, i_local+1), lddat ); trace_gpu_end( d, 0 ); } } /* end of for i=1..s */ /* ------------------------------------------------------------------------------ */ /* Set the GPU number that holds the last panel */ id = s%num_gpus; /* Set the local index where the last panel is */ i_local = s/num_gpus; /* size of the last diagonal-block */ nb0 = min(m - s*nb, n - s*nb); rows = m - s*nb; cols = maxm - s*nb; if( nb0 > 0 ) { magmaSetDevice(id); /* wait for the last panel on cpu */ magma_queue_sync( streaml[id][1] ); /* factor on cpu */ lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo); if ( (*info == 0) && (iinfo > 0) ) *info = iinfo + s*nb; /* send the factor to gpus */ for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); i_local2 = i_local; if( d < id ) i_local2 ++; if( d == id || n_local[d] > i_local2*nb ) { magma_zsetmatrix_async( rows, nb0, W(s), ldw, d_lAP[d], cols, streaml[d][1] ); } } for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][0]); if( d == 0 ) magmablas_zpermute_long2( lddat, inAT(d,0,0), lddat, ipiv, nb0, s*nb ); else magmablas_zpermute_long3( inAT(d,0,0), lddat, ipiv, nb0, s*nb ); } for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magmablasSetKernelStream(streaml[d][1]); /* wait for the pivoting to be done */ magma_queue_sync( streaml[d][0] ); i_local2 = i_local; if( d < id ) i_local2++; if( d == id ) { /* the panel belond to this gpu */ panel_local[d] = inAT(d,s,i_local); /* next column */ nb1 = n_local[d] - i_local*nb-nb0; magmablas_ztranspose2( panel_local[d], lddat, d_lAP[d], cols, rows, nb0); if( nb1 > 0 ) //cublasZtrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], lddat, inAT(d,s,i_local)+nb0, lddat); } else if( n_local[d] > i_local2*nb ) { /* the panel belong to another gpu */ panel_local[d] = &d_panel[d][(s%2)*nb*maxm]; //panel_local[d] = d_panel[d]; /* next column */ nb1 = n_local[d] - i_local2*nb; magmablas_ztranspose2( panel_local[d], nb, d_lAP[d], cols, rows, nb0); //cublasZtrsm magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit, nb1, nb0, c_one, panel_local[d], nb, inAT(d,s,i_local2), lddat); } } } /* if( nb0 > 0 ) */ /* clean up */ trace_finalize( "zgetrf_mgpu.svg","trace.css" ); for( d=0; d<num_gpus; d++ ) { magmaSetDevice(d); magma_queue_sync( streaml[d][0] ); magma_queue_sync( streaml[d][1] ); //magma_queue_destroy(streaml[d][0]); //magma_queue_destroy(streaml[d][1]); magmablasSetKernelStream(NULL); } magmaSetDevice(0); #ifdef PROFILE end_timer = get_current_time(); printf("\n Performance %f GFlop/s\n", (2./3.*n*n*n /1000000.) / GetTimerValue(start_timer, end_timer)); #endif } return *info; /* End of MAGMA_ZGETRF2_MGPU */ }
extern "C" void magma_zherk_mgpu( magma_int_t ngpu, magma_uplo_t uplo, magma_trans_t trans, magma_int_t nb, magma_int_t n, magma_int_t k, double alpha, magmaDoubleComplex_ptr dB[], magma_int_t lddb, magma_int_t b_offset, double beta, magmaDoubleComplex_ptr dC[], magma_int_t lddc, magma_int_t c_offset, magma_int_t nqueue, magma_queue_t queues[][10]) { #define dB(id, i, j) (dB[(id)]+(j)*lddb + (i)+b_offset) #define dC(id, i, j) (dC[(id)]+(j)*lddc + (i)) #define STREAM_ID(i) (nqueue > 1 ? 1+((i)/nb)%(nqueue-1) : 0) magma_int_t i, id, ib, ii, kk, n1; magmaDoubleComplex z_alpha = MAGMA_Z_MAKE(alpha,0.0); magmaDoubleComplex z_beta = MAGMA_Z_MAKE(beta, 0.0); magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); /* diagonal update */ for( i=0; i < n; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = STREAM_ID( i+c_offset ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); /* zher2k on diagonal block */ magma_setdevice(id); magmablasSetKernelStream( queues[id][kk] ); trace_gpu_start( id, kk, "syr2k", "syr2k" ); magma_zherk(uplo, trans, ib, k, alpha, dB(id, i, 0 ), lddb, beta, dC(id, i+c_offset, ii), lddc); trace_gpu_end( id, kk ); } /* off-diagonal update */ if (uplo == MagmaUpper) { for( i=nb; i < n; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = STREAM_ID( i+c_offset ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); magma_setdevice(id); magmablasSetKernelStream( queues[id][kk] ); magma_zgemm(MagmaNoTrans, MagmaConjTrans, i, ib, k, z_alpha, dB(id, 0, 0 ), lddb, dB(id, i, 0 ), lddb, z_beta, dC(id, 0, ii), lddc); } } else { for( i=0; i < n-nb; i += nb ) { id = ((i+c_offset)/nb)%ngpu; kk = STREAM_ID( i+c_offset ); ib = min(nb, n-i); ii = nb*((i+c_offset)/(nb*ngpu)); n1 = n-i-ib; /* zgemm on off-diagonal blocks */ magma_setdevice(id); magmablasSetKernelStream( queues[id][kk] ); trace_gpu_start( id, kk, "gemm_up", "gemm_up" ); magma_zgemm(MagmaNoTrans, MagmaConjTrans, n1, ib, k, z_alpha, dB(id, i+ib, 0 ), lddb, dB(id, i, 0 ), lddb, z_beta, dC(id, i+c_offset+ib, ii), lddc); trace_gpu_end( id, kk ); } } // TODO why not sync? //for( id=0; id < ngpu; id++ ) { // magma_setdevice(id); // //for( kk=0; kk < nqueue; kk++ ) // // magma_queue_sync( queues[id][kk] ); //} magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_gpu( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex_ptr dA, magma_int_t ldda, magma_int_t *info ) { #ifdef HAVE_clBLAS #define dA(i_, j_) dA, ((i_) + (j_)*ldda + dA_offset) #else #define dA(i_, j_) (dA + (i_) + (j_)*ldda) #endif /* Constants */ const magmaDoubleComplex c_one = MAGMA_Z_ONE; const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; const double d_one = 1.0; const double d_neg_one = -1.0; /* Local variables */ const char* uplo_ = lapack_uplo_const( uplo ); bool upper = (uplo == MagmaUpper); magma_int_t j, jb, nb; magmaDoubleComplex *work; *info = 0; if (! upper && uplo != MagmaLower) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } nb = magma_get_zpotrf_nb( n ); if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, nb*nb )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_queue_t queues[2]; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queues[0] ); magma_queue_create( cdev, &queues[1] ); if (nb <= 1 || nb >= n) { /* Use unblocked code. */ magma_zgetmatrix( n, n, dA(0,0), ldda, work, n, queues[0] ); lapackf77_zpotrf( uplo_, &n, work, &n, info ); magma_zsetmatrix( n, n, work, n, dA(0,0), ldda, queues[0] ); } else { /* Use blocked code. */ if (upper) { //========================================================= /* Compute the Cholesky factorization A = U'*U. */ for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaUpper, MagmaConjTrans, jb, j, d_neg_one, dA(0, j), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block row right of diagonal block if (j+jb < n) { magma_zgemm( MagmaConjTrans, MagmaNoTrans, jb, n-j-jb, j, c_neg_one, dA(0, j ), ldda, dA(0, j+jb), ldda, c_one, dA(j, j+jb), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaUpperStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block row right of diagonal block if (j+jb < n) { magma_ztrsm( MagmaLeft, MagmaUpper, MagmaConjTrans, MagmaNonUnit, jb, n-j-jb, c_one, dA(j, j), ldda, dA(j, j+jb), ldda, queues[1] ); } } } else { //========================================================= // Compute the Cholesky factorization A = L*L'. for (j=0; j < n; j += nb) { // apply all previous updates to diagonal block, // then transfer it to CPU jb = min( nb, n-j ); magma_zherk( MagmaLower, MagmaNoTrans, jb, j, d_neg_one, dA(j, 0), ldda, d_one, dA(j, j), ldda, queues[1] ); magma_queue_sync( queues[1] ); magma_zgetmatrix_async( jb, jb, dA(j, j), ldda, work, jb, queues[0] ); // apply all previous updates to block column below diagonal block if (j+jb < n) { magma_zgemm( MagmaNoTrans, MagmaConjTrans, n-j-jb, jb, j, c_neg_one, dA(j+jb, 0), ldda, dA(j, 0), ldda, c_one, dA(j+jb, j), ldda, queues[1] ); } // simultaneous with above zgemm, transfer diagonal block, // factor it on CPU, and test for positive definiteness magma_queue_sync( queues[0] ); lapackf77_zpotrf( MagmaLowerStr, &jb, work, &jb, info ); magma_zsetmatrix_async( jb, jb, work, jb, dA(j, j), ldda, queues[1] ); if (*info != 0) { *info = *info + j; break; } // apply diagonal block to block column below diagonal if (j+jb < n) { magma_ztrsm( MagmaRight, MagmaLower, MagmaConjTrans, MagmaNonUnit, n-j-jb, jb, c_one, dA(j, j), ldda, dA(j+jb, j), ldda, queues[1] ); } } } } magma_queue_destroy( queues[0] ); magma_queue_destroy( queues[1] ); magma_free_pinned( work ); return *info; } /* magma_zpotrf_gpu */
extern "C" magma_int_t magma_zlarfb_gpu( char side, char trans, char direct, char storev, magma_int_t m, magma_int_t n, magma_int_t k, const magmaDoubleComplex *dV, magma_int_t ldv, const magmaDoubleComplex *dT, magma_int_t ldt, magmaDoubleComplex *dC, magma_int_t ldc, magmaDoubleComplex *dwork, magma_int_t ldwork ) { /* -- MAGMA (version 1.4.0) -- Univ. of Tennessee, Univ. of California Berkeley August 2013 Purpose ======= ZLARFB applies a complex block reflector H or its transpose H^H to a COMPLEX_16 m by n matrix C, from the left. Arguments ========= SIDE (input) CHARACTER = 'L': apply H or H^H from the Left = 'R': apply H or H^H from the Right TRANS (input) CHARACTER = 'N': apply H (No transpose) = 'C': apply H^H (Conjugate transpose) DIRECT (input) CHARACTER Indicates how H is formed from a product of elementary reflectors = 'F': H = H(1) H(2) . . . H(k) (Forward) = 'B': H = H(k) . . . H(2) H(1) (Backward) STOREV (input) CHARACTER Indicates how the vectors which define the elementary reflectors are stored: = 'C': Columnwise = 'R': Rowwise M (input) INTEGER The number of rows of the matrix C. N (input) INTEGER The number of columns of the matrix C. K (input) INTEGER The order of the matrix T (= the number of elementary reflectors whose product defines the block reflector). DV (input) COMPLEX_16 array on the GPU, dimension (LDV,K) if STOREV = 'C' (LDV,M) if STOREV = 'R' and SIDE = 'L' (LDV,N) if STOREV = 'R' and SIDE = 'R' The matrix V. See further details. LDV (input) INTEGER The leading dimension of the array V. If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M); if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N); if STOREV = 'R', LDV >= K. DT (input) COMPLEX_16 array on the GPU, dimension (LDT,K) The triangular k by k matrix T in the representation of the block reflector. LDT (input) INTEGER The leading dimension of the array T. LDT >= K. DC (input/output) COMPLEX_16 array on the GPU, dimension (LDC,N) On entry, the m by n matrix C. On exit, C is overwritten by H*C, or H^H*C, or C*H, or C*H^H. LDC (input) INTEGER The leading dimension of the array C. LDA >= max(1,M). WORK (workspace) COMPLEX_16 array, dimension (LDWORK,K) LDWORK (input) INTEGER The leading dimension of the array WORK. If SIDE == 'L', LDWORK >= max(1,N); if SIDE == 'R', LDWORK >= max(1,M); Further Details =============== The shape of the matrix V and the storage of the vectors which define the H(i) is best illustrated by the following example with n = 5 and k = 3. All elements including 0's and 1's are stored, unlike LAPACK. DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': V = ( 1 0 0 ) V = ( 1 v1 v1 v1 v1 ) ( v1 1 0 ) ( 0 1 v2 v2 v2 ) ( v1 v2 1 ) ( 0 0 1 v3 v3 ) ( v1 v2 v3 ) ( v1 v2 v3 ) DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': V = ( v1 v2 v3 ) V = ( v1 v1 1 0 0 ) ( v1 v2 v3 ) ( v2 v2 v2 1 0 ) ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) ( 0 1 v3 ) ( 0 0 1 ) =================================================================== */ magmaDoubleComplex c_zero = MAGMA_Z_ZERO; magmaDoubleComplex c_one = MAGMA_Z_ONE; magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE; /* Check input arguments */ magma_int_t info = 0; if (m < 0) { info = -5; } else if (n < 0) { info = -6; } else if (k < 0) { info = -7; } else if ( ((storev == 'C' || storev == 'c') && (side == 'L' || side == 'l') && ldv < max(1,m)) || ((storev == 'C' || storev == 'c') && (side == 'R' || side == 'r') && ldv < max(1,n)) || ((storev == 'R' || storev == 'r') && ldv < k) ) { info = -9; } else if (ldt < k) { info = -11; } else if (ldc < max(1,m)) { info = -13; } else if ( ((side == 'L' || side == 'l') && ldwork < max(1,n)) || ((side == 'R' || side == 'r') && ldwork < max(1,m)) ) { info = -15; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } /* Function Body */ if (m <= 0 || n <= 0) { return MAGMA_SUCCESS; } // opposite of trans char transt; if (trans == 'N' || trans == 'n') transt = MagmaConjTrans; else transt = MagmaNoTrans; // whether T is upper or lower triangular char uplo; if (direct == 'F' || direct == 'f') uplo = MagmaUpper; else uplo = MagmaLower; // whether V is stored transposed or not char notransV, transV; if (storev == 'C' || storev == 'c') { notransV = MagmaNoTrans; transV = MagmaConjTrans; } else { notransV = MagmaConjTrans; transV = MagmaNoTrans; } if ( side == 'l' || side == 'L' ) { // Form H C or H^H C // Comments assume H C. When forming H^H C, T gets transposed via transt. // W = C^H V magma_zgemm( MagmaConjTrans, notransV, n, k, m, c_one, dC, ldc, dV, ldv, c_zero, dwork, ldwork); // W = W T^H = C^H V T^H magma_ztrmm( MagmaRight, uplo, transt, MagmaNonUnit, n, k, c_one, dT, ldt, dwork, ldwork); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C magma_zgemm( notransV, MagmaConjTrans, m, n, k, c_neg_one, dV, ldv, dwork, ldwork, c_one, dC, ldc); } else { // Form C H or C H^H // Comments assume C H. When forming C H^H, T gets transposed via trans. // W = C V magma_zgemm( MagmaNoTrans, notransV, m, k, n, c_one, dC, ldc, dV, ldv, c_zero, dwork, ldwork); // W = W T = C V T magma_ztrmm( MagmaRight, uplo, trans, MagmaNonUnit, m, k, c_one, dT, ldt, dwork, ldwork); // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H magma_zgemm( MagmaNoTrans, transV, m, n, k, c_neg_one, dwork, ldwork, dV, ldv, c_one, dC, ldc); } return MAGMA_SUCCESS; } /* magma_zlarfb */