/***************************************************************************//** @fn magma_copyvector_async( n, elemSize, dx_src, incx, dy_dst, incy, queue ) Copy vector dx_src on GPU device to dy_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. With CUDA unified addressing, dx and dy can be on different GPUs. This version is asynchronous: it may return before the transfer finishes. See magma_copyvector() for a synchronous version. @param[in] n Number of elements in vector. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dx_src Source array of dimension (1 + (n-1))*incx, on GPU device. @param[in] incx Increment between elements of hx_src. incx > 0. @param[out] dy_dst Destination array of dimension (1 + (n-1))*incy, on GPU device. @param[in] incy Increment between elements of dy_dst. incy > 0. @param[in] queue Queue to execute in. @ingroup magma_copyvector *******************************************************************************/ extern "C" void magma_copyvector_async_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, magma_ptr dy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } if ( incx == 1 && incy == 1 ) { cudaError_t status; status = cudaMemcpyAsync( dy_dst, dx_src, int(n*elemSize), cudaMemcpyDeviceToDevice, stream ); check_xerror( status, func, file, line ); } else { magma_copymatrix_async_internal( 1, n, elemSize, dx_src, incx, dy_dst, incy, queue, func, file, line ); } }
/***************************************************************************//** @fn magma_copymatrix( m, n, elemSize, dA_src, ldda, dB_dst, lddb, queue ) Copy all or part of matrix dA_src on GPU device to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. With CUDA unified addressing, dA and dB can be on different GPUs. This version synchronizes the queue after the transfer. See magma_copymatrix_async() for an asynchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n). @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_copymatrix *******************************************************************************/ extern "C" void magma_copymatrix_q_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); cudaError_t status; status = cudaMemcpy2DAsync( dB_dst, int(lddb*elemSize), dA_src, int(ldda*elemSize), int(m*elemSize), int(n), cudaMemcpyDeviceToDevice, queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_getmatrix( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue ) Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version synchronizes the queue after the transfer. See magma_getmatrix_async() for an asynchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] hB_dst Destination array of dimension (ldb,n), on CPU host. @param[in] ldb Leading dimension of matrix B. ldb >= m. @param[in] queue Queue to execute in. @ingroup magma_getmatrix *******************************************************************************/ extern "C" void magma_getmatrix_q_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, void* hB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); cublasStatus_t status; status = cublasGetMatrixAsync( int(m), int(n), int(elemSize), dA_src, int(ldda), hB_dst, int(ldb), queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
// TODO compare performance with cublasZcopy BLAS function. // But this implementation can handle any element size, not just [sdcz] precisions. extern "C" void magma_copyvector_q_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, magma_ptr dy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); if ( incx == 1 && incy == 1 ) { cudaError_t status; status = cudaMemcpyAsync( dy_dst, dx_src, int(n*elemSize), cudaMemcpyDeviceToDevice, queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); } else { magma_copymatrix_q_internal( 1, n, elemSize, dx_src, incx, dy_dst, incy, queue, func, file, line ); } }
/***************************************************************************//** @fn magma_getvector( n, elemSize, dx_src, incx, hy_dst, incy, queue ) Copy vector dx_src on GPU device to hy_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version synchronizes the queue after the transfer. See magma_getvector_async() for an asynchronous version. @param[in] n Number of elements in vector. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dx_src Source array of dimension (1 + (n-1))*incx, on GPU device. @param[in] incx Increment between elements of hx_src. incx > 0. @param[out] hy_dst Destination array of dimension (1 + (n-1))*incy, on CPU host. @param[in] incy Increment between elements of dy_dst. incy > 0. @param[in] queue Queue to execute in. @ingroup magma_getvector *******************************************************************************/ extern "C" void magma_getvector_q_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, void* hy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetVectorAsync( int(n), int(elemSize), dx_src, int(incx), hy_dst, int(incy), queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
extern "C" magma_int_t magma_dapplycumilu_r_transpose( magma_d_matrix b, magma_d_matrix *x, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrU=NULL; double one = MAGMA_D_MAKE( 1.0, 0.0); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseDcsrsm_solve( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->UT.num_rows, b.num_rows*b.num_cols/precond->UT.num_rows, &one, descrU, precond->UT.dval, precond->UT.drow, precond->UT.dcol, precond->cuinfoUT, b.dval, precond->UT.num_rows, x->dval, precond->UT.num_rows )); cleanup: cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
/***************************************************************************//** @fn magma_getmatrix_async( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue ) Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hB_dst is pinned CPU memory. See magma_getmatrix() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] hB_dst Destination array of dimension (ldb,n), on CPU host. @param[in] ldb Leading dimension of matrix B. ldb >= m. @param[in] queue Queue to execute in. @ingroup magma_getmatrix *******************************************************************************/ extern "C" void magma_getmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, void* hB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasGetMatrixAsync( int(m), int(n), int(elemSize), dA_src, int(ldda), hB_dst, int(ldb), stream ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_getvector_async( n, elemSize, dx_src, incx, hy_dst, incy, queue ) Copy vector dx_src on GPU device to hy_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hy_dst is pinned CPU memory. See magma_getvector() for a synchronous version. @param[in] n Number of elements in vector. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dx_src Source array of dimension (1 + (n-1))*incx, on GPU device. @param[in] incx Increment between elements of hx_src. incx > 0. @param[out] hy_dst Destination array of dimension (1 + (n-1))*incy, on CPU host. @param[in] incy Increment between elements of dy_dst. incy > 0. @param[in] queue Queue to execute in. @ingroup magma_getvector *******************************************************************************/ extern "C" void magma_getvector_async_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, void* hy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasGetVectorAsync( int(n), int(elemSize), dx_src, int(incx), hy_dst, int(incy), stream ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_copymatrix_async( m, n, elemSize, dA_src, ldda, dB_dst, lddb, queue ) Copy all or part of matrix dA_src on GPU device to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. With CUDA unified addressing, dA and dB can be on different GPUs. This version is asynchronous: it may return before the transfer finishes. See magma_copyvector() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_copymatrix *******************************************************************************/ extern "C" void magma_copymatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cudaError_t status; status = cudaMemcpy2DAsync( dB_dst, int(lddb*elemSize), dA_src, int(ldda*elemSize), int(m*elemSize), int(n), cudaMemcpyDeviceToDevice, stream ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_setmatrix_async( m, n, elemSize, hA_src, lda, dB_dst, lddb, queue ) Copy all or part of matrix hA_src on CPU host to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hA_src is pinned CPU memory. See magma_setmatrix() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] hA_src Source array of dimension (lda,n), on CPU host. @param[in] lda Leading dimension of matrix A. lda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_setmatrix *******************************************************************************/ extern "C" void magma_setmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* hA_src, magma_int_t lda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasSetMatrixAsync( int(m), int(n), int(elemSize), hA_src, int(lda), dB_dst, int(lddb), stream ); check_xerror( status, func, file, line ); }
extern "C" magma_int_t magma_dcumiccsetup( magma_d_matrix A, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; #if CUDA_VERSION >= 7000 csric02Info_t info_M=NULL; void *pBuffer = NULL; #endif magma_d_matrix hA={Magma_CSR}, hACSR={Magma_CSR}, U={Magma_CSR}; CHECK( magma_dmtransfer( A, &hA, A.memory_location, Magma_CPU, queue )); U.diagorder_type = Magma_VALUE; CHECK( magma_dmconvert( hA, &hACSR, hA.storage_type, Magma_CSR, queue )); // in case using fill-in if( precond->levels > 0 ){ magma_d_matrix hAL={Magma_CSR}, hAUt={Magma_CSR}; CHECK( magma_dsymbilu( &hACSR, precond->levels, &hAL, &hAUt, queue )); magma_dmfree(&hAL, queue); magma_dmfree(&hAUt, queue); } CHECK( magma_dmconvert( hACSR, &U, Magma_CSR, Magma_CSRL, queue )); magma_dmfree( &hACSR, queue ); CHECK( magma_dmtransfer(U, &(precond->M), Magma_CPU, Magma_DEV, queue )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &(precond->cuinfo) )); // use kernel to manually check for zeros n the diagonal CHECK( magma_ddiagcheck( precond->M, queue ) ); #if CUDA_VERSION >= 7000 // this version has the bug fixed where a zero on the diagonal causes a crash CHECK_CUSPARSE( cusparseCreateCsric02Info(&info_M) ); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); int buffersize; int structural_zero; int numerical_zero; CHECK_CUSPARSE( cusparseDcsric02_bufferSize( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, &buffersize ) ); CHECK( magma_malloc((void**)&pBuffer, buffersize) ); CHECK_CUSPARSE( cusparseDcsric02_analysis( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer )); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &numerical_zero ) ); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &structural_zero ) ); CHECK_CUSPARSE( cusparseDcsric02( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer) ); #else // this version contains the bug but is needed for backward compability CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_SYMMETRIC )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrA, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrA, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); CHECK_CUSPARSE( cusparseDcsric0( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); #endif CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU )); if( precond->maxiter < 50 ){ //prepare for iterative solves // copy the matrix to precond->L and (transposed) to precond->U CHECK( magma_dmtransfer(precond->M, &(precond->L), Magma_DEV, Magma_DEV, queue )); CHECK( magma_dmtranspose( precond->L, &(precond->U), queue )); // extract the diagonal of L into precond->d CHECK( magma_djacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_dvinit( &precond->work1, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_djacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_dvinit( &precond->work2, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); } /* // to enable also the block-asynchronous iteration for the triangular solves CHECK( magma_dmtransfer( precond->M, &hA, Magma_DEV, Magma_CPU, queue )); hA.storage_type = Magma_CSR; magma_d_matrix hD, hR, hAt CHECK( magma_dcsrsplit( 256, hA, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->LD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->L, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); CHECK( magma_d_cucsrtranspose( hA, &hAt, queue )); CHECK( magma_dcsrsplit( 256, hAt, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->UD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->U, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); magma_dmfree(&hA, queue ); magma_dmfree(&hAt, queue ); */ cleanup: #if CUDA_VERSION >= 7000 magma_free( pBuffer ); cusparseDestroyCsric02Info( info_M ); #endif cusparseDestroySolveAnalysisInfo( precond->cuinfo ); cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroyMatDescr( descrA ); cusparseDestroy( cusparseHandle ); magma_dmfree(&U, queue ); magma_dmfree(&hA, queue ); return info; }
extern "C" magma_int_t magma_dcumilugeneratesolverinfo( magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; magma_d_matrix hA={Magma_CSR}, hL={Magma_CSR}, hU={Magma_CSR}; if (precond->L.memory_location != Magma_DEV ){ CHECK( magma_dmtransfer( precond->M, &hA, precond->M.memory_location, Magma_CPU, queue )); hL.diagorder_type = Magma_UNITY; CHECK( magma_dmconvert( hA, &hL , Magma_CSR, Magma_CSRL, queue )); hU.diagorder_type = Magma_VALUE; CHECK( magma_dmconvert( hA, &hU , Magma_CSR, Magma_CSRU, queue )); CHECK( magma_dmtransfer( hL, &(precond->L), Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hU, &(precond->U), Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hA, queue ); magma_dmfree(&hL, queue ); magma_dmfree(&hU, queue ); } // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->L.num_rows, precond->L.nnz, descrL, precond->L.dval, precond->L.drow, precond->L.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_UPPER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->U.num_rows, precond->U.nnz, descrU, precond->U.dval, precond->U.drow, precond->U.dcol, precond->cuinfoU )); if( precond->maxiter < 50 ){ //prepare for iterative solves // extract the diagonal of L into precond->d CHECK( magma_djacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_dvinit( &precond->work1, Magma_DEV, precond->U.num_rows, 1, MAGMA_D_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_djacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_dvinit( &precond->work2, Magma_DEV, precond->U.num_rows, 1, MAGMA_D_ZERO, queue )); } cleanup: cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
extern "C" magma_int_t magma_dcumilusetup_transpose( magma_d_matrix A, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; magma_d_matrix Ah1={Magma_CSR}, Ah2={Magma_CSR}; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrLT=NULL; cusparseMatDescr_t descrUT=NULL; // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); // transpose the matrix magma_dmtransfer( precond->L, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_dmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransposeconjugate( Ah2, &Ah1, queue ); magma_dmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_dmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransfer( Ah2, &(precond->LT), Magma_CPU, Magma_DEV, queue ); magma_dmfree(&Ah2, queue ); magma_dmtransfer( precond->U, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_dmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransposeconjugate( Ah2, &Ah1, queue ); magma_dmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_dmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransfer( Ah2, &(precond->UT), Magma_CPU, Magma_DEV, queue ); magma_dmfree(&Ah2, queue ); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrLT )); CHECK_CUSPARSE( cusparseSetMatType( descrLT, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrLT, CUSPARSE_DIAG_TYPE_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrLT, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrLT, CUSPARSE_FILL_MODE_UPPER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoLT )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->LT.num_rows, precond->LT.nnz, descrLT, precond->LT.dval, precond->LT.drow, precond->LT.dcol, precond->cuinfoLT )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrUT )); CHECK_CUSPARSE( cusparseSetMatType( descrUT, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrUT, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrUT, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrUT, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoUT )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->UT.num_rows, precond->UT.nnz, descrUT, precond->UT.dval, precond->UT.drow, precond->UT.dcol, precond->cuinfoUT )); cleanup: cusparseDestroyMatDescr( descrLT ); cusparseDestroyMatDescr( descrUT ); cusparseDestroy( cusparseHandle ); magma_dmfree(&Ah1, queue ); magma_dmfree(&Ah2, queue ); return info; }
extern "C" magma_int_t magma_cmtransposeconjugate( magma_c_matrix A, magma_c_matrix *B, magma_queue_t queue ) { // for symmetric matrices: convert to csc using cusparse magma_int_t info = 0; cusparseHandle_t handle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrB=NULL; magma_c_matrix ACSR={Magma_CSR}, BCSR={Magma_CSR}; magma_c_matrix A_d={Magma_CSR}, B_d={Magma_CSR}; if( A.storage_type == Magma_CSR && A.memory_location == Magma_DEV ) { // fill in information for B B->storage_type = A.storage_type; B->diagorder_type = A.diagorder_type; B->memory_location = Magma_DEV; B->num_rows = A.num_cols; // transposed B->num_cols = A.num_rows; // transposed B->nnz = A.nnz; B->true_nnz = A.true_nnz; if ( A.fill_mode == MagmaFull ) { B->fill_mode = MagmaFull; } else if ( A.fill_mode == MagmaLower ) { B->fill_mode = MagmaUpper; } else if ( A.fill_mode == MagmaUpper ) { B->fill_mode = MagmaLower; } B->dval = NULL; B->drow = NULL; B->dcol = NULL; // memory allocation CHECK( magma_cmalloc( &B->dval, B->nnz )); CHECK( magma_index_malloc( &B->drow, B->num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, B->nnz )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &handle )); CHECK_CUSPARSE( cusparseSetStream( handle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrB )); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrB, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrB, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseCcsr2csc( handle, A.num_rows, A.num_cols, A.nnz, A.dval, A.drow, A.dcol, B->dval, B->dcol, B->drow, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO) ); CHECK( magma_cmconjugate( B, queue )); } else if ( A.memory_location == Magma_CPU ){ CHECK( magma_cmtransfer( A, &A_d, A.memory_location, Magma_DEV, queue )); CHECK( magma_cmtransposeconjugate( A_d, &B_d, queue )); CHECK( magma_cmtransfer( B_d, B, Magma_DEV, A.memory_location, queue )); } else { CHECK( magma_cmconvert( A, &ACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_cmtransposeconjugate( ACSR, &BCSR, queue )); CHECK( magma_cmconvert( BCSR, B, Magma_CSR, A.storage_type, queue )); } cleanup: cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroy( handle ); magma_cmfree( &A_d, queue ); magma_cmfree( &B_d, queue ); magma_cmfree( &ACSR, queue ); magma_cmfree( &BCSR, queue ); if( info != 0 ){ magma_cmfree( B, queue ); } return info; }
extern "C" magma_int_t magma_zcuspmm( magma_z_matrix A, magma_z_matrix B, magma_z_matrix *AB, magma_queue_t queue ) { magma_int_t info = 0; magma_z_matrix C={Magma_CSR}; C.num_rows = A.num_rows; C.num_cols = B.num_cols; C.storage_type = A.storage_type; C.memory_location = A.memory_location; C.fill_mode = MagmaFull; C.val = NULL; C.col = NULL; C.row = NULL; C.rowidx = NULL; C.blockinfo = NULL; C.diag = NULL; C.dval = NULL; C.dcol = NULL; C.drow = NULL; C.drowidx = NULL; C.ddiag = NULL; magma_index_t base_t, nnz_t, baseC; cusparseHandle_t handle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrB=NULL; cusparseMatDescr_t descrC=NULL; if ( A.memory_location == Magma_DEV && B.memory_location == Magma_DEV && ( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRCOO ) && ( B.storage_type == Magma_CSR || B.storage_type == Magma_CSRCOO ) ) { // CUSPARSE context / CHECK_CUSPARSE( cusparseCreate( &handle )); CHECK_CUSPARSE( cusparseSetStream( handle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrB )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrC )); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrB, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrC, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrB, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrC, CUSPARSE_INDEX_BASE_ZERO )); // nnzTotalDevHostPtr points to host memory magma_index_t *nnzTotalDevHostPtr = (magma_index_t*) &C.nnz; CHECK_CUSPARSE( cusparseSetPointerMode( handle, CUSPARSE_POINTER_MODE_HOST )); CHECK( magma_index_malloc( &C.drow, (A.num_rows + 1) )); CHECK_CUSPARSE( cusparseXcsrgemmNnz( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, B.num_cols, A.num_cols, descrA, A.nnz, A.drow, A.dcol, descrB, B.nnz, B.drow, B.dcol, descrC, C.drow, nnzTotalDevHostPtr )); if (NULL != nnzTotalDevHostPtr) { C.nnz = *nnzTotalDevHostPtr; } else { // workaround as nnz and base C are magma_int_t magma_index_getvector( 1, C.drow+C.num_rows, 1, &nnz_t, 1, queue ); magma_index_getvector( 1, C.drow, 1, &base_t, 1, queue ); C.nnz = (magma_int_t) nnz_t; baseC = (magma_int_t) base_t; C.nnz -= baseC; } CHECK( magma_index_malloc( &C.dcol, C.nnz )); CHECK( magma_zmalloc( &C.dval, C.nnz )); CHECK_CUSPARSE( cusparseZcsrgemm( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, B.num_cols, A.num_cols, descrA, A.nnz, A.dval, A.drow, A.dcol, descrB, B.nnz, B.dval, B.drow, B.dcol, descrC, C.dval, C.drow, C.dcol )); // end CUSPARSE context // //magma_device_sync(); magma_queue_sync( queue ); CHECK( magma_zmtransfer( C, AB, Magma_DEV, Magma_DEV, queue )); } else { info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroyMatDescr( descrC ); cusparseDestroy( handle ); magma_zmfree( &C, queue ); return info; }
extern "C" magma_int_t magma_dcumicgeneratesolverinfo( magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU )); /* // to enable also the block-asynchronous iteration for the triangular solves CHECK( magma_dmtransfer( precond->M, &hA, Magma_DEV, Magma_CPU, queue )); hA.storage_type = Magma_CSR; CHECK( magma_dcsrsplit( 256, hA, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->LD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->L, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); CHECK( magma_d_cucsrtranspose( hA, &hAt, queue )); CHECK( magma_dcsrsplit( 256, hAt, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->UD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->U, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); magma_dmfree(&hA, queue ); magma_dmfree(&hAt, queue ); */ cleanup: cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
extern "C" magma_int_t magma_d_spmv( double alpha, magma_d_matrix A, magma_d_matrix x, double beta, magma_d_matrix y, magma_queue_t queue ) { magma_int_t info = 0; magma_d_matrix x2={Magma_CSR}; cusparseHandle_t cusparseHandle = 0; cusparseMatDescr_t descr = 0; // make sure RHS is a dense matrix if ( x.storage_type != Magma_DENSE ) { printf("error: only dense vectors are supported for SpMV.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location != x.memory_location || x.memory_location != y.memory_location ) { printf("error: linear algebra objects are not located in same memory!\n"); printf("memory locations are: %d %d %d\n", A.memory_location, x.memory_location, y.memory_location ); info = MAGMA_ERR_INVALID_PTR; goto cleanup; } // DEV case if ( A.memory_location == Magma_DEV ) { if ( A.num_cols == x.num_rows && x.num_cols == 1 ) { if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); cusparseDcsrmv( cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, &beta, y.dval ); } else if ( A.storage_type == Magma_ELL ) { //printf("using ELLPACKT kernel for SpMV: "); CHECK( magma_dgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLPACKT ) { //printf("using ELL kernel for SpMV: "); CHECK( magma_dgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLRT ) { //printf("using ELLRT kernel for SpMV: "); CHECK( magma_dgeellrtmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, A.alignment, A.blocksize, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_SELLP ) { //printf("using SELLP kernel for SpMV: "); CHECK( magma_dgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dgemv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_SPMVFUNCTION ) { //printf("using DENSE kernel for SpMV: "); CHECK( magma_dcustomspmv( alpha, x, beta, y, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_BCSR ) { //printf("using CUSPARSE BCSR kernel for SpMV: "); // CUSPARSE context // cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; int mb = magma_ceildiv( A.num_rows, A.blocksize ); int nb = magma_ceildiv( A.num_cols, A.blocksize ); CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); cusparseDbsrmv( cusparseHandle, dirA, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, A.numblocks, &alpha, descr, A.dval, A.drow, A.dcol, A.blocksize, x.dval, &beta, y.dval ); } else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } else if ( A.num_cols < x.num_rows || x.num_cols > 1 ) { magma_int_t num_vecs = x.num_rows / A.num_cols * x.num_cols; if ( A.storage_type == Magma_CSR ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); if ( x.major == MagmaColMajor) { cusparseDcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); } else if ( x.major == MagmaRowMajor) { /*cusparseDcsrmm2(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); */ } } else if ( A.storage_type == Magma_SELLP ) { if ( x.major == MagmaRowMajor) { CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); } else if ( x.major == MagmaColMajor) { // transpose first to row major CHECK( magma_dvtranspose( x, &x2, queue )); CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x2.dval, beta, y.dval, queue )); } } /*if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dmgemv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1 ); //printf("done.\n"); }*/ else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } } // CPU case missing! else { printf("error: CPU not yet supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); cusparseHandle = 0; descr = 0; magma_dmfree(&x2, queue ); return info; }
extern "C" magma_int_t magma_cpidr_strms( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_PIDRMERGE; solver_par->numiter = 0; solver_par->spmv_count = 0; solver_par->init_res = 0.0; solver_par->final_res = 0.0; solver_par->iter_res = 0.0; solver_par->runtime = 0.0; // constants const magmaFloatComplex c_zero = MAGMA_C_ZERO; const magmaFloatComplex c_one = MAGMA_C_ONE; const magmaFloatComplex c_n_one = MAGMA_C_NEG_ONE; // internal user options const magma_int_t smoothing = 1; // 0 = disable, 1 = enable const float angle = 0.7; // [0-1] // local variables magma_int_t iseed[4] = {0, 0, 0, 1}; magma_int_t dof; magma_int_t s; magma_int_t distr; magma_int_t k, i, sk; magma_int_t innerflag; magma_int_t ldd; magma_int_t q; float residual; float nrm; float nrmb; float nrmr; float nrmt; float rho; magmaFloatComplex om; magmaFloatComplex gamma; // matrices and vectors magma_c_matrix dxs = {Magma_CSR}; magma_c_matrix dr = {Magma_CSR}, drs = {Magma_CSR}; magma_c_matrix dP = {Magma_CSR}, dP1 = {Magma_CSR}; magma_c_matrix dG = {Magma_CSR}, dGcol = {Magma_CSR}; magma_c_matrix dU = {Magma_CSR}; magma_c_matrix dM = {Magma_CSR}; magma_c_matrix df = {Magma_CSR}; magma_c_matrix dt = {Magma_CSR}, dtt = {Magma_CSR}; magma_c_matrix dc = {Magma_CSR}; magma_c_matrix dv = {Magma_CSR}; magma_c_matrix dlu = {Magma_CSR}; magma_c_matrix dskp = {Magma_CSR}; magma_c_matrix dalpha = {Magma_CSR}; magma_c_matrix dbeta = {Magma_CSR}; magmaFloatComplex *hMdiag = NULL; magmaFloatComplex *hskp = NULL; magmaFloatComplex *halpha = NULL; magmaFloatComplex *hbeta = NULL; magmaFloatComplex *d1 = NULL, *d2 = NULL; // queue variables const magma_int_t nqueues = 3; // number of queues magma_queue_t queues[nqueues]; // chronometry real_Double_t tempo1, tempo2; // create additional queues queues[0] = queue; for ( q = 1; q < nqueues; q++ ) { magma_queue_create( queue->device(), &(queues[q]) ); } // initial s space // TODO: add option for 's' (shadow space number) // Hack: uses '--restart' option as the shadow space number. // This is not a good idea because the default value of restart option is used to detect // if the user provided a custom restart. This means that if the default restart value // is changed then the code will think it was the user (unless the default value is // also updated in the 'if' statement below. s = 1; if ( solver_par->restart != 50 ) { if ( solver_par->restart > A.num_cols ) { s = A.num_cols; } else { s = solver_par->restart; } } solver_par->restart = s; // set max iterations solver_par->maxiter = min( 2 * A.num_cols, solver_par->maxiter ); // check if matrix A is square if ( A.num_rows != A.num_cols ) { //printf("Matrix A is not square.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } // |b| nrmb = magma_scnrm2( b.num_rows, b.dval, 1, queue ); if ( nrmb == 0.0 ) { magma_cscal( x->num_rows, MAGMA_C_ZERO, x->dval, 1, queue ); info = MAGMA_SUCCESS; goto cleanup; } // t = 0 // make t twice as large to contain both, dt and dr ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_cvinit( &dt, Magma_DEV, ldd, 2, c_zero, queue )); dt.num_rows = b.num_rows; dt.num_cols = 1; dt.nnz = dt.num_rows; // redirect the dr.dval to the second part of dt CHECK( magma_cvinit( &dr, Magma_DEV, b.num_rows, 1, c_zero, queue )); magma_free( dr.dval ); dr.dval = dt.dval + ldd; // r = b - A x CHECK( magma_cresidualvec( A, b, *x, &dr, &nrmr, queue )); // |r| solver_par->init_res = nrmr; solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t)nrmr; } // check if initial is guess good enough if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; goto cleanup; } // P = randn(n, s) // P = ortho(P) //--------------------------------------- // P = 0.0 CHECK( magma_cvinit( &dP, Magma_CPU, A.num_cols, s, c_zero, queue )); // P = randn(n, s) distr = 3; // 1 = unif (0,1), 2 = unif (-1,1), 3 = normal (0,1) dof = dP.num_rows * dP.num_cols; lapackf77_clarnv( &distr, iseed, &dof, dP.val ); // transfer P to device CHECK( magma_cmtransfer( dP, &dP1, Magma_CPU, Magma_DEV, queue )); magma_cmfree( &dP, queue ); // P = ortho(P1) if ( dP1.num_cols > 1 ) { // P = magma_cqr(P1), QR factorization CHECK( magma_cqr( dP1.num_rows, dP1.num_cols, dP1, dP1.ld, &dP, NULL, queue )); } else { // P = P1 / |P1| nrm = magma_scnrm2( dof, dP1.dval, 1, queue ); nrm = 1.0 / nrm; magma_csscal( dof, nrm, dP1.dval, 1, queue ); CHECK( magma_cmtransfer( dP1, &dP, Magma_DEV, Magma_DEV, queue )); } magma_cmfree( &dP1, queue ); //--------------------------------------- // allocate memory for the scalar products CHECK( magma_cmalloc_pinned( &hskp, 5 )); CHECK( magma_cvinit( &dskp, Magma_DEV, 4, 1, c_zero, queue )); CHECK( magma_cmalloc_pinned( &halpha, s )); CHECK( magma_cvinit( &dalpha, Magma_DEV, s, 1, c_zero, queue )); CHECK( magma_cmalloc_pinned( &hbeta, s )); CHECK( magma_cvinit( &dbeta, Magma_DEV, s, 1, c_zero, queue )); // workspace for merged dot product CHECK( magma_cmalloc( &d1, max(2, s) * b.num_rows )); CHECK( magma_cmalloc( &d2, max(2, s) * b.num_rows )); // smoothing enabled if ( smoothing > 0 ) { // set smoothing solution vector CHECK( magma_cmtransfer( *x, &dxs, Magma_DEV, Magma_DEV, queue )); // tt = 0 // make tt twice as large to contain both, dtt and drs ldd = magma_roundup( b.num_rows, 32 ); CHECK( magma_cvinit( &dtt, Magma_DEV, ldd, 2, c_zero, queue )); dtt.num_rows = dr.num_rows; dtt.num_cols = 1; dtt.nnz = dtt.num_rows; // redirect the drs.dval to the second part of dtt CHECK( magma_cvinit( &drs, Magma_DEV, dr.num_rows, 1, c_zero, queue )); magma_free( drs.dval ); drs.dval = dtt.dval + ldd; // set smoothing residual vector magma_ccopyvector( dr.num_rows, dr.dval, 1, drs.dval, 1, queue ); } // G(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_rows, 32 ); CHECK( magma_cvinit( &dG, Magma_DEV, ldd, s, c_zero, queue )); dG.num_rows = A.num_rows; } else { CHECK( magma_cvinit( &dG, Magma_DEV, A.num_rows, s, c_zero, queue )); } // dGcol represents a single column of dG, array pointer is set inside loop CHECK( magma_cvinit( &dGcol, Magma_DEV, dG.num_rows, 1, c_zero, queue )); magma_free( dGcol.dval ); // U(n,s) = 0 if ( s > 1 ) { ldd = magma_roundup( A.num_cols, 32 ); CHECK( magma_cvinit( &dU, Magma_DEV, ldd, s, c_zero, queue )); dU.num_rows = A.num_cols; } else { CHECK( magma_cvinit( &dU, Magma_DEV, A.num_cols, s, c_zero, queue )); } // M(s,s) = I CHECK( magma_cvinit( &dM, Magma_DEV, s, s, c_zero, queue )); CHECK( magma_cmalloc_pinned( &hMdiag, s )); magmablas_claset( MagmaFull, dM.num_rows, dM.num_cols, c_zero, c_one, dM.dval, dM.ld, queue ); // f = 0 CHECK( magma_cvinit( &df, Magma_DEV, dP.num_cols, 1, c_zero, queue )); // c = 0 CHECK( magma_cvinit( &dc, Magma_DEV, dM.num_cols, 1, c_zero, queue )); // v = r CHECK( magma_cmtransfer( dr, &dv, Magma_DEV, Magma_DEV, queue )); // lu = 0 CHECK( magma_cvinit( &dlu, Magma_DEV, dr.num_rows, 1, c_zero, queue )); //--------------START TIME--------------- // chronometry tempo1 = magma_sync_wtime( queue ); if ( solver_par->verbose > 0 ) { solver_par->timing[0] = 0.0; } om = MAGMA_C_ONE; gamma = MAGMA_C_ZERO; innerflag = 0; // start iteration do { solver_par->numiter++; // new RHS for small systems // f = P' r // Q1 magma_cgemvmdot_shfl( dP.num_rows, dP.num_cols, dP.dval, dr.dval, d1, d2, df.dval, queues[1] ); // skp[4] = f(k) // Q1 magma_cgetvector_async( 1, df.dval, 1, &hskp[4], 1, queues[1] ); // c(k:s) = f(k:s) // Q1 magma_ccopyvector_async( s, df.dval, 1, dc.dval, 1, queues[1] ); // c(k:s) = M(k:s,k:s) \ f(k:s) // Q1 magma_ctrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, s, dM.dval, dM.ld, dc.dval, 1, queues[1] ); // shadow space loop for ( k = 0; k < s; ++k ) { sk = s - k; dGcol.dval = dG.dval + k * dG.ld; // v = r - G(:,k:s) c(k:s) // Q1 magmablas_cgemv( MagmaNoTrans, dG.num_rows, sk, c_n_one, dGcol.dval, dG.ld, &dc.dval[k], 1, c_one, dv.dval, 1, queues[1] ); // preconditioning operation // v = L \ v; // v = U \ v; // Q1 CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queues[1] )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queues[1] )); // sync Q0 --> U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) magma_queue_sync( queues[0] ); // U(:,k) = om * v + U(:,k:s) c(k:s) // Q1 magmablas_cgemv( MagmaNoTrans, dU.num_rows, sk, c_one, &dU.dval[k*dU.ld], dU.ld, &dc.dval[k], 1, om, dv.dval, 1, queues[1] ); // G(:,k) = A U(:,k) // Q1 CHECK( magma_c_spmv( c_one, A, dv, c_zero, dGcol, queues[1] )); solver_par->spmv_count++; // bi-orthogonalize the new basis vectors for ( i = 0; i < k; ++i ) { // alpha = P(:,i)' G(:,k) // Q1 halpha[i] = magma_cdotc( dP.num_rows, &dP.dval[i*dP.ld], 1, dGcol.dval, 1, queues[1] ); // implicit sync Q1 --> alpha = P(:,i)' G(:,k) // alpha = alpha / M(i,i) halpha[i] = halpha[i] / hMdiag[i]; // G(:,k) = G(:,k) - alpha * G(:,i) // Q1 magma_caxpy( dG.num_rows, -halpha[i], &dG.dval[i*dG.ld], 1, dGcol.dval, 1, queues[1] ); } // sync Q1 --> compute new G, skp[4] = f(k magma_queue_sync( queues[1] ); // new column of M = P'G, first k-1 entries are zero // M(k:s,k) = P(:,k:s)' G(:,k) // Q2 magma_cgemvmdot_shfl( dP.num_rows, sk, &dP.dval[k*dP.ld], dGcol.dval, d1, d2, &dM.dval[k*dM.ld+k], queues[2] ); // U(:,k) = v // Q0 magma_ccopyvector_async( dU.num_rows, dv.dval, 1, &dU.dval[k*dU.ld], 1, queues[0] ); // non-first s iteration if ( k > 0 ) { // alpha = dalpha // Q0 magma_csetvector_async( k, halpha, 1, dalpha.dval, 1, queues[0] ); // U update outside of loop using GEMV // U(:,k) = U(:,k) - U(:,1:k) * alpha(1:k) // Q0 magmablas_cgemv( MagmaNoTrans, dU.num_rows, k, c_n_one, dU.dval, dU.ld, dalpha.dval, 1, c_one, &dU.dval[k*dU.ld], 1, queues[0] ); } // Mdiag(k) = M(k,k) // Q2 magma_cgetvector( 1, &dM.dval[k*dM.ld+k], 1, &hMdiag[k], 1, queues[2] ); // implicit sync Q2 --> Mdiag(k) = M(k,k) // check M(k,k) == 0 if ( MAGMA_C_EQUAL(hMdiag[k], MAGMA_C_ZERO) ) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // beta = f(k) / M(k,k) hbeta[k] = hskp[4] / hMdiag[k]; // check for nan if ( magma_c_isnan( hbeta[k] ) || magma_c_isinf( hbeta[k] )) { innerflag = 1; info = MAGMA_DIVERGENCE; break; } // non-last s iteration if ( (k + 1) < s ) { // f(k+1:s) = f(k+1:s) - beta * M(k+1:s,k) // Q1 magma_caxpy( sk-1, -hbeta[k], &dM.dval[k*dM.ld+(k+1)], 1, &df.dval[k+1], 1, queues[1] ); // c(k+1:s) = f(k+1:s) // Q1 magma_ccopyvector_async( sk-1, &df.dval[k+1], 1, &dc.dval[k+1], 1, queues[1] ); // c(k+1:s) = M(k+1:s,k+1:s) \ f(k+1:s) // Q1 magma_ctrsv( MagmaLower, MagmaNoTrans, MagmaNonUnit, sk-1, &dM.dval[(k+1)*dM.ld+(k+1)], dM.ld, &dc.dval[k+1], 1, queues[1] ); // skp[4] = f(k+1) // Q1 magma_cgetvector_async( 1, &df.dval[k+1], 1, &hskp[4], 1, queues[1] ); } // r = r - beta * G(:,k) // Q2 magma_caxpy( dr.num_rows, -hbeta[k], dGcol.dval, 1, dr.dval, 1, queues[2] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_scnrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // smoothing enabled } else { // x = x + beta * U(:,k) // Q0 magma_caxpy( x->num_rows, hbeta[k], &dU.dval[k*dU.ld], 1, x->dval, 1, queues[0] ); // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_cidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_cgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_cgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // xs = xs - gamma * (xs - x) // Q0 magma_cidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // rs = rs - gamma * t // Q2 magma_caxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // |rs| // Q2 nrmr = magma_scnrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { s = k + 1; // for the x-update outside the loop innerflag = 2; info = MAGMA_SUCCESS; break; } } // smoothing disabled if ( smoothing <= 0 && innerflag != 1 ) { // dbeta(1:s) = beta(1:s) // Q0 magma_csetvector_async( s, hbeta, 1, dbeta.dval, 1, queues[0] ); // x = x + U(:,1:s) * beta(1:s) // Q0 magmablas_cgemv( MagmaNoTrans, dU.num_rows, s, c_one, dU.dval, dU.ld, dbeta.dval, 1, c_one, x->dval, 1, queues[0] ); } // check convergence or iteration limit or invalid result of inner loop if ( innerflag > 0 ) { break; } // preconditioning operation // v = L \ v; // v = U \ v; // Q2 CHECK( magma_c_applyprecond_left( MagmaNoTrans, A, dv, &dlu, precond_par, queues[2] )); CHECK( magma_c_applyprecond_right( MagmaNoTrans, A, dlu, &dv, precond_par, queues[2] )); // t = A v // Q2 CHECK( magma_c_spmv( c_one, A, dv, c_zero, dt, queues[2] )); solver_par->spmv_count++; // computation of a new omega //--------------------------------------- // t't // t'r // Q2 CHECK( magma_cgemvmdot_shfl( dt.ld, 2, dt.dval, dt.dval, d1, d2, dskp.dval, queues[2] )); // skp[0-2] = dskp[0-2] // Q2 magma_cgetvector( 2, dskp.dval, 1, hskp, 1, queues[2] ); // implicit sync Q2 --> skp = dskp // |t| nrmt = magma_ssqrt( MAGMA_C_REAL(hskp[0]) ); // rho = abs((t' * r) / (|t| * |r|)) rho = MAGMA_D_ABS( MAGMA_C_REAL(hskp[1]) / (nrmt * nrmr) ); // om = (t' * r) / (|t| * |t|) om = hskp[1] / hskp[0]; if ( rho < angle ) { om = (om * angle) / rho; } //--------------------------------------- if ( MAGMA_C_EQUAL(om, MAGMA_C_ZERO) ) { info = MAGMA_DIVERGENCE; break; } // sync Q1 --> v = r magma_queue_sync( queues[1] ); // r = r - om * t // Q2 magma_caxpy( dr.num_rows, -om, dt.dval, 1, dr.dval, 1, queues[2] ); // x = x + om * v // Q0 magma_caxpy( x->num_rows, om, dv.dval, 1, x->dval, 1, queues[0] ); // smoothing disabled if ( smoothing <= 0 ) { // |r| // Q2 nrmr = magma_scnrm2( dr.num_rows, dr.dval, 1, queues[2] ); // implicit sync Q2 --> |r| // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // smoothing enabled } else { // smoothing operation //--------------------------------------- // t = rs - r // Q2 magma_cidr_smoothing_1( drs.num_rows, drs.num_cols, drs.dval, dr.dval, dtt.dval, queues[2] ); // t't // t'rs // Q2 CHECK( magma_cgemvmdot_shfl( dt.ld, 2, dtt.dval, dtt.dval, d1, d2, &dskp.dval[2], queues[2] )); // skp[2-3] = dskp[2-3] // Q2 magma_cgetvector( 2, &dskp.dval[2], 1, &hskp[2], 1, queues[2] ); // implicit sync Q2 --> skp = dskp // gamma = (t' * rs) / (t' * t) gamma = hskp[3] / hskp[2]; // xs = xs - gamma * (xs - x) // Q0 magma_cidr_smoothing_2( dxs.num_rows, dxs.num_cols, -gamma, x->dval, dxs.dval, queues[0] ); // v = r // Q1 magma_ccopyvector_async( dr.num_rows, dr.dval, 1, dv.dval, 1, queues[1] ); // rs = rs - gamma * (rs - r) // Q2 magma_caxpy( drs.num_rows, -gamma, dtt.dval, 1, drs.dval, 1, queues[2] ); // |rs| // Q2 nrmr = magma_scnrm2( drs.num_rows, drs.dval, 1, queues[2] ); // implicit sync Q2 --> |r| //--------------------------------------- } // store current timing and residual if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); if ( (solver_par->numiter) % solver_par->verbose == 0 ) { solver_par->res_vec[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)nrmr; solver_par->timing[(solver_par->numiter) / solver_par->verbose] = (real_Double_t)tempo2 - tempo1; } } // check convergence or iteration limit if ( nrmr <= solver_par->atol || nrmr/nrmb <= solver_par->rtol ) { info = MAGMA_SUCCESS; break; } } while ( solver_par->numiter + 1 <= solver_par->maxiter ); // sync all queues for ( q = 0; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { // x = xs magma_ccopyvector_async( x->num_rows, dxs.dval, 1, x->dval, 1, queue ); // r = rs magma_ccopyvector_async( dr.num_rows, drs.dval, 1, dr.dval, 1, queue ); } // get last iteration timing tempo2 = magma_sync_wtime( queue ); magma_queue_sync( queue ); solver_par->runtime = (real_Double_t)tempo2 - tempo1; //--------------STOP TIME---------------- // get final stats solver_par->iter_res = nrmr; CHECK( magma_cresidualvec( A, b, *x, &dr, &residual, queue )); solver_par->final_res = residual; // set solver conclusion if ( info != MAGMA_SUCCESS && info != MAGMA_DIVERGENCE ) { if ( solver_par->init_res > solver_par->final_res ) { info = MAGMA_SLOW_CONVERGENCE; } } cleanup: // free resources // sync all queues, destory additional queues magma_queue_sync( queues[0] ); for ( q = 1; q < nqueues; q++ ) { magma_queue_sync( queues[q] ); magma_queue_destroy( queues[q] ); } // smoothing enabled if ( smoothing > 0 ) { drs.dval = NULL; // needed because its pointer is redirected to dtt magma_cmfree( &dxs, queue ); magma_cmfree( &drs, queue ); magma_cmfree( &dtt, queue ); } dr.dval = NULL; // needed because its pointer is redirected to dt dGcol.dval = NULL; // needed because its pointer is redirected to dG magma_cmfree( &dr, queue ); magma_cmfree( &dP, queue ); magma_cmfree( &dP1, queue ); magma_cmfree( &dG, queue ); magma_cmfree( &dGcol, queue ); magma_cmfree( &dU, queue ); magma_cmfree( &dM, queue ); magma_cmfree( &df, queue ); magma_cmfree( &dt, queue ); magma_cmfree( &dc, queue ); magma_cmfree( &dv, queue ); magma_cmfree( &dlu, queue ); magma_cmfree( &dskp, queue ); magma_cmfree( &dalpha, queue ); magma_cmfree( &dbeta, queue ); magma_free_pinned( hMdiag ); magma_free_pinned( hskp ); magma_free_pinned( halpha ); magma_free_pinned( hbeta ); magma_free( d1 ); magma_free( d2 ); solver_par->info = info; return info; /* magma_cpidr_strms */ }