/***************************************************************************//** @fn magma_copyvector_async( n, elemSize, dx_src, incx, dy_dst, incy, queue ) Copy vector dx_src on GPU device to dy_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. With CUDA unified addressing, dx and dy can be on different GPUs. This version is asynchronous: it may return before the transfer finishes. See magma_copyvector() for a synchronous version. @param[in] n Number of elements in vector. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dx_src Source array of dimension (1 + (n-1))*incx, on GPU device. @param[in] incx Increment between elements of hx_src. incx > 0. @param[out] dy_dst Destination array of dimension (1 + (n-1))*incy, on GPU device. @param[in] incy Increment between elements of dy_dst. incy > 0. @param[in] queue Queue to execute in. @ingroup magma_copyvector *******************************************************************************/ extern "C" void magma_copyvector_async_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, magma_ptr dy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } if ( incx == 1 && incy == 1 ) { cudaError_t status; status = cudaMemcpyAsync( dy_dst, dx_src, int(n*elemSize), cudaMemcpyDeviceToDevice, stream ); check_xerror( status, func, file, line ); } else { magma_copymatrix_async_internal( 1, n, elemSize, dx_src, incx, dy_dst, incy, queue, func, file, line ); } }
/***************************************************************************//** @fn magma_copymatrix( m, n, elemSize, dA_src, ldda, dB_dst, lddb, queue ) Copy all or part of matrix dA_src on GPU device to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. With CUDA unified addressing, dA and dB can be on different GPUs. This version synchronizes the queue after the transfer. See magma_copymatrix_async() for an asynchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n). @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_copymatrix *******************************************************************************/ extern "C" void magma_copymatrix_q_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); cudaError_t status; status = cudaMemcpy2DAsync( dB_dst, int(lddb*elemSize), dA_src, int(ldda*elemSize), int(m*elemSize), int(n), cudaMemcpyDeviceToDevice, queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_getmatrix( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue ) Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version synchronizes the queue after the transfer. See magma_getmatrix_async() for an asynchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] hB_dst Destination array of dimension (ldb,n), on CPU host. @param[in] ldb Leading dimension of matrix B. ldb >= m. @param[in] queue Queue to execute in. @ingroup magma_getmatrix *******************************************************************************/ extern "C" void magma_getmatrix_q_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, void* hB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); cublasStatus_t status; status = cublasGetMatrixAsync( int(m), int(n), int(elemSize), dA_src, int(ldda), hB_dst, int(ldb), queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
// TODO compare performance with cublasZcopy BLAS function. // But this implementation can handle any element size, not just [sdcz] precisions. extern "C" void magma_copyvector_q_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, magma_ptr dy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { assert( queue != NULL ); if ( incx == 1 && incy == 1 ) { cudaError_t status; status = cudaMemcpyAsync( dy_dst, dx_src, int(n*elemSize), cudaMemcpyDeviceToDevice, queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); } else { magma_copymatrix_q_internal( 1, n, elemSize, dx_src, incx, dy_dst, incy, queue, func, file, line ); } }
/***************************************************************************//** @fn magma_getvector( n, elemSize, dx_src, incx, hy_dst, incy, queue ) Copy vector dx_src on GPU device to hy_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version synchronizes the queue after the transfer. See magma_getvector_async() for an asynchronous version. @param[in] n Number of elements in vector. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dx_src Source array of dimension (1 + (n-1))*incx, on GPU device. @param[in] incx Increment between elements of hx_src. incx > 0. @param[out] hy_dst Destination array of dimension (1 + (n-1))*incy, on CPU host. @param[in] incy Increment between elements of dy_dst. incy > 0. @param[in] queue Queue to execute in. @ingroup magma_getvector *******************************************************************************/ extern "C" void magma_getvector_q_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, void* hy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { cublasStatus_t status; status = cublasGetVectorAsync( int(n), int(elemSize), dx_src, int(incx), hy_dst, int(incy), queue->cuda_stream() ); cudaStreamSynchronize( queue->cuda_stream() ); check_xerror( status, func, file, line ); }
extern "C" magma_int_t magma_dapplycumilu_r_transpose( magma_d_matrix b, magma_d_matrix *x, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrU=NULL; double one = MAGMA_D_MAKE( 1.0, 0.0); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseDcsrsm_solve( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->UT.num_rows, b.num_rows*b.num_cols/precond->UT.num_rows, &one, descrU, precond->UT.dval, precond->UT.drow, precond->UT.dcol, precond->cuinfoUT, b.dval, precond->UT.num_rows, x->dval, precond->UT.num_rows )); cleanup: cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
/***************************************************************************//** @fn magma_getmatrix_async( m, n, elemSize, dA_src, ldda, hB_dst, ldb, queue ) Copy all or part of matrix dA_src on GPU device to hB_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hB_dst is pinned CPU memory. See magma_getmatrix() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] hB_dst Destination array of dimension (ldb,n), on CPU host. @param[in] ldb Leading dimension of matrix B. ldb >= m. @param[in] queue Queue to execute in. @ingroup magma_getmatrix *******************************************************************************/ extern "C" void magma_getmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, void* hB_dst, magma_int_t ldb, magma_queue_t queue, const char* func, const char* file, int line ) { cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasGetMatrixAsync( int(m), int(n), int(elemSize), dA_src, int(ldda), hB_dst, int(ldb), stream ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_getvector_async( n, elemSize, dx_src, incx, hy_dst, incy, queue ) Copy vector dx_src on GPU device to hy_dst on CPU host. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hy_dst is pinned CPU memory. See magma_getvector() for a synchronous version. @param[in] n Number of elements in vector. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dx_src Source array of dimension (1 + (n-1))*incx, on GPU device. @param[in] incx Increment between elements of hx_src. incx > 0. @param[out] hy_dst Destination array of dimension (1 + (n-1))*incy, on CPU host. @param[in] incy Increment between elements of dy_dst. incy > 0. @param[in] queue Queue to execute in. @ingroup magma_getvector *******************************************************************************/ extern "C" void magma_getvector_async_internal( magma_int_t n, magma_int_t elemSize, magma_const_ptr dx_src, magma_int_t incx, void* hy_dst, magma_int_t incy, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasGetVectorAsync( int(n), int(elemSize), dx_src, int(incx), hy_dst, int(incy), stream ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_setmatrix_async( m, n, elemSize, hA_src, lda, dB_dst, lddb, queue ) Copy all or part of matrix hA_src on CPU host to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. This version is asynchronous: it may return before the transfer finishes, if hA_src is pinned CPU memory. See magma_setmatrix() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] hA_src Source array of dimension (lda,n), on CPU host. @param[in] lda Leading dimension of matrix A. lda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_setmatrix *******************************************************************************/ extern "C" void magma_setmatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, void const* hA_src, magma_int_t lda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cublasStatus_t status; status = cublasSetMatrixAsync( int(m), int(n), int(elemSize), hA_src, int(lda), dB_dst, int(lddb), stream ); check_xerror( status, func, file, line ); }
/***************************************************************************//** @fn magma_copymatrix_async( m, n, elemSize, dA_src, ldda, dB_dst, lddb, queue ) Copy all or part of matrix dA_src on GPU device to dB_dst on GPU device. Elements may be arbitrary size. Type-safe versions set elemSize appropriately. With CUDA unified addressing, dA and dB can be on different GPUs. This version is asynchronous: it may return before the transfer finishes. See magma_copyvector() for a synchronous version. @param[in] m Number of rows of matrix A. m >= 0. @param[in] n Number of columns of matrix A. n >= 0. @param[in] elemSize Size of each element, e.g., sizeof(double). @param[in] dA_src Source array of dimension (ldda,n), on GPU device. @param[in] ldda Leading dimension of matrix A. ldda >= m. @param[out] dB_dst Destination array of dimension (lddb,n), on GPU device. @param[in] lddb Leading dimension of matrix B. lddb >= m. @param[in] queue Queue to execute in. @ingroup magma_copymatrix *******************************************************************************/ extern "C" void magma_copymatrix_async_internal( magma_int_t m, magma_int_t n, magma_int_t elemSize, magma_const_ptr dA_src, magma_int_t ldda, magma_ptr dB_dst, magma_int_t lddb, magma_queue_t queue, const char* func, const char* file, int line ) { // for backwards compatability, accepts NULL queue to mean NULL stream. cudaStream_t stream = NULL; if ( queue != NULL ) { stream = queue->cuda_stream(); } else { fprintf( stderr, "Warning: %s got NULL queue\n", __func__ ); } cudaError_t status; status = cudaMemcpy2DAsync( dB_dst, int(lddb*elemSize), dA_src, int(ldda*elemSize), int(m*elemSize), int(n), cudaMemcpyDeviceToDevice, stream ); check_xerror( status, func, file, line ); }
extern "C" magma_int_t magma_dcumicgeneratesolverinfo( magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU )); /* // to enable also the block-asynchronous iteration for the triangular solves CHECK( magma_dmtransfer( precond->M, &hA, Magma_DEV, Magma_CPU, queue )); hA.storage_type = Magma_CSR; CHECK( magma_dcsrsplit( 256, hA, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->LD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->L, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); CHECK( magma_d_cucsrtranspose( hA, &hAt, queue )); CHECK( magma_dcsrsplit( 256, hAt, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->UD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->U, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); magma_dmfree(&hA, queue ); magma_dmfree(&hAt, queue ); */ cleanup: cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
extern "C" magma_int_t magma_d_spmv( double alpha, magma_d_matrix A, magma_d_matrix x, double beta, magma_d_matrix y, magma_queue_t queue ) { magma_int_t info = 0; magma_d_matrix x2={Magma_CSR}; cusparseHandle_t cusparseHandle = 0; cusparseMatDescr_t descr = 0; // make sure RHS is a dense matrix if ( x.storage_type != Magma_DENSE ) { printf("error: only dense vectors are supported for SpMV.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location != x.memory_location || x.memory_location != y.memory_location ) { printf("error: linear algebra objects are not located in same memory!\n"); printf("memory locations are: %d %d %d\n", A.memory_location, x.memory_location, y.memory_location ); info = MAGMA_ERR_INVALID_PTR; goto cleanup; } // DEV case if ( A.memory_location == Magma_DEV ) { if ( A.num_cols == x.num_rows && x.num_cols == 1 ) { if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); cusparseDcsrmv( cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, &beta, y.dval ); } else if ( A.storage_type == Magma_ELL ) { //printf("using ELLPACKT kernel for SpMV: "); CHECK( magma_dgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLPACKT ) { //printf("using ELL kernel for SpMV: "); CHECK( magma_dgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLRT ) { //printf("using ELLRT kernel for SpMV: "); CHECK( magma_dgeellrtmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, A.alignment, A.blocksize, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_SELLP ) { //printf("using SELLP kernel for SpMV: "); CHECK( magma_dgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dgemv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_SPMVFUNCTION ) { //printf("using DENSE kernel for SpMV: "); CHECK( magma_dcustomspmv( alpha, x, beta, y, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_BCSR ) { //printf("using CUSPARSE BCSR kernel for SpMV: "); // CUSPARSE context // cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; int mb = magma_ceildiv( A.num_rows, A.blocksize ); int nb = magma_ceildiv( A.num_cols, A.blocksize ); CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); cusparseDbsrmv( cusparseHandle, dirA, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, A.numblocks, &alpha, descr, A.dval, A.drow, A.dcol, A.blocksize, x.dval, &beta, y.dval ); } else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } else if ( A.num_cols < x.num_rows || x.num_cols > 1 ) { magma_int_t num_vecs = x.num_rows / A.num_cols * x.num_cols; if ( A.storage_type == Magma_CSR ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); if ( x.major == MagmaColMajor) { cusparseDcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); } else if ( x.major == MagmaRowMajor) { /*cusparseDcsrmm2(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); */ } } else if ( A.storage_type == Magma_SELLP ) { if ( x.major == MagmaRowMajor) { CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); } else if ( x.major == MagmaColMajor) { // transpose first to row major CHECK( magma_dvtranspose( x, &x2, queue )); CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x2.dval, beta, y.dval, queue )); } } /*if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dmgemv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1 ); //printf("done.\n"); }*/ else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } } // CPU case missing! else { printf("error: CPU not yet supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); cusparseHandle = 0; descr = 0; magma_dmfree(&x2, queue ); return info; }
extern "C" magma_int_t magma_dcumiccsetup( magma_d_matrix A, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; #if CUDA_VERSION >= 7000 csric02Info_t info_M=NULL; void *pBuffer = NULL; #endif magma_d_matrix hA={Magma_CSR}, hACSR={Magma_CSR}, U={Magma_CSR}; CHECK( magma_dmtransfer( A, &hA, A.memory_location, Magma_CPU, queue )); U.diagorder_type = Magma_VALUE; CHECK( magma_dmconvert( hA, &hACSR, hA.storage_type, Magma_CSR, queue )); // in case using fill-in if( precond->levels > 0 ){ magma_d_matrix hAL={Magma_CSR}, hAUt={Magma_CSR}; CHECK( magma_dsymbilu( &hACSR, precond->levels, &hAL, &hAUt, queue )); magma_dmfree(&hAL, queue); magma_dmfree(&hAUt, queue); } CHECK( magma_dmconvert( hACSR, &U, Magma_CSR, Magma_CSRL, queue )); magma_dmfree( &hACSR, queue ); CHECK( magma_dmtransfer(U, &(precond->M), Magma_CPU, Magma_DEV, queue )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &(precond->cuinfo) )); // use kernel to manually check for zeros n the diagonal CHECK( magma_ddiagcheck( precond->M, queue ) ); #if CUDA_VERSION >= 7000 // this version has the bug fixed where a zero on the diagonal causes a crash CHECK_CUSPARSE( cusparseCreateCsric02Info(&info_M) ); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); int buffersize; int structural_zero; int numerical_zero; CHECK_CUSPARSE( cusparseDcsric02_bufferSize( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, &buffersize ) ); CHECK( magma_malloc((void**)&pBuffer, buffersize) ); CHECK_CUSPARSE( cusparseDcsric02_analysis( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer )); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &numerical_zero ) ); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &structural_zero ) ); CHECK_CUSPARSE( cusparseDcsric02( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer) ); #else // this version contains the bug but is needed for backward compability CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_SYMMETRIC )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrA, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrA, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); CHECK_CUSPARSE( cusparseDcsric0( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); #endif CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU )); if( precond->maxiter < 50 ){ //prepare for iterative solves // copy the matrix to precond->L and (transposed) to precond->U CHECK( magma_dmtransfer(precond->M, &(precond->L), Magma_DEV, Magma_DEV, queue )); CHECK( magma_dmtranspose( precond->L, &(precond->U), queue )); // extract the diagonal of L into precond->d CHECK( magma_djacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_dvinit( &precond->work1, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_djacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_dvinit( &precond->work2, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); } /* // to enable also the block-asynchronous iteration for the triangular solves CHECK( magma_dmtransfer( precond->M, &hA, Magma_DEV, Magma_CPU, queue )); hA.storage_type = Magma_CSR; magma_d_matrix hD, hR, hAt CHECK( magma_dcsrsplit( 256, hA, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->LD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->L, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); CHECK( magma_d_cucsrtranspose( hA, &hAt, queue )); CHECK( magma_dcsrsplit( 256, hAt, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->UD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->U, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); magma_dmfree(&hA, queue ); magma_dmfree(&hAt, queue ); */ cleanup: #if CUDA_VERSION >= 7000 magma_free( pBuffer ); cusparseDestroyCsric02Info( info_M ); #endif cusparseDestroySolveAnalysisInfo( precond->cuinfo ); cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroyMatDescr( descrA ); cusparseDestroy( cusparseHandle ); magma_dmfree(&U, queue ); magma_dmfree(&hA, queue ); return info; }
extern "C" magma_int_t magma_zcuspmm( magma_z_matrix A, magma_z_matrix B, magma_z_matrix *AB, magma_queue_t queue ) { magma_int_t info = 0; magma_z_matrix C={Magma_CSR}; C.num_rows = A.num_rows; C.num_cols = B.num_cols; C.storage_type = A.storage_type; C.memory_location = A.memory_location; C.fill_mode = MagmaFull; C.val = NULL; C.col = NULL; C.row = NULL; C.rowidx = NULL; C.blockinfo = NULL; C.diag = NULL; C.dval = NULL; C.dcol = NULL; C.drow = NULL; C.drowidx = NULL; C.ddiag = NULL; magma_index_t base_t, nnz_t, baseC; cusparseHandle_t handle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrB=NULL; cusparseMatDescr_t descrC=NULL; if ( A.memory_location == Magma_DEV && B.memory_location == Magma_DEV && ( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRCOO ) && ( B.storage_type == Magma_CSR || B.storage_type == Magma_CSRCOO ) ) { // CUSPARSE context / CHECK_CUSPARSE( cusparseCreate( &handle )); CHECK_CUSPARSE( cusparseSetStream( handle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrB )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrC )); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrB, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrC, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrB, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrC, CUSPARSE_INDEX_BASE_ZERO )); // nnzTotalDevHostPtr points to host memory magma_index_t *nnzTotalDevHostPtr = (magma_index_t*) &C.nnz; CHECK_CUSPARSE( cusparseSetPointerMode( handle, CUSPARSE_POINTER_MODE_HOST )); CHECK( magma_index_malloc( &C.drow, (A.num_rows + 1) )); CHECK_CUSPARSE( cusparseXcsrgemmNnz( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, B.num_cols, A.num_cols, descrA, A.nnz, A.drow, A.dcol, descrB, B.nnz, B.drow, B.dcol, descrC, C.drow, nnzTotalDevHostPtr )); if (NULL != nnzTotalDevHostPtr) { C.nnz = *nnzTotalDevHostPtr; } else { // workaround as nnz and base C are magma_int_t magma_index_getvector( 1, C.drow+C.num_rows, 1, &nnz_t, 1, queue ); magma_index_getvector( 1, C.drow, 1, &base_t, 1, queue ); C.nnz = (magma_int_t) nnz_t; baseC = (magma_int_t) base_t; C.nnz -= baseC; } CHECK( magma_index_malloc( &C.dcol, C.nnz )); CHECK( magma_zmalloc( &C.dval, C.nnz )); CHECK_CUSPARSE( cusparseZcsrgemm( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, B.num_cols, A.num_cols, descrA, A.nnz, A.dval, A.drow, A.dcol, descrB, B.nnz, B.dval, B.drow, B.dcol, descrC, C.dval, C.drow, C.dcol )); // end CUSPARSE context // //magma_device_sync(); magma_queue_sync( queue ); CHECK( magma_zmtransfer( C, AB, Magma_DEV, Magma_DEV, queue )); } else { info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroyMatDescr( descrC ); cusparseDestroy( handle ); magma_zmfree( &C, queue ); return info; }
extern "C" magma_int_t magma_dcumilugeneratesolverinfo( magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; magma_d_matrix hA={Magma_CSR}, hL={Magma_CSR}, hU={Magma_CSR}; if (precond->L.memory_location != Magma_DEV ){ CHECK( magma_dmtransfer( precond->M, &hA, precond->M.memory_location, Magma_CPU, queue )); hL.diagorder_type = Magma_UNITY; CHECK( magma_dmconvert( hA, &hL , Magma_CSR, Magma_CSRL, queue )); hU.diagorder_type = Magma_VALUE; CHECK( magma_dmconvert( hA, &hU , Magma_CSR, Magma_CSRU, queue )); CHECK( magma_dmtransfer( hL, &(precond->L), Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hU, &(precond->U), Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hA, queue ); magma_dmfree(&hL, queue ); magma_dmfree(&hU, queue ); } // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->L.num_rows, precond->L.nnz, descrL, precond->L.dval, precond->L.drow, precond->L.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_UPPER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->U.num_rows, precond->U.nnz, descrU, precond->U.dval, precond->U.drow, precond->U.dcol, precond->cuinfoU )); if( precond->maxiter < 50 ){ //prepare for iterative solves // extract the diagonal of L into precond->d CHECK( magma_djacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_dvinit( &precond->work1, Magma_DEV, precond->U.num_rows, 1, MAGMA_D_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_djacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_dvinit( &precond->work2, Magma_DEV, precond->U.num_rows, 1, MAGMA_D_ZERO, queue )); } cleanup: cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
extern "C" magma_int_t magma_dcumilusetup_transpose( magma_d_matrix A, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; magma_d_matrix Ah1={Magma_CSR}, Ah2={Magma_CSR}; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrLT=NULL; cusparseMatDescr_t descrUT=NULL; // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); // transpose the matrix magma_dmtransfer( precond->L, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_dmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransposeconjugate( Ah2, &Ah1, queue ); magma_dmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_dmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransfer( Ah2, &(precond->LT), Magma_CPU, Magma_DEV, queue ); magma_dmfree(&Ah2, queue ); magma_dmtransfer( precond->U, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_dmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransposeconjugate( Ah2, &Ah1, queue ); magma_dmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_dmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransfer( Ah2, &(precond->UT), Magma_CPU, Magma_DEV, queue ); magma_dmfree(&Ah2, queue ); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrLT )); CHECK_CUSPARSE( cusparseSetMatType( descrLT, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrLT, CUSPARSE_DIAG_TYPE_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrLT, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrLT, CUSPARSE_FILL_MODE_UPPER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoLT )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->LT.num_rows, precond->LT.nnz, descrLT, precond->LT.dval, precond->LT.drow, precond->LT.dcol, precond->cuinfoLT )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrUT )); CHECK_CUSPARSE( cusparseSetMatType( descrUT, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrUT, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrUT, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrUT, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoUT )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->UT.num_rows, precond->UT.nnz, descrUT, precond->UT.dval, precond->UT.drow, precond->UT.dcol, precond->cuinfoUT )); cleanup: cusparseDestroyMatDescr( descrLT ); cusparseDestroyMatDescr( descrUT ); cusparseDestroy( cusparseHandle ); magma_dmfree(&Ah1, queue ); magma_dmfree(&Ah2, queue ); return info; }
extern "C" magma_int_t magma_cmtransposeconjugate( magma_c_matrix A, magma_c_matrix *B, magma_queue_t queue ) { // for symmetric matrices: convert to csc using cusparse magma_int_t info = 0; cusparseHandle_t handle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrB=NULL; magma_c_matrix ACSR={Magma_CSR}, BCSR={Magma_CSR}; magma_c_matrix A_d={Magma_CSR}, B_d={Magma_CSR}; if( A.storage_type == Magma_CSR && A.memory_location == Magma_DEV ) { // fill in information for B B->storage_type = A.storage_type; B->diagorder_type = A.diagorder_type; B->memory_location = Magma_DEV; B->num_rows = A.num_cols; // transposed B->num_cols = A.num_rows; // transposed B->nnz = A.nnz; B->true_nnz = A.true_nnz; if ( A.fill_mode == MagmaFull ) { B->fill_mode = MagmaFull; } else if ( A.fill_mode == MagmaLower ) { B->fill_mode = MagmaUpper; } else if ( A.fill_mode == MagmaUpper ) { B->fill_mode = MagmaLower; } B->dval = NULL; B->drow = NULL; B->dcol = NULL; // memory allocation CHECK( magma_cmalloc( &B->dval, B->nnz )); CHECK( magma_index_malloc( &B->drow, B->num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, B->nnz )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &handle )); CHECK_CUSPARSE( cusparseSetStream( handle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrB )); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrB, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrB, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseCcsr2csc( handle, A.num_rows, A.num_cols, A.nnz, A.dval, A.drow, A.dcol, B->dval, B->dcol, B->drow, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO) ); CHECK( magma_cmconjugate( B, queue )); } else if ( A.memory_location == Magma_CPU ){ CHECK( magma_cmtransfer( A, &A_d, A.memory_location, Magma_DEV, queue )); CHECK( magma_cmtransposeconjugate( A_d, &B_d, queue )); CHECK( magma_cmtransfer( B_d, B, Magma_DEV, A.memory_location, queue )); } else { CHECK( magma_cmconvert( A, &ACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_cmtransposeconjugate( ACSR, &BCSR, queue )); CHECK( magma_cmconvert( BCSR, B, Magma_CSR, A.storage_type, queue )); } cleanup: cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroy( handle ); magma_cmfree( &A_d, queue ); magma_cmfree( &B_d, queue ); magma_cmfree( &ACSR, queue ); magma_cmfree( &BCSR, queue ); if( info != 0 ){ magma_cmfree( B, queue ); } return info; }