Caffe::Caffe() : cublas_handle_(NULL),cusparse_handle_(NULL),cusparse_descr_(NULL),curand_generator_(NULL),random_generator_(),mode_(Caffe::CPU), solver_count_(1), root_solver_(true){ // Try to create a cublas handler, and report an error if failed (but we will // keep the program running as one might just want to run CPU code). LOG(INFO)<<"caffe init."; if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Cublas handle. Cublas won't be available."; } //add cusparse handler if (cusparseCreate(&cusparse_handle_)!=CUSPARSE_STATUS_SUCCESS){ LOG(ERROR) << "cannot create Cusparse handle,Cusparse won't be available."; } if(cusparseCreateMatDescr(&cusparse_descr_)!=CUSPARSE_STATUS_SUCCESS){ LOG(ERROR) << "cannot create Cusparse descr,descr won't be available."; }else{ cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO); LOG(INFO)<<"init descr"; } // Try to create a curand handler. if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) { LOG(ERROR) << "Cannot create Curand generator. Curand won't be available."; } LOG(INFO)<<"caffe finish"; }
CudaSparseSingleton() { cusparseCreate( & handle ); cusparseCreateMatDescr( & descra ); cusparseSetMatType( descra , CUSPARSE_MATRIX_TYPE_GENERAL ); cusparseSetMatIndexBase( descra , CUSPARSE_INDEX_BASE_ZERO ); }
// initialize CUDA ssp_cuda *ssp_init_cuda() { ssp_cuda *cudaHandle = (ssp_cuda*)malloc(sizeof(ssp_cuda)); if (!cudaHandle) { fprintf(stderr,"ssp_init_cuda: cudaHandle memory allocation failed.\n"); return NULL; } cudaHandle->cusparse_handle = 0; cudaHandle->cusparse_matDescr = 0; cusparseStatus_t status = cusparseCreate(&cudaHandle->cusparse_handle); if (status != CUSPARSE_STATUS_SUCCESS) { ssp_finalize_cuda(cudaHandle); fprintf(stderr,"ssp_init_cuda: cusparse initialization failed.\n"); return NULL; } status = cusparseCreateMatDescr(&cudaHandle->cusparse_matDescr); if (status != CUSPARSE_STATUS_SUCCESS) { ssp_finalize_cuda(cudaHandle); fprintf(stderr,"ssp_init_cuda: cusparse matrix setup failed.\n"); return NULL; } cusparseSetMatType(cudaHandle->cusparse_matDescr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(cudaHandle->cusparse_matDescr,CUSPARSE_INDEX_BASE_ZERO); return cudaHandle; }
xDense2Csr(StatisticalTimer& timer) : cusparseFunc(timer) { cusparseStatus_t err = cusparseCreateMatDescr(&descrA); CUDA_V_THROW(err, "cusparseCreateMatDescr failed"); err = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); CUDA_V_THROW(err, "cusparseSetMatType failed"); err = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); CUDA_V_THROW(err, "cusparseSetMatIndexBase failed"); n_rows = 0; n_cols = 0; n_vals = 0; device_col_indices = nullptr; device_row_offsets = nullptr; device_values = nullptr; device_A = nullptr; nnzPerRow = nullptr; devRowOffsets = nullptr; devColIndices = nullptr; devValues = nullptr; }// end
void Caffe::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().cusparse_descr_)CUSPARSE_CHECK(cusparseDestroyMatDescr(Get().cusparse_descr_)); if (Get().cusparse_handle_)CUSPARSE_CHECK(cusparseDestroy(Get().cusparse_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUSPARSE_CHECK(cusparseCreate(&Get().cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&Get().cusparse_descr_)); // cusparseSetMatType(cusparse_descr_,CUSPARSE_MATRIX_TYPE_GENERAL); // cusparseSetMatIndexBase(cusparse_descr_,CUSPARSE_INDEX_BASE_ZERO); LOG(INFO)<<"set descr"; CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); }
cuSparseHandleType(bool transposeA, bool transposeB){ cusparseStatus_t status; status= cusparseCreate(&handle); if (status != CUSPARSE_STATUS_SUCCESS) { std::cerr << ("cusparseCreate ERROR") << std::endl; return; } cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST); if (transposeA){ transA = CUSPARSE_OPERATION_TRANSPOSE; } else { transA = CUSPARSE_OPERATION_NON_TRANSPOSE; } if (transposeB){ transB = CUSPARSE_OPERATION_TRANSPOSE; } else { transB = CUSPARSE_OPERATION_NON_TRANSPOSE; } status = cusparseCreateMatDescr(&a_descr); if (status != CUSPARSE_STATUS_SUCCESS) { std::cerr << "cusparseCreateMatDescr a_descr ERROR" << std::endl; return; } cusparseSetMatType(a_descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(a_descr,CUSPARSE_INDEX_BASE_ZERO); status = cusparseCreateMatDescr(&b_descr); if (status != CUSPARSE_STATUS_SUCCESS) { std::cerr << ("cusparseCreateMatDescr b_descr ERROR") << std::endl; return; } cusparseSetMatType(b_descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(b_descr,CUSPARSE_INDEX_BASE_ZERO); status = cusparseCreateMatDescr(&c_descr); if (status != CUSPARSE_STATUS_SUCCESS) { std::cerr << ("cusparseCreateMatDescr c_descr ERROR") << std::endl; return; } cusparseSetMatType(c_descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(c_descr,CUSPARSE_INDEX_BASE_ZERO); }
magma_int_t magma_dapplycuicc_l( magma_d_vector b, magma_d_vector *x, magma_d_preconditioner *precond ){ double one = MAGMA_D_MAKE( 1.0, 0.0); // CUSPARSE context // cusparseHandle_t cusparseHandle; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); if(cusparseStatus != 0) printf("error in Handle.\n"); cusparseMatDescr_t descrL; cusparseStatus = cusparseCreateMatDescr(&descrL); if(cusparseStatus != 0) printf("error in MatrDescr.\n"); cusparseStatus = cusparseSetMatType(descrL,CUSPARSE_MATRIX_TYPE_TRIANGULAR); if(cusparseStatus != 0) printf("error in MatrType.\n"); cusparseStatus = cusparseSetMatDiagType (descrL, CUSPARSE_DIAG_TYPE_NON_UNIT); if(cusparseStatus != 0) printf("error in DiagType.\n"); cusparseStatus = cusparseSetMatFillMode(descrL,CUSPARSE_FILL_MODE_LOWER); if(cusparseStatus != 0) printf("error in fillmode.\n"); cusparseStatus = cusparseSetMatIndexBase(descrL,CUSPARSE_INDEX_BASE_ZERO); if(cusparseStatus != 0) printf("error in IndexBase.\n"); // end CUSPARSE context // cusparseStatus = cusparseDcsrsv_solve( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, &one, descrL, precond->M.val, precond->M.row, precond->M.col, precond->cuinfoL, b.val, x->val ); if(cusparseStatus != 0) printf("error in L triangular solve:%p.\n", precond->cuinfoL ); cusparseDestroyMatDescr( descrL ); cusparseDestroy( cusparseHandle ); magma_device_sync(); return MAGMA_SUCCESS; }
void sparse_fully_connected_1x1_layer_tester_cuda::enqueue_forward_propagation( cudaStream_t stream_id, cuda_linear_buffer_device::ptr output_buffer, const std::vector<cuda_linear_buffer_device::const_ptr>& schema_data, const std::vector<cuda_linear_buffer_device::const_ptr>& data, const std::vector<cuda_linear_buffer_device::const_ptr>& data_custom, const std::vector<cuda_linear_buffer_device::const_ptr>& input_buffers, const std::vector<cuda_linear_buffer_device::const_ptr>& persistent_working_data, cuda_linear_buffer_device::ptr temporary_working_fixed_buffer, cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer, unsigned int entry_count) { { cusparse_safe_call(cusparseSetStream(cuda_config->get_cusparse_handle(), stream_id)); float alpha = 1.0F; float beta = 0.0F; cusparseMatDescr_t mat_descr; cusparse_safe_call(cusparseCreateMatDescr(&mat_descr)); cusparse_safe_call(cusparseScsrmm( cuda_config->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, output_elem_count_per_entry, entry_count, input_elem_count_per_entry_list[0], feature_map_connection_count, &alpha, mat_descr, *data[0], *data_custom[1], *data_custom[0], *input_buffers[0], input_elem_count_per_entry_list[0], &beta, *output_buffer, output_elem_count_per_entry)); } // Add bias { cudnn_safe_call(cudnnSetStream(cuda_config->get_cudnn_handle(), stream_id)); cudnn_util::set_tensor_descriptor( output_data_desc, output_configuration_specific, entry_count); float alpha = 1.0F; float beta = 1.0F; cudnn_safe_call(cudnnAddTensor( cuda_config->get_cudnn_handle(), &alpha, bias_desc, *data[1], &beta, output_data_desc, *output_buffer)); } }
Teuchos::RCP<cusparseMatDescr_t> Kokkos::CUSPARSEdetails::createMatDescr() { cusparseMatDescr_t *desc = new cusparseMatDescr_t; cusparseStatus_t status = cusparseCreateMatDescr(desc); TEUCHOS_TEST_FOR_EXCEPTION(status == CUSPARSE_STATUS_ALLOC_FAILED, std::runtime_error, "Kokkos::CUSPARSEdetails::createMatDescr(): the resources could not be allocated.") TEUCHOS_TEST_FOR_EXCEPTION(status != CUSPARSE_STATUS_SUCCESS, std::runtime_error, "Kokkos::CUSPARSEdetails::createMatDescr(): unspecified error.") return Teuchos::rcpWithDealloc(desc,CUSPARSEMatDescDestroyer(),true); }
xCsr2Dense( StatisticalTimer& timer, bool read_explicit_zeroes = true ): cusparseFunc( timer ) { cusparseStatus_t err = cusparseCreateMatDescr( &descrA ); CUDA_V_THROW( err, "cusparseCreateMatDescr failed" ); err = cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL ); CUDA_V_THROW( err, "cusparseSetMatType failed" ); err = cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO ); CUDA_V_THROW( err, "cusparseSetMatIndexBase failed" ); n_rows = 0; n_cols = 0; n_vals = 0; explicit_zeroes = read_explicit_zeroes; }
CudaSparseSingleton() { status = cusparseCreate(&handle); if(status != CUSPARSE_STATUS_SUCCESS) { throw std::runtime_error( std::string("ERROR - CUSPARSE Library Initialization failed" ) ); } status = cusparseCreateMatDescr(&descra); if(status != CUSPARSE_STATUS_SUCCESS) { throw std::runtime_error( std::string("ERROR - CUSPARSE Library Matrix descriptor failed" ) ); } cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO); }
extern "C" magma_int_t magma_dapplycumilu_r_transpose( magma_d_matrix b, magma_d_matrix *x, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrU=NULL; double one = MAGMA_D_MAKE( 1.0, 0.0); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseDcsrsm_solve( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->UT.num_rows, b.num_rows*b.num_cols/precond->UT.num_rows, &one, descrU, precond->UT.dval, precond->UT.drow, precond->UT.dcol, precond->cuinfoUT, b.dval, precond->UT.num_rows, x->dval, precond->UT.num_rows )); cleanup: cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
extern "C" magma_int_t magma_capplycumicc_l( magma_c_matrix b, magma_c_matrix *x, magma_c_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; magmaFloatComplex one = MAGMA_C_MAKE( 1.0, 0.0); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseCcsrsm_solve( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, b.num_rows*b.num_cols/precond->M.num_rows, &one, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL, b.dval, precond->M.num_rows, x->dval, precond->M.num_rows )); magma_device_sync(); cleanup: cusparseDestroyMatDescr( descrL ); cusparseDestroy( cusparseHandle ); return info; }
sparse_matrix::sparse_matrix(sparse_matrix::descriptor_t descriptor, int rows, int cols, int nonzeros, const double* values, const int* col_ptr, const int* row_ind) : descrA(), m(), n(), nnz(), csrValA(), csrRowPtrA(), csrColIndA() { // Create descriptor assert(cusparseCreateMatDescr(&descrA) == cudaSuccess); assert(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO) == cudaSuccess); // Set descriptor fields switch (descriptor) { case non_symmetric: assert(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT) == cudaSuccess); assert(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL) == cudaSuccess); assert(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER) == cudaSuccess); // doesn't matter which, presumably break; case symmetric_lower: assert(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT) == cudaSuccess); assert(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_SYMMETRIC) == cudaSuccess); assert(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_UPPER) == cudaSuccess); // upper since we're coming with CSC and storing as CSR break; case symmetric_upper: assert(cusparseSetMatDiagType(descrA, CUSPARSE_DIAG_TYPE_NON_UNIT) == cudaSuccess); assert(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_SYMMETRIC) == cudaSuccess); assert(cusparseSetMatFillMode(descrA, CUSPARSE_FILL_MODE_LOWER) == cudaSuccess); // lower since we're coming with CSC and storing as CSR break; } // Switch rows and cols becuase we're coming with CSC and storing as CSR n = rows; m = cols; nnz = nonzeros; // Allocate memory assert(cudaMalloc(reinterpret_cast<void**>(&csrValA), nnz * sizeof(double)) == cudaSuccess); assert(cudaMalloc(reinterpret_cast<void**>(&csrRowPtrA), (m+1) * sizeof(int)) == cudaSuccess); assert(cudaMalloc(reinterpret_cast<void**>(&csrColIndA), nnz * sizeof(int)) == cudaSuccess); // Copy values assert(cudaMemcpy(csrValA, values, nnz * sizeof(double), cudaMemcpyHostToDevice) == cudaSuccess); assert(cudaMemcpy(csrRowPtrA, col_ptr, (m+1) * sizeof(int), cudaMemcpyHostToDevice) == cudaSuccess); assert(cudaMemcpy(csrColIndA, row_ind, nnz * sizeof(int), cudaMemcpyHostToDevice) == cudaSuccess); }
extern "C" magma_int_t magma_dcumilugeneratesolverinfo( magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; magma_d_matrix hA={Magma_CSR}, hL={Magma_CSR}, hU={Magma_CSR}; if (precond->L.memory_location != Magma_DEV ){ CHECK( magma_dmtransfer( precond->M, &hA, precond->M.memory_location, Magma_CPU, queue )); hL.diagorder_type = Magma_UNITY; CHECK( magma_dmconvert( hA, &hL , Magma_CSR, Magma_CSRL, queue )); hU.diagorder_type = Magma_VALUE; CHECK( magma_dmconvert( hA, &hU , Magma_CSR, Magma_CSRU, queue )); CHECK( magma_dmtransfer( hL, &(precond->L), Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hU, &(precond->U), Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hA, queue ); magma_dmfree(&hL, queue ); magma_dmfree(&hU, queue ); } // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->L.num_rows, precond->L.nnz, descrL, precond->L.dval, precond->L.drow, precond->L.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_UPPER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->U.num_rows, precond->U.nnz, descrU, precond->U.dval, precond->U.drow, precond->U.dcol, precond->cuinfoU )); if( precond->maxiter < 50 ){ //prepare for iterative solves // extract the diagonal of L into precond->d CHECK( magma_djacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_dvinit( &precond->work1, Magma_DEV, precond->U.num_rows, 1, MAGMA_D_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_djacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_dvinit( &precond->work2, Magma_DEV, precond->U.num_rows, 1, MAGMA_D_ZERO, queue )); } cleanup: cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
extern "C" magma_int_t magma_dcumilusetup_transpose( magma_d_matrix A, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; magma_d_matrix Ah1={Magma_CSR}, Ah2={Magma_CSR}; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrLT=NULL; cusparseMatDescr_t descrUT=NULL; // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); // transpose the matrix magma_dmtransfer( precond->L, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_dmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransposeconjugate( Ah2, &Ah1, queue ); magma_dmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_dmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransfer( Ah2, &(precond->LT), Magma_CPU, Magma_DEV, queue ); magma_dmfree(&Ah2, queue ); magma_dmtransfer( precond->U, &Ah1, Magma_DEV, Magma_CPU, queue ); magma_dmconvert( Ah1, &Ah2, A.storage_type, Magma_CSR, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransposeconjugate( Ah2, &Ah1, queue ); magma_dmfree(&Ah2, queue ); Ah2.blocksize = A.blocksize; Ah2.alignment = A.alignment; magma_dmconvert( Ah1, &Ah2, Magma_CSR, A.storage_type, queue ); magma_dmfree(&Ah1, queue ); magma_dmtransfer( Ah2, &(precond->UT), Magma_CPU, Magma_DEV, queue ); magma_dmfree(&Ah2, queue ); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrLT )); CHECK_CUSPARSE( cusparseSetMatType( descrLT, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrLT, CUSPARSE_DIAG_TYPE_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrLT, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrLT, CUSPARSE_FILL_MODE_UPPER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoLT )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->LT.num_rows, precond->LT.nnz, descrLT, precond->LT.dval, precond->LT.drow, precond->LT.dcol, precond->cuinfoLT )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrUT )); CHECK_CUSPARSE( cusparseSetMatType( descrUT, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrUT, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrUT, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrUT, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoUT )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->UT.num_rows, precond->UT.nnz, descrUT, precond->UT.dval, precond->UT.drow, precond->UT.dcol, precond->cuinfoUT )); cleanup: cusparseDestroyMatDescr( descrLT ); cusparseDestroyMatDescr( descrUT ); cusparseDestroy( cusparseHandle ); magma_dmfree(&Ah1, queue ); magma_dmfree(&Ah2, queue ); return info; }
int TxMatrixOptimizationDataCU::ingestLocalMatrix(SparseMatrix& A) { std::vector<local_int_t> i(A.localNumberOfRows + 1, 0); // Slight overallocation for these arrays std::vector<local_int_t> j; j.reserve(A.localNumberOfNonzeros); std::vector<double> a; a.reserve(A.localNumberOfNonzeros); scatterFromHalo.setNumRows(A.localNumberOfRows); scatterFromHalo.setNumCols(A.localNumberOfColumns); scatterFromHalo.clear(); // We're splitting the matrix into diagonal and off-diagonal block to // enable overlapping of computation and communication. i[0] = 0; for (local_int_t m = 0; m < A.localNumberOfRows; ++m) { local_int_t nonzerosInRow = 0; for (local_int_t n = 0; n < A.nonzerosInRow[m]; ++n) { local_int_t col = A.mtxIndL[m][n]; if (col < A.localNumberOfRows) { j.push_back(col); a.push_back(A.matrixValues[m][n]); ++nonzerosInRow; } else { scatterFromHalo.addEntry(m, col, A.matrixValues[m][n]); } } i[m + 1] = i[m] + nonzerosInRow; } // Setup SpMV data on Device cudaError_t err = cudaSuccess; int* i_d; err = cudaMalloc((void**)&i_d, i.size() * sizeof(i[0])); CHKCUDAERR(err); err = cudaMemcpy(i_d, &i[0], i.size() * sizeof(i[0]), cudaMemcpyHostToDevice); CHKCUDAERR(err); int* j_d; err = cudaMalloc((void**)&j_d, j.size() * sizeof(j[0])); CHKCUDAERR(err); err = cudaMemcpy(j_d, &j[0], j.size() * sizeof(j[0]), cudaMemcpyHostToDevice); CHKCUDAERR(err); double* a_d; err = cudaMalloc((void**)&a_d, a.size() * sizeof(a[0])); CHKCUDAERR(err); err = cudaMemcpy(a_d, &a[0], a.size() * sizeof(a[0]), cudaMemcpyHostToDevice); CHKCUDAERR(err); cusparseStatus_t cerr = CUSPARSE_STATUS_SUCCESS; cerr = cusparseCreateMatDescr(&matDescr); CHKCUSPARSEERR(cerr); cerr = cusparseSetMatIndexBase(matDescr, CUSPARSE_INDEX_BASE_ZERO); CHKCUSPARSEERR(cerr); cerr = cusparseSetMatType(matDescr, CUSPARSE_MATRIX_TYPE_GENERAL); CHKCUSPARSEERR(cerr); cerr = cusparseCreateHybMat(&localMatrix); CHKCUSPARSEERR(cerr); cerr = cusparseDcsr2hyb(handle, A.localNumberOfRows, A.localNumberOfColumns, matDescr, a_d, i_d, j_d, localMatrix, 27, CUSPARSE_HYB_PARTITION_USER); CHKCUSPARSEERR(cerr); #ifndef HPCG_NOMPI err = cudaMalloc((void**)&elementsToSend, A.totalToBeSent * sizeof(*elementsToSend)); CHKCUDAERR(err); err = cudaMemcpy(elementsToSend, A.elementsToSend, A.totalToBeSent * sizeof(*elementsToSend), cudaMemcpyHostToDevice); CHKCUDAERR(err); err = cudaMalloc((void**)&sendBuffer_d, A.totalToBeSent * sizeof(double)); CHKCUDAERR(err); #endif // Set up the GS data. gelusStatus_t gerr = GELUS_STATUS_SUCCESS; gelusSolveDescription_t solveDescr; gerr = gelusCreateSolveDescr(&solveDescr); CHKGELUSERR(gerr); gerr = gelusSetSolveOperation(solveDescr, GELUS_OPERATION_NON_TRANSPOSE); CHKGELUSERR(gerr); gerr = gelusSetSolveFillMode(solveDescr, GELUS_FILL_MODE_FULL); CHKGELUSERR(gerr); gerr = gelusSetSolveStorageFormat(solveDescr, GELUS_STORAGE_FORMAT_HYB); CHKGELUSERR(gerr); gerr = gelusSetOptimizationLevel(solveDescr, GELUS_OPTIMIZATION_LEVEL_THREE); CHKGELUSERR(gerr); gerr = cugelusCreateSorIterationData(&gsContext); CHKGELUSERR(gerr); #ifdef HPCG_DEBUG std::cout << A.localNumberOfRows << std::endl; std::cout << A.localNumberOfColumns << std::endl; std::cout << A.localNumberOfNonzeros << std::endl; int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); if (myrank == 0) { dumpMatrix(std::cout, i, j, a); } #endif gerr = cugelusDcsrsor_iteration_analysis( A.localNumberOfRows, solveDescr, GELUS_SOR_SYMMETRIC, 1.0, &i[0], &j[0], &a[0], gsContext); gerr = gelusDestroySolveDescr(solveDescr); CHKGELUSERR(gerr); if (A.mgData) { err = cudaMalloc((void**)&f2c, A.mgData->rc->localLength * sizeof(local_int_t)); CHKCUDAERR(err); err = cudaMemcpy(f2c, A.mgData->f2cOperator, A.mgData->rc->localLength * sizeof(local_int_t), cudaMemcpyHostToDevice); CHKCUDAERR(err); } err = cudaMalloc((void**)&workvector, A.localNumberOfRows * sizeof(double)); CHKCUDAERR(err); return (int)cerr | (int)gerr | (int)err; }
/* Solve Ax=b using the conjugate gradient method a) without any preconditioning, b) using an Incomplete Cholesky preconditioner and c) using an ILU0 preconditioner. */ int main(int argc, char **argv) { const int max_iter = 1000; int k, M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; int *d_col, *d_row; int qatest = 0; const float tol = 1e-12f; float *x, *rhs; float r0, r1, alpha, beta; float *d_val, *d_x; float *d_zm1, *d_zm2, *d_rm2; float *d_r, *d_p, *d_omega, *d_y; float *val = NULL; float *d_valsILU0; float *valsILU0; float rsum, diff, err = 0.0; float qaerr1, qaerr2 = 0.0; float dot, numerator, denominator, nalpha; const float floatone = 1.0; const float floatzero = 0.0; int nErrors = 0; printf("conjugateGradientPrecond starting...\n"); /* QA testing mode */ if (checkCmdLineFlag(argc, (const char **)argv, "qatest")) { qatest = 1; } /* This will pick the best possible CUDA capable device */ cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); printf("GPU selected Device ID = %d \n", devID); if (devID < 0) { printf("Invalid GPU device %d selected, exiting...\n", devID); exit(EXIT_SUCCESS); } checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); /* Statistics about the GPU device */ printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); if (version < 0x11) { printf("%s: requires a minimum CUDA compute 1.1 capability\n", sSDKname); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); exit(EXIT_SUCCESS); } /* Generate a random tridiagonal symmetric matrix in CSR (Compressed Sparse Row) format */ M = N = 16384; nz = 5*N-4*(int)sqrt((double)N); I = (int *)malloc(sizeof(int)*(N+1)); // csr row pointers for matrix A J = (int *)malloc(sizeof(int)*nz); // csr column indices for matrix A val = (float *)malloc(sizeof(float)*nz); // csr values for matrix A x = (float *)malloc(sizeof(float)*N); rhs = (float *)malloc(sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 0.0; // Initialize RHS x[i] = 0.0; // Initial approximation of solution } genLaplace(I, J, val, M, N, nz, rhs); /* Create CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Create CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); /* Description of the A matrix*/ cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); /* Define the properties of the matrix */ cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); /* Allocate required memory */ checkCudaErrors(cudaMalloc((void **)&d_col, nz*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_row, (N+1)*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_val, nz*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_y, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_r, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_p, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_omega, N*sizeof(float))); cudaMemcpy(d_col, J, nz*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, I, (N+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, val, nz*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice); /* Conjugate gradient without preconditioning. ------------------------------------------ Follows the description by Golub & Van Loan, "Matrix Computations 3rd ed.", Section 10.2.6 */ printf("Convergence of conjugate gradient without preconditioning: \n"); k = 0; r0 = 0; cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); while (r1 > tol*tol && k <= max_iter) { k++; if (k == 1) { cublasScopy(cublasHandle, N, d_r, 1, d_p, 1); } else { beta = r1/r0; cublasSscal(cublasHandle, N, &beta, d_p, 1); cublasSaxpy(cublasHandle, N, &floatone, d_r, 1, d_p, 1) ; } cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &floatone, descr, d_val, d_row, d_col, d_p, &floatzero, d_omega); cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &dot); alpha = r1/dot; cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1); nalpha = -alpha; cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1); r0 = r1; cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); } printf(" iteration = %3d, residual = %e \n", k, sqrt(r1)); cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); /* check result */ err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } printf(" Convergence Test: %s \n", (k <= max_iter) ? "OK" : "FAIL"); nErrors += (k > max_iter) ? 1 : 0; qaerr1 = err; if (0) { // output result in matlab-style array int n=(int)sqrt((double)N); printf("a = [ "); for (int iy=0; iy<n; iy++) { for (int ix=0; ix<n; ix++) { printf(" %f ", x[iy*n+ix]); } if (iy == n-1) { printf(" ]"); } printf("\n"); } } /* Preconditioned Conjugate Gradient using ILU. -------------------------------------------- Follows the description by Golub & Van Loan, "Matrix Computations 3rd ed.", Algorithm 10.3.1 */ printf("\nConvergence of conjugate gradient using incomplete LU preconditioning: \n"); int nzILU0 = 2*N-1; valsILU0 = (float *) malloc(nz*sizeof(float)); checkCudaErrors(cudaMalloc((void **)&d_valsILU0, nz*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_zm1, (N)*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_zm2, (N)*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_rm2, (N)*sizeof(float))); /* create the analysis info object for the A matrix */ cusparseSolveAnalysisInfo_t infoA = 0; cusparseStatus = cusparseCreateSolveAnalysisInfo(&infoA); checkCudaErrors(cusparseStatus); /* Perform the analysis for the Non-Transpose case */ cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nz, descr, d_val, d_row, d_col, infoA); checkCudaErrors(cusparseStatus); /* Copy A data to ILU0 vals as input*/ cudaMemcpy(d_valsILU0, d_val, nz*sizeof(float), cudaMemcpyDeviceToDevice); /* generate the Incomplete LU factor H for the matrix A using cudsparseScsrilu0 */ cusparseStatus = cusparseScsrilu0(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descr, d_valsILU0, d_row, d_col, infoA); checkCudaErrors(cusparseStatus); /* Create info objects for the ILU0 preconditioner */ cusparseSolveAnalysisInfo_t info_u; cusparseCreateSolveAnalysisInfo(&info_u); cusparseMatDescr_t descrL = 0; cusparseStatus = cusparseCreateMatDescr(&descrL); cusparseSetMatType(descrL,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrL,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatFillMode(descrL, CUSPARSE_FILL_MODE_LOWER); cusparseSetMatDiagType(descrL, CUSPARSE_DIAG_TYPE_UNIT); cusparseMatDescr_t descrU = 0; cusparseStatus = cusparseCreateMatDescr(&descrU); cusparseSetMatType(descrU,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descrU,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatFillMode(descrU, CUSPARSE_FILL_MODE_UPPER); cusparseSetMatDiagType(descrU, CUSPARSE_DIAG_TYPE_NON_UNIT); cusparseStatus = cusparseScsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nz, descrU, d_val, d_row, d_col, info_u); /* reset the initial guess of the solution to zero */ for (int i = 0; i < N; i++) { x[i] = 0.0; } checkCudaErrors(cudaMemcpy(d_r, rhs, N*sizeof(float), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice)); k = 0; cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); while (r1 > tol*tol && k <= max_iter) { // Forward Solve, we can re-use infoA since the sparsity pattern of A matches that of L cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrL, d_valsILU0, d_row, d_col, infoA, d_r, d_y); checkCudaErrors(cusparseStatus); // Back Substitution cusparseStatus = cusparseScsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, &floatone, descrU, d_valsILU0, d_row, d_col, info_u, d_y, d_zm1); checkCudaErrors(cusparseStatus); k++; if (k == 1) { cublasScopy(cublasHandle, N, d_zm1, 1, d_p, 1); } else { cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator); cublasSdot(cublasHandle, N, d_rm2, 1, d_zm2, 1, &denominator); beta = numerator/denominator; cublasSscal(cublasHandle, N, &beta, d_p, 1); cublasSaxpy(cublasHandle, N, &floatone, d_zm1, 1, d_p, 1) ; } cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nzILU0, &floatone, descrU, d_val, d_row, d_col, d_p, &floatzero, d_omega); cublasSdot(cublasHandle, N, d_r, 1, d_zm1, 1, &numerator); cublasSdot(cublasHandle, N, d_p, 1, d_omega, 1, &denominator); alpha = numerator / denominator; cublasSaxpy(cublasHandle, N, &alpha, d_p, 1, d_x, 1); cublasScopy(cublasHandle, N, d_r, 1, d_rm2, 1); cublasScopy(cublasHandle, N, d_zm1, 1, d_zm2, 1); nalpha = -alpha; cublasSaxpy(cublasHandle, N, &nalpha, d_omega, 1, d_r, 1); cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, &r1); } printf(" iteration = %3d, residual = %e \n", k, sqrt(r1)); cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); /* check result */ err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } printf(" Convergence Test: %s \n", (k <= max_iter) ? "OK" : "FAIL"); nErrors += (k > max_iter) ? 1 : 0; qaerr2 = err; /* Destroy parameters */ cusparseDestroySolveAnalysisInfo(infoA); cusparseDestroySolveAnalysisInfo(info_u); /* Destroy contexts */ cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); /* Free device memory */ free(I); free(J); free(val); free(x); free(rhs); free(valsILU0); cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_y); cudaFree(d_r); cudaFree(d_p); cudaFree(d_omega); cudaFree(d_valsILU0); cudaFree(d_zm1); cudaFree(d_zm2); cudaFree(d_rm2); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); printf(" Test Summary:\n"); printf(" Counted total of %d errors\n", nErrors); printf(" qaerr1 = %f qaerr2 = %f\n\n", fabs(qaerr1), fabs(qaerr2)); exit((nErrors == 0 &&fabs(qaerr1)<1e-5 && fabs(qaerr2) < 1e-5 ? EXIT_SUCCESS : EXIT_FAILURE)); }
extern "C" magma_int_t magma_d_spmv( double alpha, magma_d_matrix A, magma_d_matrix x, double beta, magma_d_matrix y, magma_queue_t queue ) { magma_int_t info = 0; magma_d_matrix x2={Magma_CSR}; cusparseHandle_t cusparseHandle = 0; cusparseMatDescr_t descr = 0; // make sure RHS is a dense matrix if ( x.storage_type != Magma_DENSE ) { printf("error: only dense vectors are supported for SpMV.\n"); info = MAGMA_ERR_NOT_SUPPORTED; goto cleanup; } if ( A.memory_location != x.memory_location || x.memory_location != y.memory_location ) { printf("error: linear algebra objects are not located in same memory!\n"); printf("memory locations are: %d %d %d\n", A.memory_location, x.memory_location, y.memory_location ); info = MAGMA_ERR_INVALID_PTR; goto cleanup; } // DEV case if ( A.memory_location == Magma_DEV ) { if ( A.num_cols == x.num_rows && x.num_cols == 1 ) { if ( A.storage_type == Magma_CSR || A.storage_type == Magma_CUCSR || A.storage_type == Magma_CSRL || A.storage_type == Magma_CSRU ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); cusparseDcsrmv( cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, &beta, y.dval ); } else if ( A.storage_type == Magma_ELL ) { //printf("using ELLPACKT kernel for SpMV: "); CHECK( magma_dgeelltmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLPACKT ) { //printf("using ELL kernel for SpMV: "); CHECK( magma_dgeellmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_ELLRT ) { //printf("using ELLRT kernel for SpMV: "); CHECK( magma_dgeellrtmv( MagmaNoTrans, A.num_rows, A.num_cols, A.max_nnz_row, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, A.alignment, A.blocksize, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_SELLP ) { //printf("using SELLP kernel for SpMV: "); CHECK( magma_dgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dgemv( MagmaNoTrans, A.num_rows, A.num_cols, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1, queue ); //printf("done.\n"); } else if ( A.storage_type == Magma_SPMVFUNCTION ) { //printf("using DENSE kernel for SpMV: "); CHECK( magma_dcustomspmv( alpha, x, beta, y, queue )); //printf("done.\n"); } else if ( A.storage_type == Magma_BCSR ) { //printf("using CUSPARSE BCSR kernel for SpMV: "); // CUSPARSE context // cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; int mb = magma_ceildiv( A.num_rows, A.blocksize ); int nb = magma_ceildiv( A.num_cols, A.blocksize ); CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); cusparseDbsrmv( cusparseHandle, dirA, CUSPARSE_OPERATION_NON_TRANSPOSE, mb, nb, A.numblocks, &alpha, descr, A.dval, A.drow, A.dcol, A.blocksize, x.dval, &beta, y.dval ); } else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } else if ( A.num_cols < x.num_rows || x.num_cols > 1 ) { magma_int_t num_vecs = x.num_rows / A.num_cols * x.num_cols; if ( A.storage_type == Magma_CSR ) { CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descr )); CHECK_CUSPARSE( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); if ( x.major == MagmaColMajor) { cusparseDcsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); } else if ( x.major == MagmaRowMajor) { /*cusparseDcsrmm2(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, A.num_rows, num_vecs, A.num_cols, A.nnz, &alpha, descr, A.dval, A.drow, A.dcol, x.dval, A.num_cols, &beta, y.dval, A.num_cols); */ } } else if ( A.storage_type == Magma_SELLP ) { if ( x.major == MagmaRowMajor) { CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x.dval, beta, y.dval, queue )); } else if ( x.major == MagmaColMajor) { // transpose first to row major CHECK( magma_dvtranspose( x, &x2, queue )); CHECK( magma_dmgesellpmv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, A.blocksize, A.numblocks, A.alignment, alpha, A.dval, A.dcol, A.drow, x2.dval, beta, y.dval, queue )); } } /*if ( A.storage_type == Magma_DENSE ) { //printf("using DENSE kernel for SpMV: "); magmablas_dmgemv( MagmaNoTrans, A.num_rows, A.num_cols, num_vecs, alpha, A.dval, A.num_rows, x.dval, 1, beta, y.dval, 1 ); //printf("done.\n"); }*/ else { printf("error: format not supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } } } // CPU case missing! else { printf("error: CPU not yet supported.\n"); info = MAGMA_ERR_NOT_SUPPORTED; } cleanup: cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); cusparseHandle = 0; descr = 0; magma_dmfree(&x2, queue ); return info; }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { TESTING_INIT(); magma_queue_t queue; magma_queue_create( /*devices[ opts->device ],*/ &queue ); magma_d_sparse_matrix hA, hA_SELLP, hA_ELL, dA, dA_SELLP, dA_ELL; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; magma_int_t *pntre; double c_one = MAGMA_D_MAKE(1.0, 0.0); double c_zero = MAGMA_D_MAKE(0.0, 0.0); magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf( "\n# usage: ./run_dspmv" " [ --blocksize %d --alignment %d (for SELLP) ]" " matrices \n\n", (int) hA_SELLP.blocksize, (int) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); magma_dm_5stencil( laplace_size, &hA, queue ); } else { // file-matrix test magma_d_csr_mtx( &hA, argv[i], queue ); } printf( "\n# matrix info: %d-by-%d with %d nonzeros\n\n", (int) hA.num_rows,(int) hA.num_cols,(int) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; magma_d_vector hx, hy, dx, dy, hrefvec, hcheck; // init CPU vectors magma_d_vinit( &hx, Magma_CPU, hA.num_rows, c_zero, queue ); magma_d_vinit( &hy, Magma_CPU, hA.num_rows, c_zero, queue ); // init DEV vectors magma_d_vinit( &dx, Magma_DEV, hA.num_rows, c_one, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); #ifdef MAGMA_WITH_MKL // calling MKL with CSR pntre = (magma_int_t*)malloc( (hA.num_rows+1)*sizeof(magma_int_t) ); pntre[0] = 0; for (j=0; j<hA.num_rows; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT *col; TESTING_MALLOC_CPU( col, MKL_INT, nnz ); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_MALLOC_CPU( row, MKL_INT, num_rows ); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } start = magma_wtime(); for (j=0; j<10; j++ ) { mkl_dcsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); TESTING_FREE_CPU( row ); TESTING_FREE_CPU( col ); free(pntre); #endif // MAGMA_WITH_MKL // copy matrix to GPU magma_d_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); // SpMV on GPU (CSR) -- this is the reference! start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_d_spmv( c_one, dA, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_mfree(&dA, queue ); magma_d_vtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue ); // convert to ELL and copy to GPU magma_d_mconvert( hA, &hA_ELL, Magma_CSR, Magma_ELL, queue ); magma_d_mtransfer( hA_ELL, &dA_ELL, Magma_CPU, Magma_DEV, queue ); magma_d_mfree(&hA_ELL, queue ); magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (ELL) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_d_spmv( c_one, dA_ELL, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard ELL).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_mfree(&dA_ELL, queue ); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); if ( res < .000001 ) printf("# tester spmv ELL: ok\n"); else printf("# tester spmv ELL: failed\n"); magma_d_vfree( &hcheck, queue ); // convert to SELLP and copy to GPU magma_d_mconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue ); magma_d_mtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue ); magma_d_mfree(&hA_SELLP, queue ); magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j<10; j++) magma_d_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue ); end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv SELL-P: ok\n"); else printf("# tester spmv SELL-P: failed\n"); magma_d_vfree( &hcheck, queue ); magma_d_mfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); double alpha = c_one; double beta = c_zero; magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); // copy matrix to GPU magma_d_mtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue ); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, hA.num_rows, hA.num_cols, hA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE CSR\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); cusparseMatDescr_t descrA; cusparseStatus = cusparseCreateMatDescr(&descrA); if (cusparseStatus != 0) printf("error\n"); cusparseHybMat_t hybA; cusparseStatus = cusparseCreateHybMat( &hybA ); if (cusparseStatus != 0) printf("error\n"); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE CSR: ok\n"); else printf("# tester spmv cuSPARSE CSR: failed\n"); magma_d_vfree( &hcheck, queue ); magma_d_vfree( &dy, queue ); magma_d_vinit( &dy, Magma_DEV, hA.num_rows, c_zero, queue ); cusparseDcsr2hyb(cusparseHandle, hA.num_rows, hA.num_cols, descrA, dA.dval, dA.drow, dA.dcol, hybA, 0, CUSPARSE_HYB_PARTITION_AUTO); start = magma_sync_wtime( queue ); for (j=0; j<10; j++) cusparseStatus = cusparseDhybmv( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, descrA, hybA, dx.dval, &beta, dy.dval); end = magma_sync_wtime( queue ); if (cusparseStatus != 0) printf("error in cuSPARSE HYB\n"); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (HYB).\n", (end-start)/10, FLOPS*10/(end-start) ); magma_d_vtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue ); res = 0.0; for(magma_int_t k=0; k<hA.num_rows; k++ ) res=res + MAGMA_D_REAL(hcheck.val[k]) - MAGMA_D_REAL(hrefvec.val[k]); printf("# |x-y|_F = %8.2e\n", res); if ( res < .000001 ) printf("# tester spmv cuSPARSE HYB: ok\n"); else printf("# tester spmv cuSPARSE HYB: failed\n"); magma_d_vfree( &hcheck, queue ); cusparseDestroyMatDescr( descrA ); cusparseDestroyHybMat( hybA ); cusparseDestroy( cusparseHandle ); magma_d_mfree(&dA, queue ); printf("\n\n"); // free CPU memory magma_d_mfree(&hA, queue ); magma_d_vfree(&hx, queue ); magma_d_vfree(&hy, queue ); magma_d_vfree(&hrefvec, queue ); // free GPU memory magma_d_vfree(&dx, queue ); magma_d_vfree(&dy, queue ); i++; } magma_queue_destroy( queue ); TESTING_FINALIZE(); return 0; }
int main(int argc, char* argv[]) { const int bufsize = 512; char buffer[bufsize]; int m,n,S; double time_st,time_end,time_avg; //omp_set_num_threads(2); // printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads()); if(argc!=2) { cout<<"Insufficient arguments"<<endl; return 1; } graph G; cerr<<"Start reading "; // time_st=dsecnd(); G.create_graph(argv[1]); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cout<<"Success "<<endl; // cerr<<"Reading time "<<time_avg<<endl; cerr<<"Constructing Matrices "; // time_st=dsecnd(); G.construct_MNA(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; // G.construct_sparse_MNA(); m=G.node_array.size()-1; n=G.voltage_edge_id.size(); cout<<endl; cout<<"MATRIX STAT:"<<endl; cout<<"Nonzero elements: "<<G.nonzero<<endl; cout<<"Number of Rows: "<<m+n<<endl; printf("\n Nonzero = %ld", G.nonzero); printf("\n Rows = %d", m+n); cout<<"MAT val: "<<endl; int i,j; // G.Mat_val[0] +=100; /* for(i=0;i<G.nonzero;i++) cout<<" "<<G.Mat_val[i]; cout<<endl; for(i=0;i<G.nonzero;i++) cout<<" "<<G.columns[i]; cout<<endl; for(i=0;i<m+n+1;i++) cout<<" "<<G.rowIndex[i]; cout<<endl; for(i=0;i<m+n;i++) { cout<<endl; int startindex=G.rowIndex[i]; int endindex=G.rowIndex[i+1]; for(j=startindex;j<endindex;j++) cout<<" "<<G.Mat_val[j]; cout<<endl; } */ /* for (i=0;i<m+n+1;i++) { //cout<<endl; if(G.rowIndex[i]==G.rowIndex[i+1]) break; for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { if(G.Mat_val[j]>10) cout<<G.Mat_val[j]<<"\t"; } //cout<<endl; /*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++) { cout<<G.columns[j]<<"\t"; } //cout<<endl; } cout<<endl; */ //printing the matrix printf("\n Fine till here"); printf("\n"); // int* rowmIndex=(int*)calloc(m+1,sizeof(int)); printf("\n Fine till here"); printf("\n"); //int rowmIndex[5]={1,2,3,4,5}; /* for(i=0;i<m+1;i++) { rowmIndex[i]=G.rowIndex[i]; printf(" %d", rowmIndex[i]); } */ cerr<<"Solving Equations "<<endl; double r1, b, alpha, alpham1, beta, r0, a, na; const double tol = 0.1; const int max_iter = 1000000; int *d_col, *d_row; double *d_val, *d_x, dot; double *d_r, *d_p, *d_Ax; int k; cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); checkCudaErrors(cudaMalloc((void **)&d_col, G.nonzero*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_row, (m+n+1)*sizeof(int))); checkCudaErrors(cudaMalloc((void **)&d_val, G.nonzero*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_x, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_r, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_p, (m+n)*sizeof(double))); checkCudaErrors(cudaMalloc((void **)&d_Ax, (m+n)*sizeof(double))); cudaMemcpy(d_col, G.columns, G.nonzero*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_row, G.rowIndex, (m+n+1)*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_val, G.Mat_val, G.nonzero*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_x, G.x, (m+n)*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_r, G.b, (m+n)*sizeof(double), cudaMemcpyHostToDevice); alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; printf("\n Data transferred\n"); cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax); cublasDaxpy(cublasHandle, (m+n), &alpham1, d_Ax, 1, d_r, 1); cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); k = 1; while (r1 > tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasDscal(cublasHandle, (m+n), &b, d_p, 1); cublasStatus = cublasDaxpy(cublasHandle, (m+n), &alpha, d_r, 1, d_p, 1); } else { cublasStatus = cublasDcopy(cublasHandle, (m+n), d_r, 1, d_p, 1); } cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax); cublasStatus = cublasDdot(cublasHandle, (m+n), d_p, 1, d_Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &a, d_p, 1, d_x, 1); na = -a; cublasStatus = cublasDaxpy(cublasHandle, (m+n), &na, d_Ax, 1, d_r, 1); r0 = r1; cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1); // cudaThreadSynchronize(); // printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("Iterations = %3d\tTime : %.6f milli-seconds : \n", k, elapsedTime); cudaMemcpy(G.x, d_x, (m+n)*sizeof(double), cudaMemcpyDeviceToHost); /* printf("\n x = \n"); for(i=0;i<(m+n);i++) { printf("\n x[%d] = %.8f", i, G.x[i]); } */ float rsum, diff, err = 0.0; for (int i = 0; i < (m+n); i++) { rsum = 0.0; for (int j = G.rowIndex[i]; j < G.rowIndex[i+1]; j++) { rsum += G.Mat_val[j]*G.x[G.columns[j]]; } diff = fabs(rsum - G.b[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); /* free(I); free(J); free(val); free(x); free(rhs); */ cudaFree(d_col); cudaFree(d_row); cudaFree(d_val); cudaFree(d_x); cudaFree(d_r); cudaFree(d_p); cudaFree(d_Ax); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); /* printf("\n X is:\n"); for(i=0;i<(m+n);i++) { printf("\n X[%d] = %.4f", i, G.x[i]); } */ printf("Test Summary: Error amount = %f\n", err); exit((k <= max_iter) ? 0 : 1); /* culaSparseHandle handle; if (culaSparseCreate(&handle) != culaSparseNoError) { // this should only fail under extreme conditions std::cout << "fatal error: failed to create library handle!" << std::endl; exit(EXIT_FAILURE); } StatusChecker sc(handle); culaSparsePlan plan; sc = culaSparseCreatePlan(handle, &plan); culaSparseCsrOptions formatOpts; culaSparseCsrOptionsInit(handle, &formatOpts); //formatOpts.indexing=1; sc = culaSparseSetDcsrData(handle, plan, &formatOpts, m+n, G.nonzero, &G.Mat_val[0], &G.rowIndex[0], &G.columns[0], &G.x[0], &G.b[0]); printf("\n Fine till here"); printf("\n"); culaSparseConfig config; sc = culaSparseConfigInit(handle, &config); config.relativeTolerance = 1e-2; config.maxIterations = 100000; config.divergenceTolerance = 20; culaSparseResult result; /* sc = culaSparseSetHostPlatform(handle, plan, 0); // set cg solver sc = culaSparseSetCgSolver(handle, plan, 0); printf("\n Fine till here"); printf("\n"); // perform solve (cg + no preconditioner on host) culaSparseResult result; sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; if (culaSparsePreinitializeCuda(handle) == culaSparseNoError) { // change to cuda accelerated platform sc = culaSparseSetCudaPlatform(handle, plan, 0); // perform solve (cg + ilu0 on cuda) culaSparseGmresOptions solverOpts; culaSparseStatus status = culaSparseGmresOptionsInit(handle, &solverOpts); solverOpts.restart = 20; sc = culaSparseSetCgSolver(handle, plan, 0); // sc = culaSparseSetJacobiPreconditioner(handle, plan, 0); // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change preconditioner to fainv // this avoids data transfer by using data cached by the plan // sc = culaSparseSetIlu0Preconditioner(handle, plan, 0); // perform solve (cg + fainv on cuda) // these timing results should indicate minimal overhead /* sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; // change solver // this avoids preconditioner regeneration by using data cached by the plan sc = culaSparseSetBicgstabSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; sc = culaSparseSetGmresSolver(handle, plan, 0); // perform solve (bicgstab + fainv on cuda) // the timing results should indicate minimal overhead and preconditioner generation time sc = culaSparseExecutePlan(handle, plan, &config, &result); sc = culaSparseGetResultString(handle, &result, buffer, bufsize); std::cout << buffer << std::endl; } else { std::cout << "alert: no cuda capable gpu found" << std::endl; } // cleanup plan culaSparseDestroyPlan(plan); // cleanup handle culaSparseDestroy(handle); FILE* myWriteFile; myWriteFile=fopen("result.txt","w"); for (i = 0; i < n; i++) { fprintf(myWriteFile,"%1f\n",G.x[i]); // printf ("\n x [%d] = % f", i, x[i]); } fprintf(myWriteFile,".end\n"); fclose(myWriteFile); printf ("\n"); // time_st=dsecnd(); // solver(G.rowIndex,G.columns,G.Mat_val,G.b,G.x,m+n,G.nonzero); // time_end=dsecnd(); // time_avg = (time_end-time_st); // printf("Successfully Solved in : %.6f secs\n",time_avg); cerr<<endl; cerr<<"Fillup Graph "; // time_st=dsecnd(); G.fillup_graph(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; //G.output_graph_stdout(); cerr<<"Matching KCL "; // time_st=dsecnd(); G.check_kcl(); // time_end=dsecnd(); // time_avg = (time_end-time_st); // cerr<<"Done "<<time_avg<<endl; /*for (int i=0;i<m+n;i++) { cout<<"M"<<i<<endl; for (int j=0;j<m+n;j++) cout<<" "<<j<<"#"<<M[i][j]<<endl; }*/ }
/* //////////////////////////////////////////////////////////////////////////// -- testing sparse matrix vector product */ int main( int argc, char** argv ) { magma_int_t info = 0; TESTING_CHECK( magma_init() ); magma_print_environment(); magma_queue_t queue=NULL; magma_queue_create( 0, &queue ); magma_s_matrix hA={Magma_CSR}, hA_SELLP={Magma_CSR}, dA={Magma_CSR}, dA_SELLP={Magma_CSR}; magma_s_matrix hx={Magma_CSR}, hy={Magma_CSR}, dx={Magma_CSR}, dy={Magma_CSR}, hrefvec={Magma_CSR}, hcheck={Magma_CSR}; hA_SELLP.blocksize = 8; hA_SELLP.alignment = 8; real_Double_t start, end, res; #ifdef MAGMA_WITH_MKL magma_int_t *pntre=NULL; #endif cusparseHandle_t cusparseHandle = NULL; cusparseMatDescr_t descr = NULL; float c_one = MAGMA_S_MAKE(1.0, 0.0); float c_zero = MAGMA_S_MAKE(0.0, 0.0); float accuracy = 1e-10; #define PRECISION_s #if defined(PRECISION_c) accuracy = 1e-4; #endif #if defined(PRECISION_s) accuracy = 1e-4; #endif magma_int_t i, j; for( i = 1; i < argc; ++i ) { if ( strcmp("--blocksize", argv[i]) == 0 ) { hA_SELLP.blocksize = atoi( argv[++i] ); } else if ( strcmp("--alignment", argv[i]) == 0 ) { hA_SELLP.alignment = atoi( argv[++i] ); } else break; } printf("\n# usage: ./run_sspmm" " [ --blocksize %lld --alignment %lld (for SELLP) ] matrices\n\n", (long long) hA_SELLP.blocksize, (long long) hA_SELLP.alignment ); while( i < argc ) { if ( strcmp("LAPLACE2D", argv[i]) == 0 && i+1 < argc ) { // Laplace test i++; magma_int_t laplace_size = atoi( argv[i] ); TESTING_CHECK( magma_sm_5stencil( laplace_size, &hA, queue )); } else { // file-matrix test TESTING_CHECK( magma_s_csr_mtx( &hA, argv[i], queue )); } printf("%% matrix info: %lld-by-%lld with %lld nonzeros\n", (long long) hA.num_rows, (long long) hA.num_cols, (long long) hA.nnz ); real_Double_t FLOPS = 2.0*hA.nnz/1e9; // m - number of rows for the sparse matrix // n - number of vectors to be multiplied in the SpMM product magma_int_t m, n; m = hA.num_rows; n = 48; // init CPU vectors TESTING_CHECK( magma_svinit( &hx, Magma_CPU, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &hy, Magma_CPU, m, n, c_zero, queue )); // init DEV vectors TESTING_CHECK( magma_svinit( &dx, Magma_DEV, m, n, c_one, queue )); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, m, n, c_zero, queue )); // calling MKL with CSR #ifdef MAGMA_WITH_MKL TESTING_CHECK( magma_imalloc_cpu( &pntre, m + 1 ) ); pntre[0] = 0; for (j=0; j < m; j++ ) { pntre[j] = hA.row[j+1]; } MKL_INT num_rows = hA.num_rows; MKL_INT num_cols = hA.num_cols; MKL_INT nnz = hA.nnz; MKL_INT num_vecs = n; MKL_INT *col; TESTING_CHECK( magma_malloc_cpu( (void**) &col, nnz * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.nnz; ++t ) { col[ t ] = hA.col[ t ]; } MKL_INT *row; TESTING_CHECK( magma_malloc_cpu( (void**) &row, num_rows * sizeof(MKL_INT) )); for( magma_int_t t=0; t < hA.num_rows; ++t ) { row[ t ] = hA.col[ t ]; } // === Call MKL with consecutive SpMVs, using mkl_scsrmv === // warmp up mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmv( "N", &num_rows, &num_cols, MKL_ADDR(&c_one), "GFNC", MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), MKL_ADDR(&c_zero), MKL_ADDR(hy.val) ); } end = magma_wtime(); printf( "\n > MKL SpMVs : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10/(end-start) ); // === Call MKL with blocked SpMVs, using mkl_scsrmm === char transa = 'n'; MKL_INT ldb = n, ldc=n; char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'}; // warm up mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); start = magma_wtime(); for (j=0; j < 10; j++ ) { mkl_scsrmm( &transa, &num_rows, &num_vecs, &num_cols, MKL_ADDR(&c_one), matdescra, MKL_ADDR(hA.val), col, row, pntre, MKL_ADDR(hx.val), &ldb, MKL_ADDR(&c_zero), MKL_ADDR(hy.val), &ldc ); } end = magma_wtime(); printf( "\n > MKL SpMM : %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); magma_free_cpu( row ); magma_free_cpu( col ); row = NULL; col = NULL; #endif // MAGMA_WITH_MKL // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue )); // SpMV on GPU (CSR) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (standard CSR).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hrefvec , Magma_DEV, Magma_CPU, queue )); magma_smfree(&dA, queue ); // convert to SELLP and copy to GPU TESTING_CHECK( magma_smconvert( hA, &hA_SELLP, Magma_CSR, Magma_SELLP, queue )); TESTING_CHECK( magma_smtransfer( hA_SELLP, &dA_SELLP, Magma_CPU, Magma_DEV, queue )); magma_smfree(&hA_SELLP, queue ); magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); // SpMV on GPU (SELLP) start = magma_sync_wtime( queue ); for (j=0; j < 10; j++) { TESTING_CHECK( magma_s_spmv( c_one, dA_SELLP, dx, c_zero, dy, queue )); } end = magma_sync_wtime( queue ); printf( " > MAGMA: %.2e seconds %.2e GFLOP/s (SELLP).\n", (end-start)/10, FLOPS*10.*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res=res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm SELL-P: ok\n"); else printf("%% tester spmm SELL-P: failed\n"); magma_smfree( &hcheck, queue ); magma_smfree(&dA_SELLP, queue ); // SpMV on GPU (CUSPARSE - CSR) // CUSPARSE context // magma_smfree( &dy, queue ); TESTING_CHECK( magma_svinit( &dy, Magma_DEV, dx.num_rows, dx.num_cols, c_zero, queue )); //#ifdef PRECISION_d start = magma_sync_wtime( queue ); TESTING_CHECK( cusparseCreate( &cusparseHandle )); TESTING_CHECK( cusparseSetStream( cusparseHandle, magma_queue_get_cuda_stream(queue) )); TESTING_CHECK( cusparseCreateMatDescr( &descr )); TESTING_CHECK( cusparseSetMatType( descr, CUSPARSE_MATRIX_TYPE_GENERAL )); TESTING_CHECK( cusparseSetMatIndexBase( descr, CUSPARSE_INDEX_BASE_ZERO )); float alpha = c_one; float beta = c_zero; // copy matrix to GPU TESTING_CHECK( magma_smtransfer( hA, &dA, Magma_CPU, Magma_DEV, queue) ); for (j=0; j < 10; j++) { cusparseScsrmm(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, dA.num_rows, n, dA.num_cols, dA.nnz, &alpha, descr, dA.dval, dA.drow, dA.dcol, dx.dval, dA.num_cols, &beta, dy.dval, dA.num_cols); } end = magma_sync_wtime( queue ); printf( " > CUSPARSE: %.2e seconds %.2e GFLOP/s (CSR).\n", (end-start)/10, FLOPS*10*n/(end-start) ); TESTING_CHECK( magma_smtransfer( dy, &hcheck , Magma_DEV, Magma_CPU, queue )); res = 0.0; for(magma_int_t k=0; k < hA.num_rows; k++ ) { res = res + MAGMA_S_REAL(hcheck.val[k]) - MAGMA_S_REAL(hrefvec.val[k]); } printf("%% |x-y|_F = %8.2e\n", res); if ( res < accuracy ) printf("%% tester spmm cuSPARSE: ok\n"); else printf("%% tester spmm cuSPARSE: failed\n"); magma_smfree( &hcheck, queue ); cusparseDestroyMatDescr( descr ); cusparseDestroy( cusparseHandle ); descr = NULL; cusparseHandle = NULL; //#endif printf("\n\n"); // free CPU memory magma_smfree( &hA, queue ); magma_smfree( &hx, queue ); magma_smfree( &hy, queue ); magma_smfree( &hrefvec, queue ); // free GPU memory magma_smfree( &dx, queue ); magma_smfree( &dy, queue ); magma_smfree( &dA, queue); #ifdef MAGMA_WITH_MKL magma_free_cpu( pntre ); #endif i++; } magma_queue_destroy( queue ); TESTING_CHECK( magma_finalize() ); return info; }
extern "C" magma_int_t magma_cmtransposeconjugate( magma_c_matrix A, magma_c_matrix *B, magma_queue_t queue ) { // for symmetric matrices: convert to csc using cusparse magma_int_t info = 0; cusparseHandle_t handle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrB=NULL; magma_c_matrix ACSR={Magma_CSR}, BCSR={Magma_CSR}; magma_c_matrix A_d={Magma_CSR}, B_d={Magma_CSR}; if( A.storage_type == Magma_CSR && A.memory_location == Magma_DEV ) { // fill in information for B B->storage_type = A.storage_type; B->diagorder_type = A.diagorder_type; B->memory_location = Magma_DEV; B->num_rows = A.num_cols; // transposed B->num_cols = A.num_rows; // transposed B->nnz = A.nnz; B->true_nnz = A.true_nnz; if ( A.fill_mode == MagmaFull ) { B->fill_mode = MagmaFull; } else if ( A.fill_mode == MagmaLower ) { B->fill_mode = MagmaUpper; } else if ( A.fill_mode == MagmaUpper ) { B->fill_mode = MagmaLower; } B->dval = NULL; B->drow = NULL; B->dcol = NULL; // memory allocation CHECK( magma_cmalloc( &B->dval, B->nnz )); CHECK( magma_index_malloc( &B->drow, B->num_rows + 1 )); CHECK( magma_index_malloc( &B->dcol, B->nnz )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &handle )); CHECK_CUSPARSE( cusparseSetStream( handle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrB )); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatType( descrB, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrB, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseCcsr2csc( handle, A.num_rows, A.num_cols, A.nnz, A.dval, A.drow, A.dcol, B->dval, B->dcol, B->drow, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO) ); CHECK( magma_cmconjugate( B, queue )); } else if ( A.memory_location == Magma_CPU ){ CHECK( magma_cmtransfer( A, &A_d, A.memory_location, Magma_DEV, queue )); CHECK( magma_cmtransposeconjugate( A_d, &B_d, queue )); CHECK( magma_cmtransfer( B_d, B, Magma_DEV, A.memory_location, queue )); } else { CHECK( magma_cmconvert( A, &ACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_cmtransposeconjugate( ACSR, &BCSR, queue )); CHECK( magma_cmconvert( BCSR, B, Magma_CSR, A.storage_type, queue )); } cleanup: cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroy( handle ); magma_cmfree( &A_d, queue ); magma_cmfree( &B_d, queue ); magma_cmfree( &ACSR, queue ); magma_cmfree( &BCSR, queue ); if( info != 0 ){ magma_cmfree( B, queue ); } return info; }
magma_int_t magma_ccustomicsetup( magma_c_matrix A, magma_c_matrix b, magma_c_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; magma_c_matrix hA={Magma_CSR}; char preconditionermatrix[255]; snprintf( preconditionermatrix, sizeof(preconditionermatrix), "/Users/hanzt0114cl306/work/matrices/ani/ani7_crop_ichol.mtx" ); CHECK( magma_c_csr_mtx( &hA, preconditionermatrix , queue) ); // for CUSPARSE CHECK( magma_cmtransfer( hA, &precond->M, Magma_CPU, Magma_DEV , queue )); // copy the matrix to precond->L and (transposed) to precond->U CHECK( magma_cmtransfer(precond->M, &(precond->L), Magma_DEV, Magma_DEV, queue )); CHECK( magma_cmtranspose( precond->L, &(precond->U), queue )); // extract the diagonal of L into precond->d CHECK( magma_cjacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_cvinit( &precond->work1, Magma_DEV, hA.num_rows, 1, MAGMA_C_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_cjacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_cvinit( &precond->work2, Magma_DEV, hA.num_rows, 1, MAGMA_C_ZERO, queue )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseCcsrsv_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.val, precond->M.row, precond->M.col, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseCcsrsv_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.val, precond->M.row, precond->M.col, precond->cuinfoU )); cleanup: cusparseDestroy( cusparseHandle ); cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseHandle=NULL; descrL=NULL; descrU=NULL; magma_cmfree( &hA, queue ); return info; }
extern "C" magma_int_t magma_dcumiccsetup( magma_d_matrix A, magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrA=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; #if CUDA_VERSION >= 7000 csric02Info_t info_M=NULL; void *pBuffer = NULL; #endif magma_d_matrix hA={Magma_CSR}, hACSR={Magma_CSR}, U={Magma_CSR}; CHECK( magma_dmtransfer( A, &hA, A.memory_location, Magma_CPU, queue )); U.diagorder_type = Magma_VALUE; CHECK( magma_dmconvert( hA, &hACSR, hA.storage_type, Magma_CSR, queue )); // in case using fill-in if( precond->levels > 0 ){ magma_d_matrix hAL={Magma_CSR}, hAUt={Magma_CSR}; CHECK( magma_dsymbilu( &hACSR, precond->levels, &hAL, &hAUt, queue )); magma_dmfree(&hAL, queue); magma_dmfree(&hAUt, queue); } CHECK( magma_dmconvert( hACSR, &U, Magma_CSR, Magma_CSRL, queue )); magma_dmfree( &hACSR, queue ); CHECK( magma_dmtransfer(U, &(precond->M), Magma_CPU, Magma_DEV, queue )); // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrA )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &(precond->cuinfo) )); // use kernel to manually check for zeros n the diagonal CHECK( magma_ddiagcheck( precond->M, queue ) ); #if CUDA_VERSION >= 7000 // this version has the bug fixed where a zero on the diagonal causes a crash CHECK_CUSPARSE( cusparseCreateCsric02Info(&info_M) ); CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_GENERAL )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); int buffersize; int structural_zero; int numerical_zero; CHECK_CUSPARSE( cusparseDcsric02_bufferSize( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, &buffersize ) ); CHECK( magma_malloc((void**)&pBuffer, buffersize) ); CHECK_CUSPARSE( cusparseDcsric02_analysis( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer )); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &numerical_zero ) ); CHECK_CUSPARSE( cusparseXcsric02_zeroPivot( cusparseHandle, info_M, &structural_zero ) ); CHECK_CUSPARSE( cusparseDcsric02( cusparseHandle, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, info_M, CUSPARSE_SOLVE_POLICY_NO_LEVEL, pBuffer) ); #else // this version contains the bug but is needed for backward compability CHECK_CUSPARSE( cusparseSetMatType( descrA, CUSPARSE_MATRIX_TYPE_SYMMETRIC )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrA, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrA, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrA, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); CHECK_CUSPARSE( cusparseDcsric0( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, descrA, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfo )); #endif CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU )); if( precond->maxiter < 50 ){ //prepare for iterative solves // copy the matrix to precond->L and (transposed) to precond->U CHECK( magma_dmtransfer(precond->M, &(precond->L), Magma_DEV, Magma_DEV, queue )); CHECK( magma_dmtranspose( precond->L, &(precond->U), queue )); // extract the diagonal of L into precond->d CHECK( magma_djacobisetup_diagscal( precond->L, &precond->d, queue )); CHECK( magma_dvinit( &precond->work1, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); // extract the diagonal of U into precond->d2 CHECK( magma_djacobisetup_diagscal( precond->U, &precond->d2, queue )); CHECK( magma_dvinit( &precond->work2, Magma_DEV, hA.num_rows, 1, MAGMA_D_ZERO, queue )); } /* // to enable also the block-asynchronous iteration for the triangular solves CHECK( magma_dmtransfer( precond->M, &hA, Magma_DEV, Magma_CPU, queue )); hA.storage_type = Magma_CSR; magma_d_matrix hD, hR, hAt CHECK( magma_dcsrsplit( 256, hA, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->LD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->L, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); CHECK( magma_d_cucsrtranspose( hA, &hAt, queue )); CHECK( magma_dcsrsplit( 256, hAt, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->UD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->U, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); magma_dmfree(&hA, queue ); magma_dmfree(&hAt, queue ); */ cleanup: #if CUDA_VERSION >= 7000 magma_free( pBuffer ); cusparseDestroyCsric02Info( info_M ); #endif cusparseDestroySolveAnalysisInfo( precond->cuinfo ); cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroyMatDescr( descrA ); cusparseDestroy( cusparseHandle ); magma_dmfree(&U, queue ); magma_dmfree(&hA, queue ); return info; }
int _tmain(int argc, _TCHAR* argv[]) { int M = 0, N = 0, nz = 0, *I = NULL, *J = NULL; cuDoubleComplex *val = NULL; cuDoubleComplex *x, *y; cuDoubleComplex *d_x, *d_y; double duration, duration_setup; std::clock_t setup_clock; setup_clock = std::clock(); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); if (devID < 0) { printf("no devices found...\n"); exit(EXIT_SUCCESS); } checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); if (version < 0x11) { printf("Requires a minimum CUDA compute 1.1 capability\n"); cudaDeviceReset(); exit(EXIT_SUCCESS); } M = N = 8388608; //2 ^ 23 //M = N = 4194304; //2 ^ 22 //M = N = 2097152; //2 ^ 21 //M = N = 1048576; //2 ^ 20 //M = N = 524288; //2 ^ 19 nz = N * 8; I = (int *)malloc(sizeof(int)*(N + 1)); J = (int *)malloc(sizeof(int)*nz); val = (cuDoubleComplex *)malloc(sizeof(cuDoubleComplex)*nz); genTridiag(I, J, val, N, nz); x = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex)* N); y = (cuDoubleComplex*)malloc(sizeof(cuDoubleComplex)* N); //create an array for the answer array (Y) and set all of the answers to 0 for the test (could do random) for (int i = 0; i < N; i++) { y[i] = make_cuDoubleComplex(0.0, 0.0); } //Get handle to the CUBLAS context cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); //Get handle to the CUSPARSE context cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); //Get handle to a CUSPARSE matrix descriptor cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); //Get handle to a matrix_solve_info object cusparseSolveAnalysisInfo_t info = 0; cusparseStatus = cusparseCreateSolveAnalysisInfo(&info); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); duration_setup = (std::clock() - setup_clock) / (double)CLOCKS_PER_SEC; printf("setup_time: %f\r\n", duration_setup); std::clock_t start; start = std::clock(); checkCudaErrors(cudaMalloc((void **)&d_x, N*sizeof(float))); checkCudaErrors(cudaMalloc((void **)&d_y, N*sizeof(float))); cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice); //Analyze the matrix. The info variable is needed to perform additional operations on the matrix cusparseStatus = cusparseZcsrsv_analysis(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, nz, descr, val, J, I, info); //Uses infor gathered from the matrix to solve the matrix. cusparseStatus = cusparseZcsrsv_solve(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 0, descr, val, J, I, info, d_x, d_y); //Get the result back from the device cudaMemcpy(x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost); duration = (std::clock() - start) / (double)CLOCKS_PER_SEC; printf("time ellapsed: %f", duration); //free up memory cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); free(I); free(J); free(val); free(x); cudaFree(d_x); cudaDeviceReset(); //Wait for user input so they can see the results char* s = (char*)malloc(sizeof(char) * 8); scanf(s); exit(0); }
extern "C" magma_int_t magma_dcumicgeneratesolverinfo( magma_d_preconditioner *precond, magma_queue_t queue ) { magma_int_t info = 0; cusparseHandle_t cusparseHandle=NULL; cusparseMatDescr_t descrL=NULL; cusparseMatDescr_t descrU=NULL; // CUSPARSE context // CHECK_CUSPARSE( cusparseCreate( &cusparseHandle )); CHECK_CUSPARSE( cusparseSetStream( cusparseHandle, queue->cuda_stream() )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrL )); CHECK_CUSPARSE( cusparseSetMatType( descrL, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrL, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrL, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrL, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoL )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrL, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoL )); CHECK_CUSPARSE( cusparseCreateMatDescr( &descrU )); CHECK_CUSPARSE( cusparseSetMatType( descrU, CUSPARSE_MATRIX_TYPE_TRIANGULAR )); CHECK_CUSPARSE( cusparseSetMatDiagType( descrU, CUSPARSE_DIAG_TYPE_NON_UNIT )); CHECK_CUSPARSE( cusparseSetMatIndexBase( descrU, CUSPARSE_INDEX_BASE_ZERO )); CHECK_CUSPARSE( cusparseSetMatFillMode( descrU, CUSPARSE_FILL_MODE_LOWER )); CHECK_CUSPARSE( cusparseCreateSolveAnalysisInfo( &precond->cuinfoU )); CHECK_CUSPARSE( cusparseDcsrsm_analysis( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, precond->M.nnz, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU )); /* // to enable also the block-asynchronous iteration for the triangular solves CHECK( magma_dmtransfer( precond->M, &hA, Magma_DEV, Magma_CPU, queue )); hA.storage_type = Magma_CSR; CHECK( magma_dcsrsplit( 256, hA, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->LD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->L, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); CHECK( magma_d_cucsrtranspose( hA, &hAt, queue )); CHECK( magma_dcsrsplit( 256, hAt, &hD, &hR, queue )); CHECK( magma_dmtransfer( hD, &precond->UD, Magma_CPU, Magma_DEV, queue )); CHECK( magma_dmtransfer( hR, &precond->U, Magma_CPU, Magma_DEV, queue )); magma_dmfree(&hD, queue ); magma_dmfree(&hR, queue ); magma_dmfree(&hA, queue ); magma_dmfree(&hAt, queue ); */ cleanup: cusparseDestroyMatDescr( descrL ); cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); return info; }
int main(int argc, char **argv) { int N = 0, nz = 0, *I = NULL, *J = NULL; float *val = NULL; const float tol = 1e-5f; const int max_iter = 10000; float *x; float *rhs; float a, b, na, r0, r1; float dot; float *r, *p, *Ax; int k; float alpha, beta, alpham1; printf("Starting [%s]...\n", sSDKname); // This will pick the best possible CUDA capable device cudaDeviceProp deviceProp; int devID = findCudaDevice(argc, (const char **)argv); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); #if defined(__APPLE__) || defined(MACOSX) fprintf(stderr, "Unified Memory not currently supported on OS X\n"); cudaDeviceReset(); exit(EXIT_WAIVED); #endif if (sizeof(void *) != 8) { fprintf(stderr, "Unified Memory requires compiling for a 64-bit system.\n"); cudaDeviceReset(); exit(EXIT_WAIVED); } if (((deviceProp.major << 4) + deviceProp.minor) < 0x30) { fprintf(stderr, "%s requires Compute Capability of SM 3.0 or higher to run.\nexiting...\n", argv[0]); cudaDeviceReset(); exit(EXIT_WAIVED); } // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); /* Generate a random tridiagonal symmetric matrix in CSR format */ N = 1048576; nz = (N-2)*3 + 4; cudaMallocManaged((void **)&I, sizeof(int)*(N+1)); cudaMallocManaged((void **)&J, sizeof(int)*nz); cudaMallocManaged((void **)&val, sizeof(float)*nz); genTridiag(I, J, val, N, nz); cudaMallocManaged((void **)&x, sizeof(float)*N); cudaMallocManaged((void **)&rhs, sizeof(float)*N); for (int i = 0; i < N; i++) { rhs[i] = 1.0; x[i] = 0.0; } /* Get handle to the CUBLAS context */ cublasHandle_t cublasHandle = 0; cublasStatus_t cublasStatus; cublasStatus = cublasCreate(&cublasHandle); checkCudaErrors(cublasStatus); /* Get handle to the CUSPARSE context */ cusparseHandle_t cusparseHandle = 0; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); checkCudaErrors(cusparseStatus); cusparseMatDescr_t descr = 0; cusparseStatus = cusparseCreateMatDescr(&descr); checkCudaErrors(cusparseStatus); cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); // temp memory for CG checkCudaErrors(cudaMallocManaged((void **)&r, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&p, N*sizeof(float))); checkCudaErrors(cudaMallocManaged((void **)&Ax, N*sizeof(float))); cudaDeviceSynchronize(); for (int i=0; i < N; i++) { r[i] = rhs[i]; } alpha = 1.0; alpham1 = -1.0; beta = 0.0; r0 = 0.; cusparseScsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, x, &beta, Ax); cublasSaxpy(cublasHandle, N, &alpham1, Ax, 1, r, 1); cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); k = 1; while (r1 > tol*tol && k <= max_iter) { if (k > 1) { b = r1 / r0; cublasStatus = cublasSscal(cublasHandle, N, &b, p, 1); cublasStatus = cublasSaxpy(cublasHandle, N, &alpha, r, 1, p, 1); } else { cublasStatus = cublasScopy(cublasHandle, N, r, 1, p, 1); } cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha, descr, val, I, J, p, &beta, Ax); cublasStatus = cublasSdot(cublasHandle, N, p, 1, Ax, 1, &dot); a = r1 / dot; cublasStatus = cublasSaxpy(cublasHandle, N, &a, p, 1, x, 1); na = -a; cublasStatus = cublasSaxpy(cublasHandle, N, &na, Ax, 1, r, 1); r0 = r1; cublasStatus = cublasSdot(cublasHandle, N, r, 1, r, 1, &r1); cudaThreadSynchronize(); printf("iteration = %3d, residual = %e\n", k, sqrt(r1)); k++; } printf("Final residual: %e\n",sqrt(r1)); fprintf(stdout,"&&&& uvm_cg test %s\n", (sqrt(r1) < tol) ? "PASSED" : "FAILED"); float rsum, diff, err = 0.0; for (int i = 0; i < N; i++) { rsum = 0.0; for (int j = I[i]; j < I[i+1]; j++) { rsum += val[j]*x[J[j]]; } diff = fabs(rsum - rhs[i]); if (diff > err) { err = diff; } } cusparseDestroy(cusparseHandle); cublasDestroy(cublasHandle); cudaFree(I); cudaFree(J); cudaFree(val); cudaFree(x); cudaFree(r); cudaFree(p); cudaFree(Ax); cudaDeviceReset(); printf("Test Summary: Error amount = %f, result = %s\n", err, (k <= max_iter) ? "SUCCESS" : "FAILURE"); exit((k <= max_iter) ? EXIT_SUCCESS : EXIT_FAILURE); }
extern "C" magma_int_t magma_zcuspaxpy( magmaDoubleComplex *alpha, magma_z_sparse_matrix A, magmaDoubleComplex *beta, magma_z_sparse_matrix B, magma_z_sparse_matrix *AB, magma_queue_t queue ) { if ( A.memory_location == Magma_DEV && B.memory_location == Magma_DEV && ( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRCOO ) && ( B.storage_type == Magma_CSR || B.storage_type == Magma_CSRCOO ) ) { magma_z_sparse_matrix C; C.num_rows = A.num_rows; C.num_cols = A.num_cols; C.storage_type = A.storage_type; C.memory_location = A.memory_location; magma_int_t stat_dev = 0; C.val = NULL; C.col = NULL; C.row = NULL; C.rowidx = NULL; C.blockinfo = NULL; C.diag = NULL; C.dval = NULL; C.dcol = NULL; C.drow = NULL; C.drowidx = NULL; C.ddiag = NULL; // CUSPARSE context // cusparseHandle_t handle; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&handle); cusparseSetStream( handle, queue ); if (cusparseStatus != 0) printf("error in Handle.\n"); cusparseMatDescr_t descrA; cusparseMatDescr_t descrB; cusparseMatDescr_t descrC; cusparseStatus = cusparseCreateMatDescr(&descrA); cusparseStatus = cusparseCreateMatDescr(&descrB); cusparseStatus = cusparseCreateMatDescr(&descrC); if (cusparseStatus != 0) printf("error in MatrDescr.\n"); cusparseStatus = cusparseSetMatType(descrA,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrB,CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatType(descrC,CUSPARSE_MATRIX_TYPE_GENERAL); if (cusparseStatus != 0) printf("error in MatrType.\n"); cusparseStatus = cusparseSetMatIndexBase(descrA,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrB,CUSPARSE_INDEX_BASE_ZERO); cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO); if (cusparseStatus != 0) printf("error in IndexBase.\n"); // multiply A and B on the device magma_int_t baseC; // nnzTotalDevHostPtr points to host memory magma_index_t *nnzTotalDevHostPtr = (magma_index_t*) &C.nnz; cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST); stat_dev += magma_index_malloc( &C.drow, (A.num_rows + 1) ); cusparseXcsrgeamNnz(handle,A.num_rows, A.num_cols, descrA, A.nnz, A.drow, A.dcol, descrB, B.nnz, B.drow, B.dcol, descrC, C.row, nnzTotalDevHostPtr); if (NULL != nnzTotalDevHostPtr) { C.nnz = *nnzTotalDevHostPtr; } else { // workaround as nnz and base C are magma_int_t magma_index_t base_t, nnz_t; magma_index_getvector( 1, C.drow+C.num_rows, 1, &nnz_t, 1 ); magma_index_getvector( 1, C.drow, 1, &base_t, 1 ); C.nnz = (magma_int_t) nnz_t; baseC = (magma_int_t) base_t; C.nnz -= baseC; } stat_dev += magma_index_malloc( &C.dcol, C.nnz ); stat_dev += magma_zmalloc( &C.dval, C.nnz ); if( stat_dev != 0 ) { magma_z_mfree( &C, queue ); return MAGMA_ERR_DEVICE_ALLOC; } cusparseZcsrgeam(handle, A.num_rows, A.num_cols, alpha, descrA, A.nnz, A.dval, A.drow, A.dcol, beta, descrB, B.nnz, B.dval, B.drow, B.dcol, descrC, C.dval, C.drow, C.dcol); cusparseDestroyMatDescr( descrA ); cusparseDestroyMatDescr( descrB ); cusparseDestroyMatDescr( descrC ); cusparseDestroy( handle ); // end CUSPARSE context // magma_z_mtransfer( C, AB, Magma_DEV, Magma_DEV, queue ); magma_z_mfree( &C, queue ); return MAGMA_SUCCESS; } else { printf("error: CSRSPAXPY only supported on device and CSR format.\n"); return MAGMA_SUCCESS; } }
extern "C" magma_int_t magma_dapplycumicc_r( magma_d_vector b, magma_d_vector *x, magma_d_preconditioner *precond, magma_queue_t queue ) { double one = MAGMA_D_MAKE( 1.0, 0.0); // CUSPARSE context // cusparseHandle_t cusparseHandle; cusparseStatus_t cusparseStatus; cusparseStatus = cusparseCreate(&cusparseHandle); cusparseSetStream( cusparseHandle, queue ); if (cusparseStatus != 0) printf("error in Handle.\n"); cusparseMatDescr_t descrU; cusparseStatus = cusparseCreateMatDescr(&descrU); if (cusparseStatus != 0) printf("error in MatrDescr.\n"); cusparseStatus = cusparseSetMatType(descrU,CUSPARSE_MATRIX_TYPE_TRIANGULAR); if (cusparseStatus != 0) printf("error in MatrType.\n"); cusparseStatus = cusparseSetMatDiagType (descrU, CUSPARSE_DIAG_TYPE_NON_UNIT); if (cusparseStatus != 0) printf("error in DiagType.\n"); cusparseStatus = cusparseSetMatIndexBase(descrU,CUSPARSE_INDEX_BASE_ZERO); if (cusparseStatus != 0) printf("error in IndexBase.\n"); cusparseStatus = cusparseSetMatFillMode(descrU,CUSPARSE_FILL_MODE_LOWER); if (cusparseStatus != 0) printf("error in fillmode.\n"); // end CUSPARSE context // magma_int_t dofs = precond->M.num_rows; cusparseStatus = cusparseDcsrsm_solve( cusparseHandle, CUSPARSE_OPERATION_TRANSPOSE, precond->M.num_rows, b.num_rows*b.num_cols/precond->M.num_rows, &one, descrU, precond->M.dval, precond->M.drow, precond->M.dcol, precond->cuinfoU, b.dval, precond->M.num_rows, x->dval, precond->M.num_rows); if (cusparseStatus != 0) printf("error in U triangular solve:%d.\n", precond->cuinfoU ); cusparseDestroyMatDescr( descrU ); cusparseDestroy( cusparseHandle ); magma_device_sync(); return MAGMA_SUCCESS; }