magma_int_t magma_cbaiter( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par ) { // prepare solver feedback solver_par->solver = Magma_BAITER; solver_par->info = 0; magma_c_sparse_matrix A_d, D, R, D_d, R_d; magma_c_mtransfer( A, &A_d, Magma_CPU, Magma_DEV ); // initial residual real_Double_t tempo1, tempo2; float residual; magma_cresidual( A_d, b, *x, &residual ); solver_par->init_res = residual; solver_par->res_vec = NULL; solver_par->timing = NULL; // setup magma_ccsrsplit( 256, A, &D, &R ); magma_c_mtransfer( D, &D_d, Magma_CPU, Magma_DEV ); magma_c_mtransfer( R, &R_d, Magma_CPU, Magma_DEV ); magma_int_t localiter = 1; magma_device_sync(); tempo1=magma_wtime(); // block-asynchronous iteration iterator for( int iter=0; iter<solver_par->maxiter; iter++) magma_cbajac_csr( localiter, D_d, R_d, b, x ); magma_device_sync(); tempo2=magma_wtime(); solver_par->runtime = (real_Double_t) tempo2-tempo1; magma_cresidual( A_d, b, *x, &residual ); solver_par->final_res = residual; solver_par->numiter = solver_par->maxiter; if( solver_par->init_res > solver_par->final_res ) solver_par->info = 0; else solver_par->info = -1; magma_c_mfree(&D); magma_c_mfree(&R); magma_c_mfree(&D_d); magma_c_mfree(&R_d); magma_c_mfree(&A_d); return MAGMA_SUCCESS; } /* magma_cbaiter */
extern "C" magma_int_t magma_ccsrsplit( magma_int_t bsize, magma_c_matrix A, magma_c_matrix *D, magma_c_matrix *R, magma_queue_t queue ) { magma_int_t info = 0; magma_int_t i, k, j, nnz_diag, nnz_offd; D->val = NULL; D->col = NULL; D->row = NULL; D->rowidx = NULL; D->blockinfo = NULL; D->diag = NULL; D->dval = NULL; D->dcol = NULL; D->drow = NULL; D->drowidx = NULL; D->ddiag = NULL; R->val = NULL; R->col = NULL; R->row = NULL; R->rowidx = NULL; R->blockinfo = NULL; R->diag = NULL; R->dval = NULL; R->dcol = NULL; R->drow = NULL; R->drowidx = NULL; R->ddiag = NULL; if ( A.memory_location == Magma_CPU && ( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRCOO ) ) { nnz_diag = nnz_offd = 0; // Count the new number of nonzeroes in the two matrices for( i=0; i<A.num_rows; i+=bsize ){ for( k=i; k<min(A.num_rows,i+bsize); k++ ){ int check = 0; for( j=A.row[k]; j<A.row[k+1]; j++ ){ if ( A.col[j] < i ) nnz_offd++; else if ( A.col[j] < i+bsize ){ if( A.col[j] == k ){ check = 1; } nnz_diag++; } else nnz_offd++; } if( check == 0 ){ printf("error: matrix contains zero on diagonal at (%d,%d).\n", i, i); info = -1; goto cleanup; } } } // Allocate memory for the new matrices D->storage_type = Magma_CSRD; D->memory_location = A.memory_location; D->num_rows = A.num_rows; D->num_cols = A.num_cols; D->nnz = nnz_diag; R->storage_type = Magma_CSR; R->memory_location = A.memory_location; R->num_rows = A.num_rows; R->num_cols = A.num_cols; R->nnz = nnz_offd; CHECK( magma_cmalloc_cpu( &D->val, nnz_diag )); CHECK( magma_index_malloc_cpu( &D->row, A.num_rows+1 )); CHECK( magma_index_malloc_cpu( &D->col, nnz_diag )); CHECK( magma_cmalloc_cpu( &R->val, nnz_offd )); CHECK( magma_index_malloc_cpu( &R->row, A.num_rows+1 )); CHECK( magma_index_malloc_cpu( &R->col, nnz_offd )); // Fill up the new sparse matrices D->row[0] = 0; R->row[0] = 0; nnz_offd = nnz_diag = 0; for( i=0; i<A.num_rows; i+=bsize) { for( k=i; k<min(A.num_rows,i+bsize); k++ ) { D->row[k+1] = D->row[k]; R->row[k+1] = R->row[k]; for( j=A.row[k]; j<A.row[k+1]; j++ ) { if ( A.col[j] < i ) { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } else if ( A.col[j] < i+bsize ) { // larger than diagonal remain as before if ( A.col[j]>k ) { D->val[nnz_diag] = A.val[ j ]; D->col[nnz_diag] = A.col[ j ]; D->row[k+1]++; } // diagonal is written first else if ( A.col[j]==k ) { D->val[D->row[k]] = A.val[ j ]; D->col[D->row[k]] = A.col[ j ]; D->row[k+1]++; } // smaller than diagonal are shifted one to the right // to have room for the diagonal else { D->val[nnz_diag+1] = A.val[ j ]; D->col[nnz_diag+1] = A.col[ j ]; D->row[k+1]++; } nnz_diag++; } else { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } } } } } else { magma_c_matrix Ah={Magma_CSR}, ACSR={Magma_CSR}, DCSR={Magma_CSR}, RCSR={Magma_CSR}, Dh={Magma_CSR}, Rh={Magma_CSR}; CHECK( magma_cmtransfer( A, &Ah, A.memory_location, Magma_CPU, queue )); CHECK( magma_cmconvert( Ah, &ACSR, A.storage_type, Magma_CSR, queue )); CHECK( magma_ccsrsplit( bsize, ACSR, &DCSR, &RCSR, queue )); CHECK( magma_cmconvert( DCSR, &Dh, Magma_CSR, A.storage_type, queue )); CHECK( magma_cmconvert( RCSR, &Rh, Magma_CSR, A.storage_type, queue )); CHECK( magma_cmtransfer( Dh, D, Magma_CPU, A.memory_location, queue )); CHECK( magma_cmtransfer( Rh, R, Magma_CPU, A.memory_location, queue )); magma_cmfree( &Ah, queue ); magma_cmfree( &ACSR, queue ); magma_cmfree( &Dh, queue ); magma_cmfree( &DCSR, queue ); magma_cmfree( &Rh, queue ); magma_cmfree( &RCSR, queue ); } cleanup: if( info != 0 ){ magma_cmfree( D, queue ); magma_cmfree( R, queue ); } return info; }
extern "C" magma_int_t magma_cbaiter( magma_c_sparse_matrix A, magma_c_vector b, magma_c_vector *x, magma_c_solver_par *solver_par, magma_queue_t queue ) { // prepare solver feedback solver_par->solver = Magma_BAITER; solver_par->info = MAGMA_SUCCESS; magma_c_sparse_matrix Ah, ACSR, A_d, D, R, D_d, R_d; magma_c_mtransfer( A, &Ah, A.memory_location, Magma_CPU, queue ); magma_c_mconvert( Ah, &ACSR, Ah.storage_type, Magma_CSR, queue ); magma_c_mtransfer( ACSR, &A_d, Magma_CPU, Magma_DEV, queue ); // initial residual real_Double_t tempo1, tempo2; float residual; magma_cresidual( A_d, b, *x, &residual, queue ); solver_par->init_res = residual; solver_par->res_vec = NULL; solver_par->timing = NULL; // setup magma_ccsrsplit( 256, ACSR, &D, &R, queue ); magma_c_mtransfer( D, &D_d, Magma_CPU, Magma_DEV, queue ); magma_c_mtransfer( R, &R_d, Magma_CPU, Magma_DEV, queue ); magma_int_t localiter = 1; tempo1 = magma_sync_wtime( queue ); // block-asynchronous iteration iterator for( int iter=0; iter<solver_par->maxiter; iter++) magma_cbajac_csr( localiter, D_d, R_d, b, x, queue ); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; magma_cresidual( A_d, b, *x, &residual, queue ); solver_par->final_res = residual; solver_par->numiter = solver_par->maxiter; if ( solver_par->init_res > solver_par->final_res ) solver_par->info = MAGMA_SUCCESS; else solver_par->info = MAGMA_DIVERGENCE; magma_c_mfree(&D, queue ); magma_c_mfree(&R, queue ); magma_c_mfree(&D_d, queue ); magma_c_mfree(&R_d, queue ); magma_c_mfree(&A_d, queue ); magma_c_mfree(&ACSR, queue ); magma_c_mfree(&Ah, queue ); return MAGMA_SUCCESS; } /* magma_cbaiter */
extern "C" magma_int_t magma_ccsrsplit( magma_int_t bsize, magma_c_sparse_matrix A, magma_c_sparse_matrix *D, magma_c_sparse_matrix *R, magma_queue_t queue ) { if ( A.memory_location == Magma_CPU && ( A.storage_type == Magma_CSR || A.storage_type == Magma_CSRCOO ) ) { magma_int_t i, k, j, nnz_diag, nnz_offd; magma_int_t stat_cpu = 0, stat_dev = 0; D->val = NULL; D->col = NULL; D->row = NULL; D->rowidx = NULL; D->blockinfo = NULL; D->diag = NULL; D->dval = NULL; D->dcol = NULL; D->drow = NULL; D->drowidx = NULL; D->ddiag = NULL; R->val = NULL; R->col = NULL; R->row = NULL; R->rowidx = NULL; R->blockinfo = NULL; R->diag = NULL; R->dval = NULL; R->dcol = NULL; R->drow = NULL; R->drowidx = NULL; R->ddiag = NULL; nnz_diag = nnz_offd = 0; // Count the new number of nonzeroes in the two matrices for( i=0; i<A.num_rows; i+=bsize ) for( k=i; k<min(A.num_rows,i+bsize); k++ ) for( j=A.row[k]; j<A.row[k+1]; j++ ) if ( A.col[j] < i ) nnz_offd++; else if ( A.col[j] < i+bsize ) nnz_diag++; else nnz_offd++; // Allocate memory for the new matrices D->storage_type = Magma_CSRD; D->memory_location = A.memory_location; D->num_rows = A.num_rows; D->num_cols = A.num_cols; D->nnz = nnz_diag; R->storage_type = Magma_CSR; R->memory_location = A.memory_location; R->num_rows = A.num_rows; R->num_cols = A.num_cols; R->nnz = nnz_offd; stat_cpu += magma_cmalloc_cpu( &D->val, nnz_diag ); stat_cpu += magma_index_malloc_cpu( &D->row, A.num_rows+1 ); stat_cpu += magma_index_malloc_cpu( &D->col, nnz_diag ); stat_cpu += magma_cmalloc_cpu( &R->val, nnz_offd ); stat_cpu += magma_index_malloc_cpu( &R->row, A.num_rows+1 ); stat_cpu += magma_index_malloc_cpu( &R->col, nnz_offd ); if( stat_cpu != 0 ){ magma_c_mfree( D, queue ); magma_c_mfree( R, queue ); return MAGMA_ERR_HOST_ALLOC; } // Fill up the new sparse matrices D->row[0] = 0; R->row[0] = 0; nnz_offd = nnz_diag = 0; for( i=0; i<A.num_rows; i+=bsize) { for( k=i; k<min(A.num_rows,i+bsize); k++ ) { D->row[k+1] = D->row[k]; R->row[k+1] = R->row[k]; for( j=A.row[k]; j<A.row[k+1]; j++ ) { if ( A.col[j] < i ) { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } else if ( A.col[j] < i+bsize ) { // larger than diagonal remain as before if ( A.col[j]>k ) { D->val[nnz_diag] = A.val[ j ]; D->col[nnz_diag] = A.col[ j ]; D->row[k+1]++; } // diagonal is written first else if ( A.col[j]==k ) { D->val[D->row[k]] = A.val[ j ]; D->col[D->row[k]] = A.col[ j ]; D->row[k+1]++; } // smaller than diagonal are shifted one to the right // to have room for the diagonal else { D->val[nnz_diag+1] = A.val[ j ]; D->col[nnz_diag+1] = A.col[ j ]; D->row[k+1]++; } nnz_diag++; } else { R->val[nnz_offd] = A.val[j]; R->col[nnz_offd] = A.col[j]; R->row[k+1]++; nnz_offd++; } } } } return MAGMA_SUCCESS; } else { magma_c_sparse_matrix Ah, ACSR, DCSR, RCSR, Dh, Rh; magma_c_mtransfer( A, &Ah, A.memory_location, Magma_CPU, queue ); magma_c_mconvert( Ah, &ACSR, A.storage_type, Magma_CSR, queue ); magma_ccsrsplit( bsize, ACSR, &DCSR, &RCSR, queue ); magma_c_mconvert( DCSR, &Dh, Magma_CSR, A.storage_type, queue ); magma_c_mconvert( RCSR, &Rh, Magma_CSR, A.storage_type, queue ); magma_c_mtransfer( Dh, D, Magma_CPU, A.memory_location, queue ); magma_c_mtransfer( Rh, R, Magma_CPU, A.memory_location, queue ); magma_c_mfree( &Ah, queue ); magma_c_mfree( &ACSR, queue ); magma_c_mfree( &Dh, queue ); magma_c_mfree( &DCSR, queue ); magma_c_mfree( &Rh, queue ); magma_c_mfree( &RCSR, queue ); return MAGMA_SUCCESS; } }
extern "C" magma_int_t magma_cbaiter_overlap( magma_c_matrix A, magma_c_matrix b, magma_c_matrix *x, magma_c_solver_par *solver_par, magma_c_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; // prepare solver feedback solver_par->solver = Magma_BAITERO; // some useful variables magmaFloatComplex c_zero = MAGMA_C_ZERO; // initial residual real_Double_t tempo1, tempo2, runtime=0; float residual; magma_int_t localiter = precond_par->maxiter; magma_c_matrix Ah={Magma_CSR}, ACSR={Magma_CSR}, A_d={Magma_CSR}, r={Magma_CSR}, D={Magma_CSR}, R={Magma_CSR}; // setup magma_int_t matrices; matrices = precond_par->levels; struct magma_c_matrix D_d[ 256 ]; struct magma_c_matrix R_d[ 256 ]; magma_int_t overlap; magma_int_t blocksize = 256; if( matrices==2 || matrices==4 || matrices==8 || matrices==16 || matrices==32 || matrices==64 || matrices==128 ){ overlap = blocksize/matrices; }else if( matrices == 1){ overlap = 0; }else{ printf("error: overlap ratio not supported.\n"); goto cleanup; } CHECK( magma_cmtransfer( A, &Ah, A.memory_location, Magma_CPU, queue )); CHECK( magma_cmconvert( Ah, &ACSR, Ah.storage_type, Magma_CSR, queue )); CHECK( magma_cmtransfer( ACSR, &A_d, Magma_CPU, Magma_DEV, queue )); CHECK( magma_cvinit( &r, Magma_DEV, A.num_rows, b.num_cols, c_zero, queue )); CHECK( magma_cresidualvec( A_d, b, *x, &r, &residual, queue)); solver_par->init_res = residual; if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = (real_Double_t) residual; } // setup for( int i=0; i<matrices; i++ ){ CHECK( magma_ccsrsplit( i*overlap, 256, ACSR, &D, &R, queue )); CHECK( magma_cmtransfer( D, &D_d[i], Magma_CPU, Magma_DEV, queue )); CHECK( magma_cmtransfer( R, &R_d[i], Magma_CPU, Magma_DEV, queue )); magma_cmfree(&D, queue ); magma_cmfree(&R, queue ); } magma_int_t iterinc; if( solver_par->verbose == 0 ){ iterinc = solver_par->maxiter; } else{ iterinc = solver_par->verbose; } solver_par->numiter = 0; solver_par->spmv_count = 0; // block-asynchronous iteration iterator do { tempo1 = magma_sync_wtime( queue ); solver_par->numiter+= iterinc; for( int z=0; z<iterinc; z++){ CHECK( magma_cbajac_csr_overlap( localiter, matrices, overlap, D_d, R_d, b, x, queue )); } tempo2 = magma_sync_wtime( queue ); runtime += tempo2-tempo1; if ( solver_par->verbose > 0 ) { CHECK( magma_cresidualvec( A_d, b, *x, &r, &residual, queue)); solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) residual; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) runtime; } } while ( solver_par->numiter+1 <= solver_par->maxiter ); solver_par->runtime = runtime; CHECK( magma_cresidual( A_d, b, *x, &residual, queue)); solver_par->final_res = residual; solver_par->numiter = solver_par->maxiter; if ( solver_par->init_res > solver_par->final_res ){ info = MAGMA_SUCCESS; } else { info = MAGMA_DIVERGENCE; } cleanup: magma_cmfree(&r, queue ); magma_cmfree(&D, queue ); magma_cmfree(&R, queue ); for( int i=0; i<matrices; i++ ){ magma_cmfree(&D_d[i], queue ); magma_cmfree(&R_d[i], queue ); } magma_cmfree(&A_d, queue ); magma_cmfree(&ACSR, queue ); magma_cmfree(&Ah, queue ); solver_par->info = info; return info; } /* magma_cbaiter_overlap */