BASKER_INLINE int Basker<Int,Entry, Exe_Space>::find_btf(BASKER_MATRIX &M) { Int nblks = 0; strong_component(M,nblks,order_btf_array,btf_tabs); btf_flag = BASKER_TRUE; #ifdef BASKER_DEBUG_ORDER_BTF printf("BTF nblks returned: %d \n", nblks); BASKER_ASSERT(nblks>1, "NOT ENOUGH BTF BLOCKS"); #endif #ifdef BASKER_DEBUG_ORDER_BTF if(nblks<2) { printf("BTF did not find enough blks\n"); } #endif #ifdef BASKER_DEBUG_ORDER_BTF /* printf("\nBTF perm: \n"); for(Int i=0; i <M.nrow; i++) { printf("%d, ", order_btf_array(i)); //printf("%d, ", btf_perm(i)); } */ printf("\n\nBTF tabs: \n"); for(Int i=0; i < nblks+1; i++) { printf("%d, ", btf_tabs(i)); } printf("\n"); #endif permute_col(M, order_btf_array); permute_row(M, order_btf_array); break_into_parts(M, nblks, btf_tabs); btf_nblks = nblks; //#ifdef BASKER_DEBUG_ORDER_BTF printf("------------BTF CUT: %d --------------\n", btf_tabs(btf_tabs_offset)); //#endif return 0; }//end find BTF
BASKER_INLINE int Basker<Int,Entry,Exe_Space>::spmv_BTF ( Int tab, BASKER_MATRIX &M, ENTRY_1DARRAY x, ENTRY_1DARRAY y ) { //Tab = block in const Int bcol = btf_tabs(tab)- M.scol; const Int brow = M.srow; const Int ecol = btf_tabs(tab+1) - M.scol; Int erow = 0; if(tab > 0) { erow = btf_tabs(tab); } else { erow = brow-1; } #ifdef BASKER_DEBUG_SOLVE_RHS printf("BTF_UPDATE, TAB: %d [%d %d] [%d %d] \n", tab, brow, erow, bcol, ecol); #endif //loop over each column for(Int k = bcol; k < ecol; ++k) { //for(Int i = M.col_ptr[k]; i < M.col_ptr[k+1]; i++) //printf("k: %d col_ptr: %d \n", k, M.col_ptr(k)); for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); ++i) { //Int j = M.row_idx[i]; const Int j = gperm(M.row_idx(i)); //printf("j: %d jp: %d \n", M.row_idx(i), j); if(j > erow) { #ifdef BASKER_DEBUG_SOLVE_RHS ///printf("break, k: %d j: %d erow: %d\n", // k, j, erow); #endif //break; //breaks for 1 colummn continue; } #ifdef BASKER_DEBUG_SOLVE_RHS printf("BTF_UPDATE-val, j: %d y: %f x: %f, val: %f \n", j, y[j], x[k+M.scol], M.val[i]); #endif //for now just do a single function with zero //y[j] -= M.val[i]*x[k+M.scol]; y(j+brow) -= M.val(i)*x(k+M.scol); }//over all nnz in row } //printf("done\n"); return 0; }//end spmv_BTF();
BASKER_INLINE int Basker<Int,Entry,Exe_Space>::serial_btf_solve ( ENTRY_1DARRAY y, ENTRY_1DARRAY x ) { for(Int i = 0; i < gn; ++i) { x(i) = y(i); y(i) = (Entry) 0.0; } //printf("Test \n"); //Start in C and go backwards //In first level, only due U\L\x->y for(Int b = (btf_nblks-btf_tabs_offset)-1; b>= 0; b--) { #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n btf b: %d \n", b); #endif //---Lower solve BASKER_MATRIX &LC = LBTF(b); //L\x -> y lower_tri_solve(LC,x,y); BASKER_MATRIX &UC = UBTF(b); //U\x -> y upper_tri_solve(UC,x,y); #ifdef BASKER_DEBUG_SOLVE_RHS printf("Before spmv\n"); printf("Inner Vector y print\n"); printVec(y, gn); printf("Inner Vector x print\n"); printVec(x, gn); printf("\n"); #endif //-----Update //if(b > btf_tabs_offset) { //x = BTF_C*y; //printf("spmv tab: %d \n", b+btf_tabs_offset); spmv_BTF(b+btf_tabs_offset, BTF_C, y, x); } #ifdef BASKER_DEBUG_SOLVE_RHS printf("After spmv\n"); printf("Inner Vector y print\n"); printVec(y, gn); printf("Inner Vector x print\n"); printVec(x, gn); #endif //BASKER_MATRIX &UC = UBTF[b]; //U\x -> y //upper_tri_solve(UC,x,y); } #ifdef BASKER_DEBUG_SOLVE_RHS printf("Done, BTF-C Solve \n"); printf("\n x \n"); printVec(x, gn); printf("\n y \n"); printVec(y, gn); printf("\n\n"); #endif //Update B //BTF_B*y -> x if(btf_tabs_offset != 0) { neg_spmv(BTF_B,y,x); } #ifdef BASKER_DEBUG_SOLVE_RHS printf("Done, SPMV BTF_B UPDATE \n"); printf("\n x \n"); printVec(x, gn); printf("\n y \n"); printVec(y, gn); printf("\n\n"); #endif //now do the forward backwared solve //L\x ->y serial_forward_solve(x,y); //U\y->x serial_backward_solve(y,x); //copy lower part down #ifdef BASKER_DEBUG_SOLVE_RHS printf("copying lower starting: %d \n", btf_tabs[btf_tabs_offset]); #endif for(Int i = btf_tabs(btf_tabs_offset); i < gn; ++i) { //x[i] = y[i]; x(i) = y(i); } //Comeback and fix return 0; }//end serial_btf_solve
BASKER_INLINE int Basker<Int,Entry, Exe_Space>::find_btf2 ( BASKER_MATRIX &M ) { Int nblks = 0; strong_component(M,nblks,order_btf_array,btf_tabs); btf_nblks = nblks; btf_flag = BASKER_TRUE; //#ifdef BASKER_DEBUG_ORDER_BTF printf("BTF nblks returned: %d \n", nblks); //BASKER_ASSERT(nblks>1, "NOT ENOUGH BTF BLOCKS"); //#endif #ifdef BASKER_DEBUG_ORDER_BTF if(nblks<2) { printf("BTF did not find enough blks\n"); } #endif //#ifdef BASKER_DEBUG_ORDER_BTF /* printf("\nBTF perm: \n"); for(Int i=0; i <M.nrow; i++) { printf("%d, ", order_btf_array(i)); //printf("%d, ", btf_perm(i)); } */ printf("num_threads: %d \n", num_threads); printf("\n\nBTF tabs: \n"); for(Int i=0; i < nblks+1; i++) { printf("%d, ", btf_tabs(i)); } printf("\n"); // #endif permute_col(M, order_btf_array); permute_row(M, order_btf_array); MALLOC_INT_1DARRAY(order_blk_amd_array, M.ncol); init_value(order_blk_amd_array, M.ncol, (Int)0); MALLOC_INT_1DARRAY(btf_blk_nnz, nblks+1); init_value(btf_blk_nnz, nblks+1, (Int) 0); MALLOC_INT_1DARRAY(btf_blk_work, nblks+1); init_value(btf_blk_work, nblks+1, (Int) 0); //Find AMD blk ordering, get nnz, and get work btf_blk_amd( M, order_blk_amd_array, btf_blk_nnz, btf_blk_work); #ifdef BASKER_DEBUG_ORDER_BTF printf("blk_perm:\n"); for(Int i = 0; i < M.ncol; i++) { printf("(%d,%d) ", i, order_blk_amd_array(i)); } printf("\n"); printf("id/blk_size/blk_nnz/work: \n"); for(Int i = 0; i < nblks; i++) { printf("(%d, %d, %d, %d) ", i, btf_tabs(i+1)-btf_tabs(i), btf_blk_nnz(i), btf_blk_work(i)); } printf("\n"); #endif //printMTX("A_BEFORE.mtx", M); //printVec("AMD.txt", order_blk_amd_array, M.ncol); permute_col(M, order_blk_amd_array); permute_row(M, order_blk_amd_array); sort_matrix(M); //changed col to row, error. //print to see issue //printMTX("A_AMD.mtx", M); break_into_parts2(M, nblks, btf_tabs); //find schedule find_btf_schedule(M, nblks, btf_tabs); #ifdef BASKER_DEBUG_ORDER_BTF printf("------------BTF CUT: %d --------------\n", btf_tabs(btf_tabs_offset)); #endif return 0; }//end find BTF(nnz)
void Basker<Int,Entry,Exe_Space>::btf_blk_amd ( BASKER_MATRIX &M, INT_1DARRAY p, INT_1DARRAY btf_nnz, INT_1DARRAY btf_work ) { // printf("=============BTF_BLK_AMD_CALLED========\n"); if(Options.incomplete == BASKER_TRUE) { //We note that AMD on incomplete ILUK //Seems realy bad and leads to a zero on the diag //Therefore, we simply return the natural ordering for(Int i = 0 ; i < M.ncol; i++) { p(i) = i; } //We will makeup work to be 1, //Since BTF is not supported in our iluk for(Int b = 0; b < btf_nblks; b++) { btf_nnz(b) = 1; btf_work(b) =1; } //printf("Short amd blk\n"); return; } //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); //printf("Done with btf_blk_amd malloc \n"); //printf("blks: %d \n" , btf_nblks); for(Int b = 0; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); //printf("blk: %d blk_size: %d \n", // b, blk_size); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } btf_work(b) = blk_size*blk_size*blk_size; btf_nnz(b) = (.5*(blk_size*blk_size) + blk_size); continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif double l_nnz = 0; double lu_work = 0; BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0)), l_nnz, lu_work); btf_nnz(b) = l_nnz; btf_work(b) = lu_work; #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()
void Basker<Int,Entry,Exe_Space>::blk_amd(BASKER_MATRIX &M, INT_1DARRAY p) { //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); for(Int b = btf_tabs_offset; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0))); #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()