BASKER_INLINE int Basker<Int,Entry, Exe_Space>::permute_row( BASKER_MATRIX &M, INT_1DARRAY row) { if(M.nnz == 0) { return 0; } Int nnz = M.nnz; INT_1DARRAY temp_i; MALLOC_INT_1DARRAY(temp_i, nnz); init_value(temp_i, nnz, (Int)0); //permute for(Int k = 0; k < nnz; k++) { temp_i[k] = row[M.row_idx[k]]; } //Copy back for(Int k = 0; k < nnz; k++) { M.row_idx[k] = temp_i[k]; } FREE_INT_1DARRAY(temp_i); return 0; }//end permute_row(matrix,int)
BASKER_INLINE void BaskerMatrix<Int,Entry,Exe_Space>::Finalize() { if(v_fill == BASKER_TRUE) { FREE_INT_1DARRAY(col_ptr); FREE_INT_1DARRAY(row_idx); FREE_ENTRY_1DARRAY(val); v_fill = BASKER_FALSE; } if(w_fill = BASKER_TRUE) { FREE_INT_1DARRAY(iws); FREE_ENTRY_1DARRAY(ews); w_fill = BASKER_FALSE; } }//end finalize()
int Basker<Int,Entry,Exe_Space>::GetPerm(Int **lp, Int **rp) { INT_1DARRAY lp_array; MALLOC_INT_1DARRAY(lp_array, gn); INT_1DARRAY rp_array; MALLOC_INT_1DARRAY(rp_array, gn); get_total_perm(lp_array, rp_array); (*lp) = new Int[gn]; (*rp) = new Int[gn]; for(Int i = 0; i < gn; ++i) { (*lp)[i] = lp_array(i); (*rp)[i] = rp_array(i); } FREE_INT_1DARRAY(lp_array); FREE_INT_1DARRAY(rp_array); return BASKER_SUCCESS; }//end GetPerm()
BASKER_INLINE void Basker<Int,Entry,Exe_Space>::Finalize() { //finalize all matrices A.Finalize(); At.Finalize(); //??? is At even used BTF_A.Finalize(); BTF_C.Finalize(); BTF_B.Finalize(); BTF_D.Finalize(); BTF_E.Finalize(); //finalize array of 2d matrics FREE_MATRIX_VIEW_2DARRAY(AV, tree.nblks); FREE_MATRIX_VIEW_2DARRAY(AL, tree.nblks); FREE_MATRIX_2DARRAY(AVM, tree.nblks); FREE_MATRIX_2DARRAY(ALM, tree.nblks); FREE_MATRIX_2DARRAY(LL, tree.nblks); FREE_MATRIX_2DARRAY(LU, tree.nblks); FREE_INT_1DARRAY(LL_size); FREE_INT_1DARRAY(LU_size); //BTF structure FREE_INT_1DARRAY(btf_tabs); FREE_INT_1DARRAY(btf_blk_work); FREE_INT_1DARRAY(btf_blk_nnz); FREE_MATRIX_1DARRAY(LBTF); FREE_MATRIX_1DARRAY(UBTF); //Thread Array FREE_THREAD_1DARRAY(thread_array); basker_barrier.Finalize(); //S (Check on this) FREE_INT_2DARRAY(S, tree.nblks); //Permuations FREE_INT_1DARRAY(gperm); FREE_INT_1DARRAY(gpermi); if(match_flag == BASKER_TRUE) { FREE_INT_1DARRAY(order_match_array); match_flag = BASKER_FALSE; } if(btf_flag == BASKER_TRUE) { FREE_INT_1DARRAY(order_btf_array); btf_flag = BASKER_FALSE; } if(nd_flag == BASKER_TRUE) { FREE_INT_1DARRAY(order_scotch_array); nd_flag == BASKER_FALSE; } if(amd_flag == BASKER_TRUE) { FREE_INT_1DARRAY(order_csym_array); amd_flag == BASKER_FALSE; } //Structures part_tree.Finalize(); tree.Finalize(); stree.Finalize(); stats.Finalize(); /* */ }//end Finalize()
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option) { //printf("factor no token called \n"); gn = A.ncol; gm = A.nrow; BASKER_MATRIX ATEMP; //Kokkos::Impl::Timer tza; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface gn = A.ncol; gm = A.nrow; ATEMP = A; A = BTF_A; } //printf("Switch time: %f \n", tza.seconds()); //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { if(Options.verbose == BASKER_TRUE) { printf("Factoring Dom num_threads: %d \n", num_threads); } Int domain_restart = 0; kokkos_nfactor_domain <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (domain_restart > BASKER_RESTART)) { break; } else { domain_restart++; if(Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //printVec("domperm.csc", gpermi, A.nrow); //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=4; l++) for(Int l=1; l <= tree.nlvls; l++) { //#ifdef BASKER_OLD_BARRIER //Int lthreads = pow(2,l); //Int lnteams = num_threads/lthreads; //#else Int lthreads = 1; Int lnteams = num_threads/lthreads; //#endif Int sep_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring Sep num_threads: %d %d \n", lnteams, lthreads); } #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA //kokkos_nfactor_sep <Int, Entry, Exe_Space> //sep_nfactor(this, l); kokkos_nfactor_sep2 <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); //======Check for error===== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_sep_error(thread_start); if((nt == BASKER_SUCCESS)|| (nt == BASKER_ERROR) || (sep_restart > BASKER_RESTART)) { FREE_INT_1DARRAY(thread_start); break; } else { sep_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); } }//end while-true #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { Int btf_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring BLKs num_threads: %d \n", num_threads); } //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_diag_error(thread_start); //printf("RETURNED: %d \n", nt); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (btf_restart > BASKER_RESTART)) { break; } else { btf_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call Kokkos::Impl::Timer tzback; if(Options.btf == BASKER_TRUE) { A = ATEMP; } //printf("Switch back: %f \n", // tzback.seconds()); return 0; }//end factor_notoken()
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::permute_col ( BASKER_MATRIX &M, INT_1DARRAY col ) { if((M.ncol == 0)||(M.nnz == 0)) return 0; Int n = M.ncol; Int nnz = M.nnz; //printf("Using n: %d nnz: %d \n", n, nnz); INT_1DARRAY temp_p; MALLOC_INT_1DARRAY(temp_p, n+1); init_value(temp_p, n+1, (Int)0); INT_1DARRAY temp_i; MALLOC_INT_1DARRAY(temp_i, nnz); init_value(temp_i, nnz, (Int)0); ENTRY_1DARRAY temp_v; MALLOC_ENTRY_1DARRAY(temp_v, nnz); init_value(temp_v, nnz, (Entry)0.0); //printf("done with init \n"); //Determine column ptr of output matrix for(Int j = 0; j < n; j++) { Int i = col (j); temp_p (i+1) = M.col_ptr (j+1) - M.col_ptr (j); } //Get ptrs from lengths temp_p (0) = 0; for(Int j = 0; j < n; j++) { temp_p (j+1) = temp_p (j+1) + temp_p (j); } //copy idxs for(Int ii = 0; ii < n; ii++) { Int ko = temp_p (col (ii) ); for(Int k = M.col_ptr (ii); k < M.col_ptr (ii+1); k++) { temp_i (ko) = M.row_idx (k); temp_v (ko) = M.val (k); ko++; } } //copy back int A for(Int ii=0; ii < n+1; ii++) { M.col_ptr (ii) = temp_p (ii); } for(Int ii=0; ii < nnz; ii++) { M.row_idx (ii) = temp_i (ii); M.val (ii) = temp_v (ii); } FREE_INT_1DARRAY(temp_p); FREE_INT_1DARRAY(temp_i); FREE_ENTRY_1DARRAY(temp_v); return 0; }//end permute_col(int)
void Basker<Int,Entry,Exe_Space>::btf_blk_amd ( BASKER_MATRIX &M, INT_1DARRAY p, INT_1DARRAY btf_nnz, INT_1DARRAY btf_work ) { // printf("=============BTF_BLK_AMD_CALLED========\n"); if(Options.incomplete == BASKER_TRUE) { //We note that AMD on incomplete ILUK //Seems realy bad and leads to a zero on the diag //Therefore, we simply return the natural ordering for(Int i = 0 ; i < M.ncol; i++) { p(i) = i; } //We will makeup work to be 1, //Since BTF is not supported in our iluk for(Int b = 0; b < btf_nblks; b++) { btf_nnz(b) = 1; btf_work(b) =1; } //printf("Short amd blk\n"); return; } //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); //printf("Done with btf_blk_amd malloc \n"); //printf("blks: %d \n" , btf_nblks); for(Int b = 0; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); //printf("blk: %d blk_size: %d \n", // b, blk_size); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } btf_work(b) = blk_size*blk_size*blk_size; btf_nnz(b) = (.5*(blk_size*blk_size) + blk_size); continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif double l_nnz = 0; double lu_work = 0; BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0)), l_nnz, lu_work); btf_nnz(b) = l_nnz; btf_work(b) = lu_work; #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()
void Basker<Int,Entry,Exe_Space>::blk_amd(BASKER_MATRIX &M, INT_1DARRAY p) { //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); for(Int b = btf_tabs_offset; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0))); #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()
BASKER_INLINE int Basker<Int,Entry,Exe_Space>::solve_interface ( Entry *_x, //Solution (len = gn) Entry *_y ) { //Need to modify to use global perm INT_1DARRAY temp_array; MALLOC_INT_1DARRAY(temp_array, gn); //===== Move to view=========== ENTRY_1DARRAY x; ENTRY_1DARRAY y; MALLOC_ENTRY_1DARRAY(x, gn); MALLOC_ENTRY_1DARRAY(y, gm); for(Int i =0; i < gn; i++) { x(i) = (Entry) 0; y(i) = (Entry) _y[i]; } //printf("RHS: \n"); //printVec(y, gn); //printf("\n"); //===== Permute //printf("Permute RHS\n"); //==== Need to make this into one global perm if(match_flag == BASKER_TRUE) { //printf("match order\n"); //printVec("match.txt", order_match_array, gn); permute_inv(y,order_match_array, gn); } if(btf_flag == BASKER_TRUE) { //printf("btf order\n"); //printVec("btf.txt", order_btf_array, gn); permute_inv(y,order_btf_array, gn); //printVec("btf_amd.txt", order_c_csym_array, gn); permute_inv(y,order_blk_amd_array, gn); } if(nd_flag == BASKER_TRUE) { //printf("ND order \n"); //printVec("nd.txt", part_tree.permtab, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = part_tree.permtab(i); } for(Int i = BTF_A.ncol; i < gn; ++i) { temp_array(i) = i; } //permute_inv(y,part_tree.permtab, gn); permute_inv(y, temp_array,gn); } if(amd_flag == BASKER_TRUE) { //printf("AMD order \n"); //printVec("amd.txt",order_csym_array, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = order_csym_array(i); } for(Int i = BTF_A.ncol; i < gn; ++i) { temp_array(i) = i; } //permute_inv(y,order_csym_array, gn); permute_inv(y,temp_array, gn); } //printVec("perm.txt" , gperm, gn); permute_inv(y,gperm, gn); solve_interface(x,y); //Inverse perm //Note: don't need to inverse a row only perm if(btf_flag == BASKER_TRUE) { //printf("btf order\n"); //printVec(order_btf_array, gn); permute(x,order_btf_array, gn); } if(nd_flag == BASKER_TRUE) { //printf("ND order \n"); //printVec(part_tree.permtab, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = part_tree.permtab(i); } for(Int i = BTF_A.ncol; i < gn; i++) { temp_array(i) = i; } //permute(x,part_tree.permtab, gn); permute(x,temp_array, gn); } if(amd_flag == BASKER_TRUE) { //printf("AMD order \n"); //printVec(order_csym_array, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = order_csym_array(i); } for(Int i = BTF_A.ncol; i < gn; ++i) { temp_array(i) = order_csym_array(i); } //permute(x,order_csym_array, gn); permute(x,temp_array,gn); } #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n"); printf("X: \n"); for(Int i = 0; i < gn; i++) { printf("%f, " , x(i)); } printf("\n\n"); printf("RHS: \n"); for(Int i =0; i < gm; i++) { printf("%f, ", y(i)); } printf("\n\n"); #endif for(Int i = 0; i < gn; i++) { _x[i] = x(i); } FREE_ENTRY_1DARRAY(x); FREE_ENTRY_1DARRAY(y); FREE_INT_1DARRAY(temp_array); return 0; }