BASKER_INLINE int Basker<Int, Entry, Exe_Space>::Factor(Int option) { #ifdef BASKER_KOKKOS_TIME Kokkos::Impl::Timer timer; #endif factor_notoken(option); #ifdef BASKER_KOKKOS_TIME stats.time_nfactor += timer.seconds(); #endif // NDE MALLOC_ENTRY_1DARRAY(x_view_ptr_copy, gn); //used in basker_solve_rhs - move alloc MALLOC_ENTRY_1DARRAY(y_view_ptr_copy, gm); MALLOC_INT_1DARRAY(perm_inv_comp_array , gm); //y MALLOC_INT_1DARRAY(perm_comp_array, gn); //x MALLOC_INT_1DARRAY(perm_comp_iworkspace_array, gn); MALLOC_ENTRY_1DARRAY(perm_comp_fworkspace_array, gn); permute_composition_for_solve(); factor_flag = BASKER_TRUE; return 0; }//end Factor()
int Basker<Int,Entry,Exe_Space>::Factor_Inc(Int Options) { factor_inc_lvl(Options); // NDE MALLOC_ENTRY_1DARRAY(x_view_ptr_copy, gn); //used in basker_solve_rhs - move alloc MALLOC_ENTRY_1DARRAY(y_view_ptr_copy, gm); MALLOC_INT_1DARRAY(perm_inv_comp_array , gm); //y MALLOC_INT_1DARRAY(perm_comp_array, gn); //x MALLOC_INT_1DARRAY(perm_comp_iworkspace_array, gn); MALLOC_ENTRY_1DARRAY(perm_comp_fworkspace_array, gn); permute_composition_for_solve(); return 0; }
BASKER_INLINE int Basker<Int,Entry, Exe_Space>::permute_row( BASKER_MATRIX &M, INT_1DARRAY row) { if(M.nnz == 0) { return 0; } Int nnz = M.nnz; INT_1DARRAY temp_i; MALLOC_INT_1DARRAY(temp_i, nnz); init_value(temp_i, nnz, (Int)0); //permute for(Int k = 0; k < nnz; k++) { temp_i[k] = row[M.row_idx[k]]; } //Copy back for(Int k = 0; k < nnz; k++) { M.row_idx[k] = temp_i[k]; } FREE_INT_1DARRAY(temp_i); return 0; }//end permute_row(matrix,int)
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::strong_component ( BASKER_MATRIX &M, Int &nblks, INT_1DARRAY &perm, INT_1DARRAY &CC ) { typedef long int l_int; INT_1DARRAY perm_in; MALLOC_INT_1DARRAY(perm_in, M.ncol); MALLOC_INT_1DARRAY(perm, M.ncol); //JDB:Note, this needs to be changed just fixed for int/long MALLOC_INT_1DARRAY(CC, M.ncol+1); for(l_int i = 0; i < M.ncol; i++) { perm_in(i) = i; } //printf("SC one \n"); //my_strong_component(M,nblks,perm,perm_in, CC); BaskerSSWrapper<Int>::my_strong_component(M.ncol, &(M.col_ptr(0)), &(M.row_idx(0)), nblks, &(perm(0)), &(perm_in(0)), &(CC(0))); #ifdef BASKER_DEBUG_ORDER_BTF FILE *fp; fp = fopen("btf.txt", "w"); for(Int i = 0; i < M.ncol; i++) { fprintf(fp, "%d \n", perm(i)); } fclose(fp); #endif //printf("FOUND NBLKS: %d \n", nblks); return 0; }//end strong_component <long int>
BASKER_INLINE void BaskerMatrix<Int, Entry, Exe_Space>::init_col() { BASKER_ASSERT(ncol >= 0, "INIT_COL, ncol > 0"); MALLOC_INT_1DARRAY(col_ptr, ncol+1); for(Int i = 0; i < ncol+1; ++i) { col_ptr(i) = (Int) BASKER_MAX_IDX; } }//end init_col()
BASKER_INLINE void BaskerMatrix<Int,Entry,Exe_Space>::malloc_perm(Int n) { BASKER_ASSERT(n > 0, "matrix malloc_perm"); MALLOC_INT_1DARRAY(lpinv,n); //Fix later. //NDE determine what the issue is... init_perm(); }//end init_perm(Int)
BASKER_INLINE void BaskerMatrix<Int, Entry, Exe_Space>::init_vectors ( Int _m, Int _n, Int _nnz ) { nrow = _m; ncol = _n; nnz = _nnz; mnnz = _nnz; if(ncol >= 0) { BASKER_ASSERT((ncol+1)>0, "matrix init_vector ncol"); MALLOC_INT_1DARRAY(col_ptr,ncol+1); } if(nnz > 0) { BASKER_ASSERT(nnz > 0, "matrix init_vector nnz"); MALLOC_INT_1DARRAY(row_idx,nnz); MALLOC_ENTRY_1DARRAY(val,nnz); #ifdef BASKER_INC_LVL MALLOC_INT_1DARRAY(inc_lvl, nnz); #endif } else if(nnz==0) { BASKER_ASSERT((nnz+1)>0, "nnz+1 init_vector "); MALLOC_INT_1DARRAY(row_idx, nnz+1); row_idx(0) = (Int) 0; MALLOC_ENTRY_1DARRAY(val, nnz+1); val(0) = (Entry) 0; #ifdef BASKER_INC_LVL MALLOC_INT_1DARRAY(inc_lvl, nnz+1); #endif } }//end init_vectors(Int, Int, Int)
BASKER_INLINE void BaskerMatrix<Int,Entry,Exe_Space>::init_inc_lvl() { MALLOC_INT_1DARRAY(inc_lvl, nnz+1); //for(Int i = 0; i < nnz+1; i++) // { // inc_lvl(i) = 0; // } inc_lvl_flg = BASKER_TRUE; }
void BaskerMatrix<Int,Entry,Exe_Space>::init_pend() { if(ncol > 0) { BASKER_ASSERT((ncol+1)>0, "matrix init_pend") MALLOC_INT_1DARRAY(pend,ncol+1); for(Int i =0; i < ncol+1; ++i) { pend(i) = BASKER_MAX_IDX; } } }//end init_pend()
BASKER_FINLINE void Basker<Int, Entry,Exe_Space>::csymamd_order ( BASKER_MATRIX &M, INT_1DARRAY p, INT_1DARRAY cmember ) { amd_flag = BASKER_TRUE; //Debug, #ifdef BASKER_DEBUG_ORDER_AMD printf("cmember: \n"); for(Int i = 0; i < M.ncol; ++i) { printf("(%d, %d), ", i, cmember(i)); } printf("\n"); #endif //If doing iluk, we will not want this. //See amd blk notes if(Options.incomplete == BASKER_TRUE) { for(Int i = 0; i < M.ncol; i++) { p(i) = i; } //printf("Short csym \n"); return; } INT_1DARRAY temp_p; BASKER_ASSERT(M.ncol > 0, "AMD perm not long enough"); MALLOC_INT_1DARRAY(temp_p, M.ncol+1); init_value(temp_p, M.ncol+1, (Int) 0); my_amesos_csymamd(M.ncol, &(M.col_ptr(0)), &(M.row_idx(0)), &(temp_p(0)), &(cmember(0))); for(Int i = 0; i < M.ncol; ++i) { p(temp_p(i)) = i; } }//end csymamd()
int Basker<Int,Entry,Exe_Space>::GetPerm(Int **lp, Int **rp) { INT_1DARRAY lp_array; MALLOC_INT_1DARRAY(lp_array, gn); INT_1DARRAY rp_array; MALLOC_INT_1DARRAY(rp_array, gn); get_total_perm(lp_array, rp_array); (*lp) = new Int[gn]; (*rp) = new Int[gn]; for(Int i = 0; i < gn; ++i) { (*lp)[i] = lp_array(i); (*rp)[i] = rp_array(i); } FREE_INT_1DARRAY(lp_array); FREE_INT_1DARRAY(rp_array); return BASKER_SUCCESS; }//end GetPerm()
BASKER_FINLINE void Basker<Int, Entry,Exe_Space>::csymamd_order ( BASKER_MATRIX &M, INT_1DARRAY p, INT_1DARRAY cmember ) { amd_flag = BASKER_TRUE; //Debug, #ifdef BASKER_DEBUG_ORDER_AMD printf("cmember: \n"); for(Int i = 0; i < M.ncol; ++i) { printf("(%d, %d), ", i, cmember(i)); } printf("\n"); #endif INT_1DARRAY temp_p; BASKER_ASSERT(M.ncol > 0, "AMD perm not long enough"); MALLOC_INT_1DARRAY(temp_p, M.ncol+1); init_value(temp_p, M.ncol+1, (Int) 0); my_amesos_csymamd(M.ncol, &(M.col_ptr(0)), &(M.row_idx(0)), &(temp_p(0)), &(cmember(0))); for(Int i = 0; i < M.ncol; ++i) { p(temp_p(i)) = i; } }//end csymamd()
BASKER_INLINE int Basker<Int, Entry,Exe_Space>::break_into_parts ( BASKER_MATRIX &M, Int nblks, INT_1DARRAY btf_tabs ) { #ifdef BASKER_DEBUG_ORDER_BTF printf("break_into_parts called \n"); printf("nblks: %d \n", nblks); #endif Options.btf = BASKER_TRUE; //Alg. // A -> [BTF_A BTF_B] // [0 BTF_C] //1. Run backward through the btf_tabs to find size C //2. Form A,B,C based on size in 1. //Short circuit, //If nblks == 1, than only BTF_A exists if(nblks == 1) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Short Circuit part_call \n"); #endif BTF_A = A; //Options.btf = BASKER_FALSE; btf_tabs_offset = 1; return 0; } //Step 1. Int t_size = 0; Int scol = M.ncol; Int blk_idx = nblks; BASKER_BOOL move_fwd = BASKER_TRUE; while(move_fwd==BASKER_TRUE) { Int blk_size = btf_tabs(blk_idx)- btf_tabs(blk_idx-1); #ifdef BASKER_DEBUG_ORDER_BTF printf("move_fwd loop \n"); BASKER_ASSERT(blk_idx>=0, "btf blk idx off"); BASKER_ASSERT(blk_size>0, "btf blk size wrong"); printf("blk_idx: %d blk_size: %d \n", blk_idx, blk_size); std::cout << blk_size << std::endl; #endif if((blk_size < Options.btf_large) && ((((double)t_size+blk_size)/(double)M.ncol) < Options.btf_max_percent)) { #ifdef BASKER_DEBUG_ORDER_BTF printf("first choice \n"); printf("blksize test: %d %d %d \n", blk_size, Options.btf_large, BASKER_BTF_LARGE); printf("blkpercent test: %f %f %f \n", ((double)t_size+blk_size)/(double)M.ncol, Options.btf_max_percent, (double) BASKER_BTF_MAX_PERCENT); #endif t_size = t_size+blk_size; blk_idx = blk_idx-1; scol = btf_tabs[blk_idx]; } else { //printf("second choice \n"); //#ifdef BASKER_DEBUG_ORDER_BTF printf("Cut: blk_size: %d percent: %f \n", blk_size, ((double)t_size+blk_size)/(double)M.ncol); if((((double)t_size+blk_size)/(double)M.ncol) == 1.0) { blk_idx = 0; t_size = t_size + blk_size; scol = btf_tabs[blk_idx]; } //#endif move_fwd = BASKER_FALSE; } }//end while(move_fwd) #ifdef BASKER_DEBUG_ORDER_BTF printf("Done finding BTF cut. Cut size: %d scol: %d \n", t_size, scol); //BASKER_ASSERT(t_size > 0, "BTF CUT SIZE NOT BIG ENOUGH\n"); BASKER_ASSERT((scol >= 0) && (scol < M.ncol), "SCOL\n"); #endif //Comeback and change btf_tabs_offset = blk_idx; //2. Move into Blocks if(btf_tabs_offset != 0) { //--Move A into BTF_A; BTF_A.set_shape(0, scol, 0, scol); BTF_A.nnz = M.col_ptr(scol); #ifdef BASKER_DEBUG_ORDER_BTF printf("Init BTF_A. ncol: %d nnz: %d \n", scol, BTF_A.nnz); #endif if(BTF_A.v_fill == BASKER_FALSE) { BASKER_ASSERT(BTF_A.ncol >= 0, "BTF_A, col_ptr"); MALLOC_INT_1DARRAY(BTF_A.col_ptr, BTF_A.ncol+1); BASKER_ASSERT(BTF_A.nnz > 0, "BTF_A, nnz"); MALLOC_INT_1DARRAY(BTF_A.row_idx, BTF_A.nnz); MALLOC_ENTRY_1DARRAY(BTF_A.val, BTF_A.nnz); BTF_A.fill(); } Int annz = 0; for(Int k = 0; k < scol; ++k) { #ifdef BASKER_DEBUG_ORDER_BTF printf("copy column: %d into A_BTF, [%d %d] \n", k, M.col_ptr(k), M.col_ptr(k+1)); #endif for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); ++i) { //printf("annz: %d i: %d \n", annz, i); BTF_A.row_idx(annz) = M.row_idx(i); BTF_A.val(annz) = M.val(i); annz++; } BTF_A.col_ptr(k+1) = annz; } }//no A //Fill in B and C at the same time INT_1DARRAY cws; BASKER_ASSERT((M.ncol-scol+1) > 0, "BTF_SIZE MALLOC"); MALLOC_INT_1DARRAY(cws, M.ncol-scol+1); init_value(cws, M.ncol-scol+1, (Int)M.ncol); BTF_B.set_shape(0 , scol, scol, M.ncol-scol); BTF_C.set_shape(scol, M.ncol-scol, scol, M.ncol-scol); #ifdef BASKER_DEBUG_ORDER_BTF printf("Set Shape BTF_B: %d %d %d %d \n", BTF_B.srow, BTF_B.nrow, BTF_B.scol, BTF_B.ncol); printf("Set Shape BTF_C: %d %d %d %d \n", BTF_C.srow, BTF_C.nrow, BTF_C.scol, BTF_C.nrow); #endif //Scan and find nnz //We can do this much better!!!! Int bnnz = 0; Int cnnz = 0; for(Int k = scol; k < M.ncol; ++k) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Scanning nnz, k: %d \n", k); #endif for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); ++i) { if(M.row_idx(i) < scol) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz to Upper, %d %d \n", scol, M.row_idx(i)); #endif bnnz++; } else { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz to Lower, %d %d \n", scol, M.row_idx(i)); #endif cnnz++; } }//over all nnz in k }//over all k #ifdef BASKER_DEBUG_ORDER_BTF printf("BTF_B nnz: %d \n", bnnz); printf("BTF_C nnz: %d \n", cnnz); #endif BTF_B.nnz = bnnz; BTF_C.nnz = cnnz; //Malloc need space if((BTF_B.v_fill == BASKER_FALSE) && (BTF_B.nnz > 0)) //if(BTF_B.v_fill == BASKER_FALSE) { BASKER_ASSERT(BTF_B.ncol >= 0, "BTF_B ncol"); MALLOC_INT_1DARRAY(BTF_B.col_ptr, BTF_B.ncol+1); BASKER_ASSERT(BTF_B.nnz > 0, "BTF_B.nnz"); MALLOC_INT_1DARRAY(BTF_B.row_idx, BTF_B.nnz); MALLOC_ENTRY_1DARRAY(BTF_B.val, BTF_B.nnz); BTF_B.fill(); } if(BTF_C.v_fill == BASKER_FALSE) { BASKER_ASSERT(BTF_C.ncol >= 0, "BTF_C.ncol"); MALLOC_INT_1DARRAY(BTF_C.col_ptr, BTF_C.ncol+1); BASKER_ASSERT(BTF_C.nnz > 0, "BTF_C.nnz"); MALLOC_INT_1DARRAY(BTF_C.row_idx, BTF_C.nnz); MALLOC_ENTRY_1DARRAY(BTF_C.val, BTF_C.nnz); BTF_C.fill(); } //scan again (Very bad!!!) bnnz = 0; cnnz = 0; for(Int k = scol; k < M.ncol; ++k) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Scanning nnz, k: %d \n", k); #endif for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); ++i) { if(M.row_idx(i) < scol) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz to Upper, %d %d \n", scol, M.row_idx[i]); #endif BASKER_ASSERT(BTF_B.nnz > 0, "BTF B uninit"); //BTF_B.row_idx[bnnz] = M.row_idx[i]; //Note: do not offset because B srow = 0 BTF_B.row_idx(bnnz) = M.row_idx(i); BTF_B.val(bnnz) = M.val(i); bnnz++; } else { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz Lower,k: %d %d %d %f \n", k, scol, M.row_idx[i], M.val(i)); #endif //BTF_C.row_idx[cnnz] = M.row_idx[i]; BTF_C.row_idx(cnnz) = M.row_idx(i)-scol; BTF_C.val(cnnz) = M.val(i); cnnz++; } }//over all nnz in k if(BTF_B.nnz > 0) { BTF_B.col_ptr(k-scol+1) = bnnz; } BTF_C.col_ptr(k-scol+1) = cnnz; }//over all k #ifdef BASKER_DEBUG_ORDER_BTF printf("After BTF_B nnz: %d \n", bnnz); printf("After BTF_C nnz: %d \n", cnnz); #endif //printf("\n\n"); //printf("DEBUG\n"); //BTF_C.print(); //printf("\n\n"); return 0; }//end break_into_parts
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::permute_col ( BASKER_MATRIX &M, INT_1DARRAY col ) { if((M.ncol == 0)||(M.nnz == 0)) return 0; Int n = M.ncol; Int nnz = M.nnz; //printf("Using n: %d nnz: %d \n", n, nnz); INT_1DARRAY temp_p; MALLOC_INT_1DARRAY(temp_p, n+1); init_value(temp_p, n+1, (Int)0); INT_1DARRAY temp_i; MALLOC_INT_1DARRAY(temp_i, nnz); init_value(temp_i, nnz, (Int)0); ENTRY_1DARRAY temp_v; MALLOC_ENTRY_1DARRAY(temp_v, nnz); init_value(temp_v, nnz, (Entry)0.0); //printf("done with init \n"); //Determine column ptr of output matrix for(Int j = 0; j < n; j++) { Int i = col (j); temp_p (i+1) = M.col_ptr (j+1) - M.col_ptr (j); } //Get ptrs from lengths temp_p (0) = 0; for(Int j = 0; j < n; j++) { temp_p (j+1) = temp_p (j+1) + temp_p (j); } //copy idxs for(Int ii = 0; ii < n; ii++) { Int ko = temp_p (col (ii) ); for(Int k = M.col_ptr (ii); k < M.col_ptr (ii+1); k++) { temp_i (ko) = M.row_idx (k); temp_v (ko) = M.val (k); ko++; } } //copy back int A for(Int ii=0; ii < n+1; ii++) { M.col_ptr (ii) = temp_p (ii); } for(Int ii=0; ii < nnz; ii++) { M.row_idx (ii) = temp_i (ii); M.val (ii) = temp_v (ii); } FREE_INT_1DARRAY(temp_p); FREE_INT_1DARRAY(temp_i); FREE_ENTRY_1DARRAY(temp_v); return 0; }//end permute_col(int)
BASKER_INLINE int Basker<Int,Entry,Exe_Space>::btf_order() { //1. Matching ordering on whole matrix //currently finds matching and permutes //found bottle-neck to work best with circuit problems sort_matrix(A); //printMTX("A_nonmatch.mtx", A); match_ordering(0); //printf("DEBUG1: done match\n"); //for debuging sort_matrix(A); //printMTX("A_match.mtx", A); //2. BTF ordering on whole matrix // Gets estimate of work on all blocks //currently finds btf-hybrid and permutes //A -> [BTF_A, BTF_C; 0 , BTF B] printf("outter num_threads:%d \n", num_threads); MALLOC_INT_1DARRAY(btf_schedule, num_threads+1); init_value(btf_schedule, num_threads+1, 0); find_btf(A); if(btf_tabs_offset != 0) { // printf("A/B block stuff called\n"); //3. ND on BTF_A //currently finds ND and permute BTF_A //Would like to change so finds permuation, //and move into 2D-Structure //printMTX("A_BTF_FROM_A.mtx", BTF_A); sort_matrix(BTF_A); scotch_partition(BTF_A); //need to do a row perm on BTF_B too if(btf_nblks > 1) { permute_row(BTF_B, part_tree.permtab); } //needed because moving into 2D-Structure, //assumes sorted columns sort_matrix(BTF_A); if(btf_nblks > 1) { sort_matrix(BTF_B); sort_matrix(BTF_C); } //For debug //printMTX("A_BTF_PART_AFTER.mtx", BTF_A); //4. Init tree structure //This reduces the ND ordering into that fits, //thread counts init_tree_thread(); //5. Permute BTF_A //Constrained symamd on A INT_1DARRAY cmember; MALLOC_INT_1DARRAY(cmember, BTF_A.ncol+1); init_value(cmember,BTF_A.ncol+1,(Int) 0); for(Int i = 0; i < tree.nblks; ++i) { for(Int j = tree.col_tabs(i); j < tree.col_tabs(i+1); ++j) { cmember(j) = i; } } //INT_1DARRAY csymamd_perm = order_csym_array; MALLOC_INT_1DARRAY(order_csym_array, BTF_A.ncol+1); //MALLOC_INT_1DARRAY(csymamd_perm, BTF_A.ncol+1); init_value(order_csym_array, BTF_A.ncol+1,(Int) 0); //init_value(csymamd_perm, BTF_A.ncol+1,(Int) 0); csymamd_order(BTF_A, order_csym_array, cmember); //csymamd_order(BTF_A, csymamd_perm, cmember); //permute(BTF_A, csymamd_perm, csymamd_perm); permute_col(BTF_A, order_csym_array); sort_matrix(BTF_A); permute_row(BTF_A, order_csym_array); sort_matrix(BTF_A); //printMTX("A_BTF_AMD.mtx", BTF_A); if(btf_nblks > 1) { permute_row(BTF_B, order_csym_array); sort_matrix(BTF_B); //printMTX("B_BTF_AMD.mtx", BTF_B); sort_matrix(BTF_C); //printMTX("C_BTF_AMD.mtx", BTF_C); } //6. Move to 2D Structure //finds the shapes for both view and submatrices, //need to be changed over to just submatrices matrix_to_views_2D(BTF_A); //finds the starting point of A for submatrices find_2D_convert(BTF_A); //now we can fill submatrices #ifdef BASKER_KOKKOS kokkos_order_init_2D<Int,Entry,Exe_Space> iO(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), iO); Kokkos::fence(); #else //Comeback #endif //printMTX("BTF_A.mtx", BTF_A); }//if btf_tab_offset == 0 if(btf_nblks > 1) { sort_matrix(BTF_C); //printMTX("C_TEST.mtx", BTF_C); //Permute C MALLOC_INT_1DARRAY(order_c_csym_array, BTF_C.ncol+1); init_value(order_c_csym_array, BTF_C.ncol+1,(Int) 0); printf("BEFORE \n"); //csymamd_order(BTF_C, order_c_csym_array, cmember); blk_amd(BTF_C, order_c_csym_array); printf("After perm\n"); permute_col(BTF_C, order_c_csym_array); sort_matrix(BTF_C); permute_row(BTF_C, order_c_csym_array); sort_matrix(BTF_C); if(btf_tabs_offset != 0) { permute_col(BTF_B, order_c_csym_array); sort_matrix(BTF_B); //printMTX("BTF_B.mtx", BTF_B); } //printMTX("BTF_C.mtx", BTF_C); } //printf("Done with ordering\n"); return 0; }//end btf_order
BASKER_INLINE int Basker<Int, Entry,Exe_Space>::match_ordering(int option) { //printf("match_order called\n"); /* ---- Tests -------- INT_1DARRAY mperm; MALLOC_INT_1DARRAY(mperm, A.nrow); mc64(2,mperm); INT_1DARRAY mperm2; MALLOC_INT_1DARRAY(mperm2, A.nrow); mwm(A,mperm2); return 0; */ Int job = 2; //5 is the default for SuperLU_DIST //INT_1DARRAY mperm = order_match_array; MALLOC_INT_1DARRAY(order_match_array, A.nrow); //MALLOC_INT_1DARRAY(mperm, A.nrow); mwm(A,order_match_array); //mc64(job,order_match_array); //mc64(job,mperm); match_flag = BASKER_TRUE; #ifdef BASKER_DEBUG_ORDER printf("Matching Perm \n"); for(Int i = 0; i < A.nrow; i++) { printf("%d, \n", order_match_array(i)); //printf("%d, \n", mperm[i]); } printf("\n"); #endif //We want to test what the match ordering does if //have explicit zeros #ifdef BASKER_DEBUG_ORDER FILE *fp; fp = fopen("match_order.txt", "w"); for(Int i = 0; i < A.nrow; i++) { fprintf(fp, "%d \n", order_match_array(i)); } fclose(fp); #endif permute_row(A,order_match_array); //permute_row(A,mperm); //May have to call row_idx sort return 0; }//end match_ordering()
BASKER_INLINE void BaskerMatrix<Int,Entry,Exe_Space>::convert2D ( BASKER_MATRIX &M, BASKER_BOOL alloc, Int kid ) { if(nnz == 0) { for(Int i = 0; i < ncol+1; i++) { col_ptr(i) = 0; } MALLOC_INT_1DARRAY(row_idx, 1); row_idx(0) = (Int) 0; MALLOC_ENTRY_1DARRAY(val, 1); val(0) = (Entry) 0; return; } //info(); //We could check some flag ?? //We assume a pre-scan has already happened if(alloc == BASKER_TRUE) { //printf("ALLOC\n"); if(nnz > 0) { BASKER_ASSERT(nnz > 0, "matrix row nnz 2"); MALLOC_INT_1DARRAY(row_idx, nnz); } else if(nnz ==0) { BASKER_ASSERT((nnz+1)>0, "matrix row nnz 3"); MALLOC_INT_1DARRAY(row_idx, nnz+1); } } //init_value(row_idx, nnz, (Int) 0); //printf("clear row: %d \n", nnz); for(Int i = 0; i < nnz; ++i) { //printf("clear row_idx(%d) \n", i); row_idx(i) = 0; } if(alloc == BASKER_TRUE) { if(nnz > 0) { BASKER_ASSERT(nnz > 0, "matrix nnz 4"); MALLOC_ENTRY_1DARRAY(val, nnz); } else if(nnz == 0) { BASKER_ASSERT((nnz+1) > 0, "matrix nnz 5"); MALLOC_ENTRY_1DARRAY(val, nnz+1); } } //init_value(val, nnz, (Entry) 0); for(Int i = 0; i < nnz; ++i) { val(i) = 0; } Int temp_count = 0; for(Int k = scol; k < scol+ncol; ++k) { //note col_ptr[k-scol] contains the starting index if(col_ptr(k-scol) == BASKER_MAX_IDX) { col_ptr(k-scol) = temp_count; //printf("continue called, k: %d \n", k); continue; } for(Int i = col_ptr(k-scol); i < M.col_ptr(k+1); i++) { Int j = M.row_idx(i); //printf("i: %d j:%d \n", i,j); if(j >= srow+nrow) { break; } //printf("writing row_dix: %d i: %d val: %d nnz: %d srow: %d nrow: %d \n", // temp_count, i, j, nnz, // srow, nrow); //BASKER_ASSERT(temp_count < nnz, "2DConvert, too many values"); //row_idx[temp_count] = j; if(j-srow <0) { std::cout << "kid: " << kid << " j: " << j << " srow: " << srow << " k: " << k << " idx: " << i << std::endl; BASKER_ASSERT(0==1, "j-srow NO"); } row_idx(temp_count) = j-srow; val(temp_count) = M.val(i); temp_count++; } col_ptr(k-scol) = temp_count; } //col_ptr[0] = 0; //NO!!1 //Slide over the col counts for(Int i = ncol; i > 0; i--) { col_ptr(i) = col_ptr(i-1); } col_ptr(0) = (Int) 0; //info(); //print(); }//end convert2d(Matrix)
BASKER_INLINE int Basker<Int,Entry,Exe_Space>::solve_interface ( Entry *_x, //Solution (len = gn) Entry *_y ) { //Need to modify to use global perm INT_1DARRAY temp_array; MALLOC_INT_1DARRAY(temp_array, gn); //===== Move to view=========== ENTRY_1DARRAY x; ENTRY_1DARRAY y; MALLOC_ENTRY_1DARRAY(x, gn); MALLOC_ENTRY_1DARRAY(y, gm); for(Int i =0; i < gn; i++) { x(i) = (Entry) 0; y(i) = (Entry) _y[i]; } //printf("RHS: \n"); //printVec(y, gn); //printf("\n"); //===== Permute //printf("Permute RHS\n"); //==== Need to make this into one global perm if(match_flag == BASKER_TRUE) { //printf("match order\n"); //printVec("match.txt", order_match_array, gn); permute_inv(y,order_match_array, gn); } if(btf_flag == BASKER_TRUE) { //printf("btf order\n"); //printVec("btf.txt", order_btf_array, gn); permute_inv(y,order_btf_array, gn); //printVec("btf_amd.txt", order_c_csym_array, gn); permute_inv(y,order_blk_amd_array, gn); } if(nd_flag == BASKER_TRUE) { //printf("ND order \n"); //printVec("nd.txt", part_tree.permtab, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = part_tree.permtab(i); } for(Int i = BTF_A.ncol; i < gn; ++i) { temp_array(i) = i; } //permute_inv(y,part_tree.permtab, gn); permute_inv(y, temp_array,gn); } if(amd_flag == BASKER_TRUE) { //printf("AMD order \n"); //printVec("amd.txt",order_csym_array, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = order_csym_array(i); } for(Int i = BTF_A.ncol; i < gn; ++i) { temp_array(i) = i; } //permute_inv(y,order_csym_array, gn); permute_inv(y,temp_array, gn); } //printVec("perm.txt" , gperm, gn); permute_inv(y,gperm, gn); solve_interface(x,y); //Inverse perm //Note: don't need to inverse a row only perm if(btf_flag == BASKER_TRUE) { //printf("btf order\n"); //printVec(order_btf_array, gn); permute(x,order_btf_array, gn); } if(nd_flag == BASKER_TRUE) { //printf("ND order \n"); //printVec(part_tree.permtab, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = part_tree.permtab(i); } for(Int i = BTF_A.ncol; i < gn; i++) { temp_array(i) = i; } //permute(x,part_tree.permtab, gn); permute(x,temp_array, gn); } if(amd_flag == BASKER_TRUE) { //printf("AMD order \n"); //printVec(order_csym_array, gn); for(Int i = 0; i < BTF_A.ncol; ++i) { temp_array(i) = order_csym_array(i); } for(Int i = BTF_A.ncol; i < gn; ++i) { temp_array(i) = order_csym_array(i); } //permute(x,order_csym_array, gn); permute(x,temp_array,gn); } #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n"); printf("X: \n"); for(Int i = 0; i < gn; i++) { printf("%f, " , x(i)); } printf("\n\n"); printf("RHS: \n"); for(Int i =0; i < gm; i++) { printf("%f, ", y(i)); } printf("\n\n"); #endif for(Int i = 0; i < gn; i++) { _x[i] = x(i); } FREE_ENTRY_1DARRAY(x); FREE_ENTRY_1DARRAY(y); FREE_INT_1DARRAY(temp_array); return 0; }
void Basker<Int,Entry,Exe_Space>::blk_amd(BASKER_MATRIX &M, INT_1DARRAY p) { //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); for(Int b = btf_tabs_offset; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0))); #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_inc_lvl(Int option) { printf("Factor Inc Level Called \n"); gn = A.ncol; gm = A.nrow; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface //call reference copy constructor gn = A.ncol; gm = A.nrow; A = BTF_A; //printf("\n\n Switching A, newsize: %d \n", // A.ncol); //printMTX("A_FACTOR.mtx", A); } //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { kokkos_nfactor_domain_inc_lvl <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if(nt == BASKER_SUCCESS) { break; } else { printf("restart \n"); kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=1; l++) for(Int l=1; l <= tree.nlvls; l++) { //Come back for syncs //#ifdef BASKER_OLD_BARRIER Int lthreads = pow(2,l); Int lnteams = num_threads/lthreads; //#else //Int lthreads = 1; //Int lnteams = num_threads/lthreads; //#endif //printf("\n\n ============ SEP: %d ======\n\n",l); #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA kokkos_nfactor_sep2_inc_lvl <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== /* kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //=====Check for error====== //while(true) // { //INT_1DARRAY thread_start; // MALLOC_INT_1DARRAY(thread_start, num_threads+1); //init_value(thread_start, num_threads+1, // (Int) BASKER_MAX_IDX); //int nt = nfactor_diag_error(thread_start); // if(nt == BASKER_SUCCESS) // { /// break; // } //else // { /* break; printf("restart \n"); kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); */ //} // }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call return 0; }//end factor_lvl_inc()
BASKER_INLINE int Basker<Int,Entry, Exe_Space>::find_btf2 ( BASKER_MATRIX &M ) { Int nblks = 0; strong_component(M,nblks,order_btf_array,btf_tabs); btf_nblks = nblks; btf_flag = BASKER_TRUE; //#ifdef BASKER_DEBUG_ORDER_BTF printf("BTF nblks returned: %d \n", nblks); //BASKER_ASSERT(nblks>1, "NOT ENOUGH BTF BLOCKS"); //#endif #ifdef BASKER_DEBUG_ORDER_BTF if(nblks<2) { printf("BTF did not find enough blks\n"); } #endif //#ifdef BASKER_DEBUG_ORDER_BTF /* printf("\nBTF perm: \n"); for(Int i=0; i <M.nrow; i++) { printf("%d, ", order_btf_array(i)); //printf("%d, ", btf_perm(i)); } */ printf("num_threads: %d \n", num_threads); printf("\n\nBTF tabs: \n"); for(Int i=0; i < nblks+1; i++) { printf("%d, ", btf_tabs(i)); } printf("\n"); // #endif permute_col(M, order_btf_array); permute_row(M, order_btf_array); MALLOC_INT_1DARRAY(order_blk_amd_array, M.ncol); init_value(order_blk_amd_array, M.ncol, (Int)0); MALLOC_INT_1DARRAY(btf_blk_nnz, nblks+1); init_value(btf_blk_nnz, nblks+1, (Int) 0); MALLOC_INT_1DARRAY(btf_blk_work, nblks+1); init_value(btf_blk_work, nblks+1, (Int) 0); //Find AMD blk ordering, get nnz, and get work btf_blk_amd( M, order_blk_amd_array, btf_blk_nnz, btf_blk_work); #ifdef BASKER_DEBUG_ORDER_BTF printf("blk_perm:\n"); for(Int i = 0; i < M.ncol; i++) { printf("(%d,%d) ", i, order_blk_amd_array(i)); } printf("\n"); printf("id/blk_size/blk_nnz/work: \n"); for(Int i = 0; i < nblks; i++) { printf("(%d, %d, %d, %d) ", i, btf_tabs(i+1)-btf_tabs(i), btf_blk_nnz(i), btf_blk_work(i)); } printf("\n"); #endif //printMTX("A_BEFORE.mtx", M); //printVec("AMD.txt", order_blk_amd_array, M.ncol); permute_col(M, order_blk_amd_array); permute_row(M, order_blk_amd_array); sort_matrix(M); //changed col to row, error. //print to see issue //printMTX("A_AMD.mtx", M); break_into_parts2(M, nblks, btf_tabs); //find schedule find_btf_schedule(M, nblks, btf_tabs); #ifdef BASKER_DEBUG_ORDER_BTF printf("------------BTF CUT: %d --------------\n", btf_tabs(btf_tabs_offset)); #endif return 0; }//end find BTF(nnz)
void Basker<Int,Entry,Exe_Space>::btf_blk_amd ( BASKER_MATRIX &M, INT_1DARRAY p, INT_1DARRAY btf_nnz, INT_1DARRAY btf_work ) { // printf("=============BTF_BLK_AMD_CALLED========\n"); if(Options.incomplete == BASKER_TRUE) { //We note that AMD on incomplete ILUK //Seems realy bad and leads to a zero on the diag //Therefore, we simply return the natural ordering for(Int i = 0 ; i < M.ncol; i++) { p(i) = i; } //We will makeup work to be 1, //Since BTF is not supported in our iluk for(Int b = 0; b < btf_nblks; b++) { btf_nnz(b) = 1; btf_work(b) =1; } //printf("Short amd blk\n"); return; } //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); //printf("Done with btf_blk_amd malloc \n"); //printf("blks: %d \n" , btf_nblks); for(Int b = 0; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); //printf("blk: %d blk_size: %d \n", // b, blk_size); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } btf_work(b) = blk_size*blk_size*blk_size; btf_nnz(b) = (.5*(blk_size*blk_size) + blk_size); continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif double l_nnz = 0; double lu_work = 0; BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0)), l_nnz, lu_work); btf_nnz(b) = l_nnz; btf_work(b) = lu_work; #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()
BASKER_INLINE int Basker<Int,Entry,Exe_Space>::Factor(Int nrow, Int ncol, Int nnz, Int *col_ptr, Int *row_idx, Entry *val) { int err = 0; if (Options.verbose == BASKER_TRUE) { std::cout << "Basker Factor Called" << std::endl; std::cout << "Matrix: " << nrow << " " << ncol << " " << nnz << std::endl; } /* int err = A.copy_values(nrow, ncol, nnz, col_ptr, row_idx, val); */ if((Options.same_pattern == BASKER_TRUE) && (Options.no_pivot == BASKER_FALSE)) { printf("Warning: Same Pattern will not allow pivoting\n"); Options.no_pivot = BASKER_TRUE; } if(Options.transpose == BASKER_FALSE) { //printf("=======NO TRANS=====\n"); //A.init_matrix("Original Matrix", // nrow, ncol, nnz, col_ptr, row_idx, val); //A.scol = 0; //A.srow = 0; A.copy_values(nrow, ncol, nnz, col_ptr, row_idx, val); //printf("Copy done\n"); //printMTX("A_LOAD.mtx", A); } else { //printf("======TRANS=====\n"); //Will transpose and put in A using little extra matrix_transpose(0, nrow, 0, ncol, nnz, col_ptr, row_idx, val, A); } sort_matrix(A); if(Options.verbose_matrix_out == BASKER_TRUE) { printMTX("A_Factor.mtx", A); } matrix_flag = BASKER_TRUE; if(err == BASKER_ERROR) { return BASKER_ERROR; } //err = sfactor_copy(); err = sfactor_copy2(); if (Options.verbose == BASKER_TRUE) { printf("Basker Copy Structure Done \n"); } //printf("Done with sfactor_copy: %d \n", err); if(err == BASKER_ERROR) { return BASKER_ERROR; } //printf("before notoken\n"); //Kokkos::Impl::Timer timer; if(Options.incomplete == BASKER_FALSE) { err = factor_notoken(0); //printf("Notoken called\n"); } else { err = factor_inc_lvl(0); } if(err == BASKER_ERROR) { return BASKER_ERROR; } if(Options.verbose == BASKER_TRUE) { printf("Basker Factor Done \n"); } /* std::cout << "Raw Factor Time: " << timer.seconds() << std::endl; */ //DEBUG_PRINT(); // NDE MALLOC_ENTRY_1DARRAY(x_view_ptr_copy, gn); //used in basker_solve_rhs - move alloc MALLOC_ENTRY_1DARRAY(y_view_ptr_copy, gm); MALLOC_INT_1DARRAY(perm_inv_comp_array , gm); //y MALLOC_INT_1DARRAY(perm_comp_array, gn); //x MALLOC_INT_1DARRAY(perm_comp_iworkspace_array, gn); MALLOC_ENTRY_1DARRAY(perm_comp_fworkspace_array, gn); permute_composition_for_solve(); factor_flag = BASKER_TRUE; return 0; }//end Factor()
BASKER_INLINE int Basker<Int, Entry, Exe_Space>::factor_notoken(Int option) { //printf("factor no token called \n"); gn = A.ncol; gm = A.nrow; BASKER_MATRIX ATEMP; //Kokkos::Impl::Timer tza; if(Options.btf == BASKER_TRUE) { //JDB: We can change this for the new inteface gn = A.ncol; gm = A.nrow; ATEMP = A; A = BTF_A; } //printf("Switch time: %f \n", tza.seconds()); //Spit into Domain and Sep //----------------------Domain-------------------------// #ifdef BASKER_KOKKOS //====TIMER== #ifdef BASKER_TIME Kokkos::Impl::Timer timer; #endif //===TIMER=== typedef Kokkos::TeamPolicy<Exe_Space> TeamPolicy; if(btf_tabs_offset != 0) { if(Options.verbose == BASKER_TRUE) { printf("Factoring Dom num_threads: %d \n", num_threads); } Int domain_restart = 0; kokkos_nfactor_domain <Int,Entry,Exe_Space> domain_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), domain_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_domain_error(thread_start); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (domain_restart > BASKER_RESTART)) { break; } else { domain_restart++; if(Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_domain_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER=== #ifdef BASKER_TIME printf("Time DOMAIN: %f \n", timer.seconds()); timer.reset(); #endif //====TIMER==== #else// else basker_kokkos #pragma omp parallel { }//end omp parallel #endif //end basker_kokkos } //-------------------End--Domian--------------------------// //printVec("domperm.csc", gpermi, A.nrow); //---------------------------Sep--------------------------// if(btf_tabs_offset != 0) { //for(Int l=1; l<=4; l++) for(Int l=1; l <= tree.nlvls; l++) { //#ifdef BASKER_OLD_BARRIER //Int lthreads = pow(2,l); //Int lnteams = num_threads/lthreads; //#else Int lthreads = 1; Int lnteams = num_threads/lthreads; //#endif Int sep_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring Sep num_threads: %d %d \n", lnteams, lthreads); } #ifdef BASKER_KOKKOS Kokkos::Impl::Timer timer_inner_sep; #ifdef BASKER_NO_LAMBDA //kokkos_nfactor_sep <Int, Entry, Exe_Space> //sep_nfactor(this, l); kokkos_nfactor_sep2 <Int, Entry, Exe_Space> sep_nfactor(this,l); Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); //======Check for error===== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_sep_error(thread_start); if((nt == BASKER_SUCCESS)|| (nt == BASKER_ERROR) || (sep_restart > BASKER_RESTART)) { FREE_INT_1DARRAY(thread_start); break; } else { sep_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } Kokkos::parallel_for(TeamPolicy(lnteams,lthreads), sep_nfactor); Kokkos::fence(); } }//end while-true #ifdef BASKER_TIME printf("Time INNERSEP: %d %f \n", l, timer_inner_sep.seconds()); #endif #else //ELSE BASKER_NO_LAMBDA //Note: to be added #endif //end BASKER_NO_LAMBDA #else #pragma omp parallel { }//end omp parallel #endif }//end over each level #ifdef BASKER_TIME printf("Time SEP: %f \n", timer.seconds()); #endif } //-------------------------End Sep----------------// //-------------------IF BTF-----------------------// if(Options.btf == BASKER_TRUE) { Int btf_restart = 0; if(Options.verbose == BASKER_TRUE) { printf("Factoring BLKs num_threads: %d \n", num_threads); } //=====Timer #ifdef BASKER_TIME Kokkos::Impl::Timer timer_btf; #endif //====Timer //======Call diag factor==== kokkos_nfactor_diag <Int, Entry, Exe_Space> diag_nfactor(this); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor); Kokkos::fence(); //=====Check for error====== while(true) { INT_1DARRAY thread_start; MALLOC_INT_1DARRAY(thread_start, num_threads+1); init_value(thread_start, num_threads+1, (Int) BASKER_MAX_IDX); int nt = nfactor_diag_error(thread_start); //printf("RETURNED: %d \n", nt); if((nt == BASKER_SUCCESS) || (nt == BASKER_ERROR) || (btf_restart > BASKER_RESTART)) { break; } else { btf_restart++; if (Options.verbose == BASKER_TRUE) { printf("restart \n"); } kokkos_nfactor_diag_remalloc <Int, Entry, Exe_Space> diag_nfactor_remalloc(this, thread_start); Kokkos::parallel_for(TeamPolicy(num_threads,1), diag_nfactor_remalloc); Kokkos::fence(); } }//end while //====TIMER #ifdef BASKER_TIME printf("Time BTF: %f \n", timer_btf.seconds()); #endif //===TIMER }//end btf call Kokkos::Impl::Timer tzback; if(Options.btf == BASKER_TRUE) { A = ATEMP; } //printf("Switch back: %f \n", // tzback.seconds()); return 0; }//end factor_notoken()
BASKER_INLINE int Basker<Int, Entry,Exe_Space>::break_into_parts2 ( BASKER_MATRIX &M, Int nblks, INT_1DARRAY btf_tabs ) { #ifdef BASKER_DEBUG_ORDER_BTF printf("break_into_parts2 called \n"); printf("nblks: %d \n", nblks); #endif Options.btf = BASKER_TRUE; //Alg. // A -> [BTF_A BTF_B] // [0 BTF_C] //1. Run backward through the btf_tabs to find size C //2. Form A,B,C based on size in 1. //Short circuit, //If nblks == 1, than only BTF_A exists if(nblks == 1) { //#ifdef BASKER_DEBUG_ORDER_BTF printf("Short Circuit part_call \n"); //#endif BTF_A = A; //Options.btf = BASKER_FALSE; btf_tabs_offset = 1; return 0; } //Step 1. //Find total work estimate Int total_work_estimate = 0; for(Int b = 0; b < nblks; b++) { total_work_estimate += btf_blk_work(b); } //Set a class variable to use later btf_total_work = total_work_estimate; //printf("Total work estimate: %d \n", // total_work_estimate); //printf("num_threads: %d epsilon: %f \n", // num_threads, // ((double)1/num_threads) + // ((double)BASKER_BTF_IMBALANCE)); Int break_size = ceil((double)total_work_estimate*( ((double)1/num_threads) + ((double)BASKER_BTF_IMBALANCE))); printf("Break size: %d \n", break_size); Int t_size = 0; Int scol = M.ncol; Int blk_idx = nblks; BASKER_BOOL move_fwd = BASKER_TRUE; while(move_fwd==BASKER_TRUE) { //printf("------TEST blk_idx: %d \n", // blk_idx); Int blk_work = btf_blk_work(blk_idx-1); Int blk_size = btf_tabs(blk_idx) - btf_tabs(blk_idx-1); #ifdef BASKER_DEBUG_ORDER_BTF printf(" \n move_fwd loop \n"); BASKER_ASSERT(blk_idx>=0, "btf blk idx off"); BASKER_ASSERT(blk_work>=0, "btk_work wrong"); BASKER_ASSERT(blk_size>0, "btf blk size wrong"); printf("blk_idx: %d blk_work: %d break_size: %d \n", blk_idx, blk_work, break_size); #endif //Should be end //if(((blk_work < break_size) || // (blk_size < BASKER_BTF_SMALL)) && // (blk_idx > 1)) //Continue to be in btf if(((blk_work < break_size) && (blk_idx > 1))) { #ifdef BASKER_DEBUG_ORDER_BTF printf("first choice \n"); #endif t_size = t_size+blk_size; blk_idx = blk_idx-1; scol = btf_tabs[blk_idx]; } //break due to size else if(blk_work >= break_size) { printf("break due to size\n"); move_fwd = BASKER_FALSE; } //break due to end else if(blk_idx == 1) { printf("break last blk\n"); blk_idx = 0; t_size = t_size + blk_size; scol = btf_tabs[blk_idx]; move_fwd = BASKER_FALSE; } //should not be called else { BASKER_ASSERT(1==0, "btf order break"); move_fwd = BASKER_FALSE; } }//end while(move_fwd) //#ifdef BASKER_DEBUG_ORDER_BTF printf("Done finding BTF2 cut. Cut size: %d scol: %d \n", t_size, scol); printf("Done finding BTF2 cut. blk_idx: %d \n", blk_idx); //BASKER_ASSERT(t_size > 0, "BTF CUT SIZE NOT BIG ENOUGH\n"); BASKER_ASSERT((scol >= 0) && (scol < M.ncol), "SCOL\n"); //#endif //Comeback and change btf_tabs_offset = blk_idx; //2. Move into Blocks if(btf_tabs_offset != 0) { //--Move A into BTF_A; BTF_A.set_shape(0, scol, 0, scol); BTF_A.nnz = M.col_ptr(scol); #ifdef BASKER_DEBUG_ORDER_BTF printf("Init BTF_A. ncol: %d nnz: %d \n", scol, BTF_A.nnz); #endif if(BTF_A.v_fill == BASKER_FALSE) { BASKER_ASSERT(BTF_A.ncol >= 0, "BTF_A, col_ptr"); MALLOC_INT_1DARRAY(BTF_A.col_ptr, BTF_A.ncol+1); BASKER_ASSERT(BTF_A.nnz > 0, "BTF_A, nnz"); MALLOC_INT_1DARRAY(BTF_A.row_idx, BTF_A.nnz); MALLOC_ENTRY_1DARRAY(BTF_A.val, BTF_A.nnz); BTF_A.fill(); } Int annz = 0; for(Int k = 0; k < scol; ++k) { #ifdef BASKER_DEBUG_ORDER_BTF printf("copy column: %d into A_BTF, [%d %d] \n", k, M.col_ptr(k), M.col_ptr(k+1)); #endif for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); ++i) { //printf("annz: %d i: %d \n", annz, i); BTF_A.row_idx(annz) = M.row_idx(i); BTF_A.val(annz) = M.val(i); annz++; } BTF_A.col_ptr(k+1) = annz; } }//no A //Fill in B and C at the same time INT_1DARRAY cws; BASKER_ASSERT((M.ncol-scol+1) > 0, "BTF_SIZE MALLOC"); MALLOC_INT_1DARRAY(cws, M.ncol-scol+1); init_value(cws, M.ncol-scol+1, (Int)M.ncol); BTF_B.set_shape(0 , scol, scol, M.ncol-scol); BTF_C.set_shape(scol, M.ncol-scol, scol, M.ncol-scol); #ifdef BASKER_DEBUG_ORDER_BTF printf("Set Shape BTF_B: %d %d %d %d \n", BTF_B.srow, BTF_B.nrow, BTF_B.scol, BTF_B.ncol); printf("Set Shape BTF_C: %d %d %d %d \n", BTF_C.srow, BTF_C.nrow, BTF_C.scol, BTF_C.nrow); #endif //Scan and find nnz //We can do this much better!!!! Int bnnz = 0; Int cnnz = 0; for(Int k = scol; k < M.ncol; ++k) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Scanning nnz, k: %d \n", k); #endif for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); ++i) { if(M.row_idx(i) < scol) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz to Upper, %d %d \n", scol, M.row_idx(i)); #endif bnnz++; } else { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz to Lower, %d %d \n", scol, M.row_idx(i)); #endif cnnz++; } }//over all nnz in k }//over all k #ifdef BASKER_DEBUG_ORDER_BTF printf("BTF_B nnz: %d \n", bnnz); printf("BTF_C nnz: %d \n", cnnz); #endif BTF_B.nnz = bnnz; BTF_C.nnz = cnnz; //Malloc need space if((BTF_B.v_fill == BASKER_FALSE) && (BTF_B.nnz > 0)) //if(BTF_B.v_fill == BASKER_FALSE) { BASKER_ASSERT(BTF_B.ncol >= 0, "BTF_B ncol"); MALLOC_INT_1DARRAY(BTF_B.col_ptr, BTF_B.ncol+1); BASKER_ASSERT(BTF_B.nnz > 0, "BTF_B.nnz"); MALLOC_INT_1DARRAY(BTF_B.row_idx, BTF_B.nnz); MALLOC_ENTRY_1DARRAY(BTF_B.val, BTF_B.nnz); BTF_B.fill(); } if(BTF_C.v_fill == BASKER_FALSE) { BASKER_ASSERT(BTF_C.ncol >= 0, "BTF_C.ncol"); MALLOC_INT_1DARRAY(BTF_C.col_ptr, BTF_C.ncol+1); BASKER_ASSERT(BTF_C.nnz > 0, "BTF_C.nnz"); MALLOC_INT_1DARRAY(BTF_C.row_idx, BTF_C.nnz); MALLOC_ENTRY_1DARRAY(BTF_C.val, BTF_C.nnz); BTF_C.fill(); } //scan again (Very bad!!!) bnnz = 0; cnnz = 0; for(Int k = scol; k < M.ncol; ++k) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Scanning nnz, k: %d \n", k); #endif for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); ++i) { if(M.row_idx(i) < scol) { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz to Upper, %d %d \n", scol, M.row_idx[i]); #endif BASKER_ASSERT(BTF_B.nnz > 0, "BTF B uninit"); //BTF_B.row_idx[bnnz] = M.row_idx[i]; //Note: do not offset because B srow = 0 BTF_B.row_idx(bnnz) = M.row_idx(i); BTF_B.val(bnnz) = M.val(i); bnnz++; } else { #ifdef BASKER_DEBUG_ORDER_BTF printf("Adding nnz Lower,k: %d %d %d %f \n", k, scol, M.row_idx[i], M.val(i)); #endif //BTF_C.row_idx[cnnz] = M.row_idx[i]; BTF_C.row_idx(cnnz) = M.row_idx(i)-scol; BTF_C.val(cnnz) = M.val(i); cnnz++; } }//over all nnz in k if(BTF_B.nnz > 0) { BTF_B.col_ptr(k-scol+1) = bnnz; } BTF_C.col_ptr(k-scol+1) = cnnz; }//over all k #ifdef BASKER_DEBUG_ORDER_BTF printf("After BTF_B nnz: %d \n", bnnz); printf("After BTF_C nnz: %d \n", cnnz); #endif //printf("\n\n"); //printf("DEBUG\n"); //BTF_C.print(); //printf("\n\n"); return 0; }//end break_into_parts2 (based on imbalance)