BASKER_INLINE int Basker<Int,Entry,Exe_Space>::test_solve() { ENTRY_1DARRAY x_known; ENTRY_1DARRAY x; ENTRY_1DARRAY y; #ifdef BASKER_DEBUG_SOLVE_RHS printf("test_solve called \n"); printf("Global pivot permuation\n"); printVec(gperm, gn); printf("\n"); printf("Global pivot permutation inverse\n"); printVec(gpermi, gn); printf("\n"); #endif BASKER_ASSERT(gn > 0, "solve testsolve gn"); MALLOC_ENTRY_1DARRAY(x_known, gn); init_value(x_known, gn , (Entry)1.0); //temp for(Int i = 0; i < gn; i++) { //x_known(i) = (Entry)(i+1); x_known(i) = (Entry) 1.0; } //JDB: used for other test //permute(x_known, order_csym_array, gn); MALLOC_ENTRY_1DARRAY(x, gn); init_value(x, gn, (Entry) 0.0); BASKER_ASSERT(gm > 0, "solve testsolve gm"); MALLOC_ENTRY_1DARRAY(y, gm); init_value(y, gm, (Entry) 0.0); if(btf_nblks > 0) { sort_matrix(BTF_C); //printMTX("C_BEFORE_SOLVE.mtx", BTF_C); } if(Options.btf == BASKER_TRUE) { //printf("btf_tabs_offset: %d ", btf_tabs_offset); //printf("btf_nblks: %d \n", btf_nblks); if(btf_tabs_offset != 0) { //printf("BTF_A spmv\n"); spmv(BTF_A, x_known,y); if(btf_nblks> 1) { //printf("btf_B spmv \n"); spmv(BTF_B, x_known, y); } } if(btf_nblks > 1) { //printf("btf_c spmv \n"); spmv(BTF_C, x_known, y); } //return -1; } else { //printf("other\n"); //spmv(BTF_A, x_known,y); } //printf("\n Before Test Points \n"); //printf("i: %d x: %f y: %f \n", 0, x_known(0), y(0)); //if(gn > 24) // { // printf("i: %d x: %f y: %f \n", 24, x_known(24), y(24)); // } //pivot permuation //printVec("gperm.csc", gpermi, gn); for(Int i = 0; i < gn; i++) { x(gpermi(i)) = y(i); } for(Int i = 0; i < gn; i++) { y(i) = x(i); x(i) = 0; } #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n"); //printf("Known Solution: \n"); //for(Int i = 0; i < gn; i++) // { // printf("%f, " , x_known(i)); // } printf("\n\n"); printf("RHS: \n"); for(Int i =0; i < gm; i++) { printf("%d %f,\n ", i, y(i)); } printf("\n\n"); #endif if(Options.btf == BASKER_FALSE) { //printf("before serial solve\n"); if(btf_tabs_offset != 0) { serial_solve(y,x); } //printf("After serial solve\n"); //printf("i: %d x: %f y: %f \n", 0, x(0), y(0)); //printf("i: %d x: %f y: %f \n", 24, x(24), y(24)); } else { //A\y -> y //serial_btf_solve(y,x); //printf("before btf serial solve\n"); serial_btf_solve(y,x); //printf("After btf solve\n"); //printf("i: %d x: %f y: %f \n", 0, x(0), y(0)); //printf("i: %d x: %f y: %f \n", 24, x(24), y(24)); } Entry diff =0.0; for(Int i = 0; i < gn; i++) { diff += (x_known(i) - x(i)); } diff = diff/(Entry) gn; #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n"); printf("Solve Compare: \n"); for(Int i = 0; i < gn; i++) { printf("%d %f %f \n", i, x_known(i), x(i)); } printf("\n\n"); #endif printf("\n Test Points \n"); printf("i: %d x: %f %f \n", 0, x_known(0), x(0)); if(gn > 24) { printf("i: %d x: %f %f \n", 10, x_known(10), x(10)); printf("i: %d x: %f %f \n", 24, x_known(24), x(24)); } printf("\n"); printf("TEST_SOLVE: ||x-x||/||x| = %e", diff); printf("\n"); if((diff > -1e-2) && (diff < 1e-2)) { printf("TEST PASSED \n"); } return 0; }//end test_solve
BASKER_INLINE int Basker<Int,Entry,Exe_Space>::solve_interface ( ENTRY_1DARRAY x,//Solution (len = gn) ENTRY_1DARRAY y //RHS (len = gn) ) { //printf("\n Before Test Points \n"); //printf("i: %d x: %f y: %f \n", 0, x(0), y(0)); //printf("i: %d x: %f y: %f \n", 24, x(24), y(24)); #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n"); printf("X: \n"); for(Int i = 0; i < gn; i++) { printf("%f, " , x(i)); } printf("\n\n"); printf("RHS: \n"); for(Int i =0; i < gm; i++) { printf("%f, ", y(i)); } printf("\n\n"); #endif if(Options.btf == BASKER_FALSE) { if(btf_tabs_offset != 0) { serial_solve(y,x); printf("After serial solve\n"); //printf("i: %d x: %f y: %f \n", 0, x(0), y(0)); //printf("i: %d x: %f y: %f \n", 24, x(24), y(24)); } } else { //A\y -> y //serial_btf_solve(y,x); serial_btf_solve(y,x); //printf("After btf solve\n"); // printf("i: %d x: %f y: %f \n", 0, x(0), y(0)); // printf("i: %d x: %f y: %f \n", 24, x(24), y(24)); } //printf("\n After Test Points \n"); //printf("i: %d x: %f y: %f \n", 0, x(0), y(0)); //printf("i: %d x: %f y: %f \n", 24, x(24), y(24)); #ifdef BASKER_DEBUG_SOLVE_RHS printf("\n\n"); printf("X: \n"); for(Int i = 0; i < gn; i++) { printf("%f, " , x(i)); } printf("\n\n"); printf("RHS: \n"); for(Int i =0; i < gm; i++) { printf("%f, ", y(i)); } printf("\n\n"); #endif return 0; }
/* Solves a tridiagonal matrix in parallel. Each thread solves a subsection of * the problem, all the threads communicate a reduced matrix based on the * LU decomp values at the boundaries between sub problems, solving this reduced * system that only scales with the number of threads produces a correction to * the inital solution of the sub problem. * Based on this paper: http://www.mcs.anl.gov/~zippy/publications/partrid/partrid.html */ void parallel_solve( const double* const a, const double* const b, const double* const c, const double* const r, double* s, int size, int comm_rank, int comm_size ) { //TODO try to figure out a more inplace version of this algorithm. //TODO investigate why example in the paper doesn't work. double* w = calloc(size, sizeof(double)); double* y = calloc(size, sizeof(double)); double* xR = calloc(size, sizeof(double)); double* xLH = calloc(size, sizeof(double)); double* wUH = calloc(size, sizeof(double)); double* xUH = calloc(size, sizeof(double)); w[0] = c[0] / b[0]; for (int i = 1; i < size; ++i) w[i] = c[i] / (b[i] - a[i] * w[i - 1]); y[0] = r[0] / b[0]; for (int i = 1; i < size; ++i) y[i] = (r[i] - a[i] * y[i - 1]) / (b[i] - a[i] * w[i - 1]); xR[size-1] = y[size-1]; for (int i = size - 2; i >= 0; --i) xR[i] = y[i] - w[i] * xR[i+1]; xLH[size-1] = -w[size-1]; for (int i = size - 2; i >= 0; --i) xLH[i] = -w[i] * xLH[i+1]; wUH[size - 1] = a[size-1] / b[size-1]; for (int i = size - 2; i >= 0; --i) wUH[i] = a[i] / (b[i] - c[i] * wUH[i+1]); xUH[0] = -wUH[0]; for (int i = 1; i < size; ++i) xUH[i] = -wUH[i] * xUH[i-1]; //Setup the reduced global system //Should really by 2 * comm_size - 2, but the MPI_send would be more complicated. int reduced_size = 2 * comm_size; double* reducedA = calloc(reduced_size, sizeof(double)); double* reducedB = calloc(reduced_size, sizeof(double)); double* reducedC = calloc(reduced_size, sizeof(double)); double* reducedR = calloc(reduced_size, sizeof(double)); double tempA[] = {-1, xUH[size - 1]}; double tempB[] = {xUH[0], xLH[size - 1]}; double tempC[] = {xLH[0], -1}; double tempR[] = {-xR[0], -xR[size - 1]}; //Each thread builds a picture of the reduced system. MPI_Allgather(tempA, 2, MPI_DOUBLE, reducedA, 2, MPI_DOUBLE, MPI_COMM_WORLD); MPI_Allgather(tempB, 2, MPI_DOUBLE, reducedB, 2, MPI_DOUBLE, MPI_COMM_WORLD); MPI_Allgather(tempC, 2, MPI_DOUBLE, reducedC, 2, MPI_DOUBLE, MPI_COMM_WORLD); MPI_Allgather(tempR, 2, MPI_DOUBLE, reducedR, 2, MPI_DOUBLE, MPI_COMM_WORLD); //Solve the reduced system ignoring the boundary values at each end since they're not coupled with further threads. double* reduced_solution = serial_solve(reducedA + 1, reducedB + 1, reducedC + 1, reducedR + 1, 2 * comm_size - 2); //Not the highest rank double coLH = (comm_rank != comm_size - 1) ? reduced_solution[comm_rank * 2] : 0; //Not the lowest rank double coUH = (comm_rank != 0) ? reduced_solution[comm_rank * 2 - 1] : 0; //Correct initial solution with values from the reduced global system. for (int i = 0; i < size; i++) { s[i] = xR[i] + coLH*xLH[i] + coUH*xUH[i]; } }