BASKER_INLINE
  int Basker<Int,Entry,Exe_Space>::test_solve()
  {
    ENTRY_1DARRAY  x_known;
    ENTRY_1DARRAY  x;
    ENTRY_1DARRAY  y;


    #ifdef BASKER_DEBUG_SOLVE_RHS
    printf("test_solve called \n");
    printf("Global pivot permuation\n");
    printVec(gperm, gn);
    printf("\n");
    printf("Global pivot permutation inverse\n");
    printVec(gpermi, gn);
    printf("\n");
    #endif


    BASKER_ASSERT(gn > 0, "solve testsolve gn");
    MALLOC_ENTRY_1DARRAY(x_known, gn);
    init_value(x_known, gn , (Entry)1.0);


    //temp
    for(Int i = 0; i < gn; i++)
      {
	//x_known(i) = (Entry)(i+1);
        x_known(i) = (Entry) 1.0;
      }
    //JDB: used for other test
    //permute(x_known, order_csym_array, gn);



    MALLOC_ENTRY_1DARRAY(x, gn);
    init_value(x, gn, (Entry) 0.0);
    BASKER_ASSERT(gm > 0, "solve testsolve gm");
    MALLOC_ENTRY_1DARRAY(y, gm);
    init_value(y, gm, (Entry) 0.0);
    
    if(btf_nblks > 0)
      {
	sort_matrix(BTF_C);
	//printMTX("C_BEFORE_SOLVE.mtx", BTF_C);
      }

    if(Options.btf == BASKER_TRUE)
      {
      
	//printf("btf_tabs_offset: %d ", btf_tabs_offset);
        //printf("btf_nblks: %d \n", btf_nblks);
	if(btf_tabs_offset != 0)
	  {
            //printf("BTF_A spmv\n");
	    spmv(BTF_A, x_known,y);
            if(btf_nblks> 1)
              {
                //printf("btf_B spmv \n");
                spmv(BTF_B, x_known, y);
              }
          }
        if(btf_nblks > 1)
          {
            
            //printf("btf_c spmv \n");
            spmv(BTF_C, x_known, y);
          }
	//return -1;
      }
    else
      {
        //printf("other\n");
	//spmv(BTF_A, x_known,y);
      }
    
    

    
    
    //printf("\n Before Test Points \n");
    //printf("i: %d x: %f y: %f \n", 0, x_known(0), y(0));
    //if(gn > 24)
    //  {
    //   printf("i: %d x: %f y: %f \n", 24, x_known(24), y(24));
    //  }
    
    //pivot permuation
    //printVec("gperm.csc", gpermi, gn);

    for(Int i = 0; i < gn; i++)
      {
        x(gpermi(i)) = y(i);
      }
    for(Int i = 0; i < gn; i++)
      {
        y(i) = x(i);
        x(i) = 0;
      }




    #ifdef BASKER_DEBUG_SOLVE_RHS
    printf("\n\n");
    //printf("Known Solution: \n");
    //for(Int i = 0; i < gn; i++)
    //  {
    //	printf("%f, " , x_known(i));
    //  }
    printf("\n\n");
    printf("RHS: \n");
    for(Int i =0; i < gm; i++)
      {
	printf("%d %f,\n ", i, y(i)); 
      }
    printf("\n\n");
    #endif

    

    if(Options.btf == BASKER_FALSE)
      {
        
        //printf("before serial solve\n");
	if(btf_tabs_offset != 0)
	  {
	    serial_solve(y,x);
	  }
	//printf("After serial solve\n");
	//printf("i: %d x: %f y: %f \n", 0, x(0), y(0));
	//printf("i: %d x: %f y: %f \n", 24, x(24), y(24));
      }
    else
      {
	//A\y -> y
	//serial_btf_solve(y,x);
        //printf("before btf serial solve\n");
	serial_btf_solve(y,x);
        
	//printf("After btf solve\n");
	//printf("i: %d x: %f y: %f \n", 0, x(0), y(0));
	//printf("i: %d x: %f y: %f \n", 24, x(24), y(24));
      }


    Entry diff =0.0;
  
    for(Int i = 0; i < gn; i++)
      {
	diff += (x_known(i) - x(i));
      }
    diff = diff/(Entry) gn;

    #ifdef BASKER_DEBUG_SOLVE_RHS
    printf("\n\n");
    printf("Solve Compare: \n");
    for(Int i = 0; i < gn; i++)
      {
	printf("%d %f %f \n", 
	       i, x_known(i), x(i));
 
      }
    printf("\n\n");
    #endif



    printf("\n Test Points \n");
    printf("i: %d x: %f %f \n", 0, x_known(0), x(0));
    if(gn > 24)
      {
        printf("i: %d x: %f %f \n", 10, x_known(10), x(10));
        printf("i: %d x: %f %f \n", 24, x_known(24), x(24));
      }
    printf("\n");
    printf("TEST_SOLVE: ||x-x||/||x| = %e", diff);
    printf("\n");

    if((diff > -1e-2) && (diff < 1e-2))
      {
        printf("TEST PASSED \n");
      }  

    return 0;
  }//end test_solve
  BASKER_INLINE
  int Basker<Int,Entry,Exe_Space>::solve_interface
  (
   ENTRY_1DARRAY x,//Solution (len = gn)
   ENTRY_1DARRAY y //RHS      (len = gn)
   )
  {
   
    
    //printf("\n Before Test Points \n");
    //printf("i: %d x: %f y: %f \n", 0, x(0), y(0));
    //printf("i: %d x: %f y: %f \n", 24, x(24), y(24));
    
    #ifdef BASKER_DEBUG_SOLVE_RHS
    printf("\n\n");
    printf("X: \n");
    for(Int i = 0; i < gn; i++)
      {
	printf("%f, " , x(i));
      }
    printf("\n\n");
    printf("RHS: \n");
    for(Int i =0; i < gm; i++)
      {
	printf("%f, ", y(i)); 
      }
    printf("\n\n");
    #endif


    if(Options.btf == BASKER_FALSE)
      {
	if(btf_tabs_offset != 0)
	  {
	   
	    serial_solve(y,x);
	    
	    printf("After serial solve\n");
	    //printf("i: %d x: %f y: %f \n", 0, x(0), y(0));
	    //printf("i: %d x: %f y: %f \n", 24, x(24), y(24));
   
	  }
      }
    else
      {
	//A\y -> y
	//serial_btf_solve(y,x);
	serial_btf_solve(y,x);

	//printf("After btf solve\n");
	// printf("i: %d x: %f y: %f \n", 0, x(0), y(0));
	// printf("i: %d x: %f y: %f \n", 24, x(24), y(24));
   
      }


    //printf("\n After Test Points \n");
    //printf("i: %d x: %f y: %f \n", 0, x(0), y(0));
    //printf("i: %d x: %f y: %f \n", 24, x(24), y(24));
    
    #ifdef BASKER_DEBUG_SOLVE_RHS
    printf("\n\n");
    printf("X: \n");
    for(Int i = 0; i < gn; i++)
      {
	printf("%f, " , x(i));
      }
    printf("\n\n");
    printf("RHS: \n");
    for(Int i =0; i < gm; i++)
      {
	printf("%f, ", y(i)); 
      }
    printf("\n\n");
    #endif

    return 0;
  }
/* Solves a tridiagonal matrix in parallel. Each thread solves a subsection of
 * the problem, all the threads communicate a reduced matrix based on the
 * LU decomp values at the boundaries between sub problems, solving this reduced
 * system that only scales with the number of threads produces a correction to
 * the inital solution of the sub problem.
 * Based on this paper: http://www.mcs.anl.gov/~zippy/publications/partrid/partrid.html
*/
void parallel_solve(
    const double* const a,
    const double* const b,
    const double* const c,
    const double* const r,
    double* s,
    int size,
    int comm_rank,
    int comm_size
) {
    //TODO try to figure out a more inplace version of this algorithm.
    //TODO investigate why example in the paper doesn't work.
    double* w   = calloc(size, sizeof(double));
    double* y   = calloc(size, sizeof(double));
    double* xR  = calloc(size, sizeof(double));
    double* xLH = calloc(size, sizeof(double));
    double* wUH = calloc(size, sizeof(double));
    double* xUH = calloc(size, sizeof(double));

    w[0] = c[0] / b[0];
    for (int i = 1; i < size; ++i) w[i] = c[i] / (b[i] - a[i] * w[i - 1]);

    y[0] = r[0] / b[0];
    for (int i = 1; i < size; ++i) y[i] = (r[i] - a[i] * y[i - 1]) / (b[i] - a[i] * w[i - 1]);

    xR[size-1] = y[size-1];
    for (int i = size - 2; i >= 0; --i) xR[i] = y[i] - w[i] * xR[i+1];

    xLH[size-1] = -w[size-1];
    for (int i = size - 2; i >= 0; --i) xLH[i] = -w[i] * xLH[i+1];

    wUH[size - 1] = a[size-1] / b[size-1];
    for (int i = size - 2; i >= 0; --i) wUH[i] = a[i] / (b[i] - c[i] * wUH[i+1]);

    xUH[0] = -wUH[0];
    for (int i = 1; i < size; ++i) xUH[i] = -wUH[i] * xUH[i-1];

    //Setup the reduced global system
    //Should really by 2 * comm_size - 2, but the MPI_send would be more complicated.
    int reduced_size = 2 * comm_size;

    double* reducedA = calloc(reduced_size, sizeof(double));
    double* reducedB = calloc(reduced_size, sizeof(double));
    double* reducedC = calloc(reduced_size, sizeof(double));
    double* reducedR = calloc(reduced_size, sizeof(double));

    double tempA[] = {-1,     xUH[size - 1]};
    double tempB[] = {xUH[0], xLH[size - 1]};
    double tempC[] = {xLH[0], -1};
    double tempR[] = {-xR[0], -xR[size - 1]};

    //Each thread builds a picture of the reduced system.
    MPI_Allgather(tempA, 2, MPI_DOUBLE, reducedA, 2, MPI_DOUBLE, MPI_COMM_WORLD);
    MPI_Allgather(tempB, 2, MPI_DOUBLE, reducedB, 2, MPI_DOUBLE, MPI_COMM_WORLD);
    MPI_Allgather(tempC, 2, MPI_DOUBLE, reducedC, 2, MPI_DOUBLE, MPI_COMM_WORLD);
    MPI_Allgather(tempR, 2, MPI_DOUBLE, reducedR, 2, MPI_DOUBLE, MPI_COMM_WORLD);

    //Solve the reduced system ignoring the boundary values at each end since they're not coupled with further threads.
    double* reduced_solution = serial_solve(reducedA + 1, reducedB + 1, reducedC + 1, reducedR + 1, 2 * comm_size - 2);

    //Not the highest rank
    double coLH = (comm_rank != comm_size - 1) ? reduced_solution[comm_rank * 2] : 0;

    //Not the lowest rank
    double coUH = (comm_rank != 0) ? reduced_solution[comm_rank * 2 - 1] : 0;

    //Correct initial solution with values from the reduced global system.
    for (int i = 0; i < size; i++) {
        s[i] = xR[i] + coLH*xLH[i] + coUH*xUH[i];
    }
}