Ejemplo n.º 1
0
int main() {
  /* Time Recording Variables */
  int now_s, now_u;
  struct rusage usage;
  double time_now, time_past;
  
  /* x, y, z and global indices */
  int i, j, k, m, n, nodes;
  /* Connectivity enties */
  int m_above, m_below, m_left, m_right, m_front, m_back;
  double c_above, c_below, c_left, c_right, c_front, c_back, c_self;
  
  /* Creat cholmod object */
  cholmod_common Common, *cc;
  /* Compress column form sparse matrix */
  cholmod_sparse *S, *ST;
  /* forcing function on input; voltages after solving */
  cholmod_dense *b, *v0, *r;  
  size_t *Si, *Sp, *Snz, *bi, *bj;
  double *Sx, *bx, *v0x;
 
  /* store the sparse matrix */
  FILE *store;
  char filename[100];
  
  /* start CHOLMOD */
  cc = &Common;
  cholmod_l_start(cc);
  
  /* initialize timing variables*/ 
  getrusage(0, &usage);
  now_s = usage.ru_utime.tv_sec;
  now_u = usage.ru_utime.tv_usec;
  time_past = now_s + now_u  / 1.e6;
  time_now = time_past;
  
  /* total nodes in the grid */
  nodes = IMAX * JMAX * KMAX;
  
  /* Allocate space for connectivity matrix and forcing vector. */
  S = cholmod_l_allocate_sparse(nodes, nodes, 7 * nodes, 
				0, 0, 0, CHOLMOD_REAL, cc);

  b = cholmod_l_allocate_dense(nodes, 1, nodes, CHOLMOD_REAL, cc);
  bx = b->x;

  v0 = cholmod_l_allocate_dense(nodes, 1, nodes, CHOLMOD_REAL, cc);
  v0x = v0->x;



  /*================================================================*/
  /*===============  make connectivity matrix ======================*/
  /*================================================================*/
  Si = (size_t *) S->i;
  Sp = (size_t *) S->p;
  Sx = (double *) S->x;
  Snz = (size_t *) S->nz;
  n = 0;
  Sp[0] = 0;
  for (k = 0; k < KMAX; k++) {
    for (j = 0; j < JMAX; j++) {
      for (i = 0; i < IMAX; i++) {
	/* Global index in x-fastest order*/
	m = k * IMAX * JMAX + j * IMAX + i;
	Sp[m + 1] = Sp[m];
	v0x[m] = (k + 1.0) / (KMAX + 1.0);

	/* Six coef.s of every node */
	c_below = (mepr(i - 0.5, j - 0.5, k - 0.5) + 
		   mepr(i + 0.5, j - 0.5, k - 0.5) + 
		   mepr(i - 0.5, j + 0.5, k - 0.5) +
		   mepr(i + 0.5, j + 0.5, k - 0.5)) / 4.0;

	c_front = (mepr(i - 0.5, j - 0.5, k - 0.5) + 
		   mepr(i + 0.5, j - 0.5, k - 0.5) + 
		   mepr(i - 0.5, j - 0.5, k + 0.5) +
		   mepr(i + 0.5, j - 0.5, k + 0.5)) / 4.0;

	c_left = (mepr(i - 0.5, j + 0.5, k - 0.5) + 
		  mepr(i - 0.5, j - 0.5, k - 0.5) + 
		  mepr(i - 0.5, j + 0.5, k + 0.5) +
		  mepr(i - 0.5, j - 0.5, k + 0.5)) / 4.0;

	c_right = (mepr(i + 0.5, j + 0.5, k - 0.5) + 
		   mepr(i + 0.5, j - 0.5, k - 0.5) + 
		   mepr(i + 0.5, j + 0.5, k + 0.5) +
		   mepr(i + 0.5, j - 0.5, k + 0.5)) / 4.0;

	c_back = (mepr(i - 0.5, j + 0.5, k - 0.5) + 
		  mepr(i + 0.5, j + 0.5, k - 0.5) + 
		  mepr(i - 0.5, j + 0.5, k + 0.5) +
		  mepr(i + 0.5, j + 0.5, k + 0.5)) / 4.0;

	c_above = (mepr(i - 0.5, j - 0.5, k + 0.5) + 
		   mepr(i + 0.5, j - 0.5, k + 0.5) + 
		   mepr(i - 0.5, j + 0.5, k + 0.5) +
		   mepr(i + 0.5, j + 0.5, k + 0.5))/4.0;

	/* Self term. */
	c_self = -(c_above + c_below + c_left + c_right + c_front + c_back);
	Si[n] = m; Sx[n] = c_self; n++; Sp[m + 1]++;

	/* Node below. Ensure not on bottom face */
	if (k != 0) {
	  m_below = m - IMAX * JMAX;
 	  Si[n] = m_below; Sx[n] = c_below; n++; Sp[m + 1]++;
	} else {
	  bx[m] = -c_below * VBOT;
	}

	/* Node front.  Ensure not on front face. */
	if (j != 0) {
	  m_front = m - IMAX;
	} else {
	  m_front = m + (JMAX - 1) * IMAX;
	}
	Si[n] = m_front; Sx[n] = c_front; n++; Sp[m + 1]++;

	/* Node to left.  Ensure not on left face. */
	if (i != 0) {
	  m_left = m - 1;
	} else {
	  m_left = m + IMAX -1;
	}
	Si[n] = m_left; Sx[n] = c_left; n++; Sp[m + 1]++;	   

	/* Node to right.  Ensure not on right face. */
	if (i != IMAX - 1) {
	  m_right = m + 1;
	} else{
	  m_right = m - IMAX + 1;
	}
	Si[n] = m_right; Sx[n] = c_right; n++; Sp[m + 1]++;

	/* Node back.  Ensure not on back face. */
	if (j != JMAX - 1) {
	  m_back = m + IMAX;
	} else {
	  m_back = m - (JMAX - 1) * IMAX;
	}
	Si[n] = m_back; Sx[n] = c_back; n++; Sp[m + 1]++;

	/* Node top. Ensure not on top face */
	if (k != KMAX - 1) {
	  m_above = m + IMAX * JMAX;
 	  Si[n] = m_above; Sx[n] = c_above; n++; Sp[m + 1]++;
	} else {
	  bx[m] = -c_above * VTOP;
	}

	Snz[m] = Sp[m + 1] - Sp[m];
      }
    }
  }
  /*====================================================================*/
  /*===================Done creating connectivity matrix.===============*/
  /*====================================================================*/

  /* report time*/
  getrusage(0, &usage);
  now_s = usage.ru_utime.tv_sec;
  now_u = usage.ru_utime.tv_usec;
  time_now = now_s + now_u / 1.e6;
  printf("\nFinished creating connectivity matrix\n"
	 "  Incremental time %f\n"
	 "  Running time %f\n", time_now - time_past, time_now);
  time_past = time_now;

  /* Print all three matrixes */
  cholmod_l_print_sparse(S, "S", cc);
  cholmod_l_print_dense(b, "b", cc);
  cholmod_l_print_dense(v0, "v0", cc);

  /* Allocate residual vector */
  r = cholmod_l_zeros(nodes, 1, CHOLMOD_REAL, cc);

  /* The Preconditioned Conjugate Gradient Method */
  /* Calculate the first residual value */
  double one[2], zero[2], minusone[2];
  zero[0] = 0.0;
  zero[1] = 0.0;
  one[0] = 1.0;
  one[1] = 0.0;
  minusone[0] = -1.0;
  minusone[1] = 0.0;

  cholmod_l_copy_dense2(b, r, cc);
  cholmod_l_sdmult(S, 0, minusone, one, v0, r, cc);
  printf("Initial 2-norm = %f\n", cholmod_l_norm_dense(r, 2, cc));
  printf("Initial 1-norm = %f\n", cholmod_l_norm_dense(r, 1, cc));
  printf("Initial 0-norm = %f\n", cholmod_l_norm_dense(r, 0, cc));
  

  /* The iteration */
  double rho1, rho0, beta = 0.0, alpha = 0.0;
  cholmod_dense *p1, *p0, *q;
  double *p1x, *p0x, *qx, *rx;
  /*
  p1 = cholmod_l_allocate_dense(nodes, 1, nodes, CHOLMOD_REAL, cc);
  p0 = cholmod_l_allocate_dense(nodes, 1, nodes, CHOLMOD_REAL, cc);
  q = cholmod_l_allocate_dense(nodes, 1, nodes, CHOLMOD_REAL, cc);  
  */
  p1 = cholmod_l_zeros(nodes, 1, CHOLMOD_REAL, cc);
  p0 = cholmod_l_zeros(nodes, 1, CHOLMOD_REAL, cc);
  q = cholmod_l_zeros(nodes, 1, CHOLMOD_REAL, cc);
  p1x = (double *) p1->x;
  p0x = (double *) p0->x;
  qx = (double *) q->x;
  rx = (double *) r->x;

  int iter;
  for (iter = 0; iter < MAX_ITER; iter++) {
    if (cholmod_l_norm_dense(r, 0, cc) < 1e-10)
      break;
    
    rho1 = cholmod_l_norm_dense(r, 2, cc);
    rho1 = rho1 * rho1;
    
    if (iter == 0) {
      cholmod_l_copy_dense2(r, p1, cc);
    } else {
      beta = rho1 / rho0;
      for (i = 0; i < nodes; i++) {
	p1x[i] = rx[i] + beta * p0x[i];
      }
    }
   
    cholmod_l_sdmult(S, 0, one, zero, p1, q, cc);
    alpha = 0;
    for (i = 0; i < nodes; i++) {
      alpha += p1x[i] * qx[i];
    }
    alpha = rho1 / alpha;

    for (i = 0; i < nodes; i++) {
      v0x[i] += alpha * p1x[i];
      rx[i] -= alpha * qx[i];
    }
    /*
    printf("iter = %d\n: rho1 = %f, rho0 = %f, alpha = %f, beta = %f\n", 
	   iter, rho1, rho0, alpha, beta);
    */
    cholmod_l_copy_dense2(p1, p0, cc);
    rho0 = rho1;
  }
  
  cholmod_l_copy_dense2(b, r, cc);
  cholmod_l_sdmult(S, 0, minusone, one, v0, r, cc);
  printf("After %d iterations:\n", iter);
  printf("Final 2-norm: %f\n", cholmod_l_norm_dense(r, 2, cc));
  printf("Final 1-norm: %f\n", cholmod_l_norm_dense(r, 1, cc));
  printf("Final 0-norm: %f\n", cholmod_l_norm_dense(r, 0, cc));
  /* sort sparse matrix and store it 
  cholmod_l_sort(S, cc);
  S->stype = 0;
  */

  /* Check if S is symmetric
  ST = cholmod_l_transpose(S, 2, cc);
  store = fopen("LpT.dat", "w");
  cholmod_l_write_sparse(store, ST, NULL, NULL, cc);
  fclose(store);
  cholmod_l_free_sparse(&ST, cc);
  */
  
  /* store the sparse matrix   */
  /*
  store = fopen("Laplace.dat", "w");
  cholmod_l_write_sparse(store, S, NULL, NULL, cc);
  fclose(store);
  cholmod_l_free_sparse(&S, cc);
  */

  /* store the force vector  */
  /*
  store = fopen("Force.dat", "w");
  cholmod_l_write_dense(store, b, NULL, cc);
  fclose(store);
  cholmod_l_free_dense(&b,cc);
  */

  /* store the guess vector   */
  store = fopen("Voltage.dat", "w");
  cholmod_l_write_dense(store, v0, NULL, cc);
  fclose(store);

  cholmod_l_free_dense(&v0, cc);

  /* store the residual vector  */
  store = fopen("Residual.dat", "w");
  cholmod_l_write_dense(store, r, NULL, cc);
  fclose(store);

  cholmod_l_free_dense(&r, cc);

  /* report time*/
  getrusage(0, &usage);
  now_s = usage.ru_utime.tv_sec;
  now_u = usage.ru_utime.tv_usec;
  time_now = now_s + now_u / 1.e6;
  printf("\nFinished writing matrix\n"
	 "  Incremental time %f\n"
	 "  Running time %f\n", time_now - time_past, time_now);

  cholmod_l_finish(cc);
  return 0;
}
Ejemplo n.º 2
0
int main(int argc, char* argv[])
{
	const int bufsize = 512;
    	char buffer[bufsize];
	int m,n,S;
	double time_st,time_end,time_avg;
	//omp_set_num_threads(2);
//	printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads());
	if(argc!=2)
	{
		cout<<"Insufficient arguments"<<endl;
		return 1;
	}
	
	graph G;

	cerr<<"Start reading                    ";
//	time_st=dsecnd();
	G.create_graph(argv[1]);
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cout<<"Success              "<<endl;
//	cerr<<"Reading time                     "<<time_avg<<endl;

	cerr<<"Constructing Matrices            ";
//	time_st=dsecnd();
	G.construct_MNA();
	G.construct_NA();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;

//	G.construct_sparse_MNA();
	m=G.node_array.size()-1;
	n=G.voltage_edge_id.size();
	
	cout<<endl;
	cout<<"MATRIX STAT:"<<endl;
	cout<<"Nonzero elements:               "<<G.nonzero<<endl;
	cout<<"Number of Rows:                 "<<m+n<<endl;
	cout<<"Nonzero in G:			"<<G.Gnonzero<<endl;
	cout<<"Number of rows in G:		"<<m<<endl;
	cout<<"Nonzero in P: 			"<<G.Pnonzero<<endl;
	cout<<"Number of rows in P:		"<<m<<endl;


//	printf("\n Nonzero = %d", G.nonzero);
//	printf("\n Rows = %d", m+n);

	cout<<"MAT val:		       "<<endl;
	int i,j;

	G.Mat_val[0] += 100;
	G.Gmat[0] +=100;
/*
	for(i=0;i<G.Gnonzero;i++)
		cout<<" "<<G.Gmat[i];
	cout<<endl;
	for(i=0;i<G.Gnonzero;i++)
		cout<<" "<<G.Gcolumns[i];
	cout<<endl;	
	for(i=0;i<m+1;i++)
		cout<<" "<<G.GrowIndex[i];
	cout<<endl;
	for(i=0;i<m;i++)
		printf(" %.8f", G.b1[i]);
	cout<<endl;
	for(i=0;i<m;i++)
		printf(" %.8f", G.x1[i]);
	cout<<endl;	
	
	
*/	SuiteSparse_long *Gnz = (SuiteSparse_long*)calloc(m,sizeof(SuiteSparse_long));
	for(i=0;i<m;i++)
	{
	//	cout<<endl;
		SuiteSparse_long startindex=G.GrowIndex[i];
		SuiteSparse_long endindex=G.GrowIndex[i+1];
		Gnz[i] = endindex - startindex;

//		for(j=startindex;j<endindex;j++)
//			cout<<" "<<G.Gmat[j];
//		cout<<endl;
		
	}


/*	for(i=0;i<G.Pnonzero;i++)
		cout<<" "<<G.Pmat[i];
	cout<<endl;
	for(i=0;i<G.Pnonzero;i++)
		cout<<" "<<G.Pcolumns[i];
	cout<<endl;	
	for(i=0;i<m+1;i++)
		cout<<" "<<G.ProwIndex[i];
	cout<<endl;
/*	for(i=0;i<m;i++)
		printf(" %.8f", G.b1[i]);
	cout<<endl;
	for(i=0;i<m;i++)
		printf(" %.8f", G.x1[i]);
	cout<<endl;	
	
	
	for(i=0;i<m;i++)
	{
		cout<<endl;
		int startindex=G.ProwIndex[i];
		int endindex=G.ProwIndex[i+1];
		for(j=startindex;j<endindex;j++)
			cout<<" "<<G.Pmat[j];
		cout<<endl;
		
	}
	
/*	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.Mat_val[i];
	cout<<endl;
	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.columns[i];
	cout<<endl;	
	for(i=0;i<m+n+1;i++)
		cout<<" "<<G.rowIndex[i];
	cout<<endl;
	for(i=0;i<m+n;i++)
		printf(" %.8f", G.b[i]);
	cout<<endl;
	for(i=0;i<m+n;i++)
		printf(" %.8f", G.x[i]);
	cout<<endl;	
	
	
	for(i=0;i<m+n;i++)
	{
		cout<<endl;
		int startindex=G.rowIndex[i];
		int endindex=G.rowIndex[i+1];
		for(j=startindex;j<endindex;j++)
			cout<<" "<<G.Mat_val[j];
		cout<<endl;
		
	}
*/
/*	for (i=0;i<m+n+1;i++)
	{
		//cout<<endl;
		if(G.rowIndex[i]==G.rowIndex[i+1])
			break;
		
		for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			if(G.Mat_val[j]>10)
				cout<<G.Mat_val[j]<<"\t";
		}
		//cout<<endl;
		/*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			cout<<G.columns[j]<<"\t";
		}
		//cout<<endl;
	}
	cout<<endl;
*/

//printing the matrix
	printf("\n Fine till here");
	printf("\n");
//	int* rowmIndex=(int*)calloc(m+1,sizeof(int));
	printf("\n Fine till here");
	printf("\n");
	//int rowmIndex[5]={1,2,3,4,5};
/*	for(i=0;i<m+1;i++)
	{
		rowmIndex[i]=G.rowIndex[i];
		printf(" %d", rowmIndex[i]);
	}
*/
	printf("\n Allocating GPU memory\n");
	cudaDeviceReset();
	size_t free, total;
	cudaMemGetInfo(&free, &total);
	printf("\n Free Mem = %lf MB, Total mem = %lf MB\n", (double)(free)/(1024*1024), (double)(total)/(1024*1024));


	double *dev_csrValA, *dev_b, *dev_x;
	int *dev_csrRowIdxA, *dev_csrColA;

	double *dev_GcsrVal, *dev_b1, *dev_x1;
	double *dev_PcsrVal, *dev_b2, *dev_x2;
	
	int *dev_GcsrRowIdx, *dev_PcsrRowIdx, *dev_GcsrCol, *dev_PcsrCol;
	

	
	cudaMalloc((void**)&dev_PcsrVal, G.Pnonzero*sizeof(double));
	cudaMalloc((void**)&dev_PcsrRowIdx, (m+1)*sizeof(int));
	cudaMalloc((void**)&dev_PcsrCol, G.Pnonzero*sizeof(int));


	cudaMalloc((void**)&dev_b1, (m)*sizeof(double));
	cudaMalloc((void**)&dev_b2, n*sizeof(double));
	cudaMalloc((void**)&dev_x1, m*sizeof(double));
	cudaMalloc((void**)&dev_x2, n*sizeof(double));

	cudaMemcpy(dev_b1, G.b1, (m)*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_x1, G.x1, (m)*sizeof(double), cudaMemcpyHostToDevice);

	cudaMemcpy(dev_PcsrVal, G.Pmat, G.Pnonzero*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_b2, G.b2, (n)*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_x2, G.x2, (n)*sizeof(double), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_PcsrRowIdx, G.ProwIndex, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(dev_PcsrCol, G.Pcolumns, (G.Pnonzero)*sizeof(int), cudaMemcpyHostToDevice);


	/* Matrix has been created and stored in CSR format.
	However, CHOLMOD requires CSC format. Since our matrix is symmetric positive definite, we can simply swap 
	csrColA with csrRowIdx and vice versa
	*/

	/* Starting the CHOLMOD routine now*/
	printf("\n Initiating CHOLMOD\n");
	cholmod_sparse *A, *P;
	cholmod_dense *x, *b, *r, *midvec;
	cholmod_factor *L;
	cholmod_common *Common, cm;
	Common = &cm;
	cholmod_l_start(Common);
//	&Common->useGPU=1;
	printf("\n m = %d, G.Gnonzero = %d\n", m, G.Gnonzero);	


	
	cholmod_sparse *C = cholmod_l_allocate_sparse((size_t)(m), (size_t)(m), (size_t)(G.Gnonzero), 1, 0, 1, 1, Common); 
//	P = cholmod_l_allocate_sparse((size_t)(m), (size_t)(n), (size_t)(G.Pnonzero), 1, 0, 0, 1, Common); 
//	printf("\n Allocated \n");

	C->itype = CHOLMOD_LONG;	
//	printf("\n Itype \n");
	C->p = &G.GrowIndex[0];
//	printf("\n Columns \n");
	C->nz = &Gnz[0];
//	printf("\n Rows \n");
	C->i = &G.Gcolumns[0];
	C->dtype = 0;
	C->x = &G.Gmat[0];
	 
/*	P->itype = CHOLMOD_LONG;
	P->p = &G.ProwIndex[0];
	P->nz = &Pnz[0];
	P->i = &G.Pcolumns[0];
	P->dtype = 0;
	P->x = &G.Pmat[0]; 
*/	 

	b = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common);
	b->dtype=0;
	b->x = &G.b1[0];
	b->xtype = 1;

	printf("\n CHOLMOD manually set\n");
	cholmod_l_print_sparse(C, "A", Common);
	cholmod_l_print_dense(b, "b", Common);


	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventRecord(start, 0);

	L = cholmod_l_analyze(C, Common);
	printf("\n Analysis: Flops: %g \t lnz: %g\n", Common->fl, Common->lnz);
	cholmod_l_factorize(C, L, Common);
	x = cholmod_l_solve(CHOLMOD_A, L, b, Common);
	
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	
	

	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("\n Time : %.6f secs :\n", elapsedTime);
	
	cholmod_l_print_dense(x, "X", Common);
	
	double *x1_mod = (double*)x->x;
	
	cudaMemcpy(dev_x1, x1_mod, m*sizeof(double), cudaMemcpyHostToDevice);
	
	cusparseStatus_t cuSparseStatus;
	cusparseHandle_t cuspHandle;
	cuSparseStatus = cusparseCreate(&cuspHandle);

	cusparseMatDescr_t descrP;
	cusparseCreateMatDescr(&descrP);

	cusparseSetMatType(descrP, CUSPARSE_MATRIX_TYPE_GENERAL);	
	cusparseSetMatIndexBase(descrP, CUSPARSE_INDEX_BASE_ZERO);
	
	
	double *dev_res1, *dev_simple;
	double *res1 = (double*)calloc(n,sizeof(double));
	cudaMalloc((void**)&dev_res1, n*sizeof(double));
	cudaMalloc((void**)&dev_simple, n*sizeof(double));
	
	const double alpha = 1.0, beta=0.0;
	//alpha = 1.0;
	//beta = 0.0;
	
	//solving P^T * G^-1 * b1 Result stored in dev_res1
	
	cuSparseStatus = cusparseDcsrmv(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_x1, &beta, dev_res1);
		
	if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS)
	{
/*		cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost);
		for(i=0;i<n;i++)
		{
			printf("\nres1[%d] = %.8f", i, res1[i]);
		}
		printf("\n P^T * G^-1 * b1 done! Vector stored in res1");
*/	}
	else
	{
		printf("\n P^T * G^-1 * b1 failed\n");
		exit(1);
	}
	
	const double alphaneg = -1.0;
		
		//Solving P^T * G^-1 * b1 - b2 ; Result stored in dev_res1
	
		
	cublasStatus_t cuBlasStatus;
	cublasHandle_t cubHandle;
	cuBlasStatus = cublasCreate(&cubHandle);
		
	cuBlasStatus = cublasDaxpy(cubHandle, n, &alphaneg, dev_b2, 1, dev_res1, 1);
	if(cuBlasStatus == CUBLAS_STATUS_SUCCESS)
	{
//		cudaMemcpy(res1, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost);
//		for(i=0;i<n;i++)
//		{
//			printf("\nres1[%d] = %.8f", i, res1[i]);
//		}
		printf("\n res1 = res1 - b2 done\n");
		 
	}
	else
	{
		printf("\n res1 = res1 - b2 failed\n");
	}
		
	
	
	
	///NOW COMPUTING G^-1 * P
	
	
	int k = 0;
	int breakloop=0;
	
	double **midMat = (double**)malloc(m*sizeof(double*));
	for(i=0;i<m;i++)
	{
		midMat[i] = (double*)calloc(n,sizeof(double));
	}
	
	cudaEventRecord(start, 0);
	
	for(i=0;i<n;i++)
	{
		breakloop = 0;
		double *vect = (double*)calloc(m,sizeof(double*));

		for(j=0;j<m;j++)
		{
			int startin = G.ProwIndex[j];
			int endin = G.ProwIndex[j+1];
			if(startin == endin)
				continue;
				
			k = startin;

			while(k<endin)
			{

				if(G.Pcolumns[k] == i)
				{	
					vect[j] = G.Pmat[k];
					breakloop=1;
					break;
				
				}
				k++;
			}
			if(breakloop == 1)
			{
				break;
			}
		}
		
		midvec = cholmod_l_allocate_dense((size_t)(m), 1, (size_t)(m), 1, Common);
		midvec->dtype=0;
		midvec->x=&vect[0];
		midvec->xtype = 1;
		
		cholmod_dense *res2;
		
		res2 = cholmod_l_solve(CHOLMOD_A, L, midvec, Common);

		
		double *re = (double*)res2->x;
		
//		printf("\n vector %d is:\n", i);
		int i1, j1, k1;
//		for(j1=0;j1<m;j1++)
//		{
//			midmat2flat[i+j1*n] = re[j1];
//			printf(" %lf", re[j1]);
//		}
//		printf("\n");
		for(i1=0;i1<m;i1++)
		{
			midMat[i1][i] = re[i1];
		}
		
		cholmod_l_free_dense(&midvec, Common);
			
	}
	
/*	printf("\n Midmat = \n");
	for(i=0;i<m;i++)
	{
		for(j=0;j<n;j++)
		{
			printf(" %lf", midMat[i][j]);
		}
		printf("\n");
	} 
*/
	double *midMatflat = (double*)calloc((m*n),sizeof(double));
	double *dev_midMat;
	double *dev_solut;
	int counter = 0;
	for(i=0;i<n;i++)
	{
		for(j=0;j<m;j++)
		{
			midMatflat[counter] = midMat[j][i];
			counter++; 
		}
	}
	
	cudaMalloc((void**)&dev_midMat, m*n*sizeof(double));
	cudaMalloc((void**)&dev_solut, n*n*sizeof(double));
	
	cudaMemcpy(dev_midMat, midMatflat, m*n*sizeof(double), cudaMemcpyHostToDevice);
		
	//Solving P^T * midMat; Result stored in dev_solut
		
	cuSparseStatus = cusparseDcsrmm(cuspHandle, CUSPARSE_OPERATION_TRANSPOSE, m, n, n, G.Pnonzero, &alpha, descrP, dev_PcsrVal, dev_PcsrRowIdx, dev_PcsrCol, dev_midMat, m, &beta, dev_solut, n);
	
	if(cuSparseStatus == CUSPARSE_STATUS_SUCCESS)
	{
		printf("\n Solved P^T * G^-1 * P. Result stored in solut\n");
		
	}  
	else
	{
		printf("\n  Failed to Solve P^T * G^-1 * P \n");
		exit(1);
	}

/*	double *matGflat = (double*)calloc(n*n,sizeof(double));
	cudaMemcpy(matGflat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost);
	counter = 0;
	printf("\nBefore LU starts\n");
	for(i=0;i<n;i++)
	{
		for(j=0;j<n;j++)
		{
			printf(" %lf ", matGflat[counter]);
			counter++;
		}
		printf("\n");
	}
	printf("\n");
*/	
	cusolverStatus_t cuSolverStatus;
	
	
	cusolverDnHandle_t cudenHandle;
	cuSolverStatus = cusolverDnCreate(&cudenHandle);
	int Lwork = 0;
	cuSolverStatus = cusolverDnDgetrf_bufferSize(cudenHandle, n, n, dev_solut, n, &Lwork);
	
	if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS)
	{
		printf("\n Buffer works\n Lwork = %d\n", Lwork);
	}
	else
	{
		exit(1);	
	}
	double *dev_Workspace;
	int *dev_Ipiv, *dev_Info;
	
	cudaMalloc((void**)&dev_Workspace, Lwork*sizeof(double));
	cudaMalloc((void**)&dev_Ipiv, n*sizeof(int));
	cudaMalloc((void**)&dev_Info, sizeof(int));
	
	//Calculating LU for dev_solut
//	double *nnmat = (double*)calloc(n*n,sizeof(double));
//	cudaMemcpy(nnmat, dev_solut, n*n*sizeof(double), cudaMemcpyDeviceToHost);
//	cuSolverStatus = cusolverDnDgetrfHost(cudenHandle, n, n, 

	cuSolverStatus = cusolverDnDgetrf(cudenHandle, n, n, dev_solut, n, dev_Workspace, dev_Ipiv, dev_Info);
	
	if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS)
	{
		printf("\n solut has be defactorized into L and U. dev_Ipiv * solut = L * U\n");
	}
	else
	{
	
		printf("\n Unable to defactorize solut into LU\n");
		exit(1);	
	}
	
	//solving dev_solut * x = dev_res1. Result stored in dev_res1
	
	cuSolverStatus = cusolverDnDgetrs(cudenHandle, CUBLAS_OP_N, n, 1, dev_solut, n, dev_Ipiv, dev_res1, n, dev_Info);
	if(cuSolverStatus == CUSOLVER_STATUS_SUCCESS)
	{
		printf("\n Solution obtained for x2 \n");
	}
	else
	{
		printf("\n LU decomposition obtained by LU solver failed\n");
	}

/*	cudaMemcpy(G.x2, dev_res1, n*sizeof(double), cudaMemcpyDeviceToHost);
	printf("\n x2 = \n");
	for(i=0;i<n;i++)
	{
		printf("\n x2[%d] = %lf", i, G.x2[i]); 
	}
*/	
	
	double *dev_dummy;
	cudaMalloc((void**)&dev_dummy, m*sizeof(double));
	cudaMemset(dev_dummy, 0.0, m*sizeof(double));
	
	printf("\n Starting solving for x1 \n");
	//Solving for x1
	
		//Solving G^-1 * P * x2;  G^-1 * P is stored in midMat
	cuBlasStatus = cublasDgemv(cubHandle, CUBLAS_OP_N, m, n, &alpha, dev_midMat, m, dev_res1, 1, &beta, dev_dummy, 1);
	if(cuBlasStatus == CUBLAS_STATUS_SUCCESS)
	{
/*		double *toprint = (double*)calloc(m,sizeof(double));
		cudaMemcpy(toprint, dev_dummy, m*sizeof(double), cudaMemcpyDeviceToHost);
		printf("\n Intermediate vector :\n");
		for(i=0;i<m;i++)
		{
			printf("\ndummy[%d] = %lf", i, toprint[i]);
		}
*/		printf("\n midmat * x2 obtained. Stored in dummy\n");
	}
	else
	{
		printf("\n Failed to obtain midmat * x2\n");
	}
	
	cuBlasStatus = cublasDaxpy(cubHandle, m, &alphaneg, dev_dummy, 1, dev_x1, 1);
	if(cuBlasStatus == CUBLAS_STATUS_SUCCESS)
	{
/*		cudaMemcpy(G.x1, dev_x1, m*sizeof(double), cudaMemcpyDeviceToHost);
		printf("\n x1 = \n");
		for(i=0;i<m;i++)
		{
			printf("\n x1[%d] = %.15f", i, G.x1[i]);
		}
*/		printf("\n x1 obtained");
	}
	else
	{
		printf("\n Failed to obtain x1");
	}

	printf("\n Solver finished its work\n");

/*		cudaEventRecord(stop, 0);
		cudaEventSynchronize(stop);

		cudaEventElapsedTime(&elapsedTime, start, stop);
		printf("\n Time: %.6f msecs :\n", elapsedTime);
*/








	
	
	
	
	

	cholmod_l_finish(Common);
	return 0;

}