Beispiel #1
0
// Calculates w
void cg_solver_calc_w(
        const int x,
        const int y,
        const int z,
        const int halo_depth,
        double* pw,
        double* vec_p,
        double* vec_w,
        double* vec_kx,
        double* vec_ky,
        double* vec_kz,
        int* a_row_index,
        int* a_col_index,
        double* a_non_zeros)
{
    double pw_temp = 0.0;

    int m = x*y*z;
    mkl_cspblas_dcsrgemv(
            "n", &m, a_non_zeros, a_row_index, a_col_index, vec_p, vec_w);

    int x_inner = x - 2*halo_depth;
#pragma omp parallel for reduction(+:pw_temp)
    for(int ii = halo_depth; ii < z-halo_depth; ++ii)
    {
        for(int jj = halo_depth; jj < y-halo_depth; ++jj)
        {
            int offset = ii*x*y + jj*x + halo_depth;
            pw_temp += cblas_ddot(x_inner, vec_w + offset, 1, vec_p + offset, 1);
        }
    }

    *pw += pw_temp;
}
Beispiel #2
0
void mkl_warmup(){
	srand48(time(0));
	hbmat_t *t = malloc(sizeof(hbmat_t));
	t->m = DIM; t->n = DIM;
	t->vdiag = NULL;
	int m = t->m;
	int alpha = 1; int beta = 1;
	int *vptr = t->vptr = malloc((DIM+1) * sizeof(int));
	int *vpos = t->vpos = malloc((DIM * DIM) *sizeof(int));
	double *vval = t->vval = malloc((DIM*DIM)*sizeof(double));
	vptr[0] = 0;
	int vpos_p = 0;
	puts("warm-up");
	for ( int i = 1; i <= DIM; ++i ) {
		vptr[i] = vptr[i-1] +  FILL;
		int vp = 0;
		for ( int j = vptr[i-1]; j < vptr[i]; ++j ) {
			vpos[vpos_p] = vp;
			vval[vpos_p] = drand48();
			vp++; vpos_p++;
		}
	}

	double *x = malloc(DIM*sizeof(double));
	for(int i = 0; i < DIM; ++i)
		x[i] = drand48();
	double *y = malloc(DIM*sizeof(double));
	mkl_dcsrmv("N", &m, &m, &alpha, "GLNC", vval, vpos, vptr, vptr+1, x, &beta, y);
	mkl_dcsrsv("N", &m, &alpha, "TLNC", vval, vpos, vptr, vptr+1, x, y);
	mkl_cspblas_dcsrgemv("N", &m, vval, vptr, vpos, x, y);
	
	free(x); free(y);
	free(vptr); free(vpos); free(vval);
	free(t);
}
void dsyrk_sparse_upper(hbmat_t* A, hbmat_t* C){

	/*
	 * Check if the input matrix has properly set
	 */
	if ( A->vptr == NULL )
		hyper_sym_csr_task2(A);

	if ( C->vptr == NULL )
		hyper_sym_csr_task2(C);

	int n = A->n; int m = A->m;
	int* vptr = A->vptr; int* vpos = A->vpos; double* vval = A->vval;
	int* vptr_c = C->vptr; int* vpos_c = C->vpos; double* vval_c = C->vval;
	int col_pos, row_pos;
	double* peela = malloc(m*sizeof(double));
	double* peelc = malloc(m*sizeof(double));
	char* trans = "N";

	int i;
	for ( i = 0; i < n; i++ ) {
		array_clear(peela, m);
//		array_clear(peelc, m);
		array_s2d(A, peela, i);
		mkl_cspblas_dcsrgemv(trans, &m, vval, vptr, vpos, peela, peelc);
		int k;
		for ( k = vptr_c[i]; k < vptr_c[i+1]; k++ ) {
			col_pos = vpos_c[k];
			vval_c[k] -= peelc[col_pos];
		}
	}

	free(peela); free(peelc);
}
void spmv_mkl_double(MKL_INT m, 
                     double values[],
                     MKL_INT rowIndex[],
                     MKL_INT columns[],
                     double x[],
                     double y[])
{
    char transa = 'n';

    mkl_cspblas_dcsrgemv(&transa, &m, values, rowIndex, columns, x, y);
}
Beispiel #5
0
// The main chebyshev iteration
void cheby_solver_iterate(
        const int x,
        const int y,
        const int z,
        const int halo_depth,
        double alpha,
        double beta,
        double* vec_u,
        double* vec_u0,
        double* vec_p,
        double* vec_r,
        double* vec_w,
        double* vec_kx,
        double* vec_ky,
        double* vec_kz,
        int* a_row_index,
        int* a_col_index,
        double* a_non_zeros)
{
    int m = x*y*z;

    mkl_cspblas_dcsrgemv(
            "n", &m, a_non_zeros, a_row_index, a_col_index, vec_u, vec_w);

    int x_inner = x - 2*halo_depth;
    
#pragma omp parallel for
    for(int ii = halo_depth; ii < z-halo_depth; ++ii)
    {
        for(int jj = halo_depth; jj < y-halo_depth; ++jj)
        {
            const int offset = ii*x*y + jj*x + halo_depth;
            cblas_dcopy(x_inner, vec_u0 + offset, 1, vec_r + offset, 1);
            cblas_daxpy(x_inner, -1.0, vec_w + offset, 1, vec_r + offset, 1);
            cblas_dscal(x_inner, alpha, vec_p + offset, 1);
            cblas_daxpy(x_inner, beta, vec_r + offset, 1, vec_p + offset, 1);
        }
    }

    cheby_calc_u(x, y, z, halo_depth, vec_u, vec_p);
}
Beispiel #6
0
// Initialises the CG solver
void cg_solver_init(
        const int x,
        const int y,
        const int z,
        const int halo_depth,
        const int coefficient,
        double rx,
        double ry,
        double rz,
        double* rro,
        double* density,
        double* energy,
        double* vec_u,
        double* vec_p,
        double* vec_r,
        double* vec_w,
        double* vec_kx,
        double* vec_ky,
        double* vec_kz,
        int* a_row_index,
        int* a_col_index,
        double* a_non_zeros)
{
    if(coefficient != CONDUCTIVITY && coefficient != RECIP_CONDUCTIVITY)
    {
        die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient);
    }

#pragma omp parallel for
    for(int ii = 0; ii < z; ++ii)
    {
        for(int jj = 0; jj < y; ++jj)
        {
            for(int kk = 0; kk < x; ++kk)
            {
                const int index = ii*y*x+jj*x+kk;
                vec_p[index] = 0.0;
                vec_r[index] = 0.0;
                vec_u[index] = energy[index]*density[index];
            }
        }
    }

#pragma omp parallel for
    for(int ii = 1; ii < z-1; ++ii)
    {
        for(int jj = 1; jj < y-1; ++jj)
        {
            for(int kk = 1; kk < x-1; ++kk)
            {
                const int index = ii*y*x+jj*x+kk;
                vec_w[index] = (coefficient == CONDUCTIVITY) 
                    ? density[index] : 1.0/density[index];
            }
        }
    }

#pragma omp parallel for
    for(int ii = halo_depth; ii < z-1; ++ii)
    {
        for(int jj = halo_depth; jj < y-1; ++jj)
        {
            for(int kk = halo_depth; kk < x-1; ++kk)
            {
                const int index = ii*x*y + jj*x + kk;
                vec_kx[index] = rx*(vec_w[index-1]+vec_w[index]) /
                    (2.0*vec_w[index-1]*vec_w[index]);
                vec_ky[index] = ry*(vec_w[index-x]+vec_w[index]) /
                    (2.0*vec_w[index-x]*vec_w[index]);
                vec_kz[index] = rz*(vec_w[index-x*y]+vec_w[index]) /
                    (2.0*vec_w[index-x*y]*vec_w[index]);
            }
        }
    }

    // Initialise the CSR sparse coefficient matrix
    for(int ii = halo_depth; ii < z-1; ++ii)
    {
        for(int jj = halo_depth; jj < y-1; ++jj)
        {
            for(int kk = halo_depth; kk < x-1; ++kk)
            {
                const int index = ii*x*y + jj*x + kk;
                int coef_index = a_row_index[index];

                if(ii >= halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kz[index];
                    a_col_index[coef_index++] = index-x*y;
                }

                if(jj >= halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_ky[index];
                    a_col_index[coef_index++] = index-x;
                }

                if(kk >= halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kx[index];
                    a_col_index[coef_index++] = index-1;
                }

                a_non_zeros[coef_index] = (1.0 + 
                        vec_kx[index+1] + vec_kx[index] + 
                        vec_ky[index+x] + vec_ky[index] + 
                        vec_kz[index+x*y] + vec_kz[index]);
                a_col_index[coef_index++] = index;

                if(ii < z-halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kz[index+x*y];
                    a_col_index[coef_index++] = index+x*y;
                }

                if(jj < y-halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_ky[index+x];
                    a_col_index[coef_index++] = index+x;
                }

                if(kk < x-halo_depth)
                {
                    a_non_zeros[coef_index] = -vec_kx[index+1];
                    a_col_index[coef_index] = index+1;
                }
            }
        }
    }

    double rro_temp = 0.0;

    int m = x*y*z;
    mkl_cspblas_dcsrgemv(
            "n", &m, a_non_zeros, a_row_index, a_col_index, vec_u, vec_w);

    int x_inner = x-2*halo_depth;

#pragma omp parallel for reduction(+:rro_temp)
    for(int ii = halo_depth; ii < z-halo_depth; ++ii)
    {
        for(int jj = halo_depth; jj < y-halo_depth; ++jj)
        {
            const int offset = ii*y*x + jj*x + halo_depth;
            cblas_dcopy(x_inner, vec_u + offset, 1, vec_r + offset, 1);
            cblas_daxpy(x_inner, -1.0, vec_w + offset, 1, vec_r + offset, 1);
            cblas_dcopy(x_inner, vec_r + offset, 1, vec_p + offset, 1);
            rro_temp += cblas_ddot(x_inner, vec_r + offset, 1, vec_p + offset, 1);
        }
    }

    // Sum locally
    *rro += rro_temp;
}
int cg_mkl_double(MKL_INT n, 
                  double a[], 
                  MKL_INT ia[],
                  MKL_INT ja[],
                  double solution[],
                  double rhs[],
                  MKL_INT max_iter,
                  double r_tol,
                  double a_tol)
{
	MKL_INT rci_request, itercount, i;

    // parameter arrays for solver
	MKL_INT ipar[128];
    double  dpar[128];

	double euclidean_norm;
    
    // for SpMV
    char tr = 'n';

    double * tmp;
    double * residual;

    tmp      = (double *) malloc(4 * n * sizeof(double));	
    residual = (double *) malloc(n * sizeof(double));

	// initialize the solver
	dcg_init(&n,solution,rhs,&rci_request,ipar,dpar,tmp);

	if (rci_request!=0) goto failure;
    
	ipar[1]=6;                       // output all warnings and errors 
	ipar[4]=max_iter;                // maximum number of iterations
	ipar[7]=1;                       // stop iteration at maximum iterations
	ipar[8]=1;                       // residual stopping test
	ipar[9]=0;                       // request for the user defined stopping test
	dpar[0]=r_tol * r_tol;           // relative residual tolerance
	dpar[1]=a_tol * a_tol;           // absolute residual tolerance

	/*---------------------------------------------------------------------------*/
	/* Check the correctness and consistency of the newly set parameters         */
	/*---------------------------------------------------------------------------*/
	dcg_check(&n,solution,rhs,&rci_request,ipar,dpar,tmp);
	if (rci_request!=0) goto failure;

	/*---------------------------------------------------------------------------*/
	/* Compute the solution by RCI (P)CG solver without preconditioning          */
	/* Reverse Communications starts here                                        */
	/*---------------------------------------------------------------------------*/
rci: dcg(&n,solution,rhs,&rci_request,ipar,dpar,tmp);
    //printf("Residual norm is %e\n", sqrt(dpar[4]));
	/*---------------------------------------------------------------------------*/
	/* If rci_request=0, then the solution was found with the required precision */
	/*---------------------------------------------------------------------------*/
	if (rci_request==0) goto getsln;
	/*---------------------------------------------------------------------------*/
	/* If rci_request=1, then compute the vector A*tmp[0]                        */
	/* and put the result in vector tmp[n]                                       */
	/*---------------------------------------------------------------------------*/
	if (rci_request==1)
	{
        mkl_cspblas_dcsrgemv(&tr, &n, a, ia, ja, tmp, &tmp[n]);
		goto rci;
	}
	/*---------------------------------------------------------------------------*/
	/* If rci_request=anything else, then dcg subroutine failed                  */
	/* to compute the solution vector: solution[n]                               */
	/*---------------------------------------------------------------------------*/
	goto failure;
	/*---------------------------------------------------------------------------*/
	/* Reverse Communication ends here                                           */
	/* Get the current iteration number into itercount                           */
	/*---------------------------------------------------------------------------*/
getsln: dcg_get(&n,solution,rhs,&rci_request,ipar,dpar,tmp,&itercount);

    mkl_cspblas_dcsrgemv(&tr, &n, a, ia, ja, solution, residual);
	for(i=0;i<n;i++) residual[i] -= rhs[i];
    i=1; euclidean_norm=dnrm2(&n,residual,&i);
	
    printf("\nMKL CG reached %e residual in %d iterations\n",euclidean_norm, itercount);

    // release memory
	MKL_FreeBuffers();
    free(tmp);
    free(residual);

    if (itercount <= max_iter && (euclidean_norm * euclidean_norm) < (dpar[0] * dpar[4] + dpar[5]))
    {
//        printf("This example has successfully PASSED through all steps of computation!");
//        printf("\n");
//        printf("(Residual norm is %e)\n", euclidean_norm);
        return 0;
    }
    else
    {
//        printf("This example may have FAILED as either the number of iterations exceeds");
//        printf("\nthe maximum number of iterations %d, or the ", max_iter);
//        printf("computed solution\ndiffers has not sufficiently converged.");
//        printf("(Residual norm is %e), or both.\n", euclidean_norm);
        return 1;
    }
	/*-------------------------------------------------------------------------*/
	/* Release internal MKL memory that might be used for computations         */
	/* NOTE: It is important to call the routine below to avoid memory leaks   */
	/* unless you disable MKL Memory Manager                                   */
	/*-------------------------------------------------------------------------*/
failure: printf("This example FAILED as the solver has returned the ERROR ");
				 printf("code %d", rci_request);
         MKL_FreeBuffers();
         return 1;
}
Beispiel #8
0
int main(int argc, char **argv)
{
  if (argc < 2) {
    fprintf(stderr, "Usage: reordering_test matrix_in_matrix_market_format\n");
    return -1;
  }

  CSR *A = new CSR(argv[1], 0, true /* force-symmetric */);
  int nnz = A->getNnz();
  double flops = 2*nnz;
  double bytes = (sizeof(double) + sizeof(int))*nnz + sizeof(double)*(A->m + A->n);

  printf("original bandwidth %d\n", A->getBandwidth());

  double *x = MALLOC(double, A->m);
  double *y = MALLOC(double, A->m);

  // allocate a large buffer to flush out cache
  bufToFlushLlc = (double *)_mm_malloc(LLC_CAPACITY, 64);

  const int REPEAT = 128;
  double times[REPEAT];

  for (int i = 0; i < REPEAT; ++i) {
    flushLlc();

    double t = omp_get_wtime();
    A->multiplyWithVector(y, x);
    times[i] = omp_get_wtime() - t;
  }
  correctnessCheck(A, y);

  printf("SpMV BW");
  printEfficiency(times, REPEAT, flops, bytes);

#ifdef MKL
  for (int i = 0; i < REPEAT; ++i) {
    flushLlc();

    double t = omp_get_wtime();
    mkl_cspblas_dcsrgemv(
      "N", &A->m, A->values, A->rowptr, A->colidx, x, y);
    times[i] = omp_get_wtime() - t;
  }
  correctnessCheck(A, y);

  printf("MKL SpMV BW");
  printEfficiency(times, REPEAT, flops, bytes);
#endif

  int *perm = MALLOC(int, A->m);
  int *inversePerm = MALLOC(int, A->m);

  for (int o = BFS; o <= RCM; ++o) {
    Option option = (Option)o;

    switch (option) {
    case BFS:
      printf("\nBFS reordering\n");
      break;
    case RCM_WO_SOURCE_SELECTION:
      printf("\nRCM reordering w/o source selection heuristic\n");
      break;
    case RCM:
      printf("\nRCM reordering\n");
      break;
    default: assert(false); break;
    }

    double t = -omp_get_wtime();
    switch (option) {
    case BFS:
      A->getBFSPermutation(perm, inversePerm);
      break;
    case RCM_WO_SOURCE_SELECTION:
      A->getRCMPermutation(perm, inversePerm, false);
      break;
    case RCM:
      A->getRCMPermutation(perm, inversePerm);
      break;
    default: assert(false); break;
    }
    t += omp_get_wtime();

    printf(
      "Constructing permutation takes %g (%.2f gbps)\n",
      t, nnz*4/t/1e9);

    isPerm(perm, A->m);
    isPerm(inversePerm, A->m);

    t = -omp_get_wtime();
    CSR *APerm = A->permute(perm, inversePerm);
    t += omp_get_wtime();

    printf("Permute takes %g (%.2f gbps)\n", t, bytes/t/1e9);
    printf("Permuted bandwidth %d\n", APerm->getBandwidth());

    for (int i = 0; i < REPEAT; ++i) {
      flushLlc();

      t = omp_get_wtime();
      APerm->multiplyWithVector(y, x);
      times[i] = omp_get_wtime() - t;
    }
    printf("SpMV BW");
    printEfficiency(times, REPEAT, flops, bytes);

    delete APerm;
  }

  FREE(x);
  FREE(y);

  delete A;
}