예제 #1
0
int main()
{
    int t, i, j, k, l;

    double t_start, t_end;

    init_array();

	IF_TIME(t_start = rtclock());

#pragma scop
    for (t=1; t<=T; t++){
	    for (i=1; i<=N-1; i++)
		    e[i] = e[i] - coeff1*(h[i]-h[i-1]);
	    for (i=0; i<=N-1; i++)
		    h[i] = h[i] - coeff2*(e[i+1]-e[i]);
    }
#pragma endscop

    IF_TIME(t_end = rtclock());
    IF_TIME(fprintf(stderr, "%0.6lfs\n", t_end - t_start));

    if (fopen(".test", "r")) {
        print_array();
    }

    return 0;
}
예제 #2
0
int main()
{
    int i, j, k, t;

    init_array() ;

#ifdef PERFCTR
    PERF_INIT; 
#endif

    IF_TIME(t_start = rtclock());

    /* pluto start (N) */
#pragma scop
    for (i=1; i<=N-2; i++)  {
        for (j=1; j<=N-2; j++)  {
            f[i][j] = f[j][i] + f[i][j-1];
        }
    }
#pragma endscop
    /* pluto end */

    IF_TIME(t_end = rtclock());
    IF_TIME(fprintf(stderr, "%0.6lfs\n", t_end - t_start));

    if (fopen(".test", "r")) {
        print_array();
    }

    return 0;
}
예제 #3
0
int main() {
  double t_start, t_end;

  DATA_TYPE* A;
  DATA_TYPE* C;
  DATA_TYPE* D;

  A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
  C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
  D = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));

  fprintf(stdout, "<< Symmetric rank-k operations >>\n");

  init_arrays(A, C, D);	
  syrkGPU(A, D);

  t_start = rtclock();
  syrk(A, C);
  t_end = rtclock();
  fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);

  compareResults(C, D);

  free(A);
  free(C);
  free(D);
  return 0;
}
예제 #4
0
void syrkGPU(DATA_TYPE* A, DATA_TYPE* D) {
  int i, j;
  double t_start, t_end;

  t_start = rtclock();

  #pragma omp target  device (GPU_DEVICE)
  #pragma omp target map(to: A[:N*M]) map(tofrom: D[:N*M])
  {
    #pragma omp parallel for
    for (i = 0; i < N; i++) {
      for (j = 0; j < M; j++) {
	D[i * M + j] *= beta;
      }
    }
    
    #pragma omp parallel for collapse(2)
    for (i = 0; i < N; i++) {
      for (j = 0; j < M; j++) {
	int k;		
	for(k=0; k< M; k++) {
	  D[i * M + j] += alpha * A[i * M + k] * A[j * M + k];
	}
      }
    }
  }
  
  t_end = rtclock();
  fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);

}
예제 #5
0
void cl_launch_kernel()
{
	double t_start, t_end;

	int m = M;
	int n = N;

	size_t localWorkSize[2], globalWorkSize[2];
	localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
	localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
	globalWorkSize[0] = (size_t)ceil(((float)N) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = (size_t)ceil(((float)M) / ((float)DIM_LOCAL_WORK_GROUP_Y)) * DIM_LOCAL_WORK_GROUP_Y;

	t_start = rtclock();
	
	// Set the arguments of the kernel
	errcode =  clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&c_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 2, sizeof(DATA_TYPE), (void *)&alpha);
	errcode |= clSetKernelArg(clKernel1, 3, sizeof(DATA_TYPE), (void *)&beta);
	errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&m);
	errcode |= clSetKernelArg(clKernel1, 5, sizeof(int), (void *)&n);

	if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");

	// Execute the OpenCL kernel
	errcode = clEnqueueNDRangeKernel_fusion(clCommandQue, clKernel1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	//errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n");
//	clFinish(clCommandQue);

	t_end = rtclock();
	fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
	fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, t_end - t_start);
}
예제 #6
0
파일: mvt.c 프로젝트: intersense/pluto-gw
int main()
{
    int i, j, k, l, t;

    double t_start, t_end;

    init_array() ;

    IF_TIME(t_start = rtclock());

#pragma scop
    for (i=0; i<N; i++) {
        for (j=0; j<N; j++) {
            x1[i] = x1[i] + a[i][j] * y_1[j];
        }
    }

    for (i=0; i<N; i++) {
        for (j=0; j<N; j++) {
            x2[i] = x2[i] + a[j][i] * y_2[j];
        }
    }
#pragma endscop

    IF_TIME(t_end = rtclock());
    IF_TIME(printf("%0.6lfs\n", t_end - t_start));

#ifdef TEST
    print_array();
#endif
    return 0;
}
예제 #7
0
int main()
{
  init_arrays();

  double annot_t_start=0, annot_t_end=0, annot_t_total=0;
  int annot_i;

  int v1,v2,o1,o2,ox;
  int tv1,tv2,to1,to2,tox;

  for (annot_i=0; annot_i<REPS; annot_i++)
  {
    annot_t_start = rtclock();

    for (v1=0; v1<=V-1; v1=v1+1) 
      for (v2=0; v2<=V-1; v2=v2+1) 
	for (o1=0; o1<=O-1; o1=o1+1) 
	  for (o2=0; o2<=O-1; o2=o2+1) 
	    for (ox=0; ox<=O-1; ox=ox+1) 
	      R[v1][v2][o1][o2]=R[v1][v2][o1][o2]+T[v1][ox][o1][o2]*A2[v2][ox];
    
    annot_t_end = rtclock();
    annot_t_total += annot_t_end - annot_t_start;
  }
  
  annot_t_total = annot_t_total / REPS;
  printf("%f\n", annot_t_total);
  
  return 1;
}
예제 #8
0
int main(int argc, char** argv)
{
	double t_start, t_end;
	
	/* Array declaration */
	DATA_TYPE A[N][M];
	DATA_TYPE C[N][N];
	DATA_TYPE C_outputFromGpu[N][N];

	/* Initialize array. */
	init_arrays(A, C, C_outputFromGpu);

	#pragma hmpp syrk allocate
	#pragma hmpp syrk advancedload, args[a,c]

	t_start = rtclock();
	#pragma hmpp syrk callsite, args[a,c].advancedload=true, asynchronous
	runSyrk(A, C_outputFromGpu);
	#pragma hmpp syrk synchronize

	t_end = rtclock();
	fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start);
	
	#pragma hmpp syrk delegatedstore, args[c]
	#pragma hmpp syrk release

	t_start = rtclock();
	runSyrk(A, C);
	t_end = rtclock();
	fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start);

	compareResults(C, C_outputFromGpu);

	return 0;
}
예제 #9
0
void foo(){
	int y,x,trial;
    IF_TIME(t_start = rtclock());

for (trial=0;trial<10;++trial)
{
#pragma scop
	for (y = 0; y <= M-1; ++y)
   		for(x = 0; x <= N-1; ++x) {
	         	blurx[y][x]=in[x][y]+in[x+1][y]+in[x+2][y];
			if (y >= 2)
		 		out[x][y-2]=blurx[y-2][x]+blurx[y-1][x]+blurx[y][x];
   		}
#pragma endscop
}


    IF_TIME(t_end = rtclock());
    IF_TIME(fprintf(stdout, "%s\t\t(M=%d,N=%d) \t %0.6lfs\n", __FILE__, M, N, (t_end - t_start)/trial));

#ifdef VERIFY
for(x = 0; x <= N-1; ++x)
	for(y = 0; y <= M-1; ++y)
		A[x][y]=in[x][y]+in[x+1][y]+in[x+2][y];
// Stage 2: vertical blur
for(x = 0; x <= N-1; ++x)
	for(y = 2; y <= M-1; ++y)
	{
		if(out[x][y-2] != A[x][y]+A[x][y-1]+A[x][y-2])
		{
			printf("blur-smo.c: Difference at (%d, %d) : %f versus %f\n", x, y, out[x][y-2], A[x][y]+A[x][y-1]+A[x][y-2]);
		}
	}
#endif
}
예제 #10
0
int main(int argc, char* argv[]) 
//int main(void) 
{
	double t_start, t_end;

	DATA_TYPE* A;
	DATA_TYPE* B;
	DATA_TYPE* C;
	DATA_TYPE* D;
	DATA_TYPE* E;
	DATA_TYPE* F;
	DATA_TYPE* G;
	DATA_TYPE* G_outputFromGpu;
        if(argc==2){
          printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]);
          cpu_offset = atoi(argv[1]);
        }


	A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
	B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
	C = (DATA_TYPE*)malloc(NJ*NM*sizeof(DATA_TYPE));
	D = (DATA_TYPE*)malloc(NM*NL*sizeof(DATA_TYPE));
	E = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
	F = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE));
	G = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));
	G_outputFromGpu = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));

	int i;
	init_array(A, B, C, D);
	read_cl_file();
	cl_initialization_fusion();
	//cl_initialization();
	cl_mem_init(A, B, C, D, E, F, G);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue[0], g_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NL, G_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	t_start = rtclock();
	mm3_cpu(A, B, C, D, E, F, G);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(G, G_outputFromGpu);
	cl_clean_up();

	free(A);
	free(B);
	free(C);
	free(D);
	free(E);
	free(F);
	free(G);
	free(G_outputFromGpu);

	return 0;
}
예제 #11
0
int main(int argc, char** argv)
{
	double t_start, t_end;

	/* Array declaration */
	DATA_TYPE A[NI][NK];
	DATA_TYPE B[NK][NJ];
	DATA_TYPE C[NJ][NM];
	DATA_TYPE D[NM][NL];
	DATA_TYPE E[NI][NJ];
	DATA_TYPE E_gpu[NI][NJ];	
	DATA_TYPE F[NJ][NL];
	DATA_TYPE F_gpu[NJ][NL];
	DATA_TYPE G[NI][NL];
	DATA_TYPE G_outputFromGpu[NI][NL];

	/* INItialize array. */
	iNIt_array(A, B, C, D);
    
	#pragma hmpp <group1> allocate

	#pragma hmpp <group1> loopa advancedload, args[a;b;e]
	#pragma hmpp <group1> loopb advancedload, args[f;c;d]
	#pragma hmpp <group1> loopc advancedload, args[g]

	t_start = rtclock();
	#pragma hmpp <group1> loopa callsite, args[a;b;e].advancedload=true, asynchronous
	threeMMloopa(A, B, E_gpu);
	#pragma hmpp <group1> loopa synchronize
	#pragma hmpp <group1> loopb callsite, args[f;c;d].advancedload=true, asynchronous
	threeMMloopb(C, D, F_gpu);
	#pragma hmpp <group1> loopb synchronize
	#pragma hmpp <group1> loopc callsite, args[g;e;f].advancedload=true, asynchronous
	threeMMloopc(E_gpu, F_gpu, G_outputFromGpu);
	#pragma hmpp <group1> loopc synchronize

	t_end = rtclock();
	fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start);
    
	#pragma hmpp <group1> loopa delegatedstore, args[a;b]
	#pragma hmpp <group1> loopb delegatedstore, args[c;d]
	#pragma hmpp <group1> loopc delegatedstore, args[g;e;f]

	#pragma hmpp <group1> release
	
	t_start = rtclock();

	threeMMloopa(A, B, E);
	threeMMloopb(C, D, F);
	threeMMloopc(E, F, G);

	t_end = rtclock();
	fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start);

	compareResults(G, G_outputFromGpu);

	return 0;
}
예제 #12
0
int main(int argc, char** argv)
{
	int m = M;
	int n = N;
	double t_start, t_end;

	/* Array declaration */
	DATA_TYPE float_n = 321414134.01;
	DATA_TYPE data[M + 1][N + 1];
	DATA_TYPE data_Gpu[M + 1][N + 1];
	DATA_TYPE symmat[M + 1][M + 1];
	DATA_TYPE symmat_outputFromGpu[M + 1][M + 1];	
	DATA_TYPE mean[M + 1];
	DATA_TYPE mean_Gpu[M + 1];

	/* Initialize array. */
	init_arrays(data, data_Gpu);
    
	#pragma hmpp <group1> allocate
	#pragma hmpp <group1> loopa advancedload, args[pmean;pdata;pfloat_n]
    
	#pragma hmpp <group1> loopc advancedload, args[psymmat]

	t_start = rtclock();
	
	#pragma hmpp <group1> loopa callsite, args[pmean;pdata;pfloat_n].advancedload=true, asynchronous
	covarLoopa(mean_Gpu, data_Gpu, float_n);
	#pragma hmpp <group1> loopa synchronize
	#pragma hmpp <group1> loopb callsite, args[pdata;pmean].advancedload=true, asynchronous
	covarLoopb(data_Gpu, mean_Gpu);
	#pragma hmpp <group1> loopb synchronize
	#pragma hmpp <group1> loopc callsite, args[psymmat;pdata].advancedload=true, asynchronous
	covarLoopc(symmat_outputFromGpu, data_Gpu);
	#pragma hmpp <group1> loopc synchronize

	t_end = rtclock();
	fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start);
    
    
	#pragma hmpp <group1> loopb delegatedstore, args[pmean]
    
	#pragma hmpp <group1> loopc delegatedstore, args[psymmat;pdata]
	#pragma hmpp <group1> release
	
	t_start = rtclock();
	
	covarLoopa(mean, data, float_n);
	covarLoopb(data, mean);
	covarLoopc(symmat, data);
	
	t_end = rtclock();
	fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start);
	
	compareResults(symmat, symmat_outputFromGpu);

	return 0;
}
예제 #13
0
int main(int argc, char* argv[]) 
//int main(void) 
{
	double t_start, t_end;
	
	DATA_TYPE* data;
	DATA_TYPE* mean;
	DATA_TYPE* stddev;
	DATA_TYPE* symmat;
	DATA_TYPE* symmat_outputFromGpu;
        if(argc==2){
          printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]);
          cpu_offset = atoi(argv[1]);
        }


	data = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
	mean = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE));
	stddev = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE));
	symmat = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
	symmat_outputFromGpu = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
	
	init_arrays(data);
	read_cl_file();
	cl_initialization_fusion();
	//cl_initialization();
	cl_mem_init(data, mean, stddev, symmat);
	cl_load_prog();

	double start = rtclock();
	cl_launch_kernel();
	double end = rtclock(); 
	fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, (end - start));
	//fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, 1000*(end - start));

	errcode = clEnqueueReadBuffer(clCommandQue[0], symmat_mem_obj, CL_TRUE, 0, (M+1) * (N+1) * sizeof(DATA_TYPE), symmat_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	t_start = rtclock();
	correlation(data, mean, stddev, symmat);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   


	compareResults(symmat, symmat_outputFromGpu);
	cl_clean_up();
	
	free(data);
	free(mean);
	free(stddev);
	free(symmat);
	free(symmat_outputFromGpu);
	
    return 0;
}
예제 #14
0
int main() {
  double t_start, t_end;

  init_arrays();	
  syrkGPU();
  t_start = rtclock();
  syrk();
  t_end = rtclock();
  fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
  compareResults();
  return 0;
}
예제 #15
0
void SpMM(Csr<ValueType>* m1, Csr<ValueType>* m2, int num_buckets) {

  vector<FastHash<int, ValueType>* > result_map(m1->num_rows);
  for (auto& v : result_map) {
    v = new FastHash<int, ValueType>(num_buckets);
  }

  cout << "Starting SpMM..." << endl;

  float res = 0;
  double before = rtclock();
  for(int i=0;i<m1->num_rows;i++) {
    for(int j=m1->rows[i];j<m1->rows[i+1];j++) {
      int cola = m1->cols[j];
      __m512d a = _mm512_set1_pd(m1->vals[j]);
      for(int k=m2->rows[cola];k<m2->rows[cola] + m2->row_lens[cola];k+=16) {
        __m512d *pb1 = (__m512d *)(&(m2->vals[k]));
        __m512d *pb2 = (__m512d *)(&(m2->vals[k]) + 8);
        __m512i *pcols = (__m512i *)(&(m2->cols[k]));
        __m512d c1 = _mm512_mul_pd(a, *pb1);
        __m512d c2 = _mm512_mul_pd(a, *pb2);
        for(int x=0;x<8;x++) {
          int col = ((int *)pcols)[x];
          if (col == -1) {
            continue;
          }
          ValueType val = ((ValueType *)(&c1))[x];
          result_map[i]->Reduce(col, val);
          res += val;
        }

        for (int x = 0; x < 8; ++x) {
          int col = ((int *)pcols)[x+8];
          if (col == -1) {
            continue;
          }
          ValueType val = ((ValueType *)(&c2))[x];
          result_map[i]->Reduce(col, val);
          res += val;
        }
      }
    }
  }

  double after = rtclock();
  cout << "res: " << res << endl;

  cout << RED << "[****Result****] ========> *SIMD Naive* time: " << after - before << " secs." << RESET << endl;

  for (auto& v : result_map) {
    delete v;
  }
}
예제 #16
0
int main(int argc, char** argv)
{
  double t_start, t_end;

  DATA_TYPE* A;
  DATA_TYPE* B;
  DATA_TYPE* C;
  DATA_TYPE* D;
  DATA_TYPE* E;
  DATA_TYPE* F;
  DATA_TYPE* G;
  DATA_TYPE* G_outputFromGpu;

  A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
  B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
  C = (DATA_TYPE*)malloc(NJ*NM*sizeof(DATA_TYPE));
  D = (DATA_TYPE*)malloc(NM*NL*sizeof(DATA_TYPE));
  E = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
  F = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE));
  G = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));
  G_outputFromGpu = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));

  fprintf(stdout, "<< Linear Algebra: 3 Matrix Multiplications (E=A.B; F=C.D; G=E.F) >>\n");

  init_array(A, B, C, D);

  t_start = rtclock();
  mm3_OMP(A, B, C, D, E, F, G_outputFromGpu);
  t_end = rtclock();	

  fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);

  t_start = rtclock();
  mm3_cpu(A, B, C, D, E, F, G);
  t_end = rtclock();

  fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);

  compareResults(G, G_outputFromGpu);

  free(A);
  free(B);
  free(C);
  free(D);
  free(E);
  free(F);
  free(G);
  free(G_outputFromGpu);

  return 0;
}
예제 #17
0
파일: floyd.c 프로젝트: intersense/pluto-gw
int main()
{
    int i, j, k, x, y;
    unsigned int distanceYtoX, distanceYtoK, distanceKtoX;
    /*
     * pathDistanceMatrix is the adjacency matrix (square) with
     * dimension length equal to number of nodes in the graph.
     */
    unsigned int width = NUM_NODES;
    unsigned int yXwidth;

    init_array();

#ifdef PERFCTR
    PERF_INIT; 
#endif

    IF_TIME(t_start = rtclock());

#pragma scop
    for(k=0; k < NUM_NODES; k++)
    {
        for(y=0; y < NUM_NODES; y++)
        {
            for(x=0; x < NUM_NODES; x++)
            {
                pathDistanceMatrix[y][x] = ((pathDistanceMatrix[y][k] + pathDistanceMatrix[k][x]) < pathDistanceMatrix[y][x]) ? (pathDistanceMatrix[y][k] + pathDistanceMatrix[k][x]):pathDistanceMatrix[y][x];
            }
        }
    }
#pragma endscop

    IF_TIME(t_end = rtclock());
    IF_TIME(fprintf(stdout, "time = %0.6lfs\n", t_end - t_start));

#ifdef PERFCTR
    PERF_EXIT; 
#endif

    if (fopen(".test", "r")) {
#ifdef MPI
        if (my_rank == 0) {
            print_array();
        }
#else
        print_array();
#endif
    }

    return 0;
}
예제 #18
0
int main(int argc, char** argv)
{
    double t_start, t_end;

    /* Array declaration */
    DATA_TYPE A[NI][NK];
    DATA_TYPE B[NK][NJ];
    DATA_TYPE C[NI][NJ];
    DATA_TYPE C_gpu[NI][NJ];
    DATA_TYPE D[NJ][NL];
    DATA_TYPE E[NI][NL];
    DATA_TYPE E_outputFromGpu[NI][NL];

    /* Initialize array. */
    init_array(A, B, C, C_gpu, D, E, E_outputFromGpu);

#pragma hmpp <group1> allocate

#pragma hmpp <group1> loopa advancedload, args[a;b;c]
#pragma hmpp <group1> loopb advancedload, args[d;e]

    t_start = rtclock();
#pragma hmpp <group1> loopa callsite, args[a;b;c].advancedload=true, asynchronous
    twoMMloopa(A, B, C_gpu);

#pragma hmpp <group1> loopa synchronize
#pragma hmpp <group1> loopb callsite, args[c;d;e].advancedload=true, asynchronous
    twoMMloopb(C_gpu, D, E_outputFromGpu);
#pragma hmpp <group1> loopb synchronize

    t_end = rtclock();
    fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start);

#pragma hmpp <group1> loopa delegatedstore, args[a;b]
#pragma hmpp <group1> loopb delegatedstore, args[c;d;e]

#pragma hmpp <group1> release

    t_start = rtclock();

    twoMMloopa(A, B, C);
    twoMMloopb(C, D, E);

    t_end = rtclock();
    fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start);

    compareResults(E, E_outputFromGpu);

    return 0;
}
예제 #19
0
int main(void) 
{
	double t_start, t_end;
	
	DATA_TYPE* A;
	DATA_TYPE* r;
	DATA_TYPE* s;
	DATA_TYPE* p;
	DATA_TYPE* q;
	DATA_TYPE* s_outputFromGpu;
	DATA_TYPE* q_outputFromGpu;
 	
	A = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE));
	r = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
	s = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
	p = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
	q = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
	s_outputFromGpu = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
	q_outputFromGpu = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
	
	init_array(A, p, r);	
	read_cl_file();
	cl_initialization();
	cl_mem_init(A, r, s, p, q);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, s_mem_obj, CL_TRUE, 0, NY*sizeof(DATA_TYPE), s_outputFromGpu, 0, NULL, NULL);
	errcode = clEnqueueReadBuffer(clCommandQue, q_mem_obj, CL_TRUE, 0, NX*sizeof(DATA_TYPE), q_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");  

	t_start = rtclock();
	bicg_cpu(A, r, s, p, q);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(s, s_outputFromGpu, q, q_outputFromGpu);
	cl_clean_up();
	
	free(A);
	free(r);
	free(s);
	free(p);
	free(q);
	free(s_outputFromGpu);
	free(q_outputFromGpu);
	
    	return 0;
}
예제 #20
0
int main(int argc, char** argv)
{
	int m = M;
	int n = N;
	double t_start, t_end;

	/* Array declaration */
	DATA_TYPE float_n = 321414134.01;
	DATA_TYPE eps = 0.005;
	DATA_TYPE data[M + 1][N + 1];
	DATA_TYPE data_Gpu[M + 1][N + 1];
	DATA_TYPE mean[M + 1];
	DATA_TYPE mean_Gpu[M + 1];
	DATA_TYPE stddev[M + 1];
	DATA_TYPE stddev_Gpu[M + 1];
	DATA_TYPE symmat[M + 1][M + 1];
	DATA_TYPE symmat_outputFromGpu[M + 1][M + 1];

	/* Initialize array. */
	init_arrays(data, data_Gpu);
	
	#pragma hmpp corr allocate
    
	#pragma hmpp corr advancedload, args[pdata;psymmat;pstddev;pmean;pfloat_n;peps]

	t_start = rtclock();
	
	#pragma hmpp corr callsite, args[pdata;psymmat;pstddev;pmean;pfloat_n;peps].advancedload=true, asynchronous
	runCorr(data_Gpu, symmat_outputFromGpu, stddev_Gpu, mean_Gpu, float_n, eps);
    
	#pragma hmpp corr synchronize

	t_end = rtclock();
	fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start);
    
	#pragma hmpp corr delegatedstore, args[pdata;psymmat;pstddev;pmean]
	#pragma hmpp corr release
	
	t_start = rtclock();
	
	runCorr(data, symmat, stddev, mean, float_n, eps);
	
	t_end = rtclock();
	fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start);
	
	compareResults(symmat, symmat_outputFromGpu);

	return 0;
}
예제 #21
0
int main(int argc, char** argv)
{
  double t_start, t_end;

  DATA_TYPE* A;
  DATA_TYPE* r;
  DATA_TYPE* s;
  DATA_TYPE* p;
  DATA_TYPE* q;
  DATA_TYPE* s_GPU;
  DATA_TYPE* q_GPU;
 	
  A = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE));
  r = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
  s = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
  p = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
  q = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
  s_GPU = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
  q_GPU = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));

  fprintf(stdout, "<< BiCG Sub Kernel of BiCGStab Linear Solver >>\n");

  init_array(A, p, r);

  t_start = rtclock();
  bicg_OMP(A, r, s_GPU, p, q_GPU);
  t_end = rtclock();

  fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);

  t_start = rtclock();
  bicg_cpu(A, r, s, p, q);
  t_end = rtclock();

  fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);

  compareResults(s, s_GPU, q, q_GPU);

  free(A);
  free(r);
  free(s);
  free(p);
  free(q);
  free(s_GPU);
  free(q_GPU);

  return 0;
}
예제 #22
0
void cl_launch_kernel()
{
	double t_start, t_end;

	int nx=NX;
	int ny=NY;
	
	size_t localWorkSize[2], globalWorkSize[2];
	localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
	localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
	globalWorkSize[0] = (size_t)ceil(((float)NX) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = 1;

	t_start = rtclock();	
	
	// Set the arguments of the kernel
	errcode =  clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&p_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&q_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 3, sizeof(int), &nx);
        errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), &ny);
	if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");

	// Execute the 1st OpenCL kernel
	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
	clFinish(clCommandQue);
	
	globalWorkSize[0] = (size_t)ceil(((float)NY) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = 1;

	// Set the arguments of the kernel
	errcode =  clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&a_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&r_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&s_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), &nx);
        errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), &ny);
	if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");
	
	// Execute the 2nd OpenCL kernel
	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
	clFinish(clCommandQue);

	t_end = rtclock();
	fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
}
예제 #23
0
int main()
{
	int t, i, j, k, l, m, n;

	init_array() ;

#ifdef PERFCTR
    PERF_INIT;
#endif

	IF_TIME(t_start = rtclock());

#pragma scop
    for(t=0; t<tmax; t++)  {
        for (j=0; j<ny; j++)
            ey[0][j] = t;
        for (i=1; i<nx; i++)
            for (j=0; j<ny; j++)
                ey[i][j] = ey[i][j] - 0.5*(hz[i][j]-hz[i-1][j]);
        for (i=0; i<nx; i++)
            for (j=1; j<ny; j++)
                ex[i][j] = ex[i][j] - 0.5*(hz[i][j]-hz[i][j-1]);
        for (i=0; i<nx; i++)
            for (j=0; j<ny; j++)
                hz[i][j]=hz[i][j]-0.7*(ex[i][j+1]-ex[i][j]+ey[i+1][j]-ey[i][j]);
    }
#pragma endscop

    IF_TIME(t_end = rtclock());
    IF_TIME(fprintf(stdout, "%0.6lfs\n", t_end - t_start));

#ifdef PERFCTR
    PERF_EXIT;
#endif

    if (fopen(".test", "r")) {
#ifdef __MPI
        if (my_rank == 0) {
            print_array();
        }
#else
        print_array();
#endif
    }

    return 0;
}
예제 #24
0
int main()
{
  init_arrays();

  double annot_t_start=0, annot_t_end=0, annot_t_total=0;
  int annot_i;

  for (annot_i=0; annot_i<REPS; annot_i++)
    {
      annot_t_start = rtclock();

      register int i,j,k;

      for (k=0; k<=N-1; k++)
	{
	  for (j=k+1; j<=N-1; j++)
	    A[k][j] = A[k][j]/A[k][k];
	  for(i=k+1; i<=N-1; i++)
	    for (j=k+1; j<=N-1; j++)
	      A[i][j] = A[i][j] - A[i][k]*A[k][j];
	}

      annot_t_end = rtclock();
      annot_t_total += annot_t_end - annot_t_start;
    }

  annot_t_total = annot_t_total / REPS;

#ifndef TEST
  printf("%f\n", annot_t_total);
#else
  {
    int i, j;
    for (i=0; i<N; i++) {
      for (j=0; j<N; j++) {
        if (j%100==0)
          printf("\n");
        printf("%f ",A[i][j]);
      }
      printf("\n");
    }
  }
#endif

  return ((int) A[0][0]);

}
예제 #25
0
파일: mvt.c 프로젝트: lnangong/polybenchGpu
int main()
{
	double t_start, t_end;
	
	DATA_TYPE a[N][N];
	DATA_TYPE x1[N];
	DATA_TYPE x1_outputFromGpu[N];
	DATA_TYPE x2[N];
	DATA_TYPE x2_outputFromGpu[N];
	DATA_TYPE y1[N];
	DATA_TYPE y2[N];


	//initialize the arrays for running on the CPU and GPU
    	init_array(a, x1, x1_outputFromGpu, x2, x2_outputFromGpu, y1, y2);

	#pragma hmpp mvt allocate

	#pragma hmpp mvt advancedload, args[a,x1,x2,y1,y2]

	t_start = rtclock();
	
	//run the algorithm on the GPU
	#pragma hmpp mvt callsite, args[x1,x2].advancedload=true, asynchronous
	runMvt(a, x1_outputFromGpu, x2_outputFromGpu, y1, y2); // parameters are initialized in decls.h and are initialized with init_array()

	#pragma hmpp mvt synchronize

	t_end = rtclock();
	fprintf(stderr, "GPU Runtime: %0.6lf\n", t_end - t_start);
	
	#pragma hmpp mvt delegatedstore, args[x1,x2]

	#pragma hmpp mvt release

	t_start = rtclock();
	
	//run the algorithm on the CPU
	runMvt(a, x1, x2, y1, y2);
	
	t_end = rtclock();
	fprintf(stderr, "CPU Runtime: %0.6lf\n", t_end - t_start);
	
	compareResults(x1, x1_outputFromGpu, x2, x2_outputFromGpu);

	return 0;
}
예제 #26
0
int main()
{
  int i,j, k;
  double t_start, t_end;

  for (i = 0; i < NMAX; i++) {
    for (j = 0; j < NMAX; j++) {
      c[i][j] = 0.0;
      a[i][j] = b[i][j] = i*j*0.5 / NMAX;
    }
  }

  IF_TIME(t_start = rtclock());

#pragma scop
  for (i=0; i<NMAX; i++) {
    for (j=0; j<NMAX; j++) {
      for (k=0; k<j-1; k++) {
        c[i][k] += a[j][k] * b[i][j];
        c[i][j] += a[j][j] * b[i][j];
      }
      c[i][j] += a[j][j] * b[i][j];
    }
  }
#pragma endscop

  IF_TIME(t_end = rtclock());
  IF_TIME(fprintf(stdout, "%0.6lfs\n", t_end - t_start));

  if (fopen(".test", "r"))  {
#ifdef MPI
      if (my_rank == 0) {
#endif
          for (i = 0; i < NMAX; i++) {
              for (j = 0; j < NMAX; j++) {
                  fprintf(stderr, "%lf ", c[i][j]);
              }
              fprintf(stderr, "\n");
          }
#ifdef MPI
      }
#endif
  }

  return 0;
}
예제 #27
0
int main(void) {
	double t_start, t_end;
	int i;

	DATA_TYPE* A, *A_2;
	DATA_TYPE* C4, *C4_2;
	DATA_TYPE* sum, *sum_2;

	A = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE));
	C4 = (DATA_TYPE*)malloc(NP * NP * sizeof(DATA_TYPE));
	sum = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE));

	A_2 = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE));
	C4_2 = (DATA_TYPE*)malloc(NP * NP * sizeof(DATA_TYPE));
	sum_2 = (DATA_TYPE*)malloc(NR * NQ * NP * sizeof(DATA_TYPE));

	init_array(A, C4);
	init_array(A_2, C4_2);

	read_cl_file();
	cl_initialization();
	cl_mem_init(A, C4, sum);
	cl_load_prog();
	t_start = rtclock();

	int r;	

	for (r = 0; r < NR; r++)
	{
		cl_launch_kernel1(r);
		cl_launch_kernel2(r);
	}

	t_end = rtclock();
	errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NR * NQ * NP * sizeof(DATA_TYPE), sum, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");
	
	fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
	t_start = rtclock();
	doitgen(sum_2, A_2, C4_2);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(sum, sum_2);
	cl_clean_up();
    	return 0;
}
예제 #28
0
int main(void) 
{
	double t_start, t_end;

	DATA_TYPE* A;
	DATA_TYPE* B;  
	DATA_TYPE* x;  
	DATA_TYPE* y;
	DATA_TYPE* y_outputFromGpu;
	DATA_TYPE* tmp;
	
	A = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE));
	B = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE));
	x = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); 
	y = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
	y_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
	tmp = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));

	init(A, x);
	read_cl_file();
	cl_initialization();
	cl_mem_init(A, B, x, y, tmp);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, y_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), y_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	t_start = rtclock();
	gesummv(A, B, x, y, tmp);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(y, y_outputFromGpu);
	cl_clean_up();
	
	free(A);
	free(B);  
	free(x);  
	free(y);
	free(y_outputFromGpu);
	free(tmp);

	return 0;
}
예제 #29
0
int main(int argc, char** argv)
{
	double t_start, t_end;

	/* Array declaration.  */
	DATA_TYPE A[N][N];
	DATA_TYPE x[N];
	DATA_TYPE u1[N];
	DATA_TYPE u2[N];
	DATA_TYPE v2[N];
	DATA_TYPE v1[N];
	DATA_TYPE w[N];
	DATA_TYPE wi[N];
	DATA_TYPE y[N];
	DATA_TYPE z[N];

	/* Initialize array. */
	init(A, x, u1, u2, v2, v1, w, wi, y, z);
    
	#pragma hmpp conv allocate
	#pragma hmpp conv advancedload, args[A;x;u1;u2;v2;v1;w;y;z]

	t_start = rtclock();

	#pragma hmpp conv callsite, args[A;x;u1;u2;v2;v1;w;y;z].advancedload=true, asynchronous
	loop(A, x, u1, u2, v2, v1, wi, y, z);
	#pragma hmpp conv synchronize

	t_end = rtclock();//);
	fprintf(stderr, "GPU Runtime: %0.6lfs\n", t_end - t_start);
    
	#pragma hmpp conv delegatedstore, args[w]
	#pragma hmpp conv release
	
	t_start = rtclock();

	loop(A, x, u1, u2, v2, v1, w, y, z);

	t_end = rtclock();
	fprintf(stderr, "CPU Runtime: %0.6lfs\n", t_end - t_start);

	compareResults(w, wi);

	return 0;
}
예제 #30
0
void polybench_timer_start()
{
  polybench_prepare_instruments ();
#ifndef POLYBENCH_CYCLE_ACCURATE_TIMER
  polybench_t_start = rtclock ();
#else
  polybench_c_start = rdtsc ();
#endif
}