예제 #1
0
int main(void) {
  DATA_TYPE *a;
  DATA_TYPE *x1;
  DATA_TYPE *x2;
  DATA_TYPE *x1_outputFromGpu;
  DATA_TYPE *x2_outputFromGpu;
  DATA_TYPE *y_1;
  DATA_TYPE *y_2;

  /////////////////////////
  size_t oldSizes[1] = { N };
  size_t newSizes[1];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "mvt_kernel1", 1);
  N = newSizes[0];
  /////////////////////////

  a = (DATA_TYPE *)malloc(N * N * sizeof(DATA_TYPE));
  x1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  x2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  x1_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  x2_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  y_1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  y_2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));

  init_arrays(a, x1, x2, y_1, y_2);

  platform = new Platform(PLATFORM_ID);
  context = platform->getContext();
  Device device = platform->getDevice(DEVICE_ID);
  Queue queue(*context,device,Queue::EnableProfiling); 
  
  cl_mem_init(a, x1, x2, y_1, y_2,queue);
  
  Program program(context,KERNEL_DIRECTORY KERNEL_FILE_NAME);
  if(!program.build(device)){
           std::cout << "Error building the program: \n";
           std::cout <<program.getBuildLog(device); 
  }
  kernel1=program.createKernel(kernel1Name.c_str());
  kernel2=program.createKernel(kernel2Name.c_str());
  cl_launch_kernel(queue);


  queue.readBuffer(*x1_mem_obj,N * sizeof(DATA_TYPE), x1_outputFromGpu);
  queue.readBuffer(*x2_mem_obj,N * sizeof(DATA_TYPE), x2_outputFromGpu);
  queue.finish();

  runMvt(a, x1, x2, y_1, y_2, x1_outputFromGpu,x2_outputFromGpu);
  cl_clean_up();

  free(a);
  free(x1);
  free(x2);
  free(x1_outputFromGpu);
  free(x2_outputFromGpu);
  free(y_1);
  free(y_2);

  return 0;
}
예제 #2
0
int main(int argc, char* argv[]) 
//int main(void) 
{
	double t_start, t_end;

	DATA_TYPE* A;
	DATA_TYPE* B;
	DATA_TYPE* C;
	DATA_TYPE* D;
	DATA_TYPE* E;
	DATA_TYPE* F;
	DATA_TYPE* G;
	DATA_TYPE* G_outputFromGpu;
        if(argc==2){
          printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]);
          cpu_offset = atoi(argv[1]);
        }


	A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
	B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
	C = (DATA_TYPE*)malloc(NJ*NM*sizeof(DATA_TYPE));
	D = (DATA_TYPE*)malloc(NM*NL*sizeof(DATA_TYPE));
	E = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
	F = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE));
	G = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));
	G_outputFromGpu = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));

	int i;
	init_array(A, B, C, D);
	read_cl_file();
	cl_initialization_fusion();
	//cl_initialization();
	cl_mem_init(A, B, C, D, E, F, G);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue[0], g_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NL, G_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	t_start = rtclock();
	mm3_cpu(A, B, C, D, E, F, G);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(G, G_outputFromGpu);
	cl_clean_up();

	free(A);
	free(B);
	free(C);
	free(D);
	free(E);
	free(F);
	free(G);
	free(G_outputFromGpu);

	return 0;
}
예제 #3
0
int main(int argc, char *argv[])
{
	/* Retrieve problem size. */
	int ni = NI;
	int nj = NJ;

	/* Variable declaration/allocation. */
	DATA_TYPE alpha;
	DATA_TYPE beta;
	POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj);
	POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj);
	POLYBENCH_2D_ARRAY_DECL(C,DATA_TYPE,NI,NI,ni,ni);
	POLYBENCH_2D_ARRAY_DECL(C_outputFromGpu,DATA_TYPE,NI,NI,ni,ni);

	init_arrays(ni, nj, &alpha, &beta, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(C));
	read_cl_file();
	cl_initialization();
	cl_mem_init(POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(C));
	cl_load_prog();

	cl_launch_kernel(ni, nj, alpha, beta);

	errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), POLYBENCH_ARRAY(C_outputFromGpu), 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");


	#ifdef RUN_ON_CPU

		/* Start timer. */
	  	polybench_start_instruments;

		syr2kCpu(ni, nj, alpha, beta, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(C));
	
		/* Stop and print timer. */
		printf("CPU Time in seconds:\n");
	  	polybench_stop_instruments;
	 	polybench_print_instruments;

		compareResults(ni, POLYBENCH_ARRAY(C), POLYBENCH_ARRAY(C_outputFromGpu));
	
	#else //prevent dead code elimination

		polybench_prevent_dce(print_array(ni, POLYBENCH_ARRAY(C_outputFromGpu)));

	#endif //RUN_ON_CPU


	cl_clean_up();

	POLYBENCH_FREE_ARRAY(A);
	POLYBENCH_FREE_ARRAY(B);
	POLYBENCH_FREE_ARRAY(C);
	POLYBENCH_FREE_ARRAY(C_outputFromGpu);

	return 0;
}
예제 #4
0
int main(int argc, char* argv[]) 
//int main(void) 
{
	double t_start, t_end;
	
	DATA_TYPE* data;
	DATA_TYPE* mean;
	DATA_TYPE* stddev;
	DATA_TYPE* symmat;
	DATA_TYPE* symmat_outputFromGpu;
        if(argc==2){
          printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]);
          cpu_offset = atoi(argv[1]);
        }


	data = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
	mean = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE));
	stddev = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE));
	symmat = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
	symmat_outputFromGpu = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
	
	init_arrays(data);
	read_cl_file();
	cl_initialization_fusion();
	//cl_initialization();
	cl_mem_init(data, mean, stddev, symmat);
	cl_load_prog();

	double start = rtclock();
	cl_launch_kernel();
	double end = rtclock(); 
	fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, (end - start));
	//fprintf(stdout, "CAUTION:CPU offset %d %% GPU Runtime: %0.6lf s\n",cpu_offset, 1000*(end - start));

	errcode = clEnqueueReadBuffer(clCommandQue[0], symmat_mem_obj, CL_TRUE, 0, (M+1) * (N+1) * sizeof(DATA_TYPE), symmat_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	t_start = rtclock();
	correlation(data, mean, stddev, symmat);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   


	compareResults(symmat, symmat_outputFromGpu);
	cl_clean_up();
	
	free(data);
	free(mean);
	free(stddev);
	free(symmat);
	free(symmat_outputFromGpu);
	
    return 0;
}
예제 #5
0
int main(int argc, char *argv[])
{
	int tmax = TMAX;
	int nx = NX;
	int ny = NY;

	POLYBENCH_1D_ARRAY_DECL(_fict_,DATA_TYPE,TMAX,TMAX);
	POLYBENCH_2D_ARRAY_DECL(ex,DATA_TYPE,NX,NY,nx,ny);
	POLYBENCH_2D_ARRAY_DECL(ey,DATA_TYPE,NX,NY,nx,ny);
	POLYBENCH_2D_ARRAY_DECL(hz,DATA_TYPE,NX,NY,nx,ny);
	POLYBENCH_2D_ARRAY_DECL(hz_outputFromGpu,DATA_TYPE,NX,NY,nx,ny);
	
	init_arrays(tmax, nx, ny, POLYBENCH_ARRAY(_fict_), POLYBENCH_ARRAY(ex), POLYBENCH_ARRAY(ey), POLYBENCH_ARRAY(hz));

	read_cl_file();
	cl_initialization();
	cl_mem_init(POLYBENCH_ARRAY(_fict_), POLYBENCH_ARRAY(ex), POLYBENCH_ARRAY(ey), POLYBENCH_ARRAY(hz));
	cl_load_prog();

	cl_launch_kernel(tmax, nx, ny);

	errcode = clEnqueueReadBuffer(clCommandQue, hz_mem_obj, CL_TRUE, 0, NX * NY * sizeof(DATA_TYPE), POLYBENCH_ARRAY(hz_outputFromGpu), 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");	

	#ifdef RUN_ON_CPU

		/* Start timer. */
	  	polybench_start_instruments;

		runFdtd(tmax, nx, ny, POLYBENCH_ARRAY(_fict_), POLYBENCH_ARRAY(ex), POLYBENCH_ARRAY(ey), POLYBENCH_ARRAY(hz));
	
		/* Stop and print timer. */
		printf("CPU Time in seconds:\n");
	  	polybench_stop_instruments;
	 	polybench_print_instruments;

		compareResults(nx, ny, POLYBENCH_ARRAY(hz), POLYBENCH_ARRAY(hz_outputFromGpu));

	#else //prevent dead code elimination

		polybench_prevent_dce(print_array(nx, ny, POLYBENCH_ARRAY(hz_outputFromGpu)));

	#endif //RUN_ON_CPU

	POLYBENCH_FREE_ARRAY(_fict_);
	POLYBENCH_FREE_ARRAY(ex);
	POLYBENCH_FREE_ARRAY(ey);
	POLYBENCH_FREE_ARRAY(hz);
	POLYBENCH_FREE_ARRAY(hz_outputFromGpu);

	cl_clean_up();
	
    return 0;
}
예제 #6
0
int main(void) 
{
	DATA_TYPE* A;
	DATA_TYPE* A_outputFromGpu;
	DATA_TYPE* R;
	DATA_TYPE* Q;
	
  /////////////////////////
  // Kernel 1.
  size_t oldSizes[2] = { M, N };
  size_t newSizes[2];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "gramschmidt_kernel1", 2);
  M = newSizes[0];
  N = newSizes[1];

  // Kernel 2.
  getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel2", 2);
  M = newSizes[0];
  N = newSizes[1];

  // Kernel 3.
  getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel3", 2);
  M = newSizes[0];
  N = newSizes[1];
  /////////////////////////

	A = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));
	A_outputFromGpu = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));
	R = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));  
	Q = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));  

	init_array(A);
	read_cl_file();
  cl_initialization(device_id, clGPUContext, clCommandQue);
	cl_mem_init(A);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, M*N*sizeof(DATA_TYPE), A_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");   

//	gramschmidt(A, R, Q);
//	compareResults(A, A_outputFromGpu);
	cl_clean_up();

	free(A);
	free(A_outputFromGpu);
	free(R);
	free(Q);  

	return 0;
}
예제 #7
0
int main(void) 
{
	int nx = NX;
	int ny = NY;

	POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NX,NY,nx,ny);
	POLYBENCH_1D_ARRAY_DECL(x,DATA_TYPE,NY,ny);
	POLYBENCH_1D_ARRAY_DECL(y,DATA_TYPE,NY,ny);
	POLYBENCH_1D_ARRAY_DECL(y_outputFromGpu,DATA_TYPE,NY,ny);
	POLYBENCH_1D_ARRAY_DECL(tmp,DATA_TYPE,NX,nx);

	init_array(nx, ny, POLYBENCH_ARRAY(x), POLYBENCH_ARRAY(A));

	read_cl_file();
	cl_initialization();
	cl_mem_init(POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(x), POLYBENCH_ARRAY(y), POLYBENCH_ARRAY(tmp));
	cl_load_prog();

	cl_launch_kernel(nx, ny);

	errcode = clEnqueueReadBuffer(clCommandQue, y_mem_obj, CL_TRUE, 0, NY*sizeof(DATA_TYPE), POLYBENCH_ARRAY(y_outputFromGpu), 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	#ifdef RUN_ON_CPU

		/* Start timer. */
	  	polybench_start_instruments;

		atax_cpu(nx, ny, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(x), POLYBENCH_ARRAY(y), POLYBENCH_ARRAY(tmp));
	
		/* Stop and print timer. */
		printf("CPU Time in seconds:\n");
	  	polybench_stop_instruments;
	 	polybench_print_instruments;

		compareResults(ny, POLYBENCH_ARRAY(y), POLYBENCH_ARRAY(y_outputFromGpu));

	#else

		print_array(ny, POLYBENCH_ARRAY(y_outputFromGpu));

	#endif //RUN_ON_CPU

	cl_clean_up();
	
	POLYBENCH_FREE_ARRAY(A);
	POLYBENCH_FREE_ARRAY(x);
	POLYBENCH_FREE_ARRAY(y);
	POLYBENCH_FREE_ARRAY(y_outputFromGpu);
	POLYBENCH_FREE_ARRAY(tmp);
	
	return 0;
}
예제 #8
0
int main(int argc, char *argv[])
{	
	int ni = NI;
	int nj = NJ;
	int nk = NK;

	POLYBENCH_3D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,NK,ni,nj,nk);
	POLYBENCH_3D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,NK,ni,nj,nk);
	POLYBENCH_3D_ARRAY_DECL(B_outputFromGpu,DATA_TYPE,NI,NJ,NK,ni,nj,nk);

	init(ni, nj, nk, POLYBENCH_ARRAY(A));

	read_cl_file();
	cl_initialization();
	cl_mem_init(POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B));
	cl_load_prog();

	cl_launch_kernel(ni, nj, nk);

	errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj, CL_TRUE, 0, NI * NJ * NK * sizeof(DATA_TYPE), POLYBENCH_ARRAY(B_outputFromGpu), 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	#ifdef RUN_ON_CPU

		/* Start timer. */
	  	polybench_start_instruments;

		conv3D(ni, nj, nk, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B));
	
		/* Stop and print timer. */
		printf("CPU Time in seconds:\n");
	  	polybench_stop_instruments;
	 	polybench_print_instruments;

		compareResults(ni, nj, nk, POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(B_outputFromGpu));

	#else //prevent dead code elimination

		polybench_prevent_dce(print_array(ni, nj, nk, POLYBENCH_ARRAY(B_outputFromGpu)));

	#endif //RUN_ON_CPU

	cl_clean_up();

	POLYBENCH_FREE_ARRAY(A);
	POLYBENCH_FREE_ARRAY(B);
	POLYBENCH_FREE_ARRAY(B_outputFromGpu);

	return 0;
}
예제 #9
0
int main(int argc, char *argv[]) {

  DATA_TYPE *A;
  DATA_TYPE *B_outputFromGpu;

  /////////////////////////
  size_t oldSizes[2] = { NI, NJ };
  size_t newSizes[2];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "Convolution2D_kernel", 2);
  NI = newSizes[0];
  NJ = newSizes[1];
  /////////////////////////

  A = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
  B_outputFromGpu = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));

  init(A);
   
  platform = new Platform(PLATFORM_ID);
  context = platform->getContext();
  Device device = platform->getDevice(DEVICE_ID);
  Queue queue = Queue(*context,device,Queue::EnableProfiling);

  cl_mem_init(A,queue);
  SourceFile kernelFile = KERNEL_DIRECTORY KERNEL_FILE_NAME; 
 
  // Create a program from the kernel source
  Program program(context,kernelFile);
  if(!program.build(device)) {
      std::cout << "Error building the program: " << "\n";
      std::cout << program.getBuildLog(device) << "\n";
      return 1;
    }
 
  // Create the OpenCL kernel
  kernel = program.createKernel(kernelName.c_str()); 
  cl_launch_kernel(queue);


  queue.readBuffer(*b_mem_obj,NI * NJ * sizeof(DATA_TYPE),(void*) B_outputFromGpu);
  queue.finish();

  conv2D(A, B_outputFromGpu);

  free(A);
  free(B_outputFromGpu);

  cl_clean_up();
  return 0;
}
예제 #10
0
int main(void) 
{
	double t_start, t_end;
	
	DATA_TYPE* A;
	DATA_TYPE* r;
	DATA_TYPE* s;
	DATA_TYPE* p;
	DATA_TYPE* q;
	DATA_TYPE* s_outputFromGpu;
	DATA_TYPE* q_outputFromGpu;
 	
	A = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE));
	r = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
	s = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
	p = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
	q = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
	s_outputFromGpu = (DATA_TYPE*)malloc(NY*sizeof(DATA_TYPE));
	q_outputFromGpu = (DATA_TYPE*)malloc(NX*sizeof(DATA_TYPE));
	
	init_array(A, p, r);	
	read_cl_file();
	cl_initialization();
	cl_mem_init(A, r, s, p, q);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, s_mem_obj, CL_TRUE, 0, NY*sizeof(DATA_TYPE), s_outputFromGpu, 0, NULL, NULL);
	errcode = clEnqueueReadBuffer(clCommandQue, q_mem_obj, CL_TRUE, 0, NX*sizeof(DATA_TYPE), q_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");  

	t_start = rtclock();
	bicg_cpu(A, r, s, p, q);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(s, s_outputFromGpu, q, q_outputFromGpu);
	cl_clean_up();
	
	free(A);
	free(r);
	free(s);
	free(p);
	free(q);
	free(s_outputFromGpu);
	free(q_outputFromGpu);
	
    	return 0;
}
예제 #11
0
int main(void) {

  DATA_TYPE *A;
  DATA_TYPE *y;
  DATA_TYPE *tmp;

  /////////////////////////
  size_t oldSizes[1] = { NY };
  size_t newSizes[1];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "atax_kernel2", 1);
  NY = newSizes[0];
  /////////////////////////

  A = (DATA_TYPE *)malloc(NX_DEFAULT * NY * sizeof(DATA_TYPE));
  y = (DATA_TYPE *)malloc(NY * sizeof(DATA_TYPE));
  tmp = (DATA_TYPE *)malloc(NX_DEFAULT * sizeof(DATA_TYPE));

  init_array(tmp, A);
  
  platform = new Platform(PLATFORM_ID);
  context = platform->getContext();
  Device device = platform->getDevice(DEVICE_ID);
  Queue queue(*context,device,Queue::EnableProfiling); 
  
  cl_mem_init(A, y, tmp,queue);

  Program program(context,KERNEL_DIRECTORY KERNEL_FILE_NAME);
  if(!program.build(device)){
    std::cout << "Error building the program: \n";
    std::cout <<program.getBuildLog(device); 
  }

  kernel2=program.createKernel(kernel2Name.c_str());
  cl_launch_kernel(queue);


  queue.readBuffer(*y_mem_obj,NY * sizeof(DATA_TYPE), y);
  queue.finish();

  atax_cpu(A, tmp, y);
  cl_clean_up();

  free(A);
  free(y);
  free(tmp);

  return 0;
}
예제 #12
0
int main(void) 
{
	double t_start, t_end;

	DATA_TYPE* A;
	DATA_TYPE* B;  
	DATA_TYPE* x;  
	DATA_TYPE* y;
	DATA_TYPE* y_outputFromGpu;
	DATA_TYPE* tmp;
	
	A = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE));
	B = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE));
	x = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); 
	y = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
	y_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
	tmp = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));

	init(A, x);
	read_cl_file();
	cl_initialization();
	cl_mem_init(A, B, x, y, tmp);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, y_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), y_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	t_start = rtclock();
	gesummv(A, B, x, y, tmp);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(y, y_outputFromGpu);
	cl_clean_up();
	
	free(A);
	free(B);  
	free(x);  
	free(y);
	free(y_outputFromGpu);
	free(tmp);

	return 0;
}
예제 #13
0
int main(int argc, char* argv[]) 
//int main(void) 
{
	double t_start, t_end;

	DATA_TYPE* A;
	DATA_TYPE* C;
	DATA_TYPE* C_outputFromGpu;
        if(argc==2){
          printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]);
          cpu_offset = atoi(argv[1]);
        }


	A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
	C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
	C_outputFromGpu = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));

	init_arrays(A, C);
	read_cl_file();
	cl_initialization_fusion();
	//cl_initialization();
	cl_mem_init(A, C);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue[0], c_mem_obj, CL_TRUE, 0, M * N * sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");  

	t_start = rtclock();
	syrk(A, C);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(C, C_outputFromGpu);
	cl_clean_up();
	
	free(A);
	free(C);
	free(C_outputFromGpu);

	return 0;
}
예제 #14
0
int main(void) 
{
	double t_start, t_end;
	
	DATA_TYPE* _fict_;
	DATA_TYPE* ex;
	DATA_TYPE* ey;
	DATA_TYPE* hz;
	DATA_TYPE* hz_outputFromGpu;

	_fict_ = (DATA_TYPE*)malloc(TMAX*sizeof(DATA_TYPE));
	ex = (DATA_TYPE*)malloc(NX*(NY+1)*sizeof(DATA_TYPE));
	ey = (DATA_TYPE*)malloc((NX+1)*NY*sizeof(DATA_TYPE));
	hz = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE));
	hz_outputFromGpu = (DATA_TYPE*)malloc(NX*NY*sizeof(DATA_TYPE));
	
	int i;
	init_arrays(_fict_, ex, ey, hz);
	read_cl_file();
	cl_initialization();
	cl_mem_init(_fict_, ex, ey, hz);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, hz_mem_obj, CL_TRUE, 0, NX * NY * sizeof(DATA_TYPE), hz_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");	

	t_start = rtclock();
	runFdtd(_fict_, ex, ey, hz);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(hz, hz_outputFromGpu);
	cl_clean_up();
	
	free(_fict_);
	free(ex);
	free(ey);
	free(hz);
	free(hz_outputFromGpu);
	
    	return 0;
}
예제 #15
0
int main(void) 
{
	DATA_TYPE* A;
	DATA_TYPE* B;  
	DATA_TYPE* C;  
	DATA_TYPE* C_outputFromGpu; 

  /////////////////////////
  size_t oldSizes[2] = { NJ, NI };
  size_t newSizes[2];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "gemm", 2);
  NJ = newSizes[0];
  NI = newSizes[1];
  NK = NJ;  
  /////////////////////////

	A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); 
	B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));   
	C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); 
	C_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); 

	init(A, B, C);
	read_cl_file();
  cl_initialization(device_id, clGPUContext, clCommandQue);
	cl_mem_init(A, B, C);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	gemm(A, B, C, C_outputFromGpu);
	cl_clean_up();

	free(A);
	free(B);  
	free(C);  
	free(C_outputFromGpu); 

	return 0;
}
예제 #16
0
int main(void) 
{
	double t_start, t_end;

	DATA_TYPE* A;
	DATA_TYPE* B;
	DATA_TYPE* C;
	DATA_TYPE* C_outputFromGpu;

	A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
	B = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
	C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
	C_outputFromGpu = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));

	init_arrays(A, B, C);
	read_cl_file();
	cl_initialization();
	cl_mem_init(A, B, C);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, N*M*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	t_start = rtclock();
	syr2k(A, B, C);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(C, C_outputFromGpu);
	cl_clean_up();

	free(A);
	free(B);
	free(C);
	free(C_outputFromGpu);

	return 0;
}
예제 #17
0
int main(void) 
{
  /* Prepare ctuning vars */
  long ct_repeat=0;
  long ct_repeat_max=1;

  DATA_TYPE* A;
  DATA_TYPE* B;  
  DATA_TYPE* C;  
  DATA_TYPE* C_outputFromGpu; 

#ifdef OPENME
  openme_init(NULL,NULL,NULL,0);
  openme_callback("PROGRAM_START", NULL);
#endif

  /* Run kernel. */
  if (getenv("CT_REPEAT_MAIN")!=NULL) ct_repeat_max=atol(getenv("CT_REPEAT_MAIN"));

  A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); 
  B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));   
  C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); 
  C_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); 

  srand(1);
  init(A, B, C);
  read_cl_file();
  cl_initialization();
  cl_mem_init(A, B, C);
  cl_load_prog();

#ifdef OPENME
  openme_callback("ACC_KERNEL_START", NULL);
#endif
  for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++)
  {
    cl_launch_kernel();

    err_code = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL);
    if(err_code != CL_SUCCESS)
    {
      printf("Error in reading GPU mem\n");
      exit(1);
    }
  }
#ifdef OPENME
  openme_callback("ACC_KERNEL_END", NULL);
#endif

  srand(1);
  init(A, B, C);

#ifdef OPENME
  openme_callback("KERNEL_START", NULL);
#endif
  for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++)
  {
    gemm(A, B, C);
  }
#ifdef OPENME
  openme_callback("KERNEL_END", NULL);
#endif

  compareResults(C, C_outputFromGpu);
  cl_clean_up();

  free(A);
  free(B);  
  free(C);  
  free(C_outputFromGpu); 

#ifdef OPENME
  openme_callback("PROGRAM_END", NULL);
#endif

  return 0;
}
예제 #18
0
int main(void) 
{
#ifdef ALOCACAO_NORMAL
    printf(">>>>>>>>>Versao Offload<<<<<<<<<<<\n");
#else
    printf(">>>>>>>>>Versao Memoria Compartilhada<<<<<<<<<<<\n");
#endif
    double t_start, t_end;
    double t_start_init, t_end_init;
    double t_start_init_off, t_end_init_off;
    double t_offload_start, t_offload_end;
    double total_kernel;
    int i;

#ifdef ALOCACAO_NORMAL
    a = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE));
    x1_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
    x2_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
    y_1 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
    y_2 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
#endif
    x1 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));
    x2 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE));

    t_start_init = rtclock();
    read_cl_file();
    t_end_init = rtclock();
    tmp_read_cl_file = t_end_init - t_start_init;
    total_kernel = t_end_init - t_start_init;

    t_start_init = rtclock();
#ifndef MALI
    cl_initialization();
#else
    cl_initialization_Mali();
#endif
    t_end_init = rtclock();
    tmp_cl_initialization = t_end_init - t_start_init;
    total_kernel = t_end_init - t_start_init;

    t_start_init = rtclock();
    cl_mem_init();
    t_end_init = rtclock();
    tmp_cl_mem_init= t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

    //Está dentro da função a contagem
    init();


    //------------GPU---------------
    //Inicia tempo GPU	

#ifdef ALOCACAO_NORMAL
    t_start_init_off = rtclock();
    errcode = clEnqueueWriteBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N * N, a, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, x1_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, x1, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, x2_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, x2, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, y1_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, y_1, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, y2_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, y_2, 0, NULL, NULL);
    if(errcode != CL_SUCCESS) printf("Error in writing buffers\n");
    t_end_init_off = rtclock();
    tmp_clEnqueueWriteBuffer += t_end_init_off - t_start_init_off;
    total_kernel += t_end_init_off - t_start_init_off;
#endif

    t_start_init = rtclock();
    cl_load_prog();
    t_end_init = rtclock();
    tmp_cl_load_prog= t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

    t_start_init = rtclock();
    cl_launch_kernel();
    t_end_init = rtclock();
    tmp_cl_launch_kernel += t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

#ifdef ALOCACAO_NORMAL
    t_start_init_off = rtclock();
    errcode = clEnqueueReadBuffer(clCommandQue, x1_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), x1_outputFromGpu, 0, NULL, NULL);
    errcode = clEnqueueReadBuffer(clCommandQue, x2_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), x2_outputFromGpu, 0, NULL, NULL);
    if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");   
    t_end_init_off = rtclock();
    tmp_clEnqueueReadBuffer += t_end_init_off - t_start_init_off; 
    total_kernel += t_end_init_off - t_start_init_off;
#endif

    //--------------CPU------------------

    t_start = rtclock();
    runMvt();
    t_end = rtclock(); 
    tmp_serial = t_end - t_start;  


    compareResults(x1, x1_outputFromGpu, x2, x2_outputFromGpu);

    t_start_init_off = rtclock();
    cl_clean_up();
    t_end_init_off = rtclock();
    tmp_cl_clean_up+=t_end_init_off - t_start_init_off; 
    total_kernel += t_end_init_off - t_start_init_off;


    free(x1);
    free(x2);
#ifdef ALOCACAO_NORMAL
    free(a);
    free(x1_outputFromGpu);
    free(x2_outputFromGpu);
    free(y_1);
    free(y_2);
#endif

    printf("\n-------RESULTS-------\n");
    printf("Sizes N=%d\n\n", N);

    printf("read_cl_file -------------> %lf\n", tmp_read_cl_file);
    printf("cl_initialization --------> %lf\n", tmp_cl_initialization);
    printf("cl_mem_init --------------> %lf\n", tmp_cl_mem_init);
    printf("init ---------------------> %lf\n", tmp_init);
    printf("cl_load_prog -------------> %lf\n", tmp_cl_load_prog);
    printf("cl_launch_kernel ---------> %lf\n", tmp_cl_launch_kernel);   
    printf("serialExecution ----------> %lf\n", tmp_serial);
    printf("cl_clean_up --------------> %lf\n", tmp_cl_clean_up);
    printf("clEnqueueWriteBuffer -----> %lf\n", tmp_clEnqueueWriteBuffer);
    printf("clEnqueueReadBuffer-------> %lf\n", tmp_clEnqueueReadBuffer);
    printf("clEnqueueMapBuffer -------> %lf\n", tmp_clEnqueueMapBuffer);
    printf("clEnqueueUnmapMemObject --> %lf\n", tmp_clEnqueueUnmapMemObject);

    return 0;
}
예제 #19
0
int main(void) 
{
  /* Prepare ctuning vars */
  long ct_repeat=0;
  long ct_repeat_max=1;

  DATA_TYPE* data;
  DATA_TYPE* mean;
  DATA_TYPE* stddev;
  DATA_TYPE* symmat;
  DATA_TYPE* symmat_outputFromGpu;

#ifdef OPENME
  openme_init(NULL,NULL,NULL,0);
  openme_callback("PROGRAM_START", NULL);
#endif

  /* Run kernel. */
  if (getenv("CT_REPEAT_MAIN")!=NULL) ct_repeat_max=atol(getenv("CT_REPEAT_MAIN"));

  data = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
  mean = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE));
  stddev = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE));
  symmat = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
  symmat_outputFromGpu = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));

  srand(1);
  init_arrays(data);
  read_cl_file();
  cl_initialization();
  cl_mem_init(data, mean, stddev, symmat);
  cl_load_prog();

#ifdef OPENME
  openme_callback("ACC_KERNEL_START", NULL);
#endif
  for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++)
  {
    cl_launch_kernel();

    err_code = clEnqueueReadBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, 0, (M+1) * (N+1) * sizeof(DATA_TYPE), symmat_outputFromGpu, 0, NULL, NULL);
    if(err_code != CL_SUCCESS)
    {
      printf("Error in reading GPU mem\n");
      exit(1);
    }
  }
#ifdef OPENME
  openme_callback("ACC_KERNEL_END", NULL);
#endif

  srand(1);
  init_arrays(data);

#ifdef OPENME
  openme_callback("KERNEL_START", NULL);
#endif
  for (ct_repeat=0; ct_repeat<ct_repeat_max; ct_repeat++)
  {
    correlation(data, mean, stddev, symmat);
  }
#ifdef OPENME
  openme_callback("KERNEL_END", NULL);
#endif

  compareResults(symmat, symmat_outputFromGpu);
  cl_clean_up();

  free(data);
  free(mean);
  free(stddev);
  free(symmat);
  free(symmat_outputFromGpu);

#ifdef OPENME
  openme_callback("PROGRAM_END", NULL);
#endif

  return 0;
}
예제 #20
0
int main(void) 
{
#ifdef ALOCACAO_NORMAL
    printf(">>>>>>>>>Versao Offload<<<<<<<<<<<\n");
#else
    printf(">>>>>>>>>Versao Memoria Compartilhada<<<<<<<<<<<\n");
#endif
    double t_start, t_end;
    double t_start_init, t_end_init;
    double t_offload_start, t_offload_end, t_start_init_off, t_end_init_off;
    double total_kernel;

#ifdef ALOCACAO_NORMAL
    A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE));
    B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));
    D = (DATA_TYPE*)malloc(NJ*NL*sizeof(DATA_TYPE));
    E_outputFromGpu = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));
#endif

    C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
    E = (DATA_TYPE*)malloc(NI*NL*sizeof(DATA_TYPE));

    t_start_init = rtclock();
    read_cl_file();
    t_end_init = rtclock();
    tmp_read_cl_file = t_end_init - t_start_init;
    total_kernel = t_end_init - t_start_init;

    t_start_init = rtclock();
#ifndef MALI
    cl_initialization();
#else
    cl_initialization_Mali();
#endif
    t_end_init = rtclock();
    tmp_cl_initialization = t_end_init - t_start_init;
    total_kernel = t_end_init - t_start_init;

    t_start_init = rtclock();
    cl_mem_init();
    t_end_init = rtclock();
    tmp_cl_mem_init= t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

    //Está dentro da função a contagem
    init();


    //------------GPU---------------
    //Inicia tempo GPU	

#ifdef ALOCACAO_NORMAL
    t_offload_start = rtclock();
    errcode = clEnqueueWriteBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NK, A, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, b_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NK * NJ, B, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NJ, C, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, d_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NJ * NL, D, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, e_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NL, E, 0, NULL, NULL);
    if(errcode != CL_SUCCESS)printf("Error in writing buffers\n");
    t_offload_end = rtclock();
    tmp_clEnqueueWriteBuffer += t_offload_end - t_offload_start; 
    total_kernel+=t_offload_end - t_offload_start;
#endif

    t_start_init = rtclock();
    cl_load_prog();
    t_end_init = rtclock();
    tmp_cl_load_prog= t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

    t_start_init = rtclock();
    cl_launch_kernel();
    t_end_init = rtclock();
    tmp_cl_launch_kernel += t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

#ifdef ALOCACAO_NORMAL
    t_offload_start = rtclock();
    errcode = clEnqueueReadBuffer(clCommandQue, e_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NL, E_outputFromGpu, 0, NULL, NULL);
    if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");
    t_offload_end = rtclock();
    tmp_clEnqueueReadBuffer += t_offload_end - t_offload_start; 
    total_kernel+=t_offload_end - t_offload_start;
#endif

    //-------------CPU---------------


    t_start = rtclock();
    mm2_cpu(A, B, C, D, E);
    t_end = rtclock(); 
    tmp_serial = t_end - t_start;  

    compareResults();


    t_start_init_off = rtclock();
    cl_clean_up();
    t_end_init_off = rtclock();
    tmp_cl_clean_up+=t_end_init_off - t_start_init_off; 
    total_kernel += t_end_init_off - t_start_init_off;



#ifdef ALOCACAO_NORMAL
    free(C);
    free(A);
    free(B);
    free(D);
    free(E_outputFromGpu);
#endif

    free(E);

    printf("\n-------RESULTS-------\n");
    printf("Sizes NI=%d, NJ=%d, NK=%d e NL=%d\n\n", NI, NJ, NK, NL);

    printf("read_cl_file -------------> %lf\n", tmp_read_cl_file);
    printf("cl_initialization --------> %lf\n", tmp_cl_initialization);
    printf("cl_mem_init --------------> %lf\n", tmp_cl_mem_init);
    printf("init ---------------------> %lf\n", tmp_init);
    printf("cl_load_prog -------------> %lf\n", tmp_cl_load_prog);
    printf("cl_launch_kernel ---------> %lf\n", tmp_cl_launch_kernel);   
    printf("serialExecution ----------> %lf\n", tmp_serial);
    printf("cl_clean_up --------------> %lf\n", tmp_cl_clean_up);
    printf("clEnqueueWriteBuffer -----> %lf\n", tmp_clEnqueueWriteBuffer);
    printf("clEnqueueReadBuffer-------> %lf\n", tmp_clEnqueueReadBuffer);
    printf("clEnqueueMapBuffer -------> %lf\n", tmp_clEnqueueMapBuffer);
    printf("clEnqueueUnmapMemObject --> %lf\n", tmp_clEnqueueUnmapMemObject);

    return 0;
}
예제 #21
0
int main(void) 
{
#ifdef ALOCACAO_NORMAL
    printf(">>>>>>>>>Versao Offload<<<<<<<<<<<\n");
#else
    printf(">>>>>>>>>Versao Memoria Compartilhada<<<<<<<<<<<\n");
#endif
    double t_start, t_end;
    double t_start_init, t_end_init;
    double t_start_init_off, t_end_init_off;
    double t_offload_start, t_offload_end;
    double total_kernel;
    int i;


#ifdef ALOCACAO_NORMAL
    mean = (DATA_TYPE*)malloc((M + 1)*sizeof(DATA_TYPE));
    symmat_outputFromGpu = (DATA_TYPE*)malloc((M + 1)*(M + 1)*sizeof(DATA_TYPE));	
#endif		
    data = (DATA_TYPE*)malloc((M + 1)*(N + 1)*sizeof(DATA_TYPE));
    symmat = (DATA_TYPE*)malloc((M + 1)*(M + 1)*sizeof(DATA_TYPE));

    t_start_init = rtclock();
    read_cl_file();
    t_end_init = rtclock();
    tmp_read_cl_file = t_end_init - t_start_init;
    total_kernel = t_end_init - t_start_init;

    t_start_init = rtclock();
#ifndef MALI
    cl_initialization();
#else
    cl_initialization_Mali();
#endif
    t_end_init = rtclock();
    tmp_cl_initialization = t_end_init - t_start_init;
    total_kernel = t_end_init - t_start_init;

    t_start_init = rtclock();
    cl_mem_init();
    t_end_init = rtclock();
    tmp_cl_mem_init= t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

    //Está dentro da função a contagem
    init();


    //------------GPU---------------
    //Inicia tempo GPU	

#ifdef ALOCACAO_NORMAL
    t_start_init_off = rtclock();
    errcode = clEnqueueWriteBuffer(clCommandQue, data_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * (M+1) * (N+1), data, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * (M+1) * (N+1), symmat, 0, NULL, NULL);
    errcode = clEnqueueWriteBuffer(clCommandQue, mean_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * (M+1), mean, 0, NULL, NULL);
    if(errcode != CL_SUCCESS)printf("Error in writing buffers\n");
    t_end_init_off = rtclock();
    tmp_clEnqueueWriteBuffer += t_end_init_off - t_start_init_off; 
    total_kernel += t_end_init_off - t_start_init_off;
#endif

    t_start_init = rtclock();
    cl_load_prog();
    t_end_init = rtclock();
    tmp_cl_load_prog= t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

    t_start_init = rtclock();
    cl_launch_kernel();
    t_end_init = rtclock();
    tmp_cl_launch_kernel += t_end_init - t_start_init;
    total_kernel += t_end_init - t_start_init;

#ifdef ALOCACAO_NORMAL
    t_start_init_off = rtclock();
    errcode = clEnqueueReadBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, 0, (M+1) * (N+1) * sizeof(DATA_TYPE), symmat_outputFromGpu, 0, NULL, NULL);
    if(errcode != CL_SUCCESS) printf("Error in reading GPU mem =%d\n", errcode);
    t_end_init_off = rtclock();
    tmp_clEnqueueReadBuffer += t_end_init_off - t_start_init_off;
    total_kernel += t_end_init_off - t_start_init_off;
#endif

    //--------------CPU------------------
    t_start = rtclock();
    covariance();
    t_end = rtclock(); 
    tmp_serial = t_end - t_start; 


    compareResults(symmat, symmat_outputFromGpu);

    t_start_init_off = rtclock();
    cl_clean_up();
    t_end_init_off = rtclock();
    tmp_cl_clean_up+=t_end_init_off - t_start_init_off; 
    total_kernel += t_end_init_off - t_start_init_off;

#ifdef ALOCACAO_NORMAL	
    free(symmat_outputFromGpu);
    free(mean);
#endif
    free(symmat);
    free(data);	

    printf("\n-------RESULTS-------\n");
    printf("Sizes N=%d e M=%d\n\n", N, M);

    printf("read_cl_file -------------> %lf\n", tmp_read_cl_file);
    printf("cl_initialization --------> %lf\n", tmp_cl_initialization);
    printf("cl_mem_init --------------> %lf\n", tmp_cl_mem_init);
    printf("init ---------------------> %lf\n", tmp_init);
    printf("cl_load_prog -------------> %lf\n", tmp_cl_load_prog);
    printf("cl_launch_kernel ---------> %lf\n", tmp_cl_launch_kernel);   
    printf("serialExecution ----------> %lf\n", tmp_serial);
    printf("cl_clean_up --------------> %lf\n", tmp_cl_clean_up);
    printf("clEnqueueWriteBuffer -----> %lf\n", tmp_clEnqueueWriteBuffer);
    printf("clEnqueueReadBuffer-------> %lf\n", tmp_clEnqueueReadBuffer);
    printf("clEnqueueMapBuffer -------> %lf\n", tmp_clEnqueueMapBuffer);
    printf("clEnqueueUnmapMemObject --> %lf\n", tmp_clEnqueueUnmapMemObject);

    return 0;
}