示例#1
0
void cl_launch_kernel(Queue& queue) {
  int n = N;

  size_t oldLocalWorkSize[1], globalWorkSize[1];
  oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
  globalWorkSize[0] = N;

  ///////////////////////////////////////////////
  size_t localWorkSize[1];
  getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "mvt_kernel1", 1);
  ///////////////////////////////////////////////

  // Set the arguments of the kernel
  kernel1->setArgument( 0,*a_mem_obj);
  kernel1->setArgument( 1,*x1_mem_obj);
  kernel1->setArgument( 2,*y1_mem_obj);
  kernel1->setArgument( 3, sizeof(int), (void *)&n);

  // Execute the OpenCL kernel
  queue.run(*kernel1, 1,0, globalWorkSize,localWorkSize);

  getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "mvt_kernel2", 1);
  
  // Set the arguments of the kernel
  kernel2->setArgument( 0,*a_mem_obj);
  kernel2->setArgument( 1,*x2_mem_obj);
  kernel2->setArgument( 2,*y2_mem_obj);
  kernel2->setArgument( 3, sizeof(int), (void *)&n);
  
 // Execute the OpenCL kernel
 queue.run(*kernel2, 1, 0,globalWorkSize,localWorkSize);

 queue.finish();
}
int main(void) 
{
	DATA_TYPE* A;
	DATA_TYPE* A_outputFromGpu;
	DATA_TYPE* R;
	DATA_TYPE* Q;
	
  /////////////////////////
  // Kernel 1.
  size_t oldSizes[2] = { M, N };
  size_t newSizes[2];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "gramschmidt_kernel1", 2);
  M = newSizes[0];
  N = newSizes[1];

  // Kernel 2.
  getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel2", 2);
  M = newSizes[0];
  N = newSizes[1];

  // Kernel 3.
  getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel3", 2);
  M = newSizes[0];
  N = newSizes[1];
  /////////////////////////

	A = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));
	A_outputFromGpu = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));
	R = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));  
	Q = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE));  

	init_array(A);
	read_cl_file();
  cl_initialization(device_id, clGPUContext, clCommandQue);
	cl_mem_init(A);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, M*N*sizeof(DATA_TYPE), A_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");   

//	gramschmidt(A, R, Q);
//	compareResults(A, A_outputFromGpu);
	cl_clean_up();

	free(A);
	free(A_outputFromGpu);
	free(R);
	free(Q);  

	return 0;
}
示例#3
0
int main(void) {
  DATA_TYPE *a;
  DATA_TYPE *x1;
  DATA_TYPE *x2;
  DATA_TYPE *x1_outputFromGpu;
  DATA_TYPE *x2_outputFromGpu;
  DATA_TYPE *y_1;
  DATA_TYPE *y_2;

  /////////////////////////
  size_t oldSizes[1] = { N };
  size_t newSizes[1];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "mvt_kernel1", 1);
  N = newSizes[0];
  /////////////////////////

  a = (DATA_TYPE *)malloc(N * N * sizeof(DATA_TYPE));
  x1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  x2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  x1_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  x2_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  y_1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));
  y_2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE));

  init_arrays(a, x1, x2, y_1, y_2);

  platform = new Platform(PLATFORM_ID);
  context = platform->getContext();
  Device device = platform->getDevice(DEVICE_ID);
  Queue queue(*context,device,Queue::EnableProfiling); 
  
  cl_mem_init(a, x1, x2, y_1, y_2,queue);
  
  Program program(context,KERNEL_DIRECTORY KERNEL_FILE_NAME);
  if(!program.build(device)){
           std::cout << "Error building the program: \n";
           std::cout <<program.getBuildLog(device); 
  }
  kernel1=program.createKernel(kernel1Name.c_str());
  kernel2=program.createKernel(kernel2Name.c_str());
  cl_launch_kernel(queue);


  queue.readBuffer(*x1_mem_obj,N * sizeof(DATA_TYPE), x1_outputFromGpu);
  queue.readBuffer(*x2_mem_obj,N * sizeof(DATA_TYPE), x2_outputFromGpu);
  queue.finish();

  runMvt(a, x1, x2, y_1, y_2, x1_outputFromGpu,x2_outputFromGpu);
  cl_clean_up();

  free(a);
  free(x1);
  free(x2);
  free(x1_outputFromGpu);
  free(x2_outputFromGpu);
  free(y_1);
  free(y_2);

  return 0;
}
示例#4
0
void cl_launch_kernel(Queue& queue) {
  int ni = NI;
  int nj = NJ;

  size_t oldLocalWorkSize[2], globalWorkSize[2];
  oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
  oldLocalWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
  globalWorkSize[0] = NI;
  globalWorkSize[1] = NJ;

  ///////////////////////////////////////////////
  size_t localWorkSize[2];
  getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize,
              "Convolution2D_kernel", 2);
  ///////////////////////////////////////////////

  // Set the arguments of the kernel
  kernel->setArgument(0,*a_mem_obj);
  kernel->setArgument(1,*b_mem_obj);
  kernel->setArgument(2,sizeof(int),&ni);
  kernel->setArgument(3,sizeof(int),&nj);


  // Execute the OpenCL kernel
  queue.run(*kernel,2,0,globalWorkSize,localWorkSize);
  queue.finish();
}
示例#5
0
int main(int argc, char *argv[]) {

  DATA_TYPE *A;
  DATA_TYPE *B_outputFromGpu;

  /////////////////////////
  size_t oldSizes[2] = { NI, NJ };
  size_t newSizes[2];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "Convolution2D_kernel", 2);
  NI = newSizes[0];
  NJ = newSizes[1];
  /////////////////////////

  A = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));
  B_outputFromGpu = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE));

  init(A);
   
  platform = new Platform(PLATFORM_ID);
  context = platform->getContext();
  Device device = platform->getDevice(DEVICE_ID);
  Queue queue = Queue(*context,device,Queue::EnableProfiling);

  cl_mem_init(A,queue);
  SourceFile kernelFile = KERNEL_DIRECTORY KERNEL_FILE_NAME; 
 
  // Create a program from the kernel source
  Program program(context,kernelFile);
  if(!program.build(device)) {
      std::cout << "Error building the program: " << "\n";
      std::cout << program.getBuildLog(device) << "\n";
      return 1;
    }
 
  // Create the OpenCL kernel
  kernel = program.createKernel(kernelName.c_str()); 
  cl_launch_kernel(queue);


  queue.readBuffer(*b_mem_obj,NI * NJ * sizeof(DATA_TYPE),(void*) B_outputFromGpu);
  queue.finish();

  conv2D(A, B_outputFromGpu);

  free(A);
  free(B_outputFromGpu);

  cl_clean_up();
  return 0;
}
示例#6
0
//-----------------------------------------------------------------------------
void setNDRangeSizes() {
  std::vector<size_t> newGlobalWorkSize(globalWorkSize.size(), 0);
  std::vector<size_t> newLocalWorkSize(localWorkSize.size(), 0);
  getNewSizes(globalWorkSize.data(), localWorkSize.data(),
              newGlobalWorkSize.data(), newLocalWorkSize.data(),
              kernelName.c_str(), globalWorkSize.size());

  globalWorkSize.clear();
  localWorkSize.clear();

  std::copy(newGlobalWorkSize.begin(), newGlobalWorkSize.end(),
            std::back_inserter(globalWorkSize));
  std::copy(newLocalWorkSize.begin(), newLocalWorkSize.end(),
            std::back_inserter(localWorkSize));
}
示例#7
0
int main(void) {

  DATA_TYPE *A;
  DATA_TYPE *y;
  DATA_TYPE *tmp;

  /////////////////////////
  size_t oldSizes[1] = { NY };
  size_t newSizes[1];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "atax_kernel2", 1);
  NY = newSizes[0];
  /////////////////////////

  A = (DATA_TYPE *)malloc(NX_DEFAULT * NY * sizeof(DATA_TYPE));
  y = (DATA_TYPE *)malloc(NY * sizeof(DATA_TYPE));
  tmp = (DATA_TYPE *)malloc(NX_DEFAULT * sizeof(DATA_TYPE));

  init_array(tmp, A);
  
  platform = new Platform(PLATFORM_ID);
  context = platform->getContext();
  Device device = platform->getDevice(DEVICE_ID);
  Queue queue(*context,device,Queue::EnableProfiling); 
  
  cl_mem_init(A, y, tmp,queue);

  Program program(context,KERNEL_DIRECTORY KERNEL_FILE_NAME);
  if(!program.build(device)){
    std::cout << "Error building the program: \n";
    std::cout <<program.getBuildLog(device); 
  }

  kernel2=program.createKernel(kernel2Name.c_str());
  cl_launch_kernel(queue);


  queue.readBuffer(*y_mem_obj,NY * sizeof(DATA_TYPE), y);
  queue.finish();

  atax_cpu(A, tmp, y);
  cl_clean_up();

  free(A);
  free(y);
  free(tmp);

  return 0;
}
示例#8
0
int main(void) 
{
	DATA_TYPE* A;
	DATA_TYPE* B;  
	DATA_TYPE* C;  
	DATA_TYPE* C_outputFromGpu; 

  /////////////////////////
  size_t oldSizes[2] = { NJ, NI };
  size_t newSizes[2];
  getNewSizes(oldSizes, NULL, newSizes, NULL, "gemm", 2);
  NJ = newSizes[0];
  NI = newSizes[1];
  NK = NJ;  
  /////////////////////////

	A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); 
	B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE));   
	C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); 
	C_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); 

	init(A, B, C);
	read_cl_file();
  cl_initialization(device_id, clGPUContext, clCommandQue);
	cl_mem_init(A, B, C);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");

	gemm(A, B, C, C_outputFromGpu);
	cl_clean_up();

	free(A);
	free(B);  
	free(C);  
	free(C_outputFromGpu); 

	return 0;
}
示例#9
0
void cl_launch_kernel()
{
	int ni=NI;
	int nj=NJ;
	int nk=NK;

	DATA_TYPE alpha = ALPHA;
	DATA_TYPE beta = BETA;

  size_t oldLocalWorkSize[2], globalWorkSize[2];
  oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
  oldLocalWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
  globalWorkSize[0] = NJ;
  globalWorkSize[1] = NI;

  ///////////////////////////////////////////////
  size_t localWorkSize[2];
  getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize,
              "gemm", 2);
  ///////////////////////////////////////////////


	// Set the arguments of the kernel
	errcode =  clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
	errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
	errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
	errcode |= clSetKernelArg(clKernel, 3, sizeof(DATA_TYPE), (void *)&alpha);
	errcode |= clSetKernelArg(clKernel, 4, sizeof(DATA_TYPE), (void *)&beta);
	errcode |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ni);
	errcode |= clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nj);
	errcode |= clSetKernelArg(clKernel, 7, sizeof(int), (void *)&nk);
	
	if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");

	// Execute the OpenCL kernel
	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
	clFinish(clCommandQue);
}
示例#10
0
void cl_launch_kernel(Queue& queue) {

  int nx = NX_DEFAULT;
  int ny = NY;

  size_t oldLocalWorkSize[1], globalWorkSize[1];
  oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
  globalWorkSize[0] = NY;

  ///////////////////////////////////////////////
  size_t localWorkSize[1];
  getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "atax_kernel2", 1);
  ///////////////////////////////////////////////

  // Set the arguments of the kernel
  kernel2->setArgument( 0,*a_mem_obj);
  kernel2->setArgument( 1,*y_mem_obj);
  kernel2->setArgument( 2,*tmp_mem_obj);
  kernel2->setArgument( 3, sizeof(int), (void *)&nx);
  kernel2->setArgument( 4, sizeof(int), (void *)&ny);
  
  queue.run(*kernel2, 1, 0, globalWorkSize, localWorkSize);
  queue.finish();
}
示例#11
0
void cl_launch_kernel()
{
	int m = M;
	int n = N;

  size_t oldLocalWorkSize[2], globalWorkSizeKernel1[2], localWorkSize[2]; 
  size_t globalWorkSizeKernel2[2], globalWorkSizeKernel3[2];

  oldLocalWorkSize[0] = DIM_THREAD_BLOCK_X;
	oldLocalWorkSize[1] = DIM_THREAD_BLOCK_Y;
	globalWorkSizeKernel1[0] = DIM_THREAD_BLOCK_X;
	globalWorkSizeKernel1[1] = DIM_THREAD_BLOCK_Y;
	globalWorkSizeKernel2[0] = N;
	globalWorkSizeKernel2[1] = 1;
	globalWorkSizeKernel3[0] = N;
	globalWorkSizeKernel3[1] = 1;

  ///////////////////////////////////////////////
  // Kernel 2.
  getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "gramschmidt_kernel2", 2);
  // Kernel 3.
  getNewSizes(NULL, localWorkSize, NULL, localWorkSize, "gramschmidt_kernel3", 2);

  ///////////////////////////////////////////////

	int k;
	for (k = 0; k < 1; k++)
	{
		// Set the arguments of the kernel
		errcode =  clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj);
		errcode =  clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&r_mem_obj);
		errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&q_mem_obj);
		errcode |= clSetKernelArg(clKernel1, 3, sizeof(int), (void *)&k);
		errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&m);
		errcode |= clSetKernelArg(clKernel1, 5, sizeof(int), (void *)&n);
	
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
	
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 1, NULL, globalWorkSizeKernel1, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n");
		clEnqueueBarrier(clCommandQue);


		errcode =  clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&a_mem_obj);
		errcode =  clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&r_mem_obj);
		errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&q_mem_obj);
		errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), (void *)&k);
		errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), (void *)&m);
		errcode |= clSetKernelArg(clKernel2, 5, sizeof(int), (void *)&n);
	
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
	
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 1, NULL, globalWorkSizeKernel2, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel2\n");
		clEnqueueBarrier(clCommandQue);


		errcode =  clSetKernelArg(clKernel3, 0, sizeof(cl_mem), (void *)&a_mem_obj);
		errcode =  clSetKernelArg(clKernel3, 1, sizeof(cl_mem), (void *)&r_mem_obj);
		errcode |= clSetKernelArg(clKernel3, 2, sizeof(cl_mem), (void *)&q_mem_obj);
		errcode |= clSetKernelArg(clKernel3, 3, sizeof(int), (void *)&k);
		errcode |= clSetKernelArg(clKernel3, 4, sizeof(int), (void *)&m);
		errcode |= clSetKernelArg(clKernel3, 5, sizeof(int), (void *)&n);
	
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
	
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel3, 1, NULL, globalWorkSizeKernel3, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel3\n");
		clEnqueueBarrier(clCommandQue);

	}
	clFinish(clCommandQue);

}