Beispiel #1
2
int main(int argc, char** argv)
{
  int err;                            // error code returned from api calls
  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel

  size_t global[2];                   // global domain size for our calculation
  size_t local[2];                    // local domain size for our calculation

  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
   

  cl_mem in_array;                     // device memory used for the input array
  //cl_mem synaptic_weights;             // device memory used for the input array
  cl_mem out_array;                    // device memory used for the output array
   
  if (argc != 2){
    printf("%s <inputfile>\n", argv[0]);
    return -1;
  }

	//float in_array[NO_NODES];
	//float out_array[NO_NODES];
	//float synaptic_weights[NO_NODES*NO_NODES];
	float in_array_tb[NO_NODES];
	float out_array_tb[NO_NODES];
	//float synaptic_weights_tb[NO_NODES*NO_NODES];
	float temp =0;
	int i = 0;
    	int j = 0;
	int index = 0;
	FILE* ifp;
	char* mode = "r";
  //
  // Connect to first platform
  //
  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return -1;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return -1;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);
 
  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return -1;
  }
  
  //
  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return -1;
  }

  //relu_1(in_array,synaptic_weights,out_array);


  // Fill our data sets with pattern
  //
  //int i = 0;
  //for(i = 0; i < DATA_SIZE; i++) {
  //  a[i] = (int)i;
  //  b[i] = (int)i;
  //  results[i] = 0;
  //}
  //
  
  
  // Create a command commands
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return -1;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char *xclbin=argv[1];
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return -1;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    printf("err : %d %s\n",err,err);
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return -1;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "relu_1", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel!\n");
    printf("Test failed\n");
    return -1;
  }

  // Create the input and output arrays in device memory for our calculation
  //
  in_array = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES, NULL, NULL);
  //synaptic_weights = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * NO_NODES * NO_NODES, NULL, NULL);
  out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL);
  if (!in_array || /*!synaptic_weights ||*/ !out_array)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return -1;
  }    
    
	ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode);

	if(ifp == NULL)
	{
		printf("Input file not found \n");
  		return -1;
	}
	while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) {

		in_array_tb[index++] = temp;
	}
	index = 0;
	temp = 0;

	//ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode);
	//if(ifp == NULL)
	//{
	//	printf("Weight file not found \n");
  	//	return -1;
	//}
	//while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) {
	//	synaptic_weights_tb[index++] = temp;
	//}
   
  //
  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array a!\n");
    printf("Test failed\n");
    return -1;
  }

  // Write our data set into the input array in device memory 
  //
  //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) *  NO_NODES *  NO_NODES, synaptic_weights_tb, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array b!\n");
    printf("Test failed\n");
    return -1;
  }
    
  // Set the arguments to our compute kernel
  //
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array);
  //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);

  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    return -1;
  }

  clWaitForEvents(1, &readevent);
    
  //printf("A\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",a[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("B\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",b[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //printf("res\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  printf("%x ",results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
    
  // Validate our results
  //
  //correct = 0;
  //for(i = 0; i < DATA_SIZE; i++)
  //{
  //  int row = i/MATRIX_RANK;
  //  int col = i%MATRIX_RANK;
  //  int running = 0;
  //  int index;
  //  for (index=0;index<MATRIX_RANK;index++) {
  //    int aIndex = row*MATRIX_RANK + index;
  //    int bIndex = col + index*MATRIX_RANK;
  //    running += a[aIndex] * b[bIndex];
  //  }
  //  sw_results[i] = running;
  //}
  //  
  //for (i = 0;i < DATA_SIZE; i++) 
  //  if(results[i] == sw_results[i])
  //    correct++;
  //printf("Software\n");
  //for (i=0;i<DATA_SIZE;i++) {
  //  //printf("%0.2f ",sw_results[i]);
  //  printf("%d ",sw_results[i]);
  //  if (((i+1) % 16) == 0)
  //    printf("\n");
  //}
  //  
  //  
  //// Print a brief summary detailing the results
  ////
  //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE);
  //  
        
  // Shutdown and cleanup
	int temp_ = 0;


 for (j = 0; j < NO_NODES; j++)
 {
 	if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0)
 	{
 		//printf("out_array[%d] = %f \n", j, out_array[j]);
 		temp_++;
 	}
 }


  clReleaseMemObject(in_array);
  //clReleaseMemObject(synaptic_weights);
  clReleaseMemObject(out_array);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

	if (temp_ == NO_NODES)
	{
		printf("*********************************************************** \n");
		printf("TEST PASSED !!!!!! The output matches the desired output. \n");
		printf("*********************************************************** \n");
		return EXIT_SUCCESS;
	}
	else
	{
		printf("**************************************************************** \n");
		printf("TEST Failed !!!!!! The output does not match the desired output. \n");
		printf("**************************************************************** \n");
		return -1;
	}

  //if(correct == DATA_SIZE){
  //  printf("Test passed!\n");
  //  return EXIT_SUCCESS;
  //}
  //else{
  //  printf("Test failed\n");
  //  return -1;
  //}
}
Beispiel #2
0
int main (int argc, char **argv) {

  int i;
  struct scanCallInfo infoData;
  int totbytes = 3359 * 4679;
  unsigned char *pic;

  (void) load_file_to_memory("./tmp2.pnm", &pic);
//  pic=(unsigned char *)malloc( totbytes+1 );
//  for (i=0;i<totbytes;i++) pic[i]=255;

  infoData.language = (const char*)OCR_LANG_BRITISH;
  infoData.imagedata = (const unsigned char*)pic;
  infoData.bytes_per_pixel = 1;
  infoData.bytes_per_line = 3359;
  infoData.width = 3359;
  infoData.height = 4679;

  runocr(&infoData);
  printf("%s", infoData.ret);
  free(infoData.ret);
  free(pic);

  return 0;
}
Beispiel #3
0
	void Load(char* f)
	{
		vector<standardRecord> emptyVector;

		//Load file in memory
		int size=0;
		char *ptr=0;
		char* cursor=0;
		size=load_file_to_memory(f,&ptr);
		cursor=ptr;

		if(size>0&&ptr!=0)
		{
			cursor=ptr+24+(*(int*)(ptr+4));
			while(cursor<(ptr+size))
				cursor=LoopGRUP(cursor,0,0);
		}

		ParseLoadedData();

		delete ptr;

		//Trick to clear used ram by the vector
		recordPointers.swap(emptyVector);
		recordPointers.clear();
	}
Beispiel #4
0
cl_kernel xcl_import_binary(xcl_world world,
                            const char *krnl_file,
                            const char *krnl_name)
{
	int err;

	char *krnl_bin;
	const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin);

	cl_program program = clCreateProgramWithBinary(world.context, 1,
	                                    &world.device_id, &krnl_size,
	                                    (const unsigned char**) &krnl_bin,
	                                    NULL, &err);
	if ((!program) || (err!=CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n",
		       err);
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS) {
		size_t len;
		char buffer[2048];

		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, world.device_id, CL_PROGRAM_BUILD_LOG,
		                      sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	cl_kernel kernel = clCreateKernel(program, krnl_name, &err);
	if (!kernel || err != CL_SUCCESS) {
		printf("Error: Failed to create kernel for %s: %d\n", krnl_name, err);
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	}

	/* if program is released, then EnqueueNDRangeKernel fails with
	 * INVALID_KERNEL */
	/* clReleaseProgram(program); */
	free(krnl_bin);

	return kernel;
}
void read_cl_file(char** argv)
#endif
{
        #if OPENCL_DEVICE_SELECTION!=CL_DEVICE_TYPE_ACCELERATOR
	// Load the kernel source code into the array source_str
	fp = fopen("jacobi1D_gpu_ghost.cl", "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.\n");
		exit(1);
	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose( fp );
        #else 
        printf("loading %s\n", argv[1]);
        source_size = load_file_to_memory(argv[1], (char **) &source_str);
        if (source_size < 0) {
          printf("failed to load kernel from xclbin: %s\n", argv[1]);
        }
        #endif
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
	int err = 0;
	int passed = 0;
	// timer structs
    double elapsed = 0;
	srand(time(NULL));
	
	int N = 4;
	
	char dir[100] = "./data";

	if (argc>1)
		N = atoi(argv[1]);

	//if (argc>2)
	//	strcpy(dir, argv[2]);
	
	
	
	// Allocate matrices and vectors
	float *A = (float *) malloc(N*N*sizeof(float));
	float *A0 = (float *) malloc(N*N*sizeof(float));
	float *b = (float *) malloc(N*sizeof(float));
	float *b0 = (float *) malloc(N*sizeof(float)); // ADDED; original b matrix before permutations
	float *L = (float *) malloc(N*N*sizeof(float));
	float *x = (float *) malloc(N*sizeof(float));
	float *y = (float *) malloc(N*sizeof(float));
	float *Acurr = (float *) malloc(N*sizeof(float));
	
	int i, j;
	// Initialize A and b
	for(i = 0; i < N; i++)
	{
		for(j = 0; j < N; j++)
		{
			float r = (float) rand();
			if(r > RAND_MAX/2)
				A[i*N+j] = A0[i*N+j] = -(r-RAND_MAX/2)/(RAND_MAX/2);
			else
				A[i*N+j] = A0[i*N+j] = r/(RAND_MAX/2);
		}
		float r = (float) rand();
		if(r > RAND_MAX/2)
			b[i] = b0[i] = -(r-RAND_MAX/2)/(RAND_MAX/2);
		else
			b[i] = b0[i] = r/(RAND_MAX/2);
	}
	
	// Initialize L matrix, x,y vectors
	// Added to ensure initial values are 0
	for (i = 0; i < N; i++)
	{
		for (j = 0; j < N; j++)
		{
			L[i*N+j] = 0;
		}
		y[i] = 0;
		x[i] = 0;
		Acurr[i] = 0;
	}
	
	// TEST A AND b MANUAL GENERATION
	/*
		for(i = 0; i < N; i++)
		{
			for(j = 0; j < N; j++)
			{
				if (i == j)
					A[i*N+j] = A0[i*N+j] = 1;
				else
					A[i*N+j] = A0[i*N+j] = 0;
			}
			b[i] = b0[i] = (float) i/(10.0);
		}
		
		// END GENERATION
	*/
	//show_matrix(A,0,N);
	
	
	// 1. allocate host memory for matrices A and B
	int width_A, width_A0, width_L, height_A, height_A0, height_L, height_b, height_b0, height_x, height_y, width_Acurr;
	width_A = width_A0 = width_L = height_A = height_A0 = height_L = height_b = height_b0 = height_x = height_y = width_Acurr = N;
	
	unsigned int size_A = width_A * height_A;
	unsigned int size_A0 = width_A0 * height_A0;
	unsigned int size_L = width_L * height_L;
	unsigned int size_b = height_b;
	unsigned int size_b0 = height_b0;
	unsigned int size_x = height_x;
	unsigned int size_y = height_y;
	unsigned int size_Acurr = width_Acurr;
	unsigned int mem_size_A = sizeof(float) * size_A;
	unsigned int mem_size_A0 = sizeof(float) * size_A0;
	unsigned int mem_size_L = sizeof(float) * size_L;
	unsigned int mem_size_b = sizeof(float) * size_b;
	unsigned int mem_size_b0 = sizeof(float) * size_b0;
	unsigned int mem_size_x = sizeof(float) * size_x;
	unsigned int mem_size_y = sizeof(float) * size_y;
	unsigned int mem_size_Acurr = sizeof(float) * size_Acurr;
	
	// Host pointers
	float* h_A = A;
	float* h_L = L;
	float* h_b = b;
	float* h_x = x;
	float* h_y = y;
	float* h_Acurr = Acurr;
	
	
	// 5. Initialize OpenCL
     
	cl_command_queue clCommandQue;
	cl_program program;
	cl_kernel clKernel;

	size_t dataBytes;
	size_t kernelLength;
	cl_int status;
	
	
	/*****************************************/
	/* Initialize OpenCL */
	/*****************************************/

	// Retrieve the number of platforms
    cl_uint numPlatforms = 0;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

	//printf("Found %d platforms support OpenCL, return code %d.\n", numPlatforms, status);
 
    // Allocate enough space for each platform
    cl_platform_id *platforms = NULL;

    platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id));
 
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS)
		printf("clGetPlatformIDs error(%d)\n", status);
	
	// Retrieve the number of devices
    cl_uint numDevices = 0;
#ifndef FPGA_DEVICE
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
#else
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices);
#endif
	printf("Found %d devices support OpenCL.\n", numDevices);

    // Allocate enough space for each device
    cl_device_id *devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id));

    // Fill in the devices 
#ifndef FPGA_DEVICE
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
#else
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL);
#endif

	if (status != CL_SUCCESS)
		printf("clGetDeviceIDs error(%d)\n", status);

		
	// GET MAX DEVICE LOCAL MEMORY SIZE	
	//cl_ulong mem_size;
	//clGetDeviceInfo(devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
    //printf("CL_DEVICE_LOCAL_MEM_SIZE: %d KB\n", (unsigned int)(mem_size / 1024));
	
	// GET MAX NUMBER OF WORK ITEMS PER DIMENSION
	//size_t workitem_size[3];
	//cl_int ret = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
	//printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: %d / %d / %d\n", workitem_size[0], workitem_size[1], workitem_size[2]);
	
    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
	if (status != CL_SUCCESS)
		printf("clCreateContext error(%d)\n", status);

	// OpenCL device memory for matrices
	cl_mem d_A;
	cl_mem d_L;
	cl_mem d_b;
	cl_mem d_x;
	cl_mem d_y;
	cl_mem d_Acurr;
	
	//Create a command-queue
	clCommandQue = clCreateCommandQueue(context, devices[0], 0, &status);

	if (status != CL_SUCCESS)
		printf("clCreateCommandQueue error(%d)\n", status);

	// Setup device memory
	d_x = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_x, NULL, &status);
	d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &status);
	d_L = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_L, h_L, &status);
	d_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_b, h_b, &status);
	d_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_y, h_y, &status);
	d_Acurr = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_Acurr, h_Acurr, &status);

#ifndef FPGA_DEVICE
	// WE CAN'T USE THIS UNLESS WE MAKE A HEADER FILE WITH A GIANT STRING OF THE KERNEL PROGRAM
	// Create a program with source code
    program = clCreateProgramWithSource(context, 1, 
        (const char**)&lu259_cl, NULL, &status);
	if (status != 0)
		printf("clCreateProgramWithSource error(%d)\n", status);

    // Build (compile) the program for the device
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
	
#else
	// Load binary from disk
	unsigned char *kernelbinary;
	char *xclbin = argv[2];
	printf("loading %s\n", xclbin);
	int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
	printf("done loading\n");
	if (n_i < 0) {
		printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin);
		return -1;
	}
	size_t n_bit = n_i;
	printf("creating program with binary\n");
	// Create the compute program from offline
	program = clCreateProgramWithBinary(context, 1, &devices[0], &n_bit,
			(const unsigned char **) &kernelbinary, NULL, &status);

	if ((!program) || (status != CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n", status);
		return -1;
	}
	printf("done creating program with binary\n");
	printf("building program\n");
	// Build the program executable
	status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	printf("done building program\n");
#endif

	if (status != 0) {
		char errmsg[2048];
		size_t sizemsg = 0;

		status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048*sizeof(char), errmsg, &sizemsg);

		printf("clBuildProgram error(%d)\n", status);
		printf("Compilation messages: \n %s", errmsg);
	}

	clKernel = clCreateKernel(program, "LUFact", &status);
	if (status != CL_SUCCESS)
		printf("clCreateKernel error(%d)\n", status);

	// 7. Launch OpenCL kernel
	//size_t localWorkSize[2], globalWorkSize[2];
	size_t localWorkSize[1], globalWorkSize[1];
	
	int width_matrix = width_A;
	int height_vector = height_x;
	
	status  = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_x);
	status |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
	status |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_L);
	status |= clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_b);
	status |= clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&d_y);
	status |= clSetKernelArg(clKernel, 5, sizeof(cl_mem), (void *)&d_Acurr);
	status |= clSetKernelArg(clKernel, 6, sizeof(int), (void *)&N);
	//status |= clSetKernelArg(clKernel, 6, sizeof(int), (void *)&height_vector);
	
	if (status != CL_SUCCESS)
		printf("clSetKernelArg error(%d)\n", status);
		
	
	//localWorkSize[0] = BLOCK_SIZE;
	//localWorkSize[1] = BLOCK_SIZE;
	//globalWorkSize[0] = width_A;
	//globalWorkSize[1] = height_A;
	localWorkSize[0] = N;//(N)/BLOCK_SIZE; 
	globalWorkSize[0] = N;//(N*N)/BLOCK_SIZE;

    // start timer
	clock_t start = clock();

    status = clEnqueueWriteBuffer(clCommandQue, d_A, CL_FALSE, 0, mem_size_A, h_A, 0, NULL, NULL);
	status = clEnqueueWriteBuffer(clCommandQue, d_L, CL_FALSE, 0, mem_size_L, h_L, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(clCommandQue, d_b, CL_FALSE, 0, mem_size_b, h_b, 0, NULL, NULL);
	status = clEnqueueWriteBuffer(clCommandQue, d_y, CL_FALSE, 0, mem_size_y, h_y, 0, NULL, NULL);
	status = clEnqueueWriteBuffer(clCommandQue, d_Acurr, CL_FALSE, 0, mem_size_Acurr, h_Acurr, 0, NULL, NULL);
	printf("Enter the dragon\n");
	status = clEnqueueNDRangeKernel(clCommandQue, 
			clKernel, 1, NULL, globalWorkSize, 
			localWorkSize, 0, NULL, NULL);
	if (status != CL_SUCCESS)
		printf("clEnqueueNDRangeKernel error(%d)\n", status);
	printf("Exit the dragon\n");
	// 8. Retrieve result from device
	status = clEnqueueReadBuffer(clCommandQue, d_x, CL_TRUE, 0, mem_size_x, h_x, 0, NULL, NULL);
	printf("HERE1\n");
	//status = clEnqueueReadBuffer(clCommandQue, d_A, CL_TRUE, 0, mem_size_A, h_A, 0, NULL, NULL);
	//status = clEnqueueReadBuffer(clCommandQue, d_L, CL_TRUE, 0, mem_size_L, h_L, 0, NULL, NULL);
	printf("HERE2\n");
	if (status != CL_SUCCESS)
		printf("clEnqueueReadBuffer error(%d)\n", status);
	printf("HERE3\n");
	//show_matrix(A,0,N);
	//show_matrix(L,0,N);
	
	printf("HERE4\n");
	// TEMPORARILY ADDED IN FOR DEBUGGING PURPOSES
	/*for(i = 0; i < N; i++)
		{
			float yi = b[i];
			for(j = 0; j < i; j++)
			{
				yi -= L[i*N+j]*y[j];
			}	
			y[i] = yi;
		}

		// Use back substitution to solve Ux = y
		for(i = N-1; i >= 0; i--)
		{
			float xi = y[i];
			for(j = i+1; j < N; j++)
				xi -= A[i*N+j]*x[j];
			x[i] = xi/A[i*N+i];
		}
	// END TEMPORARILY ADDED IN
	*/
	printf("HERE5\n");
	//show_matrix(b,0,N);
	//show_matrix(b0,0,N);
	//show_matrix(x,0,N);
	printf("So far so good\n");
	// stop timer
	clock_t end = clock();
	elapsed += ((double)(end-start)) / CLOCKS_PER_SEC;
	
	printf("LU decomposition done. Now to check\n");
	// Check result
	float error = 0;
	for(i = 0; i < N; i++)
	{
		float b_res = 0;
		for(j = 0; j < N; j++)
			b_res += A0[i*N+j] * x[j];
		if ((b_res - 0.1) < b0[i] || (b_res + 0.1) > b0[i])
			b_res = b0[i];
		error += b_res > b0[i] ? b_res-b0[i] : b0[i]-b_res;
		//printf("b_res is: %f\n", b_res);
	}
		
	float epsilonPerRow = 0.01;
	if(error < N*epsilonPerRow)
		passed++;
		
	printf("%d of %d tests passed\n", passed, ITER);
	printf("Average time: %.2f seconds\n", elapsed/ITER);

	// 10. clean up memory
	free(A0);
	free(b0);
	
	free(h_A);
	free(h_L);
	free(h_b);
	free(h_x);
	free(h_y);
	free(h_Acurr);

	clReleaseMemObject(d_A);
	clReleaseMemObject(d_L);
	clReleaseMemObject(d_b);
	clReleaseMemObject(d_x);
	clReleaseMemObject(d_y);
	clReleaseMemObject(d_Acurr);

	free(devices);
	clReleaseContext(context);
	clReleaseKernel(clKernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(clCommandQue);
}
Beispiel #7
0
void init_device(int concurrent){
    int  err;
    char cl_platform_vendor[1001];
    char cl_platform_name[1001];

    h_input  = (cl_int *) malloc(sizeof(cl_int)*REC_N);
    h_output = (cl_int *) malloc(sizeof(cl_int)*REC_N);

    err = clGetPlatformIDs(1,&platform_id,NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to find an OpenCL platform!\n");
        printf("Test failed\n");
        exit(1);
    }

    err = clGetPlatformInfo(platform_id, CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
        printf("Test failed\n");
        exit(1);
    }

    err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
        printf("Test failed\n");
        exit(1);
    }

    int fpga = 0;
#if defined (FPGA_DEVICE)
    fpga = 1;
#endif

    err = clGetDeviceIDs(platform_id, 
            fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
            1, 
            &device_id, 
            NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        printf("Test failed\n");
        exit(1);
    }


    // Create a compute context 
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        printf("Test failed\n");
        exit(1);
    }

    // Create a command commands
    //
    if (concurrent) {
        commands = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
    } else {
        commands = clCreateCommandQueue(context, device_id, 0, &err);
    }
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        printf("Error: code %i\n",err);
        printf("Test failed\n");
        exit(1);
    }

    int status;

    unsigned char *kernelbinary;

    char xclbin[] = "pipe.xclbin";
    int n_i= load_file_to_memory(xclbin, (char **) &kernelbinary);
    if (n_i < 0) {
        printf("failed to load kernel from xclbin\n");
        printf("Test failed\n");
        exit(1);
    }

    size_t n = n_i;
    // Create the compute program from offline
    program = clCreateProgramWithBinary(context, 1, &device_id, &n,
            (const unsigned char **) &kernelbinary, &status, &err);
    if ((!program) || (err!=CL_SUCCESS)) {
        printf("Error: Failed to create compute program from binary %d!\n", err);
        printf("Test failed\n");
        exit(1);
    }
    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        printf("Test failed\n");
        exit(1);
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel_in = clCreateKernel(program, "kernel_in", &err);
    if (!kernel_in || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel_in! %d\n", err);
        printf("Test failed\n");
        exit(1);
    }
    kernel_inter = clCreateKernel(program, "kernel_inter", &err);
    if (!kernel_inter || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel_inter! %d\n", err);
        printf("Test failed\n");
        exit(1);
    }
    kernel_out = clCreateKernel(program, "kernel_out", &err);
    if (!kernel_out || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel_out! %d\n", err);
        printf("Test failed\n");
        exit(1);
    }

    // Create the input and output arrays in device memory for our calculation
    //
    d_input    = clCreateBuffer(context, CL_MEM_READ_ONLY,  sizeof(cl_int)*REC_N, NULL, NULL);
    d_output   = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int)*REC_N, NULL, NULL);

    if (!d_input || !d_output) {
        printf("Error: Failed to allocate device memory!\n");
        printf("Test failed\n");
        exit(1);
    }
    // Set the arguments to our compute kernel
    //
    err = 0;
    err  = clSetKernelArg(kernel_in, 0, sizeof(cl_mem), &d_input);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel argument d_input! %d\n", err);
        printf("Test failed\n");
        exit(1);
    }

    err = 0;
    err  = clSetKernelArg(kernel_out, 0, sizeof(cl_mem), &d_output);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel argument d_output! %d\n", err);
        printf("Test failed\n");
        exit(1);
    }
}
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    int* a = NULL; // input pointer
    int* results = NULL; // output pointer
    unsigned int correct;               // number of correct results returned

    size_t global[2];                   // global domain size for our calculation
    size_t local[2];                    // local domain size for our calculation

    cl_platform_id platform_id;         // platform id
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

    char cl_platform_vendor[1001];
    char cl_platform_name[1001];

    cl_mem input_a;                     // device memory used for the input array
    //cl_mem input_b;                     // device memory used for the input array
    cl_mem output;                      // device memory used for the output array
    int inc;
    double t_start, t_end;

    if (argc != 2) {
        printf("%s <inputfile>\n", argv[0]);
        return EXIT_FAILURE;
    }

    // Connect to first platform
    //
    err = clGetPlatformIDs(1,&platform_id,NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to find an OpenCL platform!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
    err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    printf("CL_PLATFORM_NAME %s\n",cl_platform_name);

    // Connect to a compute device
    //
    int fpga = 0;
#if defined (FPGA_DEVICE)
    fpga = 1;
#endif
    err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                         1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        printf("Error: code %i\n",err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    int status;

    // Create Program Objects
    //

    // Load binary from disk
    unsigned char *kernelbinary;
    char *xclbin=argv[1];
    printf("loading %s\n", xclbin);
    int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
    if (n_i < 0) {
        printf("failed to load kernel from xclbin: %s\n", xclbin);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    else {
        printf("Succeed to load kernel from xclbin: %s\n", xclbin);
    }
    size_t n = n_i;
    // Create the compute program from offline
    program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                        (const unsigned char **) &kernelbinary, &status, &err);
    if ((!program) || (err!=CL_SUCCESS)) {
        printf("Error: Failed to create compute program from binary %d!\n", err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    else {
        printf("Succeed to create compute program from binary %d!\n", err);
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    else {
        printf("Succeed to build program executable!\n");
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "mmult", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    else {
        printf("Succeed to create compute kernel!\n");
    }

    // Create the input and output arrays in device memory for our calculation
    //
    input_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(int) * DATA_SIZE, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * RESULT_SIZE, NULL, NULL);
    if (!input_a || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    else {
        printf("Succeed to allocate device memory!\n");
    }

    // set up socket
    printf("\n************* Welcome to UCLA FPGA agent! **********\n");
    struct sockaddr_in stSockAddr;
    int SocketFD = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);

    if(-1 == SocketFD) {
        perror("can not create socket");
        exit(EXIT_FAILURE);
    }

    memset(&stSockAddr, 0, sizeof(stSockAddr));

    stSockAddr.sin_family = AF_INET;
    stSockAddr.sin_port = htons(7000);
    stSockAddr.sin_addr.s_addr = htonl(INADDR_ANY);

    if(-1 == bind(SocketFD,(struct sockaddr *)&stSockAddr, sizeof(stSockAddr))) {
        perror("error bind failed");
        close(SocketFD);
        exit(EXIT_FAILURE);
    }

    if(-1 == listen(SocketFD, 10)) {
        perror("error listen failed");
        close(SocketFD);
        exit(EXIT_FAILURE);
    }


    int taskNum = -1;

    // polling setting
    timespec deadline;
    deadline.tv_sec = 0;
    deadline.tv_nsec = 100;

    // Get the start time
    timespec timer = tic( );
    timespec socListenTime = diff(timer, timer);
    timespec socSendTime = diff(timer, timer);
    timespec socRecvTime = diff(timer, timer);
    timespec exeTime = diff(timer, timer);

    bool broadcastFlag = false;

    int packet_buf[PACKET_SIZE];
    int time_buf[TIME_BUF_SIZE];

    while (true) {
        //printf("\n************* Got a new task! *************\n");
        timer = tic();

        int ConnectFD = accept(SocketFD, NULL, NULL);
        if (!broadcastFlag) {
            broadcastFlag = true;
            timer = tic();
        }

        // For profiling only
        //struct timeval  tv;
        //gettimeofday(&tv, NULL);
        //double time_in_mill = (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000 ; // convert tv_sec & tv_usec to millisecond
        //printf("Receive time (ms): %lf\n", time_in_mill);

        accTime (&socListenTime, &timer);

        if(0 > ConnectFD) {
            perror("error accept failed");
            close(SocketFD);
            exit(EXIT_FAILURE);
        }

        read(ConnectFD, &packet_buf, PACKET_SIZE * sizeof(int));

        // send FPGA stats back to java application
        if(packet_buf[0] == -1) {
            // for profiling use
            collect_timer_stats(ConnectFD, &socListenTime, &socSendTime, &socRecvTime, &exeTime, &timer);
            broadcastFlag = false;
            continue;
        }

        char* shm_addr;
        int shmid = -1;
        int data_size = -1;  // data sent to FPGA (unit: int)
        shmid = packet_buf[0];
        data_size = packet_buf[1];
        printf("Shmid: %d, Data size (# of int): %d\n", shmid, data_size);

        // shared memory
        if((shm_addr = (char *) shmat(shmid, NULL, 0)) == (char *) -1) {
            perror("Server: shmat failed.");
            exit(1);
        }
        //else
        //printf("Server: attach shared memory: %p\n", shm_addr);

        int done = 0;
        while(done == 0) {
            done = (int) *((int*)shm_addr);
            clock_nanosleep(CLOCK_REALTIME, 0, &deadline, NULL);
        }

        //printf("Copy data to the array in the host\n");
        a = (int *)(shm_addr + FLAG_NUM * sizeof(int));
        results = (int *)(shm_addr + FLAG_NUM * sizeof(int));

        accTime (&socSendTime, &timer);

        taskNum = a[2];
        for (int i=0; i<taskNum; i++) {
            int tmp = *(a+8+i*8+7);
            assert(tmp >=0 && tmp < TOTAL_TASK_NUMS);
        }
        printf("Task Num: %d\n", taskNum);

        //printf("\nparameter recieved --- \n");
        //Write our data set into the input array in device memory

        //printf("Write data from host to FPGA\n");
        err = clEnqueueWriteBuffer(commands, input_a, CL_TRUE, 0, sizeof(int) * data_size, a, 0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to write to source array a!\n");
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

        // Set the arguments to our compute kernel
        //
        err = 0;
        err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_a);
        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
        err |= clSetKernelArg(kernel, 2, sizeof(int), &taskNum);
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to set kernel arguments! %d\n", err);
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

        // Execute the kernel over the entire range of our 1d input data set
        // using the maximum number of work group items for this device
        //

        //printf("Enqueue Task\n");
        err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
        if (err)
        {
            printf("Error: Failed to execute kernel! %d\n", err);
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

        // Read back the results from the device to verify the output
        //
        cl_event readevent;
        //printf("Enqueue read buffer\n");
        err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(int) * FPGA_RET_PARAM_NUM * taskNum, results, 0, NULL, &readevent );
        if (err != CL_SUCCESS)
        {
            printf("Error: Failed to read output array! %d\n", err);
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

        //printf("Wait for FPGA results\n");
        clWaitForEvents(1, &readevent);
        accTime(&exeTime, &timer);

        // Get the execution time
        //toc(&timer);

        // put data back to shared memory
        //printf("Put data back to the shared memory\n");
        *((int*)(shm_addr + sizeof(int))) = DONE;

        //printf("\n************* Task finished! *************\n");

        if (-1 == shutdown(ConnectFD, SHUT_RDWR)) {
            perror("can not shutdown socket");
            close(ConnectFD);
            close(SocketFD);
            exit(EXIT_FAILURE);
        }
        close(ConnectFD);

        //printf("done\n");

        // free the shared memory
        shmdt(shm_addr);
        //shmctl(shmid, IPC_RMID, 0);

        accTime(&socRecvTime, &timer);

        printf("**********timing begin**********\n");
        printTimeSpec(socListenTime);
        printTimeSpec(socSendTime);
        printTimeSpec(socRecvTime);
        printTimeSpec(exeTime);
        printf("**********timing end**********\n\n");
    }

    close(SocketFD);

    // Shutdown and cleanup
    //
    clReleaseMemObject(input_a);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    return EXIT_SUCCESS;

}
Beispiel #9
0
int main(int argc, char** argv)
{
    int err;                            // error code returned from api calls
    int test_fail = 0;
    pgm_t input_img, output_img;

    IMG_DTYPE filter[FILTER_SIZE*FILTER_SIZE] = {-1, -1, -1, -1, 8, -1, -1, -1, -1};
    IMG_DTYPE *h_input;      // input image buffer
    IMG_DTYPE *hw_output;    // host buffer for device output
    IMG_DTYPE *sw_output;    // host buffer for reference output

    size_t global[2];                   // global domain size for our calculation
    size_t local[2];                    // local domain size for our calculation

    cl_platform_id platform_id;         // platform id
    cl_device_id device_id;             // compute device id
    cl_context context;                 // compute context
    cl_command_queue commands;          // compute command queue
    cl_program program;                 // compute program
    cl_kernel kernel;                   // compute kernel

    char cl_platform_vendor[1001];
    char cl_platform_name[1001];

    cl_mem d_in_image;                  // device buffer for input image
    cl_mem d_in_filter;                 // device buffer for filter kernel
    cl_mem d_out_image;                 // device buffer for filtered image

    printf("Application start\n");
    if (argc != 3) {
        printf("Usage: %s conv_2d.xclbin image_path/image_name.pgm\n", argv[0]);
        return EXIT_FAILURE;
    }

    int row, col, pix;
    // read the image and initialize the host buffer with that
    err = readPGM(&input_img, argv[2]);
    if(err < 0) {
        printf("Cound not read the image\n");
        return EXIT_FAILURE;
    }
    printf("Input image resolution = %xx%d\n", input_img.width, input_img.height);
    h_input = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); 
    hw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); 
    sw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); 
    for(pix = 0; pix < input_img.height*input_img.width; pix++) {
        h_input[pix] = input_img.buf[pix];
    }

    // Connect to first platform
    //
    err = clGetPlatformIDs(1,&platform_id,NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to find an OpenCL platform!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
    if (err != CL_SUCCESS) {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    printf("INFO: CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
    err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
    if (err != CL_SUCCESS) {
        printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    printf("INFO: CL_PLATFORM_NAME %s\n",cl_platform_name);

    // Connect to a compute device
    //
    err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR,
                         1, &device_id, NULL);
    if (err != CL_SUCCESS) {
            printf("Error: Failed to create a device group!\n");
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context) {
        printf("Error: Failed to create a compute context!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create a command commands
    //
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands) {
        printf("Error: Failed to create a command commands!\n");
        printf("Error: code %i\n",err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    int status;

    // Create Program Objects
    //

    // Load binary from disk
    unsigned char *kernelbinary;
    char *xclbin = argv[1];

    printf("INFO: loading xclbin %s\n", xclbin);
    int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
    if (n_i < 0) {
        printf("failed to load kernel from xclbin0: %s\n", xclbin);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    size_t n = n_i;

    // Create the compute program from offline
    program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                        (const unsigned char **) &kernelbinary, &status, &err);

    if ((!program) || (err!=CL_SUCCESS)) {
        printf("Error: Failed to create compute program0 from binary %d!\n", err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Build the program executable
    //
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "conv_2d", &err);
    if (!kernel || err != CL_SUCCESS) {
        printf("Error: Failed to create compute kernel!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Create the input and output arrays in device memory for our calculation
    //
    d_in_image = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL);
    d_in_filter = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, NULL, NULL);
    d_out_image = clCreateBuffer(context, CL_MEM_WRITE_ONLY,  sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL);
    if (!d_in_image || !d_in_filter || !d_out_image) {
        printf("Error: Failed to allocate device memory!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Write the image from host buffer to device memory
    //
    err = clEnqueueWriteBuffer(commands, d_in_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.height*input_img.width, h_input, 0, NULL, NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to write to image to device memory!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }
    // Write filter kernel into device buffer
    //
    err = clEnqueueWriteBuffer(commands, d_in_filter, CL_TRUE, 0, sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, filter, 0, NULL, NULL);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to write to filter coeff into device memory!\n");
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Set the arguments to our compute kernel
    //
    int filter_size = FILTER_SIZE;
    IMG_DTYPE bias = 1;
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_in_image);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_in_filter);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_out_image);
    //err |= clSetKernelArg(kernel, 3, sizeof(int),    &filter_size);
    err |= clSetKernelArg(kernel, 3, sizeof(IMG_DTYPE),    &bias);
    if (err != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        printf("Test failed\n");
        return EXIT_FAILURE;
    }

    // Launch computation kernel
    global[0] = input_img.width * WORKGROUP_SIZE_0;
    global[1] = input_img.height * WORKGROUP_SIZE_1;
    local[0] = WORKGROUP_SIZE_0;
    local[1] = WORKGROUP_SIZE_1;

    err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL,
                                 (size_t*)&global, (size_t*)&local, 0, NULL, NULL);
    if (err) {
            printf("Error: Failed to execute kernel! %d\n", err);
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

    // Read back the results from the device to verify the output
    //
    cl_event readevent;
    err = clEnqueueReadBuffer( commands, d_out_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.width*input_img.height, hw_output, 0, NULL, &readevent
);
    if (err != CL_SUCCESS) {
            printf("Error: Failed to read output array! %d\n", err);
            printf("Test failed\n");
            return EXIT_FAILURE;
        }

    clWaitForEvents(1, &readevent);

    // Generate reference output
    int kr, kc;
    IMG_DTYPE sum = 0;
    for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) {
        for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) {
            sum = 0;
            for(kr = 0; kr < FILTER_SIZE; kr++) {
                for(kc = 0; kc < FILTER_SIZE; kc++ ) {
                    sum += (filter[kr*FILTER_SIZE + kc] * h_input[(row+kr)*input_img.width + col + kc]);
                }
            }
            sw_output[row*input_img.width + col] = sum + bias;
        }
    }
    // Check Results
    for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) {
        for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) {
             if(sw_output[row*input_img.width+col] != hw_output[row*input_img.width+col]){
                 printf("Mismatch at : row = %d, col = %d, expected = %f, got = %f\n",
                     row, col, sw_output[row*input_img.width+col], hw_output[row*input_img.width+col]);
                 test_fail = 1;
             }
        }
    }
    printf("---------Input image-----------\n");
    //print_matrix(h_input, input_img.height, input_img.width);
    printf("---------Reference output------\n");
    //print_matrix(sw_output, input_img.height, input_img.width);
    printf("---------OCL Kernel output-----\n");
    //print_matrix(hw_output, input_img.height, input_img.width);

    // store the output image
    output_img.width = input_img.width;
    output_img.height = input_img.height;
    normalizeF2PGM(&output_img, hw_output);
    writePGM(&output_img, "../../../../fpga_output.pgm");
    //--------------------------------------------------------------------------
    // Shutdown and cleanup
    //--------------------------------------------------------------------------
    clReleaseMemObject(d_in_image);
    clReleaseMemObject(d_in_filter);
    clReleaseMemObject(d_out_image);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);

    destroyPGM(&input_img);
    if (test_fail) {
        printf("INFO: Test failed\n");
        return EXIT_FAILURE;
    } else {
        printf("INFO: Test passed\n");
    }
}
int main(int argc, char* argv[]){

	// Input Files //
	/* Intro Message */
                printf("\n");
                printf("A Program to Repack Protein Side-chains for Protein Docking Refinement Procedures\n");
                printf("Copyright (c) 2014, Structural Bioinformatics Laboratory, Boston University\n");
                printf("Author: Mohammad Moghadasi ([email protected]) \n");	

	if(argc!=10){
		printf("Usage:\n ./main \n   Complex_IN.pdb Complex_IN.psf Complex_IN.mol2 Complex_IN_Ligand.pdb\n   Libmol-param-file charmm-param-file.prm  charmm-rtf-file.rtf rotamer-library-binary-file.txt\n   Complex_OUT.pdb\n \n");
		exit(EXIT_FAILURE);
	}

        char* ifile           = argv[1];       //pdb file of both receptor and ligand
        char* psffile         = argv[2];         //charmm type psf file
        char* mol2file        = argv[3];         //mol2 file
        char* pdbfilelig      = argv[4];         //pdb file of ligand
        char* atom_prm_file   = argv[5];         //libmol parameter file
              prmfile         = argv[6];         //charmm type parameter file
              rtffile         = argv[7];         //charmm type connectivity file
        char* rotamer_library_file  = argv[8]; //rotamer raw library file
        char* ofile           = argv[9];         //output file
	
	// Filling the atom_group struct //

	struct prm *atomprm    = read_prm(atom_prm_file,_MOL_VERSION_);
	struct atomgrp* ag     = read_file_atomgrp(ifile, atomprm, -1);
	read_ff_charmm(psffile, prmfile, rtffile, ag);

	if(!read_hybridization_states_from_mol2(mol2file,ag)){
	    exit (EXIT_FAILURE);             
	}                 
        fix_acceptor_bases(ag,atomprm);

        struct List lig_list;
	read_fix(pdbfilelig,&lig_list.n,&lig_list.K);

	fixed_init(ag);
        fixed_update_unfreeze_all(ag);
        zero_grads(ag);
        fill_ingrp(ag);

	struct agsetup* ags;
        ags     = malloc(sizeof(struct agsetup));
        init_nblst(ag,ags);
        update_nblst(ag,ags);

	// Mark interface residues //
	
        int num_of_res_interface;
        int res_list_interface[ag->nres];
        mark_interface_residues(ag,ags,lig_list, lig_rec_dist ,&num_of_res_interface,res_list_interface);

	// Initialize side chain rotamer library  //
	
	//nrotCoef = 3;
	nrotCoef = 1;
	MAX_ROT = 245;
	MAX_RES = ag->nres;//needed for full_pack
        cutoff = 3;

	// Reslist // 

	struct ifres_list* reslist;
	ifres_list_malloc( &reslist );
	reslist->num_of_ifres = num_of_res_interface;
	for(int r = 0; r < reslist->num_of_ifres ; r++)
		reslist->ifres_num[r] = res_list_interface[r];

	// Library //

	char *rotamer_lib;
	load_file_to_memory(rotamer_library_file, &rotamer_lib);
	struct rot_info *rotinf;
	init_rotinf(ag, num_of_res_interface, res_list_interface, rotamer_lib, &rotinf);

	struct ifres_list* reslist_minor;
	ifres_list_malloc( &reslist_minor ) ;

	// MAIN FUNCITION // 
	//
	clock_t start, finish;
	start = clock();

	full_pack(ag,lig_list,rotinf,num_of_res_interface, reslist_minor);

	finish = clock();
	if(0) printf("Processing Time = %f\n",((double)(finish-start)/CLOCKS_PER_SEC));

	// Writing the atom_group into a PDB //

	write_pdb_traj_nopar(ag,ifile,ofile);
	
	// Free memory //
        Free_ifres_list( &reslist_minor );	
	free(rotamer_lib);

	return 0;
}
Beispiel #11
0
int main(int argc, char** argv)
{
  int err;                            // error code returned from api calls
     
  float a1[DATA_SIZE1];               // original data set given to device
  float b1[FILTER_SIZE1];             // original data set given to device
  float c1[OUTPUT_SIZE1];
  float results1[OUTPUT_SIZE1];       // results returned from device
  float sw_results1[OUTPUT_SIZE1];     // results returned from device

  unsigned int correct;               // number of correct results returned

  size_t global[2];                   // global domain size for our calculation
  size_t local[2];                    // local domain size for our calculation

  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel
   
  char cl_platform_vendor[1001];
  char cl_platform_name[1001];
   
  cl_mem input_a;                     // device memory used for the input array
  cl_mem input_b;                     // device memory used for the input array
  cl_mem output;                      // device memory used for the output array
   
  if (argc != 2){
    printf("%s <inputfile>\n", argv[0]);
    return EXIT_FAILURE;
  }

  // Fill our data sets with pattern
  //
  int i = 0;
  for(i = 0; i < DATA_SIZE1; i++) {
    a1[i] = (float)1;
  }
  for(i = 0; i < OUTPUT_SIZE1; i++) {
    results1[i] = 0;
    sw_results1[i] = FILTER_SIZE1;
  }
  for(i = 0; i < FILTER_SIZE1; i++) {
    b1[i] = (float)1;
  }
  for(i = 0; i < OUTPUT_SIZE1; i++) {
    c1[i] = (float)0;
  }

  // Connect to first platform
  //
  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);
 
  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  
  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create a command commands
  //
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char *xclbin=argv[1];
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "conv3_layer", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the input and output arrays in device memory for our calculation
  //
  input_a = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * DATA_SIZE1, NULL, NULL);
  input_b = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(float) * FILTER_SIZE1, NULL, NULL);
  output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * OUTPUT_SIZE1, NULL, NULL);
  if (!input_a || !input_b || !output)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }    
    
  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, input_a, CL_TRUE, 0, sizeof(float) * DATA_SIZE1, a1, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array a!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Write our data set into the input array in device memory 
  //
  err = clEnqueueWriteBuffer(commands, input_b, CL_TRUE, 0, sizeof(float) * FILTER_SIZE1, b1, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array b!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  err = clEnqueueWriteBuffer(commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, c1, 0, NULL, NULL);
    
  // Set the arguments to our compute kernel
  //
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_a);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input_b);
  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

#ifdef C_KERNEL
  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
#else
  global[0] = MATRIX_RANK;
  global[1] = MATRIX_RANK;
  local[0] = MATRIX_RANK;
  local[1] = MATRIX_RANK;
  err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, 
                               (size_t*)&global, (size_t*)&local, 0, NULL, NULL);
#endif
  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, results1, 0, NULL, &readevent );  
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output array! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  clWaitForEvents(1, &readevent);
    
  printf("A\n");
  for (i=0;i<DATA_SIZE1;i++) {
    printf("%f ",a1[i]);
    if (((i+1) % NUM_DATA_ROWS) == 0)
      printf("\n");
  }
  printf("B\n");
  for (i=0;i< FILTER_SIZE1;i++) {
    printf("%f ",b1[i]);
    if (((i+1) % NUM_MASK_ROWS) == 0)
      printf("\n");
  }
  printf("res\n");
  for (i=0;i< OUTPUT_SIZE1;i++) {
    printf("%f ",results1[i]);
    if (((i+1) % NUM_OUT_ROWS) == 0)
      printf("\n");
  }
    
  // Validate our results
  //
  correct = 0;
  /* for(i = 0; i < OUTPUT_SIZE1; i++)
  {
    int row = i/MATRIX_RANK;
    int col = i%MATRIX_RANK;
    int running = 0;
    int index;
    for (index=0;index<MATRIX_RANK;index++) {
      int aIndex = row*MATRIX_RANK + index;
      int bIndex = col + index*MATRIX_RANK;
      running += a[aIndex] * b[bIndex];
    }
    sw_results[i] = running;
    }*/
    
  for (i = 0;i < OUTPUT_SIZE1; i++) 
    if(results1[i] == sw_results1[i])
      correct++;
  printf("Software\n");
  for (i=0;i<OUTPUT_SIZE1;i++) {
    //printf("%0.2f ",sw_results[i]);
    printf("%f ",sw_results1[i]);
    if (((i+1) % NUM_OUT_ROWS) == 0)
      printf("\n");
  }
    
    
  // Print a brief summary detailing the results
  //
  printf("Computed '%d/%d' correct values!\n", correct, OUTPUT_SIZE1);
    
  // Shutdown and cleanup
  //
  clReleaseMemObject(input_a);
  clReleaseMemObject(input_b);
  clReleaseMemObject(output);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

  if(correct == OUTPUT_SIZE1){
    printf("Test passed!\n");
    return EXIT_SUCCESS;
  }
  else{
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
}
Beispiel #12
0
int setup (char *configFile) {

  struct simpleLinkedList *rSet;
  char *location, *conf, *sql;

  printf("entering setup\n");

  // Defaults
  VERBOSITY = DEBUGM;
  LOG_DIR = o_printf("%s/log/opendias", VAR_DIR);

  // Get 'DB' location
  if (configFile != NULL) {
    conf = o_strdup(configFile);
  }
  else {
    conf = o_printf("%s/opendias/opendias.conf", ETC_DIR);
    if( 0 != access(conf, F_OK) ) {
      o_log(INFORMATION, "Config not in GNU location: %s. Attempting system config dir /etc/opendias/opendias.conf", conf);
      free(conf);
      conf = o_strdup("/etc/opendias/opendias.conf");
    }
  }

  o_log(INFORMATION, "|Using config file: %s", conf);
  if( 0 == load_file_to_memory(conf, &location) ) {
    o_log(ERROR, "|Cannot find main config file: %s", conf);
    free(location);
    free(conf);
    return 1;
  }
  free(conf);

  chop(location);
  BASE_DIR = o_strdup(location);
  o_log(INFORMATION, "|Which says the database is at: %s", BASE_DIR);

  // Open (& maybe update) the database.
  if(connect_db (1)) { // 1 = create if required
    free(BASE_DIR);
    free(location);
    return 1;
  }
  free(location);

  o_log(INFORMATION, "|Current config is: ");
  sql = o_strdup("SELECT config_option, config_value FROM config");

  rSet = runquery_db(sql, NULL);
  if( rSet != NULL ) {
    do {
      char *config_option, *config_value;
      config_option = o_strdup(readData_db(rSet, "config_option"));
      config_value = o_strdup(readData_db(rSet, "config_value"));

      if ( config_option == NULL || config_value == NULL ) {
        printf("either option or value is NULL\n");
      } 
      else {
        //o_log(INFORMATION, "    %s = %s", config_option, config_value);
        //remark: the pipe in the message causes o_log i_o_log to crash
        //	caused by debug.c i_o_log by double use of vprintf
        o_log(INFORMATION, "|    %s = %s", config_option, config_value);
      }

      if( 0 == strcmp(config_option, "log_verbosity") ) {
        VERBOSITY = atoi(config_value);
      }

      free(config_option);
      free(config_value);
    } while ( nextRow( rSet ) );
  }
  free_recordset( rSet );
  free(sql);

  return 0;
}
Beispiel #13
0
struct cl_package initFPGA( const char* xclbin, const char* kernel_name )
{
	/*****************************************/
	/* Initialize OpenCL */
	/*****************************************/

	// Retrieve the number of platforms
    cl_uint numPlatforms = 0;
    cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);

	//printf("Found %d platforms support OpenCL, return code %d.\n", numPlatforms, status);
 
    // Allocate enough space for each platform
    cl_platform_id *platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id));
 
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
	if (status != CL_SUCCESS)
		printf("clGetPlatformIDs error(%d)\n", status);
	
	// Retrieve the number of devices
    cl_uint numDevices = 0;
#ifndef FPGA_DEVICE
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
#else
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices);
#endif
	printf("Found %d devices support OpenCL.\n", numDevices);

    // Allocate enough space for each device
    cl_device_id *devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id));

    // Fill in the devices 
#ifndef FPGA_DEVICE
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
#else
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL);
#endif
	
	if (status != CL_SUCCESS)
		printf("clGetDeviceIDs error(%d)\n", status);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
	if (status != CL_SUCCESS)
		printf("clCreateContext error(%d)\n", status);


	//Create a command-queue
	cl_command_queue clCommandQue = clCreateCommandQueue(context, devices[0], 0, &status);

	if (status != CL_SUCCESS)
		printf("clCreateCommandQueue error(%d)\n", status);

	// 6. Load and build OpenCL kernel
	
#ifndef FPGA_DEVICE
	// Create a program with source code
    cl_program program = clCreateProgramWithSource(context, 1, 
        (const char**)&logistic_cl, NULL, &status);
	if (status != 0)
		printf("clCreateProgramWithSource error(%d)\n", status);

    // Build (compile) the program for the device
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
#else
	// Load binary from disk
	unsigned char *kernelbinary;
	printf("loading %s\n", xclbin);
	int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
	if (n_i < 0) {
		printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin);
		exit(1);
	}
	size_t n_bit = n_i;

	// Create the compute program from offline
	cl_program program = clCreateProgramWithBinary(context, 1, &devices[0], &n_bit,
			(const unsigned char **) &kernelbinary, NULL, &status);
	if ((!program) || (status != CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n", status);
		exit(1);
	}

	// Build the program executable
	status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
#endif

	if (status != 0) {
		char errmsg[2048];
		size_t sizemsg = 0;

		status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048*sizeof(char), errmsg, &sizemsg);

		printf("clBuildProgram error(%d)\n", status);
		printf("Compilation messages: \n %s", errmsg);
	}

	cl_kernel clKernel = clCreateKernel(program, kernel_name, &status);
	if (status != CL_SUCCESS)
		printf("clCreateKernel error(%d)\n", status);

	// TODO: parameterize the size of buffers
	cl_mem d_gradient = clCreateBuffer(context, CL_MEM_READ_WRITE, FEATURE_SIZE*LABEL_SIZE*GROUP_SIZE*sizeof(float), NULL, &status);
	if (status != CL_SUCCESS)
		printf("d_gradient clCreateBuffer error(%d)\n", status);

	cl_mem d_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, FEATURE_SIZE*LABEL_SIZE*sizeof(float), NULL, &status);
	if (status != CL_SUCCESS)
		printf("d_weights clCreateBuffer error(%d)\n", status);

	cl_mem d_data = clCreateBuffer(context, CL_MEM_READ_ONLY, (FEATURE_SIZE+LABEL_SIZE)*CHUNK_SIZE*sizeof(float), NULL, &status);
	if (status != CL_SUCCESS)
		printf("d_data clCreateBuffer error(%d)\n", status);

    struct cl_package result;
    result.context = context;
    result.kernel = clKernel;
    result.commandQueue = clCommandQue;
    result.d_gradient = d_gradient;
    result.d_weights = d_weights;
    result.d_data = d_data;

    return result;
}
void BurstSort::parallelSort(std::ofstream& file){
	char* buffer = NULL;
	char* tmp;
	int* posArray = NULL;
	int entryLength = KEY_LENGTH + sizeof(char*);
	buffer = (char*) malloc(sizeof(char) * size * entryLength);
	posArray = (int*) malloc(sizeof(int) * (NODE_SIZE + 1));
	int pos = 0;
	posArray[0] = 0;
	for(int i = 0; i < NODE_SIZE; i++){
		for(int j = 0; j < nodes[i].used; j++){
			memcpy(buffer + pos * entryLength, nodes[i].entries[j], KEY_LENGTH * sizeof(char));
			memcpy(buffer + pos * entryLength + KEY_LENGTH, &nodes[i].entries[j], sizeof(char*));
			pos += sizeof(char);
		}
		posArray[i+1] = pos;
	}

	// OpenCL
	// Use this to check the output of each API call
    cl_int status;  
	cl_int numDevices = 1;
	
	// Connect to first platform
    cl_platform_id platform;
    status = clGetPlatformIDs(1, &platform, NULL);

	if (status != CL_SUCCESS) {
		printf("Error: Failed to find an OpenCL platform!\n");
		return -1;
	}
 
	char cBuffer[1024];
	clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(cBuffer), cBuffer, NULL);
	printf("CL_PLATFORM_VENDOR %s\n", cBuffer);

	clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
	printf("CL_PLATFORM_NAME %s\n", cBuffer);

    cl_device_id device;
	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 1, &device, NULL);

	if (status != CL_SUCCESS) {
		printf("Error: Failed to create a device group!\n");
		return -1;
	}

	cl_long maxBufferSize = 0;
	status = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxBufferSize, NULL);
	printf("max buffer size: %lld\n", maxBufferSize);

    // Create a context and associate it with the devices
    cl_context context;
    context = clCreateContext(NULL, numDevices, &device, NULL, NULL, &status);
	

	if (status != CL_SUCCESS) {
		printf("Error in creating context, code %d\n", status);
		return -1;
	}
    // Create a command queue and associate it with the device 
    cl_command_queue cmdQueue;
    cmdQueue = clCreateCommandQueue(context, device, 0, &status);

	if (status != CL_SUCCESS) {
		printf("Error in creating command queue for a device, code %d\n", status);
		return -1;
	}

	// Load binary from disk
	unsigned char *kernelbinary;
	char *xclbin = "sort_xiaohui.xclbin";
	printf("loading %s\n", xclbin);
	int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
	if (n_i < 0) {
		printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin);
		return -1;
	}
	size_t n_bit = n_i;

	// Create the compute program from offline
	cl_program program = clCreateProgramWithBinary(context, 1, &device, &n_bit,
			(const unsigned char **) &kernelbinary, NULL, &status);
	if ((!program) || (status != CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n", status);
		return -1;
	}

	// Build the program executable
	status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

	if (status != CL_SUCCESS) {
		size_t len;
		char buffer[2048];

		printf("Error: Failed to build program executable!\n");
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
		printf("%s\n", buffer);
		return -1;
	}

	// Create the vector addition kernel
    cl_kernel kernel;
    kernel = clCreateKernel(program, "sort", &status);


	cl_mem clPosArray;
	cl_mem clBuffer;
	clBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, 
		sizeof(char) * size * entryLength, NULL, &status);

	clPosArray = clCreateBuffer(context, CL_MEM_READ_ONLY, 
		sizeof(int) * (NODE_SIZE + 1), NULL, &status);
	
	status = clEnqueueWriteBuffer(cmdQueue, clPosArray, CL_FALSE, 
		0, sizeof(int) * (NODE_SIZE + 1),posArray, 0, NULL, NULL);

	status = clEnqueueWriteBuffer(cmdQueue, clBuffer, CL_FALSE, 
		0, sizeof(char) * size * entryLength, buffer, 0, NULL, NULL);


    // Associate the input and output buffers with the kernel 
	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &clBuffer);

	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &clPosArray);

	int nodeSize = NODE_SIZE;
	status = clSetKernelArg(kernel, 2, sizeof(int), (void *)&nodeSize);

	status = clSetKernelArg(kernel, 3, sizeof(int), (void *)&entryLength);

	size_t globalWorkSize[1];   

	globalWorkSize[0] = NODE_SIZE;

    gettimeofday(&t1, NULL);
	// Execute the kernel for execution
    status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);

	if (status != CL_SUCCESS) {
		printf("Error in clEnqueue, code %d\n", status);
		return -1;
	}


    // Read the device output buffer to the host output array
	clEnqueueReadBuffer(cmdQueue, clBuffer, CL_TRUE, 0, 
		sizeof(char) * size * entryLength, buffer, 0, NULL, NULL);

    // Free OpenCL resources
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(cmdQueue);
	clReleaseMemObject(clBuffer);
	clReleaseMemObject(clPosArray);
	clReleaseContext(context);

    //print result
	for(int i = 0; i < size; i+= sizeof(char)){
		memcpy(&tmp,buffer + i * entryLength + KEY_LENGTH,sizeof(char*));
		file << tmp;
	}

    // Free host resources
	free(buffer);
	free(posArray);

	free(platforms);
	free(devices);

}
Beispiel #15
0
int deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree,
  unsigned tree_len, unsigned char* output, unsigned* out_len)
{
#define SDACCEL_WRAPPER
#ifdef SDACCEL_WRAPPER
  int err;                            // error code returned from api calls

  cl_platform_id platform_id;         // platform id
  cl_device_id device_id;             // compute device id 
  cl_context context;                 // compute context
  cl_command_queue commands;          // compute command queue
  cl_program program;                 // compute program
  cl_kernel kernel;                   // compute kernel
   
  char cl_platform_vendor[1001];
  char cl_platform_name[1001];

  err = clGetPlatformIDs(1,&platform_id,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to find an OpenCL platform!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor);
  err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  printf("CL_PLATFORM_NAME %s\n",cl_platform_name);

  // Connect to a compute device
  //
  int fpga = 0;
#if defined (FPGA_DEVICE)
  fpga = 1;
#endif
  err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to create a device group!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create a compute context 
  //
  context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context)
  {
    printf("Error: Failed to create a compute context!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create a command commands
  //
  commands = clCreateCommandQueue(context, device_id, 0, &err);
  if (!commands)
  {
    printf("Error: Failed to create a command commands!\n");
    printf("Error: code %i\n",err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  int status;

  // Create Program Objects
  //
  
  // Load binary from disk
  unsigned char *kernelbinary;
  char xclbin[]="deflate1.xclbin";
  printf("loading %s\n", xclbin);
  int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary);
  if (n_i < 0) {
    printf("failed to load kernel from xclbin: %s\n", xclbin);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  size_t n = n_i;
  // Create the compute program from offline
  program = clCreateProgramWithBinary(context, 1, &device_id, &n,
                                      (const unsigned char **) &kernelbinary, &status, &err);
  if ((!program) || (err!=CL_SUCCESS)) {
    printf("Error: Failed to create compute program from binary %d!\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Build the program executable
  //
  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    size_t len;
    char buffer[2048];

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
    printf("%s\n", buffer);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the compute kernel in the program we wish to run
  //
  kernel = clCreateKernel(program, "deflate259", &err);
  if (!kernel || err != CL_SUCCESS)
  {
    printf("Error: Failed to create compute kernel! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Create the input and output arrays in device memory for our calculation
//  void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree,
//    unsigned tree_len, unsigned char* output, unsigned* out_len)
  cl_mem input_arg, in_len_arg, tree_arg, tree_len_arg, output_arg, out_len_arg;
  input_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, CHUNK, NULL, NULL);
  in_len_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL);
  tree_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, 512, NULL, NULL);
  tree_len_arg = clCreateBuffer(context,  CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL);
  output_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, CHUNK*2, NULL, NULL);
  out_len_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(unsigned), NULL, NULL);

  if (!input_arg || !in_len_arg || !tree_arg || !tree_len_arg || !output_arg || !out_len_arg)
  {
    printf("Error: Failed to allocate device memory!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }    

  err = clEnqueueWriteBuffer(commands, input_arg, CL_TRUE, 0, in_len, input, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array input!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  err = clEnqueueWriteBuffer(commands, in_len_arg, CL_TRUE, 0, sizeof(unsigned), &in_len, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array &in_len!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  err = clEnqueueWriteBuffer(commands, tree_arg, CL_TRUE, 0, 512, tree, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array tree!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  err = clEnqueueWriteBuffer(commands, tree_len_arg, CL_TRUE, 0, sizeof(unsigned), &tree_len, 0, NULL, NULL);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to write to source array &tree_len!\n");
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Set the arguments to our compute kernel
//void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree,
//  unsigned tree_len, unsigned char* output, unsigned* out_len)
  err = 0;
  err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_arg);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_len_arg);
  err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &tree_arg);
  err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &tree_len_arg);
  err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_arg);
  err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &out_len_arg);
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Execute the kernel over the entire range of our 1d input data set
  // using the maximum number of work group items for this device
  //

#ifdef C_KERNEL
  err = clEnqueueTask(commands, kernel, 0, NULL, NULL);
#else
  size_t global[1];
  size_t local[1];
  global[0] = 1;
  local[0] = 1;
  err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, 
                               (size_t*)&global, (size_t*)&local, 0, NULL, NULL);
#endif
  if (err)
  {
    printf("Error: Failed to execute kernel! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  // Read back the results from the device to verify the output
  //
  cl_event readevent;
  unsigned out_len_b;
  err = clEnqueueReadBuffer( commands, out_len_arg, CL_TRUE, 0, sizeof(unsigned),
      &out_len_b, 0, NULL, &readevent );
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output length! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }

  clWaitForEvents(1, &readevent);
  *out_len = out_len_b;

  printf("Read final output length: %d\n", out_len_b);

  err = clEnqueueReadBuffer( commands, output_arg, CL_TRUE, 0, out_len_b, output, 0, NULL, &readevent );
  if (err != CL_SUCCESS)
  {
    printf("Error: Failed to read output data! %d\n", err);
    printf("Test failed\n");
    return EXIT_FAILURE;
  }
  clWaitForEvents(1, &readevent);
#endif
}
Beispiel #16
0
int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
	int in, hid, out;
	float out_err, hid_err;
  
	in = net->input_n;
	hid = net->hidden_n;
	out = net->output_n;   

        //int use_device = 0;  // use CPU as device
	int use_device = 2;  // use GPU as device
        //int use_device = 2;  // use FPGA as device
	if(initialize(use_device)) return -1;
         
	int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char)); 
	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	// read the kernel core source
	char * kernel_bp1  = "bpnn_layerforward_ocl";
	char * kernel_bp2  = "bpnn_adjust_weights_ocl";
	char * tempchar = "./backprop_kernel.cl";
        char * krnl_file = "./binary/backprop_kernel_default.xclbin";
 
        cl_int err = 0;
        cl_program prog;
        // create program from source
        if (use_device < 2 ) {
	    FILE * fp = fopen(tempchar, "rb"); 
	    if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
	    fread(source + strlen(source), sourcesize, 1, fp);
	    fclose(fp);
		
	    // compile kernel
	    err = 0;
	    const char * slist[2] = { source, 0 };
	    prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	    if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
        } 
        // create program from binary
        else {
            char *krnl_bin;
	    const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin);

            err = 0;
            prog = clCreateProgramWithBinary(context, 1,
	                                    &device_list[0], &krnl_size,
	                                    (const unsigned char**) &krnl_bin,
	                                    NULL, &err);
            if ((!prog) || (err!=CL_SUCCESS)) {
		printf("Error: Failed to create compute program from binary %d!\n",
		       err);
		printf("Test failed\n");
		exit(EXIT_FAILURE);
	    }
        }
        
	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
	{ // show warnings/errors
		//static char log[65536]; memset(log, 0, sizeof(log));
		//cl_device_id device_id = 0;
		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
	}
	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
    	
	cl_kernel kernel1;
	cl_kernel kernel2;
	kernel1 = clCreateKernel(prog, kernel_bp1, &err);  
        if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; }
	kernel2 = clCreateKernel(prog, kernel_bp2, &err);  
	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; }
	/* clReleaseProgram(prog); */
	
	float *input_weights_one_dim;
    float *input_weights_prev_one_dim;
	float * partial_sum;
	float sum;
	float num_blocks = in / BLOCK_SIZE;
	
	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
	
	// set global and local workitems
	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; 
	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
	
	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
	// todo: fix mem allocation
	int m = 0;
	for (int k = 0; k <= in; k++) {	
		for (int j = 0; j <= hid; j++) {
		input_weights_one_dim[m] = net->input_weights[k][j];
		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
	    m++;
		}
	}
	
	cl_mem input_hidden_ocl;
	cl_mem input_ocl;
	cl_mem output_hidden_ocl;
	cl_mem hidden_partial_sum;
	cl_mem hidden_delta_ocl;
	cl_mem input_prev_weights_ocl;
  
	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
		
	printf("Performing GPU computation\n");
	
	//write buffers
	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
 
	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0);
        if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");}
	if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }	
  
	for (int j = 1; j <= hid; j++) {
		sum = 0.0;
		for (int k = 0; k < num_blocks; k++) {	
		sum += partial_sum[k * hid + j-1] ;
    }
		sum += net->input_weights[0][j];
		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
	}

	
	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);  
	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);

	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
  
	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
  
	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
  
	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }	
	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }	
  
	clReleaseMemObject(input_ocl);
	clReleaseMemObject(output_hidden_ocl);
	clReleaseMemObject(input_hidden_ocl);
	clReleaseMemObject(hidden_partial_sum);
	clReleaseMemObject(input_prev_weights_ocl);
  
	free(input_weights_prev_one_dim);
	free(partial_sum);
	free(input_weights_one_dim);

}