int main(int argc, char** argv) { int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem in_array; // device memory used for the input array //cl_mem synaptic_weights; // device memory used for the input array cl_mem out_array; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return -1; } //float in_array[NO_NODES]; //float out_array[NO_NODES]; //float synaptic_weights[NO_NODES*NO_NODES]; float in_array_tb[NO_NODES]; float out_array_tb[NO_NODES]; //float synaptic_weights_tb[NO_NODES*NO_NODES]; float temp =0; int i = 0; int j = 0; int index = 0; FILE* ifp; char* mode = "r"; // // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return -1; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return -1; } // // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return -1; } //relu_1(in_array,synaptic_weights,out_array); // Fill our data sets with pattern // //int i = 0; //for(i = 0; i < DATA_SIZE; i++) { // a[i] = (int)i; // b[i] = (int)i; // results[i] = 0; //} // // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return -1; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return -1; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); printf("err : %d %s\n",err,err); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return -1; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "relu_1", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return -1; } // Create the input and output arrays in device memory for our calculation // in_array = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES, NULL, NULL); //synaptic_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES * NO_NODES, NULL, NULL); out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL); if (!in_array || /*!synaptic_weights ||*/ !out_array) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return -1; } ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode); if(ifp == NULL) { printf("Input file not found \n"); return -1; } while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) { in_array_tb[index++] = temp; } index = 0; temp = 0; //ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode); //if(ifp == NULL) //{ // printf("Weight file not found \n"); // return -1; //} //while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) { // synaptic_weights_tb[index++] = temp; //} // // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return -1; } // Write our data set into the input array in device memory // //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) * NO_NODES * NO_NODES, synaptic_weights_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return -1; } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array); //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return -1; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // err = clEnqueueTask(commands, kernel, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return -1; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return -1; } clWaitForEvents(1, &readevent); //printf("A\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",a[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("B\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",b[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("res\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // Validate our results // //correct = 0; //for(i = 0; i < DATA_SIZE; i++) //{ // int row = i/MATRIX_RANK; // int col = i%MATRIX_RANK; // int running = 0; // int index; // for (index=0;index<MATRIX_RANK;index++) { // int aIndex = row*MATRIX_RANK + index; // int bIndex = col + index*MATRIX_RANK; // running += a[aIndex] * b[bIndex]; // } // sw_results[i] = running; //} // //for (i = 0;i < DATA_SIZE; i++) // if(results[i] == sw_results[i]) // correct++; //printf("Software\n"); //for (i=0;i<DATA_SIZE;i++) { // //printf("%0.2f ",sw_results[i]); // printf("%d ",sw_results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // // //// Print a brief summary detailing the results //// //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE); // // Shutdown and cleanup int temp_ = 0; for (j = 0; j < NO_NODES; j++) { if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0) { //printf("out_array[%d] = %f \n", j, out_array[j]); temp_++; } } clReleaseMemObject(in_array); //clReleaseMemObject(synaptic_weights); clReleaseMemObject(out_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if (temp_ == NO_NODES) { printf("*********************************************************** \n"); printf("TEST PASSED !!!!!! The output matches the desired output. \n"); printf("*********************************************************** \n"); return EXIT_SUCCESS; } else { printf("**************************************************************** \n"); printf("TEST Failed !!!!!! The output does not match the desired output. \n"); printf("**************************************************************** \n"); return -1; } //if(correct == DATA_SIZE){ // printf("Test passed!\n"); // return EXIT_SUCCESS; //} //else{ // printf("Test failed\n"); // return -1; //} }
int main (int argc, char **argv) { int i; struct scanCallInfo infoData; int totbytes = 3359 * 4679; unsigned char *pic; (void) load_file_to_memory("./tmp2.pnm", &pic); // pic=(unsigned char *)malloc( totbytes+1 ); // for (i=0;i<totbytes;i++) pic[i]=255; infoData.language = (const char*)OCR_LANG_BRITISH; infoData.imagedata = (const unsigned char*)pic; infoData.bytes_per_pixel = 1; infoData.bytes_per_line = 3359; infoData.width = 3359; infoData.height = 4679; runocr(&infoData); printf("%s", infoData.ret); free(infoData.ret); free(pic); return 0; }
void Load(char* f) { vector<standardRecord> emptyVector; //Load file in memory int size=0; char *ptr=0; char* cursor=0; size=load_file_to_memory(f,&ptr); cursor=ptr; if(size>0&&ptr!=0) { cursor=ptr+24+(*(int*)(ptr+4)); while(cursor<(ptr+size)) cursor=LoopGRUP(cursor,0,0); } ParseLoadedData(); delete ptr; //Trick to clear used ram by the vector recordPointers.swap(emptyVector); recordPointers.clear(); }
cl_kernel xcl_import_binary(xcl_world world, const char *krnl_file, const char *krnl_name) { int err; char *krnl_bin; const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin); cl_program program = clCreateProgramWithBinary(world.context, 1, &world.device_id, &krnl_size, (const unsigned char**) &krnl_bin, NULL, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); exit(EXIT_FAILURE); } err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, world.device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); exit(EXIT_FAILURE); } cl_kernel kernel = clCreateKernel(program, krnl_name, &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create kernel for %s: %d\n", krnl_name, err); printf("Test failed\n"); exit(EXIT_FAILURE); } /* if program is released, then EnqueueNDRangeKernel fails with * INVALID_KERNEL */ /* clReleaseProgram(program); */ free(krnl_bin); return kernel; }
void read_cl_file(char** argv) #endif { #if OPENCL_DEVICE_SELECTION!=CL_DEVICE_TYPE_ACCELERATOR // Load the kernel source code into the array source_str fp = fopen("jacobi1D_gpu_ghost.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); #else printf("loading %s\n", argv[1]); source_size = load_file_to_memory(argv[1], (char **) &source_str); if (source_size < 0) { printf("failed to load kernel from xclbin: %s\n", argv[1]); } #endif }
///////////////////////////////////////////////////////// // Program main ///////////////////////////////////////////////////////// int main(int argc, char** argv) { int err = 0; int passed = 0; // timer structs double elapsed = 0; srand(time(NULL)); int N = 4; char dir[100] = "./data"; if (argc>1) N = atoi(argv[1]); //if (argc>2) // strcpy(dir, argv[2]); // Allocate matrices and vectors float *A = (float *) malloc(N*N*sizeof(float)); float *A0 = (float *) malloc(N*N*sizeof(float)); float *b = (float *) malloc(N*sizeof(float)); float *b0 = (float *) malloc(N*sizeof(float)); // ADDED; original b matrix before permutations float *L = (float *) malloc(N*N*sizeof(float)); float *x = (float *) malloc(N*sizeof(float)); float *y = (float *) malloc(N*sizeof(float)); float *Acurr = (float *) malloc(N*sizeof(float)); int i, j; // Initialize A and b for(i = 0; i < N; i++) { for(j = 0; j < N; j++) { float r = (float) rand(); if(r > RAND_MAX/2) A[i*N+j] = A0[i*N+j] = -(r-RAND_MAX/2)/(RAND_MAX/2); else A[i*N+j] = A0[i*N+j] = r/(RAND_MAX/2); } float r = (float) rand(); if(r > RAND_MAX/2) b[i] = b0[i] = -(r-RAND_MAX/2)/(RAND_MAX/2); else b[i] = b0[i] = r/(RAND_MAX/2); } // Initialize L matrix, x,y vectors // Added to ensure initial values are 0 for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { L[i*N+j] = 0; } y[i] = 0; x[i] = 0; Acurr[i] = 0; } // TEST A AND b MANUAL GENERATION /* for(i = 0; i < N; i++) { for(j = 0; j < N; j++) { if (i == j) A[i*N+j] = A0[i*N+j] = 1; else A[i*N+j] = A0[i*N+j] = 0; } b[i] = b0[i] = (float) i/(10.0); } // END GENERATION */ //show_matrix(A,0,N); // 1. allocate host memory for matrices A and B int width_A, width_A0, width_L, height_A, height_A0, height_L, height_b, height_b0, height_x, height_y, width_Acurr; width_A = width_A0 = width_L = height_A = height_A0 = height_L = height_b = height_b0 = height_x = height_y = width_Acurr = N; unsigned int size_A = width_A * height_A; unsigned int size_A0 = width_A0 * height_A0; unsigned int size_L = width_L * height_L; unsigned int size_b = height_b; unsigned int size_b0 = height_b0; unsigned int size_x = height_x; unsigned int size_y = height_y; unsigned int size_Acurr = width_Acurr; unsigned int mem_size_A = sizeof(float) * size_A; unsigned int mem_size_A0 = sizeof(float) * size_A0; unsigned int mem_size_L = sizeof(float) * size_L; unsigned int mem_size_b = sizeof(float) * size_b; unsigned int mem_size_b0 = sizeof(float) * size_b0; unsigned int mem_size_x = sizeof(float) * size_x; unsigned int mem_size_y = sizeof(float) * size_y; unsigned int mem_size_Acurr = sizeof(float) * size_Acurr; // Host pointers float* h_A = A; float* h_L = L; float* h_b = b; float* h_x = x; float* h_y = y; float* h_Acurr = Acurr; // 5. Initialize OpenCL cl_command_queue clCommandQue; cl_program program; cl_kernel clKernel; size_t dataBytes; size_t kernelLength; cl_int status; /*****************************************/ /* Initialize OpenCL */ /*****************************************/ // Retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); //printf("Found %d platforms support OpenCL, return code %d.\n", numPlatforms, status); // Allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) printf("clGetPlatformIDs error(%d)\n", status); // Retrieve the number of devices cl_uint numDevices = 0; #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices); #endif printf("Found %d devices support OpenCL.\n", numDevices); // Allocate enough space for each device cl_device_id *devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL); #endif if (status != CL_SUCCESS) printf("clGetDeviceIDs error(%d)\n", status); // GET MAX DEVICE LOCAL MEMORY SIZE //cl_ulong mem_size; //clGetDeviceInfo(devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL); //printf("CL_DEVICE_LOCAL_MEM_SIZE: %d KB\n", (unsigned int)(mem_size / 1024)); // GET MAX NUMBER OF WORK ITEMS PER DIMENSION //size_t workitem_size[3]; //cl_int ret = clGetDeviceInfo(devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); //printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: %d / %d / %d\n", workitem_size[0], workitem_size[1], workitem_size[2]); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); if (status != CL_SUCCESS) printf("clCreateContext error(%d)\n", status); // OpenCL device memory for matrices cl_mem d_A; cl_mem d_L; cl_mem d_b; cl_mem d_x; cl_mem d_y; cl_mem d_Acurr; //Create a command-queue clCommandQue = clCreateCommandQueue(context, devices[0], 0, &status); if (status != CL_SUCCESS) printf("clCreateCommandQueue error(%d)\n", status); // Setup device memory d_x = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_x, NULL, &status); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &status); d_L = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_L, h_L, &status); d_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_b, h_b, &status); d_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_y, h_y, &status); d_Acurr = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_Acurr, h_Acurr, &status); #ifndef FPGA_DEVICE // WE CAN'T USE THIS UNLESS WE MAKE A HEADER FILE WITH A GIANT STRING OF THE KERNEL PROGRAM // Create a program with source code program = clCreateProgramWithSource(context, 1, (const char**)&lu259_cl, NULL, &status); if (status != 0) printf("clCreateProgramWithSource error(%d)\n", status); // Build (compile) the program for the device status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); #else // Load binary from disk unsigned char *kernelbinary; char *xclbin = argv[2]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); printf("done loading\n"); if (n_i < 0) { printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin); return -1; } size_t n_bit = n_i; printf("creating program with binary\n"); // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &devices[0], &n_bit, (const unsigned char **) &kernelbinary, NULL, &status); if ((!program) || (status != CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", status); return -1; } printf("done creating program with binary\n"); printf("building program\n"); // Build the program executable status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); printf("done building program\n"); #endif if (status != 0) { char errmsg[2048]; size_t sizemsg = 0; status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048*sizeof(char), errmsg, &sizemsg); printf("clBuildProgram error(%d)\n", status); printf("Compilation messages: \n %s", errmsg); } clKernel = clCreateKernel(program, "LUFact", &status); if (status != CL_SUCCESS) printf("clCreateKernel error(%d)\n", status); // 7. Launch OpenCL kernel //size_t localWorkSize[2], globalWorkSize[2]; size_t localWorkSize[1], globalWorkSize[1]; int width_matrix = width_A; int height_vector = height_x; status = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_x); status |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); status |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_L); status |= clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_b); status |= clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&d_y); status |= clSetKernelArg(clKernel, 5, sizeof(cl_mem), (void *)&d_Acurr); status |= clSetKernelArg(clKernel, 6, sizeof(int), (void *)&N); //status |= clSetKernelArg(clKernel, 6, sizeof(int), (void *)&height_vector); if (status != CL_SUCCESS) printf("clSetKernelArg error(%d)\n", status); //localWorkSize[0] = BLOCK_SIZE; //localWorkSize[1] = BLOCK_SIZE; //globalWorkSize[0] = width_A; //globalWorkSize[1] = height_A; localWorkSize[0] = N;//(N)/BLOCK_SIZE; globalWorkSize[0] = N;//(N*N)/BLOCK_SIZE; // start timer clock_t start = clock(); status = clEnqueueWriteBuffer(clCommandQue, d_A, CL_FALSE, 0, mem_size_A, h_A, 0, NULL, NULL); status = clEnqueueWriteBuffer(clCommandQue, d_L, CL_FALSE, 0, mem_size_L, h_L, 0, NULL, NULL); status = clEnqueueWriteBuffer(clCommandQue, d_b, CL_FALSE, 0, mem_size_b, h_b, 0, NULL, NULL); status = clEnqueueWriteBuffer(clCommandQue, d_y, CL_FALSE, 0, mem_size_y, h_y, 0, NULL, NULL); status = clEnqueueWriteBuffer(clCommandQue, d_Acurr, CL_FALSE, 0, mem_size_Acurr, h_Acurr, 0, NULL, NULL); printf("Enter the dragon\n"); status = clEnqueueNDRangeKernel(clCommandQue, clKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (status != CL_SUCCESS) printf("clEnqueueNDRangeKernel error(%d)\n", status); printf("Exit the dragon\n"); // 8. Retrieve result from device status = clEnqueueReadBuffer(clCommandQue, d_x, CL_TRUE, 0, mem_size_x, h_x, 0, NULL, NULL); printf("HERE1\n"); //status = clEnqueueReadBuffer(clCommandQue, d_A, CL_TRUE, 0, mem_size_A, h_A, 0, NULL, NULL); //status = clEnqueueReadBuffer(clCommandQue, d_L, CL_TRUE, 0, mem_size_L, h_L, 0, NULL, NULL); printf("HERE2\n"); if (status != CL_SUCCESS) printf("clEnqueueReadBuffer error(%d)\n", status); printf("HERE3\n"); //show_matrix(A,0,N); //show_matrix(L,0,N); printf("HERE4\n"); // TEMPORARILY ADDED IN FOR DEBUGGING PURPOSES /*for(i = 0; i < N; i++) { float yi = b[i]; for(j = 0; j < i; j++) { yi -= L[i*N+j]*y[j]; } y[i] = yi; } // Use back substitution to solve Ux = y for(i = N-1; i >= 0; i--) { float xi = y[i]; for(j = i+1; j < N; j++) xi -= A[i*N+j]*x[j]; x[i] = xi/A[i*N+i]; } // END TEMPORARILY ADDED IN */ printf("HERE5\n"); //show_matrix(b,0,N); //show_matrix(b0,0,N); //show_matrix(x,0,N); printf("So far so good\n"); // stop timer clock_t end = clock(); elapsed += ((double)(end-start)) / CLOCKS_PER_SEC; printf("LU decomposition done. Now to check\n"); // Check result float error = 0; for(i = 0; i < N; i++) { float b_res = 0; for(j = 0; j < N; j++) b_res += A0[i*N+j] * x[j]; if ((b_res - 0.1) < b0[i] || (b_res + 0.1) > b0[i]) b_res = b0[i]; error += b_res > b0[i] ? b_res-b0[i] : b0[i]-b_res; //printf("b_res is: %f\n", b_res); } float epsilonPerRow = 0.01; if(error < N*epsilonPerRow) passed++; printf("%d of %d tests passed\n", passed, ITER); printf("Average time: %.2f seconds\n", elapsed/ITER); // 10. clean up memory free(A0); free(b0); free(h_A); free(h_L); free(h_b); free(h_x); free(h_y); free(h_Acurr); clReleaseMemObject(d_A); clReleaseMemObject(d_L); clReleaseMemObject(d_b); clReleaseMemObject(d_x); clReleaseMemObject(d_y); clReleaseMemObject(d_Acurr); free(devices); clReleaseContext(context); clReleaseKernel(clKernel); clReleaseProgram(program); clReleaseCommandQueue(clCommandQue); }
void init_device(int concurrent){ int err; char cl_platform_vendor[1001]; char cl_platform_name[1001]; h_input = (cl_int *) malloc(sizeof(cl_int)*REC_N); h_output = (cl_int *) malloc(sizeof(cl_int)*REC_N); err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); exit(1); } err = clGetPlatformInfo(platform_id, CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); exit(1); } err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); exit(1); } int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); exit(1); } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); exit(1); } // Create a command commands // if (concurrent) { commands = clCreateCommandQueue(context, device_id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); } else { commands = clCreateCommandQueue(context, device_id, 0, &err); } if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); exit(1); } int status; unsigned char *kernelbinary; char xclbin[] = "pipe.xclbin"; int n_i= load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin\n"); printf("Test failed\n"); exit(1); } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); exit(1); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); exit(1); } // Create the compute kernel in the program we wish to run // kernel_in = clCreateKernel(program, "kernel_in", &err); if (!kernel_in || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel_in! %d\n", err); printf("Test failed\n"); exit(1); } kernel_inter = clCreateKernel(program, "kernel_inter", &err); if (!kernel_inter || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel_inter! %d\n", err); printf("Test failed\n"); exit(1); } kernel_out = clCreateKernel(program, "kernel_out", &err); if (!kernel_out || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel_out! %d\n", err); printf("Test failed\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int)*REC_N, NULL, NULL); d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_int)*REC_N, NULL, NULL); if (!d_input || !d_output) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel_in, 0, sizeof(cl_mem), &d_input); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel argument d_input! %d\n", err); printf("Test failed\n"); exit(1); } err = 0; err = clSetKernelArg(kernel_out, 0, sizeof(cl_mem), &d_output); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel argument d_output! %d\n", err); printf("Test failed\n"); exit(1); } }
int main(int argc, char** argv) { int err; // error code returned from api calls int* a = NULL; // input pointer int* results = NULL; // output pointer unsigned int correct; // number of correct results returned size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem input_a; // device memory used for the input array //cl_mem input_b; // device memory used for the input array cl_mem output; // device memory used for the output array int inc; double t_start, t_end; if (argc != 2) { printf("%s <inputfile>\n", argv[0]); return EXIT_FAILURE; } // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return EXIT_FAILURE; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return EXIT_FAILURE; } else { printf("Succeed to load kernel from xclbin: %s\n", xclbin); } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); return EXIT_FAILURE; } else { printf("Succeed to create compute program from binary %d!\n", err); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return EXIT_FAILURE; } else { printf("Succeed to build program executable!\n"); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "mmult", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return EXIT_FAILURE; } else { printf("Succeed to create compute kernel!\n"); } // Create the input and output arrays in device memory for our calculation // input_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * DATA_SIZE, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * RESULT_SIZE, NULL, NULL); if (!input_a || !output) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } else { printf("Succeed to allocate device memory!\n"); } // set up socket printf("\n************* Welcome to UCLA FPGA agent! **********\n"); struct sockaddr_in stSockAddr; int SocketFD = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); if(-1 == SocketFD) { perror("can not create socket"); exit(EXIT_FAILURE); } memset(&stSockAddr, 0, sizeof(stSockAddr)); stSockAddr.sin_family = AF_INET; stSockAddr.sin_port = htons(7000); stSockAddr.sin_addr.s_addr = htonl(INADDR_ANY); if(-1 == bind(SocketFD,(struct sockaddr *)&stSockAddr, sizeof(stSockAddr))) { perror("error bind failed"); close(SocketFD); exit(EXIT_FAILURE); } if(-1 == listen(SocketFD, 10)) { perror("error listen failed"); close(SocketFD); exit(EXIT_FAILURE); } int taskNum = -1; // polling setting timespec deadline; deadline.tv_sec = 0; deadline.tv_nsec = 100; // Get the start time timespec timer = tic( ); timespec socListenTime = diff(timer, timer); timespec socSendTime = diff(timer, timer); timespec socRecvTime = diff(timer, timer); timespec exeTime = diff(timer, timer); bool broadcastFlag = false; int packet_buf[PACKET_SIZE]; int time_buf[TIME_BUF_SIZE]; while (true) { //printf("\n************* Got a new task! *************\n"); timer = tic(); int ConnectFD = accept(SocketFD, NULL, NULL); if (!broadcastFlag) { broadcastFlag = true; timer = tic(); } // For profiling only //struct timeval tv; //gettimeofday(&tv, NULL); //double time_in_mill = (tv.tv_sec) * 1000 + (tv.tv_usec) / 1000 ; // convert tv_sec & tv_usec to millisecond //printf("Receive time (ms): %lf\n", time_in_mill); accTime (&socListenTime, &timer); if(0 > ConnectFD) { perror("error accept failed"); close(SocketFD); exit(EXIT_FAILURE); } read(ConnectFD, &packet_buf, PACKET_SIZE * sizeof(int)); // send FPGA stats back to java application if(packet_buf[0] == -1) { // for profiling use collect_timer_stats(ConnectFD, &socListenTime, &socSendTime, &socRecvTime, &exeTime, &timer); broadcastFlag = false; continue; } char* shm_addr; int shmid = -1; int data_size = -1; // data sent to FPGA (unit: int) shmid = packet_buf[0]; data_size = packet_buf[1]; printf("Shmid: %d, Data size (# of int): %d\n", shmid, data_size); // shared memory if((shm_addr = (char *) shmat(shmid, NULL, 0)) == (char *) -1) { perror("Server: shmat failed."); exit(1); } //else //printf("Server: attach shared memory: %p\n", shm_addr); int done = 0; while(done == 0) { done = (int) *((int*)shm_addr); clock_nanosleep(CLOCK_REALTIME, 0, &deadline, NULL); } //printf("Copy data to the array in the host\n"); a = (int *)(shm_addr + FLAG_NUM * sizeof(int)); results = (int *)(shm_addr + FLAG_NUM * sizeof(int)); accTime (&socSendTime, &timer); taskNum = a[2]; for (int i=0; i<taskNum; i++) { int tmp = *(a+8+i*8+7); assert(tmp >=0 && tmp < TOTAL_TASK_NUMS); } printf("Task Num: %d\n", taskNum); //printf("\nparameter recieved --- \n"); //Write our data set into the input array in device memory //printf("Write data from host to FPGA\n"); err = clEnqueueWriteBuffer(commands, input_a, CL_TRUE, 0, sizeof(int) * data_size, a, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(int), &taskNum); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // //printf("Enqueue Task\n"); err = clEnqueueTask(commands, kernel, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Read back the results from the device to verify the output // cl_event readevent; //printf("Enqueue read buffer\n"); err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(int) * FPGA_RET_PARAM_NUM * taskNum, results, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } //printf("Wait for FPGA results\n"); clWaitForEvents(1, &readevent); accTime(&exeTime, &timer); // Get the execution time //toc(&timer); // put data back to shared memory //printf("Put data back to the shared memory\n"); *((int*)(shm_addr + sizeof(int))) = DONE; //printf("\n************* Task finished! *************\n"); if (-1 == shutdown(ConnectFD, SHUT_RDWR)) { perror("can not shutdown socket"); close(ConnectFD); close(SocketFD); exit(EXIT_FAILURE); } close(ConnectFD); //printf("done\n"); // free the shared memory shmdt(shm_addr); //shmctl(shmid, IPC_RMID, 0); accTime(&socRecvTime, &timer); printf("**********timing begin**********\n"); printTimeSpec(socListenTime); printTimeSpec(socSendTime); printTimeSpec(socRecvTime); printTimeSpec(exeTime); printf("**********timing end**********\n\n"); } close(SocketFD); // Shutdown and cleanup // clReleaseMemObject(input_a); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(int argc, char** argv) { int err; // error code returned from api calls int test_fail = 0; pgm_t input_img, output_img; IMG_DTYPE filter[FILTER_SIZE*FILTER_SIZE] = {-1, -1, -1, -1, 8, -1, -1, -1, -1}; IMG_DTYPE *h_input; // input image buffer IMG_DTYPE *hw_output; // host buffer for device output IMG_DTYPE *sw_output; // host buffer for reference output size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem d_in_image; // device buffer for input image cl_mem d_in_filter; // device buffer for filter kernel cl_mem d_out_image; // device buffer for filtered image printf("Application start\n"); if (argc != 3) { printf("Usage: %s conv_2d.xclbin image_path/image_name.pgm\n", argv[0]); return EXIT_FAILURE; } int row, col, pix; // read the image and initialize the host buffer with that err = readPGM(&input_img, argv[2]); if(err < 0) { printf("Cound not read the image\n"); return EXIT_FAILURE; } printf("Input image resolution = %xx%d\n", input_img.width, input_img.height); h_input = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); hw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); sw_output = (IMG_DTYPE*)malloc(sizeof(IMG_DTYPE)*input_img.height*input_img.width); for(pix = 0; pix < input_img.height*input_img.width; pix++) { h_input[pix] = input_img.buf[pix]; } // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("INFO: CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("INFO: CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return EXIT_FAILURE; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin = argv[1]; printf("INFO: loading xclbin %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin0: %s\n", xclbin); printf("Test failed\n"); return EXIT_FAILURE; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program0 from binary %d!\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return EXIT_FAILURE; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "conv_2d", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation // d_in_image = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL); d_in_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, NULL, NULL); d_out_image = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(IMG_DTYPE) * input_img.height*input_img.width, NULL, NULL); if (!d_in_image || !d_in_filter || !d_out_image) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write the image from host buffer to device memory // err = clEnqueueWriteBuffer(commands, d_in_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.height*input_img.width, h_input, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to image to device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write filter kernel into device buffer // err = clEnqueueWriteBuffer(commands, d_in_filter, CL_TRUE, 0, sizeof(IMG_DTYPE) * FILTER_SIZE * FILTER_SIZE, filter, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to filter coeff into device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Set the arguments to our compute kernel // int filter_size = FILTER_SIZE; IMG_DTYPE bias = 1; err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_in_image); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_in_filter); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_out_image); //err |= clSetKernelArg(kernel, 3, sizeof(int), &filter_size); err |= clSetKernelArg(kernel, 3, sizeof(IMG_DTYPE), &bias); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Launch computation kernel global[0] = input_img.width * WORKGROUP_SIZE_0; global[1] = input_img.height * WORKGROUP_SIZE_1; local[0] = WORKGROUP_SIZE_0; local[1] = WORKGROUP_SIZE_1; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, d_out_image, CL_TRUE, 0, sizeof(IMG_DTYPE) * input_img.width*input_img.height, hw_output, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); // Generate reference output int kr, kc; IMG_DTYPE sum = 0; for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) { for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) { sum = 0; for(kr = 0; kr < FILTER_SIZE; kr++) { for(kc = 0; kc < FILTER_SIZE; kc++ ) { sum += (filter[kr*FILTER_SIZE + kc] * h_input[(row+kr)*input_img.width + col + kc]); } } sw_output[row*input_img.width + col] = sum + bias; } } // Check Results for(row = 0; row < input_img.height-FILTER_SIZE+1; row++) { for(col = 0; col < input_img.width-FILTER_SIZE+1; col++) { if(sw_output[row*input_img.width+col] != hw_output[row*input_img.width+col]){ printf("Mismatch at : row = %d, col = %d, expected = %f, got = %f\n", row, col, sw_output[row*input_img.width+col], hw_output[row*input_img.width+col]); test_fail = 1; } } } printf("---------Input image-----------\n"); //print_matrix(h_input, input_img.height, input_img.width); printf("---------Reference output------\n"); //print_matrix(sw_output, input_img.height, input_img.width); printf("---------OCL Kernel output-----\n"); //print_matrix(hw_output, input_img.height, input_img.width); // store the output image output_img.width = input_img.width; output_img.height = input_img.height; normalizeF2PGM(&output_img, hw_output); writePGM(&output_img, "../../../../fpga_output.pgm"); //-------------------------------------------------------------------------- // Shutdown and cleanup //-------------------------------------------------------------------------- clReleaseMemObject(d_in_image); clReleaseMemObject(d_in_filter); clReleaseMemObject(d_out_image); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); destroyPGM(&input_img); if (test_fail) { printf("INFO: Test failed\n"); return EXIT_FAILURE; } else { printf("INFO: Test passed\n"); } }
int main(int argc, char* argv[]){ // Input Files // /* Intro Message */ printf("\n"); printf("A Program to Repack Protein Side-chains for Protein Docking Refinement Procedures\n"); printf("Copyright (c) 2014, Structural Bioinformatics Laboratory, Boston University\n"); printf("Author: Mohammad Moghadasi ([email protected]) \n"); if(argc!=10){ printf("Usage:\n ./main \n Complex_IN.pdb Complex_IN.psf Complex_IN.mol2 Complex_IN_Ligand.pdb\n Libmol-param-file charmm-param-file.prm charmm-rtf-file.rtf rotamer-library-binary-file.txt\n Complex_OUT.pdb\n \n"); exit(EXIT_FAILURE); } char* ifile = argv[1]; //pdb file of both receptor and ligand char* psffile = argv[2]; //charmm type psf file char* mol2file = argv[3]; //mol2 file char* pdbfilelig = argv[4]; //pdb file of ligand char* atom_prm_file = argv[5]; //libmol parameter file prmfile = argv[6]; //charmm type parameter file rtffile = argv[7]; //charmm type connectivity file char* rotamer_library_file = argv[8]; //rotamer raw library file char* ofile = argv[9]; //output file // Filling the atom_group struct // struct prm *atomprm = read_prm(atom_prm_file,_MOL_VERSION_); struct atomgrp* ag = read_file_atomgrp(ifile, atomprm, -1); read_ff_charmm(psffile, prmfile, rtffile, ag); if(!read_hybridization_states_from_mol2(mol2file,ag)){ exit (EXIT_FAILURE); } fix_acceptor_bases(ag,atomprm); struct List lig_list; read_fix(pdbfilelig,&lig_list.n,&lig_list.K); fixed_init(ag); fixed_update_unfreeze_all(ag); zero_grads(ag); fill_ingrp(ag); struct agsetup* ags; ags = malloc(sizeof(struct agsetup)); init_nblst(ag,ags); update_nblst(ag,ags); // Mark interface residues // int num_of_res_interface; int res_list_interface[ag->nres]; mark_interface_residues(ag,ags,lig_list, lig_rec_dist ,&num_of_res_interface,res_list_interface); // Initialize side chain rotamer library // //nrotCoef = 3; nrotCoef = 1; MAX_ROT = 245; MAX_RES = ag->nres;//needed for full_pack cutoff = 3; // Reslist // struct ifres_list* reslist; ifres_list_malloc( &reslist ); reslist->num_of_ifres = num_of_res_interface; for(int r = 0; r < reslist->num_of_ifres ; r++) reslist->ifres_num[r] = res_list_interface[r]; // Library // char *rotamer_lib; load_file_to_memory(rotamer_library_file, &rotamer_lib); struct rot_info *rotinf; init_rotinf(ag, num_of_res_interface, res_list_interface, rotamer_lib, &rotinf); struct ifres_list* reslist_minor; ifres_list_malloc( &reslist_minor ) ; // MAIN FUNCITION // // clock_t start, finish; start = clock(); full_pack(ag,lig_list,rotinf,num_of_res_interface, reslist_minor); finish = clock(); if(0) printf("Processing Time = %f\n",((double)(finish-start)/CLOCKS_PER_SEC)); // Writing the atom_group into a PDB // write_pdb_traj_nopar(ag,ifile,ofile); // Free memory // Free_ifres_list( &reslist_minor ); free(rotamer_lib); return 0; }
int main(int argc, char** argv) { int err; // error code returned from api calls float a1[DATA_SIZE1]; // original data set given to device float b1[FILTER_SIZE1]; // original data set given to device float c1[OUTPUT_SIZE1]; float results1[OUTPUT_SIZE1]; // results returned from device float sw_results1[OUTPUT_SIZE1]; // results returned from device unsigned int correct; // number of correct results returned size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem input_a; // device memory used for the input array cl_mem input_b; // device memory used for the input array cl_mem output; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return EXIT_FAILURE; } // Fill our data sets with pattern // int i = 0; for(i = 0; i < DATA_SIZE1; i++) { a1[i] = (float)1; } for(i = 0; i < OUTPUT_SIZE1; i++) { results1[i] = 0; sw_results1[i] = FILTER_SIZE1; } for(i = 0; i < FILTER_SIZE1; i++) { b1[i] = (float)1; } for(i = 0; i < OUTPUT_SIZE1; i++) { c1[i] = (float)0; } // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return EXIT_FAILURE; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return EXIT_FAILURE; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return EXIT_FAILURE; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "conv3_layer", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation // input_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE1, NULL, NULL); input_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * FILTER_SIZE1, NULL, NULL); output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * OUTPUT_SIZE1, NULL, NULL); if (!input_a || !input_b || !output) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input_a, CL_TRUE, 0, sizeof(float) * DATA_SIZE1, a1, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input_b, CL_TRUE, 0, sizeof(float) * FILTER_SIZE1, b1, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, c1, 0, NULL, NULL); // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else global[0] = MATRIX_RANK; global[1] = MATRIX_RANK; local[0] = MATRIX_RANK; local[1] = MATRIX_RANK; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, NULL); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * OUTPUT_SIZE1, results1, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); printf("A\n"); for (i=0;i<DATA_SIZE1;i++) { printf("%f ",a1[i]); if (((i+1) % NUM_DATA_ROWS) == 0) printf("\n"); } printf("B\n"); for (i=0;i< FILTER_SIZE1;i++) { printf("%f ",b1[i]); if (((i+1) % NUM_MASK_ROWS) == 0) printf("\n"); } printf("res\n"); for (i=0;i< OUTPUT_SIZE1;i++) { printf("%f ",results1[i]); if (((i+1) % NUM_OUT_ROWS) == 0) printf("\n"); } // Validate our results // correct = 0; /* for(i = 0; i < OUTPUT_SIZE1; i++) { int row = i/MATRIX_RANK; int col = i%MATRIX_RANK; int running = 0; int index; for (index=0;index<MATRIX_RANK;index++) { int aIndex = row*MATRIX_RANK + index; int bIndex = col + index*MATRIX_RANK; running += a[aIndex] * b[bIndex]; } sw_results[i] = running; }*/ for (i = 0;i < OUTPUT_SIZE1; i++) if(results1[i] == sw_results1[i]) correct++; printf("Software\n"); for (i=0;i<OUTPUT_SIZE1;i++) { //printf("%0.2f ",sw_results[i]); printf("%f ",sw_results1[i]); if (((i+1) % NUM_OUT_ROWS) == 0) printf("\n"); } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, OUTPUT_SIZE1); // Shutdown and cleanup // clReleaseMemObject(input_a); clReleaseMemObject(input_b); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if(correct == OUTPUT_SIZE1){ printf("Test passed!\n"); return EXIT_SUCCESS; } else{ printf("Test failed\n"); return EXIT_FAILURE; } }
int setup (char *configFile) { struct simpleLinkedList *rSet; char *location, *conf, *sql; printf("entering setup\n"); // Defaults VERBOSITY = DEBUGM; LOG_DIR = o_printf("%s/log/opendias", VAR_DIR); // Get 'DB' location if (configFile != NULL) { conf = o_strdup(configFile); } else { conf = o_printf("%s/opendias/opendias.conf", ETC_DIR); if( 0 != access(conf, F_OK) ) { o_log(INFORMATION, "Config not in GNU location: %s. Attempting system config dir /etc/opendias/opendias.conf", conf); free(conf); conf = o_strdup("/etc/opendias/opendias.conf"); } } o_log(INFORMATION, "|Using config file: %s", conf); if( 0 == load_file_to_memory(conf, &location) ) { o_log(ERROR, "|Cannot find main config file: %s", conf); free(location); free(conf); return 1; } free(conf); chop(location); BASE_DIR = o_strdup(location); o_log(INFORMATION, "|Which says the database is at: %s", BASE_DIR); // Open (& maybe update) the database. if(connect_db (1)) { // 1 = create if required free(BASE_DIR); free(location); return 1; } free(location); o_log(INFORMATION, "|Current config is: "); sql = o_strdup("SELECT config_option, config_value FROM config"); rSet = runquery_db(sql, NULL); if( rSet != NULL ) { do { char *config_option, *config_value; config_option = o_strdup(readData_db(rSet, "config_option")); config_value = o_strdup(readData_db(rSet, "config_value")); if ( config_option == NULL || config_value == NULL ) { printf("either option or value is NULL\n"); } else { //o_log(INFORMATION, " %s = %s", config_option, config_value); //remark: the pipe in the message causes o_log i_o_log to crash // caused by debug.c i_o_log by double use of vprintf o_log(INFORMATION, "| %s = %s", config_option, config_value); } if( 0 == strcmp(config_option, "log_verbosity") ) { VERBOSITY = atoi(config_value); } free(config_option); free(config_value); } while ( nextRow( rSet ) ); } free_recordset( rSet ); free(sql); return 0; }
struct cl_package initFPGA( const char* xclbin, const char* kernel_name ) { /*****************************************/ /* Initialize OpenCL */ /*****************************************/ // Retrieve the number of platforms cl_uint numPlatforms = 0; cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms); //printf("Found %d platforms support OpenCL, return code %d.\n", numPlatforms, status); // Allocate enough space for each platform cl_platform_id *platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) printf("clGetPlatformIDs error(%d)\n", status); // Retrieve the number of devices cl_uint numDevices = 0; #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &numDevices); #endif printf("Found %d devices support OpenCL.\n", numDevices); // Allocate enough space for each device cl_device_id *devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices #ifndef FPGA_DEVICE status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); #else status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ACCELERATOR, numDevices, devices, NULL); #endif if (status != CL_SUCCESS) printf("clGetDeviceIDs error(%d)\n", status); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); if (status != CL_SUCCESS) printf("clCreateContext error(%d)\n", status); //Create a command-queue cl_command_queue clCommandQue = clCreateCommandQueue(context, devices[0], 0, &status); if (status != CL_SUCCESS) printf("clCreateCommandQueue error(%d)\n", status); // 6. Load and build OpenCL kernel #ifndef FPGA_DEVICE // Create a program with source code cl_program program = clCreateProgramWithSource(context, 1, (const char**)&logistic_cl, NULL, &status); if (status != 0) printf("clCreateProgramWithSource error(%d)\n", status); // Build (compile) the program for the device status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); #else // Load binary from disk unsigned char *kernelbinary; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin); exit(1); } size_t n_bit = n_i; // Create the compute program from offline cl_program program = clCreateProgramWithBinary(context, 1, &devices[0], &n_bit, (const unsigned char **) &kernelbinary, NULL, &status); if ((!program) || (status != CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", status); exit(1); } // Build the program executable status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); #endif if (status != 0) { char errmsg[2048]; size_t sizemsg = 0; status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048*sizeof(char), errmsg, &sizemsg); printf("clBuildProgram error(%d)\n", status); printf("Compilation messages: \n %s", errmsg); } cl_kernel clKernel = clCreateKernel(program, kernel_name, &status); if (status != CL_SUCCESS) printf("clCreateKernel error(%d)\n", status); // TODO: parameterize the size of buffers cl_mem d_gradient = clCreateBuffer(context, CL_MEM_READ_WRITE, FEATURE_SIZE*LABEL_SIZE*GROUP_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_gradient clCreateBuffer error(%d)\n", status); cl_mem d_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, FEATURE_SIZE*LABEL_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_weights clCreateBuffer error(%d)\n", status); cl_mem d_data = clCreateBuffer(context, CL_MEM_READ_ONLY, (FEATURE_SIZE+LABEL_SIZE)*CHUNK_SIZE*sizeof(float), NULL, &status); if (status != CL_SUCCESS) printf("d_data clCreateBuffer error(%d)\n", status); struct cl_package result; result.context = context; result.kernel = clKernel; result.commandQueue = clCommandQue; result.d_gradient = d_gradient; result.d_weights = d_weights; result.d_data = d_data; return result; }
void BurstSort::parallelSort(std::ofstream& file){ char* buffer = NULL; char* tmp; int* posArray = NULL; int entryLength = KEY_LENGTH + sizeof(char*); buffer = (char*) malloc(sizeof(char) * size * entryLength); posArray = (int*) malloc(sizeof(int) * (NODE_SIZE + 1)); int pos = 0; posArray[0] = 0; for(int i = 0; i < NODE_SIZE; i++){ for(int j = 0; j < nodes[i].used; j++){ memcpy(buffer + pos * entryLength, nodes[i].entries[j], KEY_LENGTH * sizeof(char)); memcpy(buffer + pos * entryLength + KEY_LENGTH, &nodes[i].entries[j], sizeof(char*)); pos += sizeof(char); } posArray[i+1] = pos; } // OpenCL // Use this to check the output of each API call cl_int status; cl_int numDevices = 1; // Connect to first platform cl_platform_id platform; status = clGetPlatformIDs(1, &platform, NULL); if (status != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); return -1; } char cBuffer[1024]; clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(cBuffer), cBuffer, NULL); printf("CL_PLATFORM_VENDOR %s\n", cBuffer); clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); printf("CL_PLATFORM_NAME %s\n", cBuffer); cl_device_id device; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 1, &device, NULL); if (status != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return -1; } cl_long maxBufferSize = 0; status = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxBufferSize, NULL); printf("max buffer size: %lld\n", maxBufferSize); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, &device, NULL, NULL, &status); if (status != CL_SUCCESS) { printf("Error in creating context, code %d\n", status); return -1; } // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, device, 0, &status); if (status != CL_SUCCESS) { printf("Error in creating command queue for a device, code %d\n", status); return -1; } // Load binary from disk unsigned char *kernelbinary; char *xclbin = "sort_xiaohui.xclbin"; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin); return -1; } size_t n_bit = n_i; // Create the compute program from offline cl_program program = clCreateProgramWithBinary(context, 1, &device, &n_bit, (const unsigned char **) &kernelbinary, NULL, &status); if ((!program) || (status != CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", status); return -1; } // Build the program executable status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (status != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return -1; } // Create the vector addition kernel cl_kernel kernel; kernel = clCreateKernel(program, "sort", &status); cl_mem clPosArray; cl_mem clBuffer; clBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * size * entryLength, NULL, &status); clPosArray = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * (NODE_SIZE + 1), NULL, &status); status = clEnqueueWriteBuffer(cmdQueue, clPosArray, CL_FALSE, 0, sizeof(int) * (NODE_SIZE + 1),posArray, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, clBuffer, CL_FALSE, 0, sizeof(char) * size * entryLength, buffer, 0, NULL, NULL); // Associate the input and output buffers with the kernel status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &clBuffer); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &clPosArray); int nodeSize = NODE_SIZE; status = clSetKernelArg(kernel, 2, sizeof(int), (void *)&nodeSize); status = clSetKernelArg(kernel, 3, sizeof(int), (void *)&entryLength); size_t globalWorkSize[1]; globalWorkSize[0] = NODE_SIZE; gettimeofday(&t1, NULL); // Execute the kernel for execution status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) { printf("Error in clEnqueue, code %d\n", status); return -1; } // Read the device output buffer to the host output array clEnqueueReadBuffer(cmdQueue, clBuffer, CL_TRUE, 0, sizeof(char) * size * entryLength, buffer, 0, NULL, NULL); // Free OpenCL resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(clBuffer); clReleaseMemObject(clPosArray); clReleaseContext(context); //print result for(int i = 0; i < size; i+= sizeof(char)){ memcpy(&tmp,buffer + i * entryLength + KEY_LENGTH,sizeof(char*)); file << tmp; } // Free host resources free(buffer); free(posArray); free(platforms); free(devices); }
int deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree, unsigned tree_len, unsigned char* output, unsigned* out_len) { #define SDACCEL_WRAPPER #ifdef SDACCEL_WRAPPER int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel char cl_platform_vendor[1001]; char cl_platform_name[1001]; err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return EXIT_FAILURE; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char xclbin[]="deflate1.xclbin"; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return EXIT_FAILURE; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return EXIT_FAILURE; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "deflate259", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation // void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree, // unsigned tree_len, unsigned char* output, unsigned* out_len) cl_mem input_arg, in_len_arg, tree_arg, tree_len_arg, output_arg, out_len_arg; input_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, CHUNK, NULL, NULL); in_len_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL); tree_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, 512, NULL, NULL); tree_len_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL); output_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, CHUNK*2, NULL, NULL); out_len_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(unsigned), NULL, NULL); if (!input_arg || !in_len_arg || !tree_arg || !tree_len_arg || !output_arg || !out_len_arg) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, input_arg, CL_TRUE, 0, in_len, input, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array input!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, in_len_arg, CL_TRUE, 0, sizeof(unsigned), &in_len, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array &in_len!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, tree_arg, CL_TRUE, 0, 512, tree, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array tree!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, tree_len_arg, CL_TRUE, 0, sizeof(unsigned), &tree_len, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array &tree_len!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Set the arguments to our compute kernel //void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree, // unsigned tree_len, unsigned char* output, unsigned* out_len) err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_arg); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_len_arg); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &tree_arg); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &tree_len_arg); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_arg); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &out_len_arg); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else size_t global[1]; size_t local[1]; global[0] = 1; local[0] = 1; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, NULL); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Read back the results from the device to verify the output // cl_event readevent; unsigned out_len_b; err = clEnqueueReadBuffer( commands, out_len_arg, CL_TRUE, 0, sizeof(unsigned), &out_len_b, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output length! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); *out_len = out_len_b; printf("Read final output length: %d\n", out_len_b); err = clEnqueueReadBuffer( commands, output_arg, CL_TRUE, 0, out_len_b, output, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output data! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); #endif }
int bpnn_train_kernel(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; in = net->input_n; hid = net->hidden_n; out = net->output_n; //int use_device = 0; // use CPU as device int use_device = 2; // use GPU as device //int use_device = 2; // use FPGA as device if(initialize(use_device)) return -1; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_bp1 = "bpnn_layerforward_ocl"; char * kernel_bp2 = "bpnn_adjust_weights_ocl"; char * tempchar = "./backprop_kernel.cl"; char * krnl_file = "./binary/backprop_kernel_default.xclbin"; cl_int err = 0; cl_program prog; // create program from source if (use_device < 2 ) { FILE * fp = fopen(tempchar, "rb"); if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // compile kernel err = 0; const char * slist[2] = { source, 0 }; prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } } // create program from binary else { char *krnl_bin; const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin); err = 0; prog = clCreateProgramWithBinary(context, 1, &device_list[0], &krnl_size, (const unsigned char**) &krnl_bin, NULL, &err); if ((!prog) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); exit(EXIT_FAILURE); } } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); { // show warnings/errors //static char log[65536]; memset(log, 0, sizeof(log)); //cl_device_id device_id = 0; //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL); //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel1; cl_kernel kernel2; kernel1 = clCreateKernel(prog, kernel_bp1, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; } kernel2 = clCreateKernel(prog, kernel_bp2, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; } /* clReleaseProgram(prog); */ float *input_weights_one_dim; float *input_weights_prev_one_dim; float * partial_sum; float sum; float num_blocks = in / BLOCK_SIZE; input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); // set global and local workitems size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 }; // this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights // todo: fix mem allocation int m = 0; for (int k = 0; k <= in; k++) { for (int j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } cl_mem input_hidden_ocl; cl_mem input_ocl; cl_mem output_hidden_ocl; cl_mem hidden_partial_sum; cl_mem hidden_delta_ocl; cl_mem input_prev_weights_ocl; input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;} input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;} output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;} hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;} hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;} input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;} printf("Performing GPU computation\n"); //write buffers err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl); clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum ); clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL ); clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL ); clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid); err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0); if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");} if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; } for (int j = 1; j <= hid; j++) { sum = 0.0; for (int k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum))); } bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl); clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid); clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl ); err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; } clReleaseMemObject(input_ocl); clReleaseMemObject(output_hidden_ocl); clReleaseMemObject(input_hidden_ocl); clReleaseMemObject(hidden_partial_sum); clReleaseMemObject(input_prev_weights_ocl); free(input_weights_prev_one_dim); free(partial_sum); free(input_weights_one_dim); }