//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); runTest( argc, argv); ocd_finalize(); return EXIT_SUCCESS; }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); //float* h_areas = new float[nelr]; //int* h_elements_surrounding_elements = new int[nelr*NNB]; //float* h_normals = new float[nelr*NDIM*NNB]; float* h_areas ; int* h_elements_surrounding_elements ; float* h_normals ; h_areas = (float*) memalign(AOCL_ALIGNMENT,nelr*sizeof(float)); h_elements_surrounding_elements = (int*) memalign(AOCL_ALIGNMENT,nelr*NNB*sizeof(int)); h_normals = (float *) memalign(AOCL_ALIGNMENT,nelr*NDIM*NNB*sizeof(float)); //posix_memalign(&h_areas , AOCL_ALIGNMENT, nelr); //posix_memalign(&h_elements_surrounding_elements , AOCL_ALIGNMENT, nelr*NNB); //posix_memalign(&h_normals , AOCL_ALIGNMENT, nelr*NDIM*NNB); // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } char* kernel_files; int num_kernels = 20; kernel_files = (char*) malloc(sizeof(char*)*num_kernels); strcpy(kernel_files,"cfd_kernel"); program = ocdBuildProgramFromFile(context,device_id,kernel_files, NULL); // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
int main(int argc, char ** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); if (argc < 3) { printf("Calculate similarities between two strings.\n"); printf("Maximum length of each string is: %d\n", MAX_LEN); printf("Usage: %s query database\n", argv[0]); printf("or: %s query database [openPenalty extensionPenalty block#]\n", argv[0]); printf("openPenalty (5.0), extensionPenalty (0.5)\n"); return 1; } ///////////////////////////////////// // 00 --> 01 // | | // 10 --> 11 //////////////////////////////////// char queryFilePathName[255], dbDataFilePathName[255], dbLenFilePathName[255]; int querySize, subSequenceNum, subSequenceSize; float openPenalty, extensionPenalty; int coalescedOffset = COALESCED_OFFSET; int nblosumWidth = 23; size_t blockSize = 64; size_t setZeroThreadNum, mfThreadNum; int blockNum = 14; cl_ulong maxLocalSize; int arraySize; struct timeval t1, t2; float tmpTime; FILE *pfile; //record time memset(&strTime, 0, sizeof(STRUCT_TIME)); timerStart(); openPenalty = 5.0f; extensionPenalty = 0.5; if (argc == 6) { openPenalty = atof(argv[3]); extensionPenalty = atof(argv[4]); blockNum = atoi(argv[5]); } //relocated to after MAX_COMPUTE_UNITS check //mfThreadNum = blockNum * blockSize; cl_program hProgram; cl_kernel hMatchStringKernel, hTraceBackKernel, hSetZeroKernel; size_t sourceFileSize; char *cSourceCL = NULL; //err = clGetPlatformIDs(1, &platformID, NULL); //CHKERR(err, "Get platform ID error!"); cl_int err; //check to make sure the device supports this block count //then scale threads appropriately cl_uint devBlockNum = 0; CHKERR(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS,\ sizeof(cl_uint), &devBlockNum, 0), \ "Error while querying CL_DEVICE_MAX_COMPUTE_UNITS."); if (devBlockNum == MIN(blockNum, devBlockNum)) { printf("Scaling blocks from %d to %d to fit on device\n",\ blockNum, devBlockNum); blockNum = devBlockNum; } mfThreadNum = blockNum * blockSize; CHKERR(clGetDeviceInfo(device_id, CL_DEVICE_LOCAL_MEM_SIZE,\ sizeof(cl_ulong), &maxLocalSize, 0), \ "Error while querying CL_DEVICE_LOCAL_MEM_SIZE."); //load the source file char kernel_file[] = "kernels.cl"; cSourceCL = loadSource(kernel_file, &sourceFileSize); hProgram = clCreateProgramWithSource(context, 1, (const char **)&cSourceCL, &sourceFileSize, &err); CHKERR(err, "Create program with source error"); err = clBuildProgram(hProgram, 0, 0, 0, 0, 0); //debug================================ int logSize = 3000, i; size_t retSize; char logTxt[3000]; err = clGetProgramBuildInfo(hProgram, device_id, CL_PROGRAM_BUILD_LOG, logSize, logTxt, &retSize); for (i = 0; i < retSize; i++) { printf("%c", logTxt[i]); } //=================================== CHKERR(err, "Build program error"); hMatchStringKernel = clCreateKernel(hProgram, "MatchStringGPUSync", &err); CHKERR(err, "Create MatchString kernel error"); hTraceBackKernel = clCreateKernel(hProgram, "trace_back2", &err); CHKERR(err, "Create trace_back2 kernel error"); hSetZeroKernel = clCreateKernel(hProgram, "setZero", &err); CHKERR(err, "Create setZero kernel error"); sprintf(queryFilePathName, "%s", argv[1]); sprintf(dbDataFilePathName, "%s.data", argv[2]); sprintf(dbLenFilePathName, "%s.loc", argv[2]); char *allSequences, *querySequence, *subSequence; char *seq1, *seq2; cl_mem seq1D, seq2D; allSequences = new char[2 * (MAX_LEN)]; if (allSequences == NULL) { printf("Allocate sequence buffer error!\n"); return 1; } querySequence = allSequences; seq1D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_char) * MAX_LEN, 0, &err); CHKERR(err, "Create seq1D memory"); seq2D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_char) * MAX_LEN, 0, &err); CHKERR(err, "Create seq2D memory"); //read query sequence querySize = readQuerySequence(queryFilePathName, querySequence); if (querySize <= 0 || querySize > MAX_LEN) { printf("Query size %d is out of range (0, %d)\n", MAX_LEN, querySize); return 1; } encoding(querySequence, querySize); subSequence = allSequences + querySize; //allocate output sequence buffer char *outSeq1, *outSeq2; outSeq1 = new char[2 * MAX_LEN]; outSeq2 = new char[2 * MAX_LEN]; if (outSeq1 == NULL || outSeq2 == NULL) { printf("Allocate output sequence buffer on host error!\n"); return 1; } cl_mem outSeq1D, outSeq2D; outSeq1D = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * MAX_LEN * 2, 0, &err); CHKERR(err, "Create outSeq1D memory"); outSeq2D = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * MAX_LEN * 2, 0, &err); CHKERR(err, "Create outSeq2D memory"); //allocate thread number per launch and //location difference information int *threadNum, *diffPos; threadNum = new int[2 * MAX_LEN]; diffPos = new int[2 * MAX_LEN]; if (threadNum == NULL || diffPos == NULL) { printf("Allocate location buffer on host error!\n"); return 1; } cl_mem threadNumD, diffPosD; threadNumD = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * (2 * MAX_LEN), 0, &err); CHKERR(err, "Create threadNumD memory"); diffPosD = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int) * (2 * MAX_LEN), 0, &err); CHKERR(err, "Create diffPosD memory"); //allocate matrix buffer char *pathFlag, *extFlag; float *nGapDist, *hGapDist, *vGapDist; int maxElemNum = (MAX_LEN + 1) * (MAX_LEN + 1); pathFlag = new char[maxElemNum]; extFlag = new char[maxElemNum]; nGapDist = new float[maxElemNum]; hGapDist = new float[maxElemNum]; vGapDist = new float[maxElemNum]; if (pathFlag == NULL || extFlag == NULL || nGapDist == NULL || hGapDist == NULL || vGapDist == NULL) { printf("Allocate DP matrices on host error!\n"); return 1; } cl_mem pathFlagD, extFlagD, nGapDistD, hGapDistD, vGapDistD; pathFlagD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * maxElemNum, 0, &err); CHKERR(err, "Create pathFlagD memory"); extFlagD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_char) * maxElemNum, 0, &err); CHKERR(err, "Create extFlagD memory"); nGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err); CHKERR(err, "Create nGapDistD memory"); hGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err); CHKERR(err, "Create hGapDistD memory"); vGapDistD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * maxElemNum, 0, &err); CHKERR(err, "Create vGapDistD memory"); //Allocate the MAX INFO structure MAX_INFO *maxInfo; maxInfo = new MAX_INFO[1]; if (maxInfo == NULL) { printf("Alloate maxInfo on host error!\n"); return 1; } cl_mem maxInfoD; maxInfoD = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(MAX_INFO) * mfThreadNum, 0, &err); CHKERR(err, "Create maxInfoD memory"); //allocate the distance table cl_mem blosum62D; int nblosumHeight = 23; blosum62D = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * nblosumWidth * nblosumHeight, 0, &err); err = clEnqueueWriteBuffer(commands, blosum62D, CL_TRUE, 0, nblosumWidth * nblosumHeight * sizeof(cl_float), blosum62[0], 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Scoring Matrix Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "copy blosum62 to device"); cl_mem mutexMem; mutexMem = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0, &err); CHKERR(err, "create mutex mem error!"); //copy the scoring matrix to the constant memory //copyScoringMatrixToConstant(); //open the database pDBDataFile = fopen(dbDataFilePathName, "rb"); if (pDBDataFile == NULL) { printf("DB data file %s open error!\n", dbDataFilePathName); return 1; } pDBLenFile = fopen(dbLenFilePathName, "rb"); if (pDBLenFile == NULL) { printf("DB length file %s open error!\n", dbLenFilePathName); return 1; } //record time timerEnd(); strTime.iniTime = elapsedTime(); //read the total number of sequences fread(&subSequenceNum, sizeof(int), 1, pDBLenFile); //get the larger and smaller of the row and colum number int subSequenceNo, launchNum, launchNo; int rowNum, columnNum, matrixIniNum; int DPMatrixSize; int seq1Pos, seq2Pos, nOffset, startPos; for (subSequenceNo = 0; subSequenceNo < subSequenceNum; subSequenceNo++) { //record time timerStart(); //read subject sequence fread(&subSequenceSize, sizeof(int), 1, pDBLenFile); if (subSequenceSize <= 0 || subSequenceSize > MAX_LEN) { printf("Size %d of bubject sequence %d is out of range!\n", subSequenceSize, subSequenceNo); break; } fread(subSequence, sizeof(char), subSequenceSize, pDBDataFile); gettimeofday(&t1, NULL); if (subSequenceSize > querySize) { seq1 = subSequence; seq2 = querySequence; rowNum = subSequenceSize + 1; columnNum = querySize + 1; } else { seq1 = querySequence; seq2 = subSequence; rowNum = querySize + 1; columnNum = subSequenceSize + 1; } launchNum = rowNum + columnNum - 1; //preprocessing for sequences DPMatrixSize = preProcessing(rowNum, columnNum, threadNum, diffPos, matrixIniNum); //record time timerEnd(); strTime.preprocessingTime += elapsedTime(); //record time timerStart(); //use a kernel to initialize the matrix arraySize = DPMatrixSize * sizeof(char); setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&pathFlagD); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT DP Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&extFlagD); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT DP Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize flag matrice"); arraySize = matrixIniNum * sizeof(float); setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&nGapDistD); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&hGapDistD); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&vGapDistD); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Distance Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize dist matrice"); arraySize = sizeof(MAX_INFO) * mfThreadNum; setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&maxInfoD); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Max Info Matrix Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize max info"); arraySize = sizeof(int); setZeroThreadNum = ((arraySize - 1) / blockSize + 1) * blockSize; err = clSetKernelArg(hSetZeroKernel, 0, sizeof(cl_mem), (void *)&mutexMem); err |= clSetKernelArg(hSetZeroKernel, 1, sizeof(int), (void *)&arraySize); err |= clEnqueueNDRangeKernel(commands, hSetZeroKernel, 1, NULL, &setZeroThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Mutex Init", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Initialize mutex variable"); //copy input sequences to device err = clEnqueueWriteBuffer(commands, seq1D, CL_FALSE, 0, (rowNum - 1) * sizeof(cl_char), seq1, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clEnqueueWriteBuffer(commands, seq2D, CL_FALSE, 0, (columnNum - 1) * sizeof(cl_char), seq2, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "copy input sequence"); err = clEnqueueWriteBuffer(commands, diffPosD, CL_FALSE, 0, launchNum * sizeof(cl_int), diffPos, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Mutex Info Copy", ocdTempTimer) END_TIMER(ocdTempTimer) err |= clEnqueueWriteBuffer(commands, threadNumD, CL_FALSE, 0, launchNum * sizeof(cl_int), threadNum, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "SWAT Mutex Info Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "copy diffpos and/or threadNum mutexMem info error!"); //record time timerEnd(); strTime.copyTimeHostToDevice += elapsedTime(); //record time timerStart(); //set arguments err = clSetKernelArg(hMatchStringKernel, 0, sizeof(cl_mem), (void *)&pathFlagD); err |= clSetKernelArg(hMatchStringKernel, 1, sizeof(cl_mem), (void *)&extFlagD); err |= clSetKernelArg(hMatchStringKernel, 2, sizeof(cl_mem), (void *)&nGapDistD); err |= clSetKernelArg(hMatchStringKernel, 3, sizeof(cl_mem), (void *)&hGapDistD); err |= clSetKernelArg(hMatchStringKernel, 4, sizeof(cl_mem), (void *)&vGapDistD); err |= clSetKernelArg(hMatchStringKernel, 5, sizeof(cl_mem), (void *)&diffPosD); err |= clSetKernelArg(hMatchStringKernel, 6, sizeof(cl_mem), (void *)&threadNumD); err |= clSetKernelArg(hMatchStringKernel, 7, sizeof(cl_int), (void *)&rowNum); err |= clSetKernelArg(hMatchStringKernel, 8, sizeof(cl_int), (void *)&columnNum); err |= clSetKernelArg(hMatchStringKernel, 9, sizeof(cl_mem), (void *)&seq1D); err |= clSetKernelArg(hMatchStringKernel, 10, sizeof(cl_mem), (void *)&seq2D); err |= clSetKernelArg(hMatchStringKernel, 11, sizeof(cl_int), (void *)&nblosumWidth); err |= clSetKernelArg(hMatchStringKernel, 12, sizeof(cl_float), (void *)&openPenalty); err |= clSetKernelArg(hMatchStringKernel, 13, sizeof(cl_float), (void *)&extensionPenalty); err |= clSetKernelArg(hMatchStringKernel, 14, sizeof(cl_mem), (void *)&maxInfoD); err |= clSetKernelArg(hMatchStringKernel, 15, sizeof(cl_mem), (void *)&blosum62D); err |= clSetKernelArg(hMatchStringKernel, 16, sizeof(cl_mem), (void *)&mutexMem); //err |= clSetKernelArg(hMatchStringKernel, 17, maxLocalSize, NULL); CHKERR(err, "Set match string argument error!"); err = clEnqueueNDRangeKernel(commands, hMatchStringKernel, 1, NULL, &mfThreadNum, &blockSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Launch kernel match string error"); //record time timerEnd(); strTime.matrixFillingTime += elapsedTime(); //record time timerStart(); err = clSetKernelArg(hTraceBackKernel, 0, sizeof(cl_mem), (void *)&pathFlagD); err |= clSetKernelArg(hTraceBackKernel, 1, sizeof(cl_mem), (void *)&extFlagD); err |= clSetKernelArg(hTraceBackKernel, 2, sizeof(cl_mem), (void *)&diffPosD); err |= clSetKernelArg(hTraceBackKernel, 3, sizeof(cl_mem), (void *)&seq1D); err |= clSetKernelArg(hTraceBackKernel, 4, sizeof(cl_mem), (void *)&seq2D); err |= clSetKernelArg(hTraceBackKernel, 5, sizeof(cl_mem), (void *)&outSeq1D); err |= clSetKernelArg(hTraceBackKernel, 6, sizeof(cl_mem), (void *)&outSeq2D); err |= clSetKernelArg(hTraceBackKernel, 7, sizeof(cl_mem), (void *)&maxInfoD); err |= clSetKernelArg(hTraceBackKernel, 8, sizeof(int), (void *)&mfThreadNum); size_t tbGlobalSize[1] = {1}; size_t tbLocalSize[1] = {1}; err = clEnqueueNDRangeKernel(commands, hTraceBackKernel, 1, NULL, tbGlobalSize, tbLocalSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "SWAT Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Launch kernel trace back error"); clFinish(commands); //record time timerEnd(); strTime.traceBackTime += elapsedTime(); //record time timerStart(); //copy matrix score structure back err = clEnqueueReadBuffer(commands, maxInfoD, CL_FALSE, 0, sizeof(MAX_INFO), maxInfo, 0, 0, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Max Info Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Read maxInfo buffer error!"); int maxOutputLen = rowNum + columnNum - 2; err = clEnqueueReadBuffer(commands, outSeq1D, CL_FALSE, 0, maxOutputLen * sizeof(cl_char), outSeq1, 0, 0, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) err = clEnqueueReadBuffer(commands, outSeq2D, CL_FALSE, 0, maxOutputLen * sizeof(cl_char), outSeq2, 0, 0, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "SWAT Sequence Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Read output sequence error!"); //record time clFinish(commands); gettimeofday(&t2, NULL); timerEnd(); strTime.copyTimeDeviceToHost += elapsedTime(); //call the print function to print the match result printf("============================================================\n"); printf("Sequence pair %d:\n", subSequenceNo); int nlength = maxInfo->noutputlen; PrintAlignment(outSeq1, outSeq2, nlength, CHAR_PER_LINE, openPenalty, extensionPenalty); printf("Max alignment score (on device) is %.1f\n", maxInfo->fmaxscore); //obtain max alignment score on host //err = clEnqueueReadBuffer(commands, nGapDistD, CL_TRUE, 0, sizeof(cl_float) * DPMatrixSize, // nGapDist, 0, 0, 0); //printf("Max alignment score (on host) is %.1f\n", maxScore(nGapDist, DPMatrixSize)); printf("openPenalty = %.1f, extensionPenalty = %.1f\n", openPenalty, extensionPenalty); printf("Input sequence size, querySize: %d, subSequenceSize: %d\n", querySize, subSequenceSize); printf("Max position, seq1 = %d, seq2 = %d\n", maxInfo->nposi, maxInfo->nposj); } tmpTime = 1000.0 * (t2.tv_sec - t1.tv_sec) + (t2.tv_usec - t1.tv_usec) / 1000.0; pfile = fopen("../kernelTime.txt", "at"); fprintf(pfile, "verOpencl4:\t%.3f\n", tmpTime); fclose(pfile); //print time printTime_toStandardOutput(); printTime_toFile(); fclose(pDBLenFile); fclose(pDBDataFile); clReleaseKernel(hMatchStringKernel); clReleaseKernel(hTraceBackKernel); clReleaseKernel(hSetZeroKernel); delete allSequences; clReleaseMemObject(seq1D); clReleaseMemObject(seq2D); delete outSeq1; delete outSeq2; clReleaseMemObject(outSeq1D); clReleaseMemObject(outSeq2D); delete threadNum; clReleaseMemObject(threadNumD); delete diffPos; clReleaseMemObject(diffPosD); delete pathFlag; delete extFlag; delete nGapDist; delete hGapDist; delete vGapDist; clReleaseMemObject(pathFlagD); clReleaseMemObject(extFlagD); clReleaseMemObject(nGapDistD); clReleaseMemObject(hGapDistD); clReleaseMemObject(vGapDistD); delete maxInfo; clReleaseMemObject(maxInfoD); free(cSourceCL); clReleaseMemObject(blosum62D); clReleaseMemObject(mutexMem); clReleaseProgram(hProgram); clReleaseCommandQueue(commands); clReleaseContext(context); ocd_finalize(); return 0; }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); std::cerr << "N-Queen solver for OpenCL\n"; std::cerr << "Ping-Che Chen\n\n"; if(argc < 2) { std::cerr << "Usage: " << argv[0] << " [options] N\n"; std::cerr << "\tN: board size (1 ~ 32)\n"; std::cerr << "\t-cpu: use CPU (multi-threaded on Windows)\n"; std::cerr << "\t-prof: enable profiler\n"; std::cerr << "\t-threads #: set number of threads to #\n"; std::cerr << "\t-blocksize #: set size of thread blocks to #\n"; std::cerr << "\t-local: use local memory for arrays (default: off)\n"; std::cerr << "\t-noatomics: do not use global atomics\n"; std::cerr << "\t-novec: do not use vectorization\n"; std::cerr << "\t-vec4: use 4D vectors instead of 2D (only when vectorized- default: off)\n"; return 0; } // handle options bool force_cpu = false; bool profiling = false; int threads = 0; int block_size = 0; bool local = false;//default OFF (was true) bool noatomics = false; bool novec = false; bool use_vec4 = false; int start = 1; while(start < argc - 1) { if(std::strcmp(argv[start], "-cpu") == 0) { force_cpu = true; } else if(std::strcmp(argv[start], "-threads") == 0 && start < argc - 2) { threads = std::atoi(argv[start + 1]); start++; } else if(std::strcmp(argv[start], "-blocksize") == 0 && start < argc - 2) { block_size = std::atoi(argv[start + 1]); start++; } else if(std::strcmp(argv[start], "-local") == 0) { local = true; } else if(std::strcmp(argv[start], "-noatomics") == 0) { noatomics = true; } else if(std::strcmp(argv[start], "-novec") == 0) { novec = true; } else if(std::strcmp(argv[start], "-vec4") == 0) { use_vec4 = true; } else { std::cerr << "Unknown option " << argv[start] << "\n"; } start ++; } int board_size = std::atoi(argv[start]); if(board_size < 1 || board_size > 32) { std::cerr << "Inalid board size (only 1 ~ 32 allowed)\n"; return 0; } stopwatch sw; long long solutions = 0; long long unique_solutions = 0; if(force_cpu) { stopwatch_start(&sw); solutions = nqueen_cpu(board_size, &unique_solutions); stopwatch_stop(&sw); } else { stopwatch_start(&sw); cl_int err; // show device list size_t num_devices; num_devices=1;//In OpenDwarfs we only work with one device at a time. std::vector<cl_device_id> devices(num_devices / sizeof(cl_device_id)); devices.clear(); devices.resize(1); devices[0] = device_id; try { NQueenSolver nqueen(context, devices, profiling, threads, block_size, local, noatomics, novec, use_vec4); for(int i = 0; i < devices.size(); i++) { size_t name_length; err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, 0, &name_length); if(err == CL_SUCCESS) { std::string name; name.resize(name_length + 1); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, name_length, &name[0], &name_length); name[name_length] = 0; std::cerr << "Device " << i << ": " << name.c_str() << "\n"; std::cerr << "\tUsing " << nqueen.GetThreads(i) << " threads\n"; std::cerr << "\tBlock size = " << nqueen.GetBlockSize(i) << " threads\n"; if(nqueen.AtomicsEnabled(i)) { std::cerr << "\tUsing global atomics\n"; } if(nqueen.VectorizationEnabled(i)) { std::cerr << "\tUsing vectorization\n"; if(use_vec4) { std::cerr << "\tUse 4D vectors\n"; } else { std::cerr << "\tUse 2D vectors\n"; } } } } //start_time = std::clock(); solutions = nqueen.Compute(board_size, &unique_solutions); //end_time = std::clock(); } catch(CLError x) { if(x.GetErrorNo() == 1) { std::cerr << "1 OpenCL kernel execution failed\n"; } if(x.GetErrorNo() == 2) { std::cerr << "2 OpenCL kernel execution failed\n"; } if(x.GetErrorNo() == 3) { std::cerr << "3 OpenCL kernel execution failed\n"; } else { std::cerr << x << "\n"; } } stopwatch_stop(&sw); clReleaseContext(context); } std::cerr << "Solution took " << get_interval_by_sec(&sw) << " seconds to complete\n"; std::cerr << board_size << "-queen has " << solutions << " solutions (" << unique_solutions << " unique)\n"; printf("{ \"status\": %d, \"options\": \"-s %d\", \"time\": %f }\n", 1, board_size, get_interval_by_sec(&sw)); ocd_finalize(); return 0; }
int main ( int argc, char *argv[] ) { int matrix_dim = 32; /* default matrix_dim */ int opt, option_index=0; func_ret_t ret; const char *input_file = NULL; float *m, *mm; stopwatch sw; //cl_device_id device_id; //cl_context context; //cl_command_queue commands; cl_program clProgram; cl_kernel clKernel_diagonal; cl_kernel clKernel_perimeter; cl_kernel clKernel_internal; cl_int dev_type; cl_int errcode; FILE *kernelFile; char *kernelSource; size_t kernelLength; cl_mem d_m; ocd_init(&argc, &argv, NULL); ocd_initCL(); while ((opt = getopt_long(argc, argv, "::vs:i:", long_options, &option_index)) != -1 ) { switch(opt){ case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); fprintf(stderr, "Currently not supported, use -i instead\n"); fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); exit(EXIT_FAILURE); case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); //break; default: fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); exit(EXIT_FAILURE); } } if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); exit(EXIT_FAILURE); } if (input_file) { printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify){ printf("Before LUD\n"); print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } size_t max_worksize[3]; errcode = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL); CHKERR(errcode, "Failed to get device info!"); //Start by 16*16, but if not allowed divide by two until MAX_WORK_ITEM_SIZES is less or equal than what we are going to ask for. while(BLOCK_SIZE*BLOCK_SIZE>max_worksize[0]) BLOCK_SIZE = BLOCK_SIZE/2; kernelFile = fopen("lud_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); clProgram = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, &kernelLength, &errcode); CHKERR(errcode, "Failed to create program with source!"); free(kernelSource); char arg[100]; sprintf(arg,"-D BLOCK_SIZE=%d", (int)BLOCK_SIZE); errcode = clBuildProgram(clProgram, 1, &device_id, arg, NULL, NULL); if (errcode == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLength; errcode = clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength); log = (char *) malloc(sizeof(char)*logLength); errcode = clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL); fprintf(stderr, "Kernel build error! Log:\n%s", log); free(log); return 0; } CHKERR(errcode, "Failed to build program!"); clKernel_diagonal = clCreateKernel(clProgram, "lud_diagonal", &errcode); CHKERR(errcode, "Failed to create kernel!"); clKernel_perimeter = clCreateKernel(clProgram, "lud_perimeter", &errcode); CHKERR(errcode, "Failed to create kernel!"); clKernel_internal = clCreateKernel(clProgram, "lud_internal", &errcode); CHKERR(errcode, "Failed to create kernel!"); d_m = clCreateBuffer(context, CL_MEM_READ_WRITE, matrix_dim*matrix_dim*sizeof(float), NULL, &errcode); CHKERR(errcode, "Failed to create buffer!"); /* beginning of timing point */ stopwatch_start(&sw); errcode = clEnqueueWriteBuffer(commands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Matrix Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(errcode, "Failed to enqueue write buffer!"); int i=0; size_t localWorkSize[2]; size_t globalWorkSize[2]; //printf("BLOCK_SIZE: %d\n",BLOCK_SIZE); // printf("max Work-item Size: %d\n",(int)max_worksize[0]); #ifdef START_POWER for( int iter = 0; iter < 1000; iter++) #endif for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) { errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHKERR(errcode, "Failed to set kernel arguments!"); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(commands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(errcode, "Failed to enqueue kernel!"); errcode = clSetKernelArg(clKernel_perimeter, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_perimeter, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_perimeter, 2, sizeof(int), (void *) &i); CHKERR(errcode, "Failed to set kernel arguments!"); localWorkSize[0] = BLOCK_SIZE*2; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; errcode = clEnqueueNDRangeKernel(commands, clKernel_perimeter, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Perimeter Kernel", ocdTempTimer) CHKERR(errcode, "Failed to enqueue kernel!"); END_TIMER(ocdTempTimer) errcode = clSetKernelArg(clKernel_internal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_internal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_internal, 2, sizeof(int), (void *) &i); CHKERR(errcode, "Failed to set kernel arguments!"); localWorkSize[0] = BLOCK_SIZE; localWorkSize[1] = BLOCK_SIZE; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; globalWorkSize[1] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[1]; errcode = clEnqueueNDRangeKernel(commands, clKernel_internal, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Internal Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(errcode, "Failed to enqueue kernel!"); } errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHKERR(errcode, "Failed to set kernel arguments!"); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(commands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) CHKERR(errcode, "Failed to enqueue kernel!"); END_TIMER(ocdTempTimer) errcode = clEnqueueReadBuffer(commands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Matrix copy", ocdTempTimer) END_TIMER(ocdTempTimer) /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw)); clReleaseMemObject(d_m); if (do_verify){ printf("After LUD\n"); print_matrix(m, matrix_dim); printf(">>>Verify<<<<\n"); printf("matrix_dim: %d\n",matrix_dim); lud_verify(mm, m, matrix_dim); free(mm); } clReleaseKernel(clKernel_diagonal); clReleaseKernel(clKernel_perimeter); clReleaseKernel(clKernel_internal); clReleaseProgram(clProgram); clReleaseCommandQueue(commands); clReleaseContext(context); free(m); ocd_finalize(); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */