int main(int argc, const char** argv) { cl_uint platform_count; cl_platform_id platforms[5]; cl_int err = CL_SUCCESS; unsigned int i, p; cl_device_type dev_type = CL_DEVICE_TYPE_ALL; void * ptrs[BLOCKS]; cl_command_queue cqs[BLOCKS]; cl_mem d_A[BLOCKS]; cl_mem d_C[BLOCKS]; cl_mem d_B[BLOCKS]; cl_event GPUDone[BLOCKS]; cl_event GPUExecution[BLOCKS]; struct timeval start, end; int workOffset[BLOCKS]; int workSize[BLOCKS]; unsigned int sizePerGPU = HC / BLOCKS; unsigned int sizeMod = HC % BLOCKS; size_t A_size = WA * HA; size_t A_mem_size = sizeof(TYPE) * A_size; TYPE* A_data; size_t B_size = WB * HB; size_t B_mem_size = sizeof(TYPE) * B_size; TYPE* B_data; size_t C_size = WC * HC; size_t C_mem_size = sizeof(TYPE) * C_size; TYPE* C_data; parse_args(argc, argv); check(clGetPlatformIDs(5, platforms, &platform_count)); if (platform_count == 0) { printf("No platform found\n"); exit(77); } cl_uint device_count; cl_uint devs[platform_count]; cl_device_id * devices[platform_count]; cl_context ctx[platform_count]; cl_command_queue * commandQueue[platform_count]; device_count = 0; for (p=0; p<platform_count; p++) { cl_platform_id platform = platforms[p]; err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]); if (err == CL_DEVICE_NOT_FOUND) { devs[p] = 0; continue; } if (devs[p] == 0) { printf("No OpenCL device found\n"); exit(77); } if (err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err); exit(EXIT_FAILURE); } if (devs[p] == 0) continue; devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]); commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]); check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL)); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err)); for(i = 0; i < devs[p]; ++i) { cl_device_id device = devices[p][i]; char name[2048]; name[0] = '\0'; clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL); printf("Device %d: %s\n", i, name); check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err)); } device_count += devs[p]; } if (device_count == 0) error("No device found\n"); cl_kernel multiplicationKernel[platform_count]; printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC); // allocate host memory for matrices A, B and C A_data = (TYPE*)malloc(A_mem_size); if (A_data == NULL) { perror("malloc"); } B_data = (TYPE*)malloc(B_mem_size); if (B_data == NULL) { perror("malloc"); } C_data = (TYPE*) malloc(C_mem_size); if (C_data == NULL) { perror("malloc"); } cl_program program[platform_count]; for (p=0; p<platform_count; p++) { if (devs[p] == 0) continue; check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err)); check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL)); check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err)); } printf("Initializing data...\n"); srand(2008); fillArray(A_data, A_size); fillArray(B_data, B_size); memset(C_data, 0, C_size); printf("Computing...\n"); workOffset[0] = 0; gettimeofday(&start, NULL); size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; int c = 0; for (p=0; p<platform_count;p++) { for (i=0; i<devs[p]; i++) { check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err)); c++; } } for(i=0; i < BLOCKS; ++i) { int d = i % device_count; cl_uint platform = 0; // determine device platform int dev = d; for (platform = 0; platform < platform_count; platform++) { if ((cl_int)(dev - devs[platform]) < 0) break; dev -= devs[platform]; } workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU; check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err)); check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err)); check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i])); check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d])); check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i])); size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])}; check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i])); // Non-blocking copy of result from device to host cqs[i] = commandQueue[platform][dev]; check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err)); if(i+1 < BLOCKS) workOffset[i + 1] = workOffset[i] + workSize[i]; } // CPU sync with GPU for (p=0; p<platform_count;p++) { cl_uint dev; for (dev=0; dev<devs[p]; dev++) { clFinish(commandQueue[p][dev]); } } gettimeofday(&end, NULL); double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec)); double dSeconds = timing/1000/1000; double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB; double gflops = 1.0e-9 * dNumOps/dSeconds; printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]); // compute reference solution if (check) { printf("Comparing results with CPU computation... "); TYPE* reference = (TYPE*)malloc(C_mem_size); computeReference(reference, A_data, B_data, HA, WA, WB); // check result int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f); if (res == 0) { printf("\n\n"); printDiff(reference, C_data, WC, HC, 100, 1.0e-5f); } else printf("PASSED\n\n"); free(reference); } for(i = 0; i < BLOCKS; i++) { clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL); } for(i = 0; i < BLOCKS; i++) { clFinish(cqs[i]); } for (i=0; i<device_count; i++) { clReleaseMemObject(d_B[i]); } for(i = 0; i < BLOCKS; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } for (p=0; p<platform_count;p++) { if (devs[p] == 0) continue; check(clReleaseKernel(multiplicationKernel[p])); check(clReleaseProgram(program[p])); check(clReleaseContext(ctx[p])); cl_uint k; for(k = 0; k < devs[p]; ++k) { check(clReleaseCommandQueue(commandQueue[p][k])); } } free(A_data); free(B_data); free(C_data); return 0; }
void runProgram(int N, char *fileName) { printf("GPU Symmetrize()..." "\nSquareMatrix[%d][%d]\n", N, N); int i,j; // initialize input array float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = j; } } // result float *Aout; Aout = (float*)malloc(sizeof(float)*N*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); #endif int NumK = 1; int NumE = 2; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); //printf("binary size = %ld\n", binary_size); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL); OCL_CHECK(err); //puts("save binaries"); // Print the binary out to the output file fh = fopen("kernel.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); puts("done save binaries"); #endif kernel[0] = clCreateKernel(program, "kernel_a", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); // copy data to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]); OCL_CHECK(err); size_t localsize[2]; size_t globalsize[2]; localsize[0] = 16; localsize[1] = 16; globalsize[0] = N; globalsize[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL); OCL_CHECK(err); clFinish(queue); // read device data back to host clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("Output"); check_2d_f(Aout,N,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(Aout_d); // // check // int flag = 1; // for(i=0;i<N;++i){ // for(j=0;j<N;++j){ // if(A[i*N+j] != At[j*N+i]) // { // flag = 0; // break; // } // } // } // if( flag == 0 ) // { // puts("Bugs! Check program."); // }else{ // puts("Succeed!"); // } clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(Aout); return; }
static cl_int runSummarization(CLInfo* ci, SeparationCLMem* cm, const IntegralArea* ia, cl_uint which, Kahan* resultOut) { cl_int err = CL_SUCCESS; cl_mem buf; cl_uint offset; size_t global[1]; size_t local[1]; real result[2] = { -1.0, -1.0 }; cl_uint nElements = ia->r_steps * ia->mu_steps; cl_mem sumBufs[2] = { cm->summarizationBufs[0], cm->summarizationBufs[1] }; if (which == 0) { buf = cm->outBg; offset = 0; } else { buf = cm->outStreams; offset = (which - 1) * nElements; } /* First call reads from an offset into one of the output buffers */ err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]); err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &buf); err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements); err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel arguments"); return err; } local[0] = _summarizationWorkgroupSize; global[0] = mwNextMultiple(local[0], nElements); err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization kernel"); return err; } /* Why is this necessary? It seems to frequently break on the 7970 and nowhere else without it */ err = clFinish(ci->queue); //err = clFlush(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error finishing summarization kernel"); return err; } /* Later calls swap between summarization buffers without an offset */ nElements = (cl_uint) mwDivRoundup(global[0], local[0]); offset = 0; err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel offset argument"); return err; } while (nElements > 1) { /* Swap old summarization buffer to the input and shrink the range */ swapBuffers(sumBufs); global[0] = mwNextMultiple(local[0], nElements); err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]); err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &sumBufs[1]); err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error setting summarization kernel arguments"); return err; } /* err = clEnqueueBarrier(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization barrier"); return err; } */ err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization kernel"); return err; } err = clFinish(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error finishing summarization kernel"); return err; } nElements = (cl_uint) mwDivRoundup(global[0], local[0]); } err = clEnqueueBarrier(ci->queue); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error enqueuing summarization barrier"); return err; } err = clEnqueueReadBuffer(ci->queue, sumBufs[0], CL_TRUE, 0, 2 * sizeof(real), result, 0, NULL, NULL); if (err != CL_SUCCESS) { mwPerrorCL(err, "Error reading summarization result buffer"); return err; } resultOut->sum = result[0]; resultOut->correction = result[1]; return CL_SUCCESS; }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); float* h_areas = new float[nelr]; int* h_elements_surrounding_elements = new int[nelr*NNB]; float* h_normals = new float[nelr*NDIM*NNB]; // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } // Get program source. long kernelSize = getKernelSize(); char* kernelSource = new char[kernelSize]; getKernelSource(kernelSource, kernelSize); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err); CHKERR(err, "Failed to create a compute program!"); // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); log = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log); free(log); exit(1); } CHKERR(err, "Failed to build program!"); delete[] kernelSource; // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
int finish() { return clFinish(commands); }
// host stub function void ops_par_loop_advec_mom_kernel_mass_flux_z(char const *name, ops_block block, int dim, int *range, ops_arg arg0, ops_arg arg1) { // Timing double t1, t2, c1, c2; ops_arg args[2] = {arg0, arg1}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 2, range, 134)) return; #endif if (OPS_diags > 1) { ops_timing_realloc(134, "advec_mom_kernel_mass_flux_z"); OPS_kernels[134].count++; ops_timers_core(&c1, &t1); } // compute locally allocated range for the sub-block int start[3]; int end[3]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for (int n = 0; n < 3; n++) { start[n] = sb->decomp_disp[n]; end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; if (start[n] >= range[2 * n]) { start[n] = 0; } else { start[n] = range[2 * n] - start[n]; } if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) start[n] = range[2 * n]; if (end[n] >= range[2 * n + 1]) { end[n] = range[2 * n + 1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n] == MPI_PROC_NULL && (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); } #else for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #endif int x_size = MAX(0, end[0] - start[0]); int y_size = MAX(0, end[1] - start[1]); int z_size = MAX(0, end[2] - start[2]); int xdim0 = args[0].dat->size[0]; int ydim0 = args[0].dat->size[1]; int xdim1 = args[1].dat->size[0]; int ydim1 = args[1].dat->size[1]; // build opencl kernel if not already built buildOpenCLKernels_advec_mom_kernel_mass_flux_z(xdim0, ydim0, xdim1, ydim1); // set up OpenCL thread blocks size_t globalWorkSize[3] = { ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x, ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y, ((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z}; size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y, OPS_block_size_z}; // set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); base0 = base0 + args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 * (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); base1 = base1 + args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 * (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); ops_H_D_exchanges_device(args, 2); ops_halo_exchanges(args, 2, range); ops_H_D_exchanges_device(args, 2); if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[134].mpi_time += t2 - t1; } if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) { clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 0, sizeof(cl_mem), (void *)&arg0.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 1, sizeof(cl_mem), (void *)&arg1.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 2, sizeof(cl_int), (void *)&base0)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 3, sizeof(cl_int), (void *)&base1)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 4, sizeof(cl_int), (void *)&x_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 5, sizeof(cl_int), (void *)&y_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 6, sizeof(cl_int), (void *)&z_size)); // call/enque opencl kernel wrapper function clSafeCall(clEnqueueNDRangeKernel( OPS_opencl_core.command_queue, OPS_opencl_core.kernel[134], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL)); } if (OPS_diags > 1) { clSafeCall(clFinish(OPS_opencl_core.command_queue)); } if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[134].time += t1 - t2; } ops_set_dirtybit_device(args, 2); ops_set_halo_dirtybit3(&args[0], range); if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c2, &t2); OPS_kernels[134].mpi_time += t2 - t1; OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1); } }
/* .External */ SEXP ocl_call(SEXP args) { struct arg_chain *float_args = 0; ocl_call_context_t *occ; int on, an = 0, ftype = FT_DOUBLE, ftsize, ftres, async; SEXP ker = CADR(args), olen, arg, res, octx, dimVec; cl_kernel kernel = getKernel(ker); cl_context context; cl_command_queue commands; cl_device_id device_id = getDeviceID(getAttrib(ker, Rf_install("device"))); cl_mem output; size_t wdims[3] = {0, 0, 0}; int wdim = 1; if (clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(context), &context, NULL) != CL_SUCCESS || !context) Rf_error("cannot obtain kernel context via clGetKernelInfo"); args = CDDR(args); res = Rf_getAttrib(ker, install("precision")); if (TYPEOF(res) == STRSXP && LENGTH(res) == 1 && CHAR(STRING_ELT(res, 0))[0] != 'd') ftype = FT_SINGLE; ftsize = (ftype == FT_DOUBLE) ? sizeof(double) : sizeof(float); olen = CAR(args); /* size */ args = CDR(args); on = Rf_asInteger(olen); if (on < 0) Rf_error("invalid output length"); ftres = (Rf_asInteger(CAR(args)) == 1) ? 1 : 0; /* native.result */ if (ftype != FT_SINGLE) ftres = 0; args = CDR(args); async = (Rf_asInteger(CAR(args)) == 1) ? 0 : 1; /* wait */ args = CDR(args); dimVec = coerceVector(CAR(args), INTSXP); /* dim */ wdim = LENGTH(dimVec); if (wdim > 3) Rf_error("OpenCL standard only supports up to three work item dimensions - use index vectors for higher dimensions"); if (wdim) { int i; /* we don't use memcpy in case int and size_t are different */ for (i = 0; i < wdim; i++) wdims[i] = INTEGER(dimVec)[i]; } if (wdim < 1 || wdims[0] < 1 || (wdim > 1 && wdims[1] < 1) || (wdim > 2 && wdims[2] < 1)) Rf_error("invalid dimensions - muse be a numeric vector with positive values"); args = CDR(args); occ = (ocl_call_context_t*) calloc(1, sizeof(ocl_call_context_t)); if (!occ) Rf_error("unable to allocate ocl_call context"); octx = PROTECT(R_MakeExternalPtr(occ, R_NilValue, R_NilValue)); R_RegisterCFinalizerEx(octx, ocl_call_context_fin, TRUE); occ->output = output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ftsize * on, NULL, &last_ocl_error); if (!output) Rf_error("failed to create output buffer of %d elements via clCreateBuffer (%d)", on, last_ocl_error); if (clSetKernelArg(kernel, an++, sizeof(cl_mem), &output) != CL_SUCCESS) Rf_error("failed to set first kernel argument as output in clSetKernelArg"); if (clSetKernelArg(kernel, an++, sizeof(on), &on) != CL_SUCCESS) Rf_error("failed to set second kernel argument as output length in clSetKernelArg"); occ->commands = commands = clCreateCommandQueue(context, device_id, 0, &last_ocl_error); if (!commands) ocl_err("clCreateCommandQueue"); if (ftype == FT_SINGLE) /* need conversions, create floats buffer */ occ->float_args = float_args = arg_alloc(0, 32); while ((arg = CAR(args)) != R_NilValue) { int n, ndiv = 1; void *ptr; size_t al; switch (TYPEOF(arg)) { case REALSXP: if (ftype == FT_SINGLE) { int i; float *f; double *d = REAL(arg); n = LENGTH(arg); f = (float*) malloc(sizeof(float) * n); if (!f) Rf_error("unable to allocate temporary single-precision memory for conversion from a double-precision argument vector of length %d", n); for (i = 0; i < n; i++) f[i] = d[i]; ptr = f; al = sizeof(float); arg_add(float_args, ptr); } else { ptr = REAL(arg); al = sizeof(double); } break; case INTSXP: ptr = INTEGER(arg); al = sizeof(int); break; case LGLSXP: ptr = LOGICAL(arg); al = sizeof(int); break; case RAWSXP: if (inherits(arg, "clFloat")) { ptr = RAW(arg); ndiv = al = sizeof(float); break; } default: Rf_error("only numeric or logical kernel arguments are supported"); /* no-ops but needed to make the compiler happy */ ptr = 0; al = 0; } n = LENGTH(arg); if (ndiv != 1) n /= ndiv; if (n == 1) {/* scalar */ if ((last_ocl_error = clSetKernelArg(kernel, an++, al, ptr)) != CL_SUCCESS) Rf_error("Failed to set scalar kernel argument %d (size=%d, error code %d)", an, al, last_ocl_error); } else { cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, al * n, ptr, &last_ocl_error); if (!input) Rf_error("Unable to create buffer (%d elements, %d bytes each) for vector argument %d (oclError %d)", n, al, an, last_ocl_error); if (!occ->mem_objects) occ->mem_objects = arg_alloc(0, 32); arg_add(occ->mem_objects, input); #if 0 /* we used this before CL_MEM_USE_HOST_PTR */ if ((last_ocl_error = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, al * n, ptr, 0, NULL, NULL)) != CL_SUCCESS) Rf_error("Failed to transfer data (%d elements) for vector argument %d (oclError %d)", n, an, last_ocl_error); #endif if ((last_ocl_error = clSetKernelArg(kernel, an++, sizeof(cl_mem), &input)) != CL_SUCCESS) Rf_error("Failed to set vector kernel argument %d (size=%d, length=%d, error %d)", an, al, n, last_ocl_error); /* clReleaseMemObject(input); */ } args = CDR(args); } if ((last_ocl_error = clEnqueueNDRangeKernel(commands, kernel, wdim, NULL, wdims, NULL, 0, NULL, async ? &occ->event : NULL)) != CL_SUCCESS) ocl_err("Kernel execution"); if (async) { /* asynchronous call -> get out and return the context */ #if USE_OCL_COMPLETE_CALLBACK last_ocl_error = clSetEventCallback(occ->event, CL_COMPLETE, ocl_complete_callback, occ); #endif clFlush(commands); /* the specs don't guarantee execution unless clFlush is called */ occ->ftres = ftres; occ->ftype = ftype; occ->on = on; Rf_setAttrib(octx, R_ClassSymbol, mkString("clCallContext")); UNPROTECT(1); return octx; } clFinish(commands); occ->finished = 1; /* we can release input memory objects now */ if (occ->mem_objects) { arg_free(occ->mem_objects, (afin_t) clReleaseMemObject); occ->mem_objects = 0; } if (float_args) { arg_free(float_args, 0); float_args = occ->float_args = 0; } res = ftres ? Rf_allocVector(RAWSXP, on * sizeof(float)) : Rf_allocVector(REALSXP, on); if (ftype == FT_SINGLE) { if (ftres) { if ((last_ocl_error = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, RAW(res), 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, last_ocl_error); PROTECT(res); Rf_setAttrib(res, R_ClassSymbol, mkString("clFloat")); UNPROTECT(1); } else { /* float - need a temporary buffer */ float *fr = (float*) malloc(sizeof(float) * on); double *r = REAL(res); int i; if (!fr) Rf_error("unable to allocate memory for temporary single-precision output buffer"); occ->float_out = fr; if ((last_ocl_error = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, fr, 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, last_ocl_error); for (i = 0; i < on; i++) r[i] = fr[i]; } } else if ((last_ocl_error = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(double) * on, REAL(res), 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d double elements, oclError %d)", on, last_ocl_error); ocl_call_context_fin(octx); UNPROTECT(1); return res; }
//Do the proper test using different sizes. static cl_ulong gws_test(size_t num, struct fmt_main * self) { cl_event myEvent; cl_int ret_code; cl_uint *tmpbuffer; cl_ulong startTime, endTime, runtime; int i, loops; //Prepare buffers. create_clobj(num, self); tmpbuffer = mem_alloc(sizeof(sha512_hash) * num); if (tmpbuffer == NULL) { fprintf(stderr, "Malloc failure in find_best_gws\n"); exit(EXIT_FAILURE); } queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue"); // Set salt. set_salt(get_salt("$6$saltstring$")); salt->initial = salt->rounds - get_multiple(salt->rounds, HASH_LOOPS); // Set keys for (i = 0; i < num; i++) { set_key("aaabaabaaa", i); } //** Get execution time **// HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_FALSE, 0, sizeof(sha512_salt), salt, 0, NULL, &myEvent), "Failed in clEnqueueWriteBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime = endTime - startTime; //** Get execution time **// HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, sizeof(sha512_password) * num, plaintext, 0, NULL, &myEvent), "Failed in clEnqueueWriteBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; //** Get execution time **// if (gpu(source_in_use) || use_local(source_in_use)) { ret_code = clEnqueueNDRangeKernel(queue_prof, prepare_kernel, 1, NULL, &num, &local_work_size, 0, NULL, &myEvent); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; } loops = gpu(source_in_use) || use_local(source_in_use) ? (salt->rounds / HASH_LOOPS) : 1; //** Get execution time **// for (i = 0; i < loops; i++) { ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &num, &local_work_size, 0, NULL, &myEvent); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; } //** Get execution time **// HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0, sizeof(sha512_hash) * num, tmpbuffer, 0, NULL, &myEvent), "Failed in clEnqueueReadBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; MEM_FREE(tmpbuffer); HANDLE_CLERROR(clReleaseCommandQueue(queue_prof), "Failed in clReleaseCommandQueue"); release_clobj(); if (ret_code != CL_SUCCESS) { if (ret_code != CL_INVALID_WORK_GROUP_SIZE) fprintf(stderr, "Error %d\n", ret_code); return 0; } return runtime; }
static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, int64_t __maybe_unused max_nonce) { const int thr_id = thr->id; struct opencl_thread_data *thrdata = thr->cgpu_data; struct cgpu_info *gpu = thr->cgpu; _clState *clState = clStates[thr_id]; const cl_kernel *kernel = &clState->kernel; const int dynamic_us = opt_dynamic_interval * 1000; cl_bool blocking; cl_int status; size_t globalThreads[1]; size_t localThreads[1] = { clState->wsize }; unsigned int threads; int64_t hashes; if (gpu->dynamic) blocking = CL_TRUE; else blocking = CL_FALSE; /* This finish flushes the readbuffer set with CL_FALSE later */ if (!blocking) clFinish(clState->commandQueue); if (gpu->dynamic) { struct timeval diff; suseconds_t gpu_us; gettimeofday(&gpu->tv_gpuend, NULL); timersub(&gpu->tv_gpuend, &gpu->tv_gpustart, &diff); gpu_us = diff.tv_sec * 1000000 + diff.tv_usec; if (likely(gpu_us >= 0)) { gpu->gpu_us_average = (gpu->gpu_us_average + gpu_us * 0.63) / 1.63; /* Try to not let the GPU be out for longer than * opt_dynamic_interval in ms, but increase * intensity when the system is idle in dynamic mode */ if (gpu->gpu_us_average > dynamic_us) { if (gpu->intensity > MIN_INTENSITY) --gpu->intensity; } else if (gpu->gpu_us_average < dynamic_us / 2) { if (gpu->intensity < MAX_INTENSITY) ++gpu->intensity; } } } set_threads_hashes(clState->vwidth, &threads, &hashes, globalThreads, localThreads[0], gpu->intensity); if (hashes > gpu->max_hashes) gpu->max_hashes = hashes; status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); return -1; } /* MAXBUFFERS entry is used as a flag to say nonces exist */ if (thrdata->res[FOUND]) { /* Clear the buffer again */ status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0, BUFFERSIZE, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); return -1; } if (unlikely(thrdata->last_work)) { applog(LOG_DEBUG, "GPU %d found something in last work?", gpu->device_id); postcalc_hash_async(thr, thrdata->last_work, thrdata->res); thrdata->last_work = NULL; } else { applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id); postcalc_hash_async(thr, work, thrdata->res); } memset(thrdata->res, 0, BUFFERSIZE); if (!blocking) clFinish(clState->commandQueue); } gettimeofday(&gpu->tv_gpustart, NULL); if (clState->goffset) { size_t global_work_offset[1]; global_work_offset[0] = work->blk.nonce; status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset, globalThreads, localThreads, 0, NULL, NULL); } else status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); return -1; } status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0, BUFFERSIZE, thrdata->res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status); return -1; } /* The amount of work scanned can fluctuate when intensity changes * and since we do this one cycle behind, we increment the work more * than enough to prevent repeating work */ work->blk.nonce += gpu->max_hashes; return hashes; }
void runKernel(void) { cl_int err; cl_event event; size_t global_item_size_max = 16; size_t global_item_size = cl_width * cl_height; size_t global_item_size2[] = {cl_width, cl_height}; char *output; int i; /* local item size : * the number of work item * in a work group in each diamension * * the only constraint for the global_work_size is * that it must be a multiple of the local_work_size (for each dimension). */ size_t local_item_size = 64; size_t local_item_size2[] = {64, 8}; //this will update our system by calculating new velocity and updating the positions of our particles //Make sure OpenGL is done using our VBOs glFinish(); // map OpenGL buffer object for writing from OpenCL //this passes in the vector of VBO buffer objects (position and color) err = clEnqueueAcquireGLObjects(command_queue, 2, cl_pbos, 0, NULL, NULL); checkError("acquireGLObjects", err); clFinish(command_queue); //execute the kernel //err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, // &global_item_size, // //&local_item_size, // NULL, // 0, NULL, &event); err = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_item_size2, local_item_size2, //NULL, 0, NULL, &event); checkError("clEnqueueNDRangeKernel", err); err = clEnqueueNDRangeKernel(command_queue, kernel_max, 1, NULL, &global_item_size_max, //&local_item_size, NULL, 0, NULL, &event); checkError("clEnqueueNDRangeKernel max", err); ///* Transfer result to host */ output = malloc(4 * 4); err = clEnqueueReadBuffer(command_queue, mobj, CL_TRUE, 0, 4 * 4 * sizeof(char), output, 0, NULL, NULL); checkError("clEnqueueReadBuffer", err); free(output); //clFinish(command_queue); //Release the VBOs so OpenGL can play with them clEnqueueReleaseGLObjects(command_queue, 2, cl_pbos, 0, NULL, NULL); checkError("releaseGLObjects", err); clFlush(command_queue); clFinish(command_queue); }
int main(int argc, char *argv[]){ cl_uint numPlatforms; cl_platform_id* clSelectedPlatformID = NULL; int err; // error code returned from api calls int data[DATA_SIZE]; // original data set given to device int results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array if(parseArgs(argc, argv)){ return 0; } // Fill our data set with random int values unsigned int count = DATA_SIZE; //////////////////////////////////////////////////////////////////////////////// // Simple compute kernel which computes the collatz of an input array // const char *KernelSource = fileToString("gpuFunctions.c"); //get Platform clGetPlatformIDs(0, NULL, &numPlatforms); clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms); err = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL); //get Device err = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } //create context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "allToOne", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } timer t = createTimer(); for(int i =0;i<rep;i++){ initData(data); // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } } double timeEnd = getTime(t); // Validate our results // correct = 0; for(int i = 0; i < arraySize; i++) { if(results[i] >= 0){ correct++; if(i==0){ printf("%d",results[i]); }else{ printf(",%d",results[i]); } } } printf("\n"); // Print a brief summary detailing the results printf("Computed '%d/%d' values to 1!\n", correct, arraySize); printf("TIME- %f\n",timeEnd); // Shutdown and cleanup clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
void Extrae_OpenCL_clCreateCommandQueue (cl_command_queue queue, cl_device_id device, cl_command_queue_properties properties) { if (!Extrae_OpenCL_lookForOpenCLQueue (queue, NULL)) { cl_int err; char _threadname[THREAD_INFO_NAME_LEN]; char _hostname[HOST_NAME_MAX]; char *_device_type; int prev_threadid, found, idx; cl_device_type device_type; cl_event event; idx = nCommandQueues; CommandQueues = (RegisteredCommandQueue_t*) realloc ( CommandQueues, sizeof(RegisteredCommandQueue_t)*(nCommandQueues+1)); if (CommandQueues == NULL) { fprintf (stderr, PACKAGE_NAME": Fatal error! Failed to allocate memory for OpenCL Command Queues\n"); exit (-1); } CommandQueues[idx].queue = queue; CommandQueues[idx].isOutOfOrder = (properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0; err = clGetDeviceInfo (device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); if (err == CL_SUCCESS) { if (device_type == CL_DEVICE_TYPE_GPU) _device_type = "GPU"; else if (device_type == CL_DEVICE_TYPE_CPU) _device_type = "CPU"; else _device_type = "Other"; } else _device_type = "Unknown"; /* Was the thread created before (i.e. did we executed a cudadevicereset?) */ if (gethostname(_hostname, HOST_NAME_MAX) == 0) sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx, _hostname); else sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx, "unknown-host"); prev_threadid = Extrae_search_thread_name (_threadname, &found); if (found) { /* If thread name existed, reuse its thread id */ CommandQueues[idx].threadid = prev_threadid; } else { /* For timing purposes we change num of threads here instead of doing Backend_getNumberOfThreads() + CUDAdevices*/ Backend_ChangeNumberOfThreads (Backend_getNumberOfThreads() + 1); CommandQueues[idx].threadid = Backend_getNumberOfThreads()-1; /* Set thread name */ Extrae_set_thread_name (CommandQueues[idx].threadid, _threadname); } CommandQueues[idx].nevents = 0; #ifdef CL_VERSION_1_2 err = clEnqueueBarrierWithWaitList (queue, 0, NULL, &event); #else err = clEnqueueBarrier (queue); if (err == CL_SUCCESS) err = clEnqueueMarker (queue, &event); #endif CommandQueues[idx].host_reference_time = TIME; if (err == CL_SUCCESS) { err = clFinish(queue); if (err != CL_SUCCESS) { fprintf (stderr, PACKAGE_NAME": Error in clFinish (error = %d)! Dying...\n", err); exit (-1); } err = clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &(CommandQueues[idx].device_reference_time), NULL); if (err != CL_SUCCESS) { fprintf (stderr, PACKAGE_NAME": Error in clGetEventProfilingInfo (error = %d)! Dying...\n", err); exit (-1); } } else { fprintf (stderr, PACKAGE_NAME": Error while looking for clock references in host & accelerator\n"); exit (-1); } nCommandQueues++; } }
int main(int argc, char** argv) { int err; // error code returned from api calls float *cpu_xyz; //calculate the results on the CPU float *gpu_xyz; //calculate the results on the GPU unsigned int correct; // number of correct results returned char* kernel_source; //kernel source code cl_platform_id platform_id; // compute platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel code //stuff were going to query cl_int preferred_workgroup_size; cl_int max_workgroup_size; cl_mem cl_output; // device memory used for the output array int i, j, k; if(!cl_load("main.cl", &kernel_source)) { printf("Your file didn't load."); return 1; } int theta, phi, r; for(i=0; i<V_THETA_MAX; i++) { theta = i*V_THETA_INC; for(j=0; j<V_PHI_MAX; j++) { phi = j*V_PHI_INC; for(k=0; k<V_R_MAX; k++) { r = k*V_R_INC; } } } //get the platform information. This corresponds to vendor implementations of opencl cl_uint platforms; err = clGetPlatformIDs(1, &platform_id, &platforms); if(err != CL_SUCCESS) { printf("Error: Failed to query platform ids!\n"); return EXIT_FAILURE; } // Connect to a compute device. A GPU in this case. // cl_uint num_devices; err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context. A handle to the combination of platform and device. // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue. We will use this to send work to the CPU. // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // create a program, given the loaded source code. // program = clCreateProgramWithSource(context, 1, (const char **) &kernel_source, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Compile the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // create (select) a kernel function from the compiled program // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Get the preferred workgroup size multiple // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Get the preferred workgroup size // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Create the output array in device memory for our calculation // output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // // global = count; global = local; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if((results[i]) == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main() { cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; char string[MEM_SIZE]; FILE *fp; char fileName[] = "./hello.cl"; char *source_str; size_t source_size; /* Load the source code containing the kernel*/ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Get Platform and Device Info */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* Create OpenCL context */ context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); /* Create Command Queue */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* Create Memory Buffer */ memobj = clCreateBuffer(context, CL_MEM_READ_WRITE,MEM_SIZE * sizeof(char), NULL, &ret); /* Create Kernel Program from the source */ program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); /* Build Kernel Program */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create OpenCL Kernel */ kernel = clCreateKernel(program, "hello", &ret); /* Set OpenCL Kernel Parameters */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj); /* Execute OpenCL Kernel */ ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL); /* Copy results from the memory buffer */ ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(char),string, 0, NULL, NULL); /* Display Result */ puts(string); /* Finalization */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(source_str); system("Pause"); return 0; }
void btParticlesDynamicsWorld::runIntegrateMotionKernel() { cl_int ciErrNum; if(m_useCpuControls[SIMSTAGE_INTEGRATE_MOTION]->m_active) { // CPU version #if 1 // read from GPU unsigned int memSize = sizeof(btVector3) * m_numParticles; ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); for(int index = 0; index < m_numParticles; index++) { btVector3 pos = m_hPos[index]; btVector3 vel = m_hVel[index]; pos[3] = 1.0f; vel[3] = 0.0f; // apply gravity btVector3 gravity; gravity[0] = m_simParams.m_gravity[0]; gravity[1] = m_simParams.m_gravity[1]; gravity[2] = m_simParams.m_gravity[2]; float particleRad = m_simParams.m_particleRad; float globalDamping = m_simParams.m_globalDamping; float boundaryDamping = m_simParams.m_boundaryDamping; vel += gravity * m_timeStep; vel *= globalDamping; // integrate position pos += vel * m_timeStep; // collide with world boundaries btVector3 worldMin; worldMin[0] = m_simParams.m_worldMin[0]; worldMin[1] = m_simParams.m_worldMin[1]; worldMin[2] = m_simParams.m_worldMin[2]; btVector3 worldMax; worldMax[0] = m_simParams.m_worldMax[0]; worldMax[1] = m_simParams.m_worldMax[1]; worldMax[2] = m_simParams.m_worldMax[2]; for(int j = 0; j < 3; j++) { if(pos[j] < (worldMin[j] + particleRad)) { pos[j] = worldMin[j] + particleRad; vel[j] *= boundaryDamping; } if(pos[j] > (worldMax[j] - particleRad)) { pos[j] = worldMax[j] - particleRad; vel[j] *= boundaryDamping; } } // write back position and velocity m_hPos[index] = pos; m_hVel[index] = vel; } #endif // write back to GPU memSize = sizeof(btVector3) * m_numParticles; ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); } else { // Set work size and execute the kernel ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 4, sizeof(float), &m_timeStep); oclCHECKERROR(ciErrNum, CL_SUCCESS); runKernelWithWorkgroupSize(PARTICLES_KERNEL_INTEGRATE_MOTION, m_numParticles); ciErrNum = clFinish(m_cqCommandQue); oclCHECKERROR(ciErrNum, CL_SUCCESS); } }
void call_kernel(float *data,unsigned int count,char * cl_name,float *results) { FILE* programHandle; size_t programSize, KernelSourceSize; char *programBuffer, *KernelSource; size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array int err; int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); commands = clCreateCommandQueue(context, device_id, 0, &err); //---------------------------------------------------------------------------- // get size of kernel source programHandle = fopen(cl_name, "r"); fseek(programHandle, 0, SEEK_END); programSize = ftell(programHandle); rewind(programHandle); programBuffer = (char*) malloc(programSize + 1); programBuffer[programSize] = '\0'; fread(programBuffer, sizeof(char), programSize, programHandle); fclose(programHandle); // create program from buffer program = clCreateProgramWithSource(context,1,(const char**) &programBuffer,&programSize, NULL); free(programBuffer); // read kernel source back in from program to check clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &KernelSourceSize); KernelSource = (char*) malloc(KernelSourceSize); clGetProgramInfo(program, CL_PROGRAM_SOURCE, KernelSourceSize, KernelSource, NULL); program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); kernel = clCreateKernel(program, "square", &err); //---------------------------------------------------------------------------- input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); clFinish(commands); err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); printf("nKernel source:\n\n %s \n", KernelSource); free(KernelSource); }
int main() { char buf[]="Hello, World!"; size_t srcsize, worksize=strlen(buf); cl_int error; cl_platform_id platform; cl_device_id device; cl_uint platforms, devices; // Fetch the Platform and Device IDs; we only want one. error=clGetPlatformIDs(1, &platform, &platforms); error=clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &devices); cl_context_properties properties[]={ CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; // Note that nVidia's OpenCL requires the platform property cl_context context=clCreateContext(properties, 1, &device, NULL, NULL, &error); cl_command_queue cq = clCreateCommandQueue(context, device, 0, &error); rot13(buf); // scramble using the CPU puts(buf); // Just to demonstrate the plaintext is destroyed //char src[8192]; //FILE *fil=fopen("rot13.cl","r"); //srcsize=fread(src, sizeof src, 1, fil); //fclose(fil); const char *src=rot13_cl; srcsize=strlen(rot13_cl); const char *srcptr[]={src}; // Submit the source code of the rot13 kernel to OpenCL cl_program prog=clCreateProgramWithSource(context, 1, srcptr, &srcsize, &error); // and compile it (after this we could extract the compiled version) error=clBuildProgram(prog, 0, NULL, "", NULL, NULL); // Allocate memory for the kernel to work with cl_mem mem1, mem2; mem1=clCreateBuffer(context, CL_MEM_READ_ONLY, worksize, NULL, &error); mem2=clCreateBuffer(context, CL_MEM_WRITE_ONLY, worksize, NULL, &error); // get a handle and map parameters for the kernel cl_kernel k_rot13=clCreateKernel(prog, "rot13", &error); clSetKernelArg(k_rot13, 0, sizeof(mem1), &mem1); clSetKernelArg(k_rot13, 1, sizeof(mem2), &mem2); // Target buffer just so we show we got the data from OpenCL char buf2[sizeof buf]; buf2[0]='?'; buf2[worksize]=0; // Send input data to OpenCL (async, don't alter the buffer!) error=clEnqueueWriteBuffer(cq, mem1, CL_FALSE, 0, worksize, buf, 0, NULL, NULL); // Perform the operation error=clEnqueueNDRangeKernel(cq, k_rot13, 1, NULL, &worksize, &worksize, 0, NULL, NULL); // Read the result back into buf2 error=clEnqueueReadBuffer(cq, mem2, CL_FALSE, 0, worksize, buf2, 0, NULL, NULL); // Await completion of all the above error=clFinish(cq); // Finally, output out happy message. puts(buf2); }
int main(int argc, char** argv) { int rank, size; // MPI rank & size int err; // error code returned from OpenCL calls float h_a[LENGTH]; // a vector float h_b[LENGTH]; // b vector float h_c[LENGTH]; // c vector (a+b) returned from the compute device (local per task) float _h_c[LENGTH]; // c vector (a+b) returned from the compute device (global for master) unsigned int correct; // number of correct results size_t global; // global domain size size_t local; // local domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_vadd; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory used for the input b vector cl_mem d_c; // device memory used for the output c vector int mycount, i; err = MPI_Init (&argc, &argv); if (err != MPI_SUCCESS) { printf ("MPI_Init failed!\n"); exit (-1); } err = MPI_Comm_rank (MPI_COMM_WORLD, &rank); if (err != MPI_SUCCESS) { printf ("MPI_Comm_rank failed!\n"); exit (-1); } err = MPI_Comm_size (MPI_COMM_WORLD, &size); if (err != MPI_SUCCESS) { printf ("MPI_Comm_size failed\n"); exit (-1); } if (LENGTH % size != 0) { printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH); exit (-1); } mycount = LENGTH / size; if (rank == 0) { for (i = 0; i < LENGTH; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_a[i] = i; h_b[i] = i*2; } err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_b\n"); exit (-1); } } else { err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_b\n"); exit (-1); } } // Set up platform cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n"); return EXIT_FAILURE; } // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } else { if (output_device_info (rank, device_id) != CL_SUCCESS) return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel from the program ko_vadd = clCreateKernel(program, "vadd", &err); if (!ko_vadd || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input (a, b) and output (c) arrays in device memory d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL); if (!d_a || !d_b || !d_c) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write a and b vectors into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_a to source array!\n"); exit(1); } err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_b to source array!\n"); exit(1); } // Set the arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device global = LENGTH; err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the commands to complete before reading back results clFinish(commands); // Read back the results from the compute device err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Gather failed receiving h_c\n"); exit (-1); } if (rank == 0) { // Test the results correct = 0; float tmp; for(i = 0; i < LENGTH; i++) { tmp = h_a[i] + h_b[i]; // assign element i of a+b to tmp tmp -= _h_c[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]); } // summarize results printf("C = A+B: %d out of %d results were correct.\n", correct, LENGTH); } // cleanup then shutdown clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(ko_vadd); clReleaseCommandQueue(commands); clReleaseContext(context); err = MPI_Finalize (); if (err != MPI_SUCCESS) { printf ("MPI_Finalize failed!\n"); exit (-1); } return 0; }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
// host stub function void ops_par_loop_update_halo_kernel2_xvel_minus_4_a(char const *name, ops_block block, int dim, int* range, ops_arg arg0, ops_arg arg1, ops_arg arg2) { ops_arg args[3] = { arg0, arg1, arg2}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args,3,range,55)) return; #endif ops_timing_realloc(55,"update_halo_kernel2_xvel_minus_4_a"); OPS_kernels[55].count++; //compute locally allocated range for the sub-block int start[2]; int end[2]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for ( int n=0; n<2; n++ ){ start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; if (start[n] >= range[2*n]) { start[n] = 0; } else { start[n] = range[2*n] - start[n]; } if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; if (end[n] >= range[2*n+1]) { end[n] = range[2*n+1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); } #else //OPS_MPI for ( int n=0; n<2; n++ ){ start[n] = range[2*n];end[n] = range[2*n+1]; } #endif //OPS_MPI int x_size = MAX(0,end[0]-start[0]); int y_size = MAX(0,end[1]-start[1]); int xdim0 = args[0].dat->size[0]*args[0].dat->dim; int xdim1 = args[1].dat->size[0]*args[1].dat->dim; //build opencl kernel if not already built buildOpenCLKernels_update_halo_kernel2_xvel_minus_4_a( xdim0,xdim1); //Timing double t1,t2,c1,c2; ops_timers_core(&c2,&t2); //set up OpenCL thread blocks size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,1}; int *arg2h = (int *)arg2.data; int consts_bytes = 0; consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); reallocConstArrays(consts_bytes); consts_bytes = 0; arg2.data = OPS_consts_h + consts_bytes; arg2.data_d = OPS_consts_d + consts_bytes; for (int d=0; d<NUM_FIELDS; d++) ((int *)arg2.data)[d] = arg2h[d]; consts_bytes += ROUND_UP(NUM_FIELDS*sizeof(int)); mvConstArraysToDevice(consts_bytes); int dat0 = args[0].dat->elem_size; int dat1 = args[1].dat->elem_size; //set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif //OPS_MPI int base0 = 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif //OPS_MPI int base1 = 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); ops_H_D_exchanges_device(args, 3); ops_halo_exchanges(args,3,range); ops_H_D_exchanges_device(args, 3); ops_timers_core(&c1,&t1); OPS_kernels[55].mpi_time += t1-t2; clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[55], 0, sizeof(cl_mem), (void*) &arg0.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[55], 1, sizeof(cl_mem), (void*) &arg1.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[55], 2, sizeof(cl_mem), (void*) &arg2.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[55], 3, sizeof(cl_int), (void*) &base0 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[55], 4, sizeof(cl_int), (void*) &base1 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[55], 5, sizeof(cl_int), (void*) &x_size )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[55], 6, sizeof(cl_int), (void*) &y_size )); //call/enque opencl kernel wrapper function clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[55], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); if (OPS_diags>1) { clSafeCall( clFinish(OPS_opencl_core.command_queue) ); } ops_set_dirtybit_device(args, 3); ops_set_halo_dirtybit3(&args[0],range); ops_set_halo_dirtybit3(&args[1],range); //Update kernel record ops_timers_core(&c2,&t2); OPS_kernels[55].time += t2-t1; OPS_kernels[55].transfer += ops_compute_transfer(dim, range, &arg0); OPS_kernels[55].transfer += ops_compute_transfer(dim, range, &arg1); }
SEXP ocl_collect_call(SEXP octx, SEXP wait) { SEXP res = R_NilValue; ocl_call_context_t *occ; int on; if (!Rf_inherits(octx, "clCallContext")) Rf_error("Invalid call context"); occ = (ocl_call_context_t*) R_ExternalPtrAddr(octx); if (!occ || occ->finished) Rf_error("The call results have already been collected, they cannot be retrieved twice"); if (Rf_asInteger(wait) == 0 && occ->event) { cl_int status; if ((last_ocl_error = clGetEventInfo(occ->event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL)) != CL_SUCCESS) ocl_err("querying event object for the supplied context"); if (status < 0) Rf_error("Asynchronous call failed with error code 0x%x", (int) -status); if (status != CL_COMPLETE) return R_NilValue; } clFinish(occ->commands); occ->finished = 1; /* we can release input memory objects now */ if (occ->mem_objects) { arg_free(occ->mem_objects, (afin_t) clReleaseMemObject); occ->mem_objects = 0; } if (occ->float_args) { arg_free(occ->float_args, 0); occ->float_args = 0; } on = occ->on; res = occ->ftres ? Rf_allocVector(RAWSXP, on * sizeof(float)) : Rf_allocVector(REALSXP, on); if (occ->ftype == FT_SINGLE) { if (occ->ftres) { if ((last_ocl_error = clEnqueueReadBuffer( occ->commands, occ->output, CL_TRUE, 0, sizeof(float) * on, RAW(res), 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, last_ocl_error); PROTECT(res); Rf_setAttrib(res, R_ClassSymbol, mkString("clFloat")); UNPROTECT(1); } else { /* float - need a temporary buffer */ float *fr = (float*) malloc(sizeof(float) * on); double *r = REAL(res); int i; if (!fr) Rf_error("unable to allocate memory for temporary single-precision output buffer"); occ->float_out = fr; if ((last_ocl_error = clEnqueueReadBuffer( occ->commands, occ->output, CL_TRUE, 0, sizeof(float) * on, fr, 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, last_ocl_error); for (i = 0; i < on; i++) r[i] = fr[i]; } } else if ((last_ocl_error = clEnqueueReadBuffer( occ->commands, occ->output, CL_TRUE, 0, sizeof(double) * on, REAL(res), 0, NULL, NULL )) != CL_SUCCESS) Rf_error("Unable to transfer result vector (%d double elements, oclError %d)", on, last_ocl_error); ocl_call_context_fin(octx); return res; }
int vadd(void) { // Create the two input vectors int i; const int LIST_SIZE = 1024; int *A = (int*)malloc(sizeof(int)*LIST_SIZE); int *B = (int*)malloc(sizeof(int)*LIST_SIZE); for(i = 0; i < LIST_SIZE; i++) { A[i] = i; B[i] = LIST_SIZE - i; } // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; const char *fname = "/home/ckit/program/workspace_java/OpenCLHookSample/jni/vector_add_kernel.cl"; // const char *fname = "vadd.ir"; fp = fopen(fname, "rb"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platform_id ); //#define XXX CL_DEVICE_TYPE_DEFAULT // #define XXX CL_DEVICE_TYPE_ALL // #define XXX CL_DEVICE_TYPE_GPU #define XXX CL_DEVICE_TYPE_CPU cl_uint num_platforms = 2; cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)* num_platforms); if(NULL == platforms){ printf("malloc err!\n"); } ret = clGetPlatformIDs(2, platforms, &ret_num_platforms); printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platforms[1] ); ret = clGetDeviceIDs( platforms[0], XXX, 1, &device_id, &ret_num_devices); printf("clGetDeviceIDs err=%d,num_platforms=%d, device_id=%x\n", ret, ret_num_platforms, (unsigned int)device_id ); char name[64]; ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char)*64, name, NULL); printf("device_name : %s\n", name); // Create an OpenCL context // cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); cl_context_properties cps[3] = { (cl_context_properties)CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0], (cl_context_properties)0 }; cl_context context = clCreateContextFromType( cps, XXX, NULL, NULL, &ret); printf("clCreateContextFromType err=%d,device_type=%x\n", ret, (unsigned int)XXX); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); // Copy the lists A and B to their respective memory buffers ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // cl_int status; // cl_int err; // cl_program program = clCreateProgramWithBinary( // context, 1, &device_id, &source_size, (const unsigned char **)&source_str, &status, &err); // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "vector_add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); // Execute the OpenCL kernel on the list size_t global_item_size = LIST_SIZE; // Process the entire lists size_t local_item_size = 64; // Process in groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C int *C = (int*)malloc(sizeof(int)*LIST_SIZE); ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL); // Display the result to the screen for(i = 0; i < /*LIST_SIZE*/10; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(A); free(B); free(C); return 0; }
void copy(cl_command_queue commands, cl_mem dst, cl_mem src, int N) { int err = clEnqueueCopyBuffer(commands, src, dst, 0, 0, N*sizeof(T), 0, NULL, NULL); clFinish(commands); CHKERR(err, "Unable to copy memory"); }
/*virtual*/ bool NuiKinfuOpenCLFrame::BufferToMappableTexture(NuiCLMappableData* pMappableData) { assert(pMappableData); if (!pMappableData) return false; if (!m_dirty) return false; m_dirty = false; if (!m_colorsCL) return false; cl_kernel rgbaKernel = NuiOpenCLKernelManager::instance().acquireKernel(E_COLOR_TO_TEXTURE); assert(rgbaKernel); if (!rgbaKernel) { NUI_ERROR("Get kernel 'E_COLOR_TO_TEXTURE' failed!\n"); return false; } if (m_nWidth != pMappableData->ColorTex().width() || m_nHeight != pMappableData->ColorTex().height()) { NuiTextureMappableAccessor::updateImpl( pMappableData->ColorTex(), m_nWidth, m_nHeight, NULL ); } cl_mem texGL = NuiOpenCLBufferFactory::asTexture2DCL(pMappableData->ColorTex()); cl_int err = CL_SUCCESS; cl_command_queue queue = NuiOpenCLGlobal::instance().clQueue(); // err = clFinish(queue); NUI_CHECK_CL_ERR(err); // Acquire OpenGL objects before use cl_mem glObjs[] = { texGL }; openclutil::enqueueAcquireHWObjects( sizeof(glObjs) / sizeof(cl_mem), glObjs, 0, nullptr, nullptr); // Set kernel arguments cl_uint idx = 0; err = clSetKernelArg(rgbaKernel, idx++, sizeof(cl_mem), &m_colorsCL); NUI_CHECK_CL_ERR(err); err = clSetKernelArg(rgbaKernel, idx++, sizeof(cl_mem), &texGL); NUI_CHECK_CL_ERR(err); // Run kernel to calculate size_t kernelGlobalSize[2] = { m_nWidth, m_nHeight }; err = clEnqueueNDRangeKernel( queue, rgbaKernel, 2, nullptr, kernelGlobalSize, nullptr, 0, NULL, NULL ); NUI_CHECK_CL_ERR(err); err = clFinish(queue); NUI_CHECK_CL_ERR(err); // Release OpenGL objects openclutil::enqueueReleaseHWObjects( sizeof(glObjs) / sizeof(cl_mem), glObjs, 0, nullptr, nullptr); return true; }
// host stub function void ops_par_loop_calc_dt_kernel_print(char const *name, ops_block block, int dim, int* range, ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6) { ops_arg args[7] = { arg0, arg1, arg2, arg3, arg4, arg5, arg6}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args,7,range,30)) return; #endif ops_timing_realloc(30,"calc_dt_kernel_print"); OPS_kernels[30].count++; //compute locally allocated range for the sub-block int start[2]; int end[2]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for ( int n=0; n<2; n++ ){ start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; if (start[n] >= range[2*n]) { start[n] = 0; } else { start[n] = range[2*n] - start[n]; } if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; if (end[n] >= range[2*n+1]) { end[n] = range[2*n+1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); } #else //OPS_MPI for ( int n=0; n<2; n++ ){ start[n] = range[2*n];end[n] = range[2*n+1]; } #endif //OPS_MPI int x_size = MAX(0,end[0]-start[0]); int y_size = MAX(0,end[1]-start[1]); int xdim0 = args[0].dat->size[0]*args[0].dat->dim; int xdim1 = args[1].dat->size[0]*args[1].dat->dim; int xdim2 = args[2].dat->size[0]*args[2].dat->dim; int xdim3 = args[3].dat->size[0]*args[3].dat->dim; int xdim4 = args[4].dat->size[0]*args[4].dat->dim; int xdim5 = args[5].dat->size[0]*args[5].dat->dim; //build opencl kernel if not already built buildOpenCLKernels_calc_dt_kernel_print( xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); //Timing double t1,t2,c1,c2; ops_timers_core(&c2,&t2); //set up OpenCL thread blocks size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, 1}; size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,1}; #ifdef OPS_MPI double *arg6h = (double *)(((ops_reduction)args[6].data)->data + ((ops_reduction)args[6].data)->size * block->index); #else //OPS_MPI double *arg6h = (double *)(((ops_reduction)args[6].data)->data); #endif //OPS_MPI int nblocks = ((x_size-1)/OPS_block_size_x+ 1)*((y_size-1)/OPS_block_size_y + 1); int maxblocks = nblocks; int reduct_bytes = 0; reduct_bytes += ROUND_UP(maxblocks*12*sizeof(double)); reallocReductArrays(reduct_bytes); reduct_bytes = 0; int r_bytes6 = reduct_bytes/sizeof(double); arg6.data = OPS_reduct_h + reduct_bytes; arg6.data_d = OPS_reduct_d;// + reduct_bytes; for (int b=0; b<maxblocks; b++) for (int d=0; d<12; d++) ((double *)arg6.data)[d+b*12] = ZERO_double; reduct_bytes += ROUND_UP(maxblocks*12*sizeof(double)); mvReductArraysToDevice(reduct_bytes); int dat0 = args[0].dat->elem_size; int dat1 = args[1].dat->elem_size; int dat2 = args[2].dat->elem_size; int dat3 = args[3].dat->elem_size; int dat4 = args[4].dat->elem_size; int dat5 = args[5].dat->elem_size; //set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif //OPS_MPI int base0 = 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif //OPS_MPI int base1 = 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; #endif //OPS_MPI int base2 = 1 * (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); base2 = base2 + args[2].dat->size[0] * (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; #endif //OPS_MPI int base3 = 1 * (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); base3 = base3 + args[3].dat->size[0] * (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; #endif //OPS_MPI int base4 = 1 * (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); base4 = base4 + args[4].dat->size[0] * (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; #endif //OPS_MPI int base5 = 1 * (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); base5 = base5 + args[5].dat->size[0] * (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); ops_H_D_exchanges_device(args, 7); ops_halo_exchanges(args,7,range); ops_H_D_exchanges_device(args, 7); ops_timers_core(&c1,&t1); OPS_kernels[30].mpi_time += t1-t2; int nthread = OPS_block_size_x*OPS_block_size_y; clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 0, sizeof(cl_mem), (void*) &arg0.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 1, sizeof(cl_mem), (void*) &arg1.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 2, sizeof(cl_mem), (void*) &arg2.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 3, sizeof(cl_mem), (void*) &arg3.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 4, sizeof(cl_mem), (void*) &arg4.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 5, sizeof(cl_mem), (void*) &arg5.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 6, sizeof(cl_mem), (void*) &arg6.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 7, nthread*sizeof(double), NULL)); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 8, sizeof(cl_int), (void*) &r_bytes6 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 9, sizeof(cl_int), (void*) &base0 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 10, sizeof(cl_int), (void*) &base1 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 11, sizeof(cl_int), (void*) &base2 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 12, sizeof(cl_int), (void*) &base3 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 13, sizeof(cl_int), (void*) &base4 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 14, sizeof(cl_int), (void*) &base5 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 15, sizeof(cl_int), (void*) &x_size )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[30], 16, sizeof(cl_int), (void*) &y_size )); //call/enque opencl kernel wrapper function clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[30], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); if (OPS_diags>1) { clSafeCall( clFinish(OPS_opencl_core.command_queue) ); } mvReductArraysToHost(reduct_bytes); for ( int b=0; b<maxblocks; b++ ){ for ( int d=0; d<12; d++ ){ arg6h[d] = arg6h[d] + ((double *)arg6.data)[d+b*12]; } } arg6.data = (char *)arg6h; ops_set_dirtybit_device(args, 7); //Update kernel record ops_timers_core(&c2,&t2); OPS_kernels[30].time += t2-t1; OPS_kernels[30].transfer += ops_compute_transfer(dim, range, &arg0); OPS_kernels[30].transfer += ops_compute_transfer(dim, range, &arg1); OPS_kernels[30].transfer += ops_compute_transfer(dim, range, &arg2); OPS_kernels[30].transfer += ops_compute_transfer(dim, range, &arg3); OPS_kernels[30].transfer += ops_compute_transfer(dim, range, &arg4); OPS_kernels[30].transfer += ops_compute_transfer(dim, range, &arg5); }
int main(int argc, char *argv[]) { int error, xsize, ysize, rgb_max; int *r, *b, *g; float *gray, *congray, *congray2, *congray_cl; // identity kernel // float filter[] = { // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,1,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // }; // 45 degree motion blur float filter[] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; // mexican hat kernel // float filter[] = { // 0, 0,-1,-1,-1, 0, 0, // 0,-1,-3,-3,-3,-1, 0, // -1,-3, 0, 7, 0,-3,-1, // -1,-3, 7,24, 7,-3,-1, // -1,-3, 0, 7, 0,-3,-1, // 0,-1,-3,-3,-3,-1, 0, // 0, 0,-1,-1,-1, 0, 0 // }; if(argc != 3) { fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]); abort(); } const char* filename = argv[1]; const int num_loops = atoi(argv[2]); // -------------------------------------------------------------------------- // load image // -------------------------------------------------------------------------- printf("Reading ``%s''\n", filename); ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b); printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize); // -------------------------------------------------------------------------- // allocate CPU buffers // -------------------------------------------------------------------------- posix_memalign((void**)&gray, 32, 4*xsize*ysize*sizeof(float)); if(!gray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray, 32, 4*xsize*ysize*sizeof(float)); if(!congray) { fprintf(stderr, "alloc congray"); abort(); } posix_memalign((void**)&congray2, 32, 4*xsize*ysize*sizeof(float)); if(!congray2) { fprintf(stderr, "alloc congray2"); abort(); } posix_memalign((void**)&congray_cl, 32, 4*xsize*ysize*sizeof(float)); if(!congray_cl) { fprintf(stderr, "alloc congray_cl"); abort(); } // -------------------------------------------------------------------------- // convert image to grayscale // -------------------------------------------------------------------------- for(int n = 0; n < xsize*ysize; ++n) { gray[4*n] = r[n]; gray[4*n+1] = g[n]; gray[4*n+2] = b[n]; gray[4*n+3] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max; } for(int n = 0; n < 4*xsize*ysize; ++n) { congray[n]=gray[n]; } // -------------------------------------------------------------------------- // execute filter on cpu // -------------------------------------------------------------------------- for(int s=0;s<num_loops;s++){ for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i) { for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j) { float sumR = 0; float sumG = 0; float sumB = 0; float sum = 0; for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k) { for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l){ sumR += congray[4*((i+k)*xsize + (j+l))] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; sumG += congray[4*((i+k)*xsize + (j+l))+1] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; sumB += congray[4*((i+k)*xsize + (j+l))+2] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; //sum += congray[4*((i+k)*xsize + (j+l))+3] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; } } congray2[4*(i*xsize + j)] = sumR; congray2[4*(i*xsize + j)+1] = sumG; congray2[4*(i*xsize + j)+2] = sumB; congray2[4*(i*xsize + j)+3] = sum; } } for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i) { for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j) { congray[4*(i*xsize + j)] = congray2[4*(i*xsize + j)]; congray[4*(i*xsize + j)+1] = congray2[4*(i*xsize + j)+1]; congray[4*(i*xsize + j)+2] = congray2[4*(i*xsize + j)+2]; congray[4*(i*xsize + j)+3] = congray2[4*(i*xsize + j)+3]; } } } // -------------------------------------------------------------------------- // output cpu filtered image // -------------------------------------------------------------------------- printf("Writing cpu filtered image\n"); for(int n = 0; n < xsize*ysize; ++n) { r[n] = (int)(congray[4*n] ); g[n] = (int)(congray[4*n+1] ); b[n] = (int)(congray[4*n+2]); } error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // get an OpenCL context and queue // -------------------------------------------------------------------------- cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); print_device_info_from_queue(queue); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("convolution_color.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution_color", NULL); free(knl_text); #ifdef NON_OPTIMIZED int deviceWidth = xsize; #else int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX; #endif int deviceHeight = ysize; size_t deviceDataSize = 4 * deviceHeight*deviceWidth*sizeof(float); // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0, deviceDataSize, gray, 0, NULL, NULL)); #else size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), ysize, 1}; clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, gray, 0, NULL, NULL); #endif CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL)); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- cl_int rows = ysize; cl_int cols = xsize; cl_int filterWidth = FILTER_WIDTH; cl_int paddingPixels = 2*HALF_FILTER_WIDTH; size_t local_size[] = { WGX, WGY }; size_t global_size[] = { ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0], ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1], }; cl_int localWidth = local_size[0] + paddingPixels; cl_int localHeight = local_size[1] + paddingPixels; size_t localMemSize = 4 * localWidth * localHeight * sizeof(float); CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray)); CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter)); CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows)); CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols)); CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth)); CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL)); CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight)); CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth)); // -------------------------------------------------------------------------- // print kernel info // -------------------------------------------------------------------------- print_kernel_info(queue, knl); CALL_CL_SAFE(clFinish(queue)); timestamp_type tic, toc; get_timestamp(&tic); for(int loop = 0; loop < num_loops; ++loop) { if (loop%2==0){ CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray)); } else{ CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_congray), &buf_congray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_gray), &buf_gray)); } CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL, global_size, local_size, 0, NULL, NULL)); } CALL_CL_SAFE(clFinish(queue)); get_timestamp(&toc); double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops; printf("%f s\n", elapsed); printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed); printf("%f GBit/s\n", 4*2*xsize*ysize*sizeof(float)/1e9/elapsed); printf("%f GFlop/s\n",4*(xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH) *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed); // -------------------------------------------------------------------------- // transfer back & check // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED if (num_loops%2==0) CALL_CL_SAFE(clEnqueueReadBuffer(queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,deviceDataSize,congray_cl,0, NULL, NULL)); else CALL_CL_SAFE(clEnqueueReadBuffer(queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,deviceDataSize, congray_cl, 0, NULL, NULL)); #else buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; region[0] = (xsize-paddingPixels)*sizeof(float); region[1] = (ysize-paddingPixels); region[2] = 1; if (num_loops%2==0) clEnqueueReadBufferRect(queue, buf_gray, CL_TRUE,buffer_origin, host_origin, region,deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,congray_cl, 0, NULL, NULL); else clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,buffer_origin, host_origin, region,deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,congray_cl, 0, NULL, NULL); #endif // -------------------------------------------------------------------------- // output OpenCL filtered image // -------------------------------------------------------------------------- printf("Writing OpenCL filtered image\n"); for(int n = 0; n < xsize*ysize; ++n) { //r[n] = (int)(congray_cl[4*n] * rgb_max); //g[n] = (int)(congray_cl[4*n+1] * rgb_max); //b[n] = (int)(congray_cl[4*n+2] * rgb_max); r[n] = (int)(congray_cl[4*n]); g[n] = (int)(congray_cl[4*n+1]); b[n] = (int)(congray_cl[4*n+2]); } error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_SAFE(clReleaseMemObject(buf_congray)); CALL_CL_SAFE(clReleaseMemObject(buf_gray)); CALL_CL_SAFE(clReleaseMemObject(buf_filter)); CALL_CL_SAFE(clReleaseKernel(knl)); CALL_CL_SAFE(clReleaseCommandQueue(queue)); CALL_CL_SAFE(clReleaseContext(ctx)); free(gray); free(congray); free(congray_cl); free(r); free(b); free(g); }
int main(int argc, char *argv[]){ if (MODE == 5){ printf("---OpenCL Test Code---\n\n"); cl_int errNum; cl_uint numPlatforms; cl_platform_id *platforms = NULL; cl_uint numDevices; cl_device_id *devices = NULL; //platform info fields char vendor[1024], name[1024], version[1024]; //device info fields size_t MAX_WORK_GROUP_SIZE; cl_ulong GLOBAL_MEM_CACHE_SIZE, GLOBAL_MEM_SIZE, LOCAL_MEM_SIZE, GLOBAL_MEM_CACHELINE_SIZE; cl_uint MAX_COMPUTE_UNITS, MAX_WORK_ITEM_DIMENSIONS; size_t MAX_WORK_ITEM_SIZES[3]; char DEVICE_NAME[1024], DEVICE_VENDOR[1024], DEVICE_VERSION[1024], DRIVER_VERSION[1024], EXTENSIONS[2048]; cl_device_mem_cache_type GLOBAL_MEM_CACHE_TYPE; //printf("Getting number of OpenCL Platforms...\n"); errNum = clGetPlatformIDs(0, NULL, &numPlatforms); if (errNum != CL_SUCCESS) { printf("Failed to get number of OpenCL platforms.\n"); return 0; } else { //printf("found %d.\n", numPlatforms); } //printf("Allocating space for the platform info...\n"); platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); printf("---Platform Info---\n"); errNum = clGetPlatformIDs(numPlatforms, platforms, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get platform info.\n"); return 0; } else { clGetPlatformInfo (platforms[0], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_NAME, sizeof(name), name, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_VERSION, sizeof(version), version, NULL); //printf("Got platform info.\n"); printf("Vendor: \t%s\n", vendor); printf("Name: \t%s\n", name); printf("Version:\t%s\n", version); } //printf("Getting number of devices...\n"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS) { printf("Failed to get number of devices.\n"); return 0; } else { //printf("Found %d.\n", numDevices); } //printf("Allocating space for device info...\n"); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); printf("\n---Device Info---"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get device info.\n"); return 0; } else { int i, j = 0; for (i = 0; i < numDevices; i++ ) { printf("\nDevice ID: %d\n", i+1); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(DEVICE_NAME), DEVICE_NAME, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(DEVICE_VENDOR), DEVICE_VENDOR, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(DEVICE_VERSION), DEVICE_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(DRIVER_VERSION), DRIVER_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(EXTENSIONS), EXTENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS), &MAX_COMPUTE_UNITS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(GLOBAL_MEM_SIZE), &GLOBAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(LOCAL_MEM_SIZE), &LOCAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(MAX_WORK_ITEM_DIMENSIONS), &MAX_WORK_ITEM_DIMENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MAX_WORK_ITEM_SIZES), MAX_WORK_ITEM_SIZES, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_WORK_GROUP_SIZE), &MAX_WORK_GROUP_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(GLOBAL_MEM_CACHE_SIZE), &GLOBAL_MEM_CACHE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(GLOBAL_MEM_CACHELINE_SIZE), &GLOBAL_MEM_CACHELINE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(GLOBAL_MEM_CACHE_TYPE), &GLOBAL_MEM_CACHE_TYPE, NULL); printf("Device Name:\t%s\n", DEVICE_NAME); printf("Device Vendor:\t%s\n", DEVICE_VENDOR); printf("Device Version:\t%s\n", DEVICE_VERSION); printf("Driver Version:\t%s\n", DRIVER_VERSION); printf("EXTENSIONS:\t%s\n", EXTENSIONS); printf("Number of CUs:\t%d\n", MAX_COMPUTE_UNITS); printf("GMem:\t\t%lld (Bytes)\n", (long long) GLOBAL_MEM_SIZE); printf("GMem $ Size:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHE_SIZE); printf("GMem $ Line:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHELINE_SIZE); if(GLOBAL_MEM_CACHE_TYPE == CL_NONE) { printf("GMem $ Type:\tCL_NONE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_ONLY_CACHE) { printf("GMem $ Type:\tCL_READ_ONLY_CACHE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_WRITE_CACHE) { printf("GMem $ Type:\tCL_READ_WRITE_CACHE\n"); } printf("LMem:\t\t%lld (Bytes)\n", (long long) LOCAL_MEM_SIZE); printf("Work Group Size:%d (Max)\n", (int) MAX_WORK_GROUP_SIZE); printf("Work Item Dim:\t%d (Max)\n", MAX_WORK_ITEM_DIMENSIONS); printf("Work Item Size:\t"); for(j = 0; j < MAX_WORK_ITEM_DIMENSIONS; j ++) { if (j != (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d, ", (int) MAX_WORK_ITEM_SIZES[j]); if (j == (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d ", (int) MAX_WORK_ITEM_SIZES[j]); } printf("(Max)\n"); } //printf("Got device info.\n"); } } else if (MODE == 4){ cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; //Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } //Create a command-queue on the first device available on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create commandQueue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL program and store the binary for future use. printf("Attempting to create kernel binary from source.\n"); program = CreateProgram(context, device, KERNELPATHIN); if (program == NULL) { printf("Failed to create Program"); Cleanup(context, commandQueue, program, NULL); return 1; } printf("Kernel is saved.\n"); if (SaveProgramBinary(program, device, KERNELPATHOUT) == false) { printf("Failed to write program binary.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //printf("---Done---"); //return 1; } else if (MODE == 3){ //todo free remaining objects not passed to cleanup //profiling int write_bytes = 0; int read_bytes = 0; /*unsigned long long start_cycles, stop_cycles; unsigned long long start_setup, stop_setup; unsigned long long start_write, stop_write; unsigned long long start_read, stop_read; unsigned long long start_finalize, stop_finalize; struct timespec start_time_t, stop_time_t;*/ printf("Stream Mode\n\n"); //clock_gettime(CLOCK_MONOTONIC, &start_time_t); //start_cycles = rdtsc(); int i; time_t t; srand((unsigned) time(&t)); // Create the two input vectors printf("\nHostside malloc(s)\n"); fflush(stdout); int *A = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *B = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *C = (int*)malloc(sizeof(int)*(SIZE*SIZE)); //profile //bytes += 3 * sizeof(int)*(SIZE*SIZE); printf("\nHostside mat init\n"); fflush(stdout); for(i = 0; i < (SIZE*SIZE); i++) { A[i] = B[i] = rand() % 10 + 1;; } //print matrix printf("Matrix A[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", A[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //print matrix printf("\nMatrix B[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", B[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //syscall(STATS_RESET); //Get platform and device information cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_uint err = 0; //char *filepath = NULL; //Create the context printf("\nCreateContext\n"); fflush(stdout); context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } /* printf("\nEnd CreateContext\n"); fflush(stdout);*/ //Create a command-queue on the first device available on the created context printf("\nCreateCommandQueue\n"); fflush(stdout); commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create command queue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //create the program from the binary //program = CreateProgramFromBinary(context, device, "/home/stardica/Desktop/Kernels/vector.cl.bin.GPU"); //strcat(KERNELPATHOUT, ".GPU") printf("\nCreateProgramFromBinary\n"); fflush(stdout); program = CreateProgramFromBinary(context, device, KERNEL); if (program == NULL) { printf("Failed to load kernel binary,\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL kernel printf("\nclCreateKernel\n"); fflush(stdout); kernel = clCreateKernel(program, "Matrix", NULL); if (kernel == NULL) { printf("Failed to create kernel.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } cl_mem a_mem_obj = 0; cl_mem b_mem_obj = 0; cl_mem c_mem_obj = 0; //Create memory buffers on the device for each vector printf("\nclCreateBuffer(s)\n"); fflush(stdout); if(LOCALMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the GPU's local memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the system memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 1) { //this creates cached buffers in the system memory. #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if (a_mem_obj == NULL || b_mem_obj == NULL || c_mem_obj == NULL) { printf("Failed to create memory objects.\n"); Cleanup(context, commandQueue, program, kernel); return 1; } //Copy the lists A and B to their respective memory buffers printf("\nclEnqueueWriteBuffer(s)\n"); fflush(stdout); write_bytes += 2 * sizeof(int)*(SIZE*SIZE); // start_write = rdtsc(); clEnqueueWriteBuffer(commandQueue, a_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), A, 0, NULL, NULL); clEnqueueWriteBuffer(commandQueue, b_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), B, 0, NULL, NULL); // stop_write = rdtsc(); // Set the arguments of the kernel int *size = (int *)SIZE; printf("\nclSetKernelArg(s)\n"); fflush(stdout); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj); err = clSetKernelArg(kernel, 3, sizeof(int), (void *)&size); if (err != CL_SUCCESS) { printf("Kernel args not set.\n"); return 1; } // Execute the OpenCL kernel on the list size_t GlobalWorkSize[2], LocalWorkSize[2]; //Rember that in OpenCL we need to express the globalWorkSize in //terms of the total number of threads. The underlying OpenCL API //will look at the globalWorkSize and divide by the localWorkSize //to arrive at a 64 by 64 NDRange of 16 by 16 work groups. GlobalWorkSize[0] = GWS_0;//SIZE*SIZE*SIZE; // Process the entire lists GlobalWorkSize[1] = GWS_1;//SIZE*SIZE*SIZE; // Process the entire lists LocalWorkSize[0] = LWS_0; //SIZE Divide work items into groups of 64 LocalWorkSize[1] = LWS_1; //SIZE Divide work items into groups of 64 //used null for local, lets OpenCL determine the best local size. //err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); printf("\nclEnqueueNDRangeKernel\n"); fflush(stdout); err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("ND range not enqueued. Code: %d\n", err); return 1; } //Read the memory buffer C on the device to the local variable C printf("\nclEnqueueReadBuffer\n"); fflush(stdout); read_bytes += sizeof(int)*(SIZE*SIZE); //start_read = rdtsc(); err = clEnqueueReadBuffer(commandQueue, c_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), C, 0, NULL, NULL); // stop_read = rdtsc(); if (err != CL_SUCCESS) { printf("Buffer not returned.\n"); return 1; } //syscall(STATS_STOP); //print matrix printf("\nMatrix C[%d][%d] = A[%d][%d]*B[%d][%d]:\n", SIZE, SIZE, SIZE, SIZE, SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", C[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } printf("\nHostside clean up\n"); fflush(stdout); err = clFlush(commandQueue); err = clFinish(commandQueue); Cleanup(context, commandQueue, program, kernel); err = clReleaseMemObject(a_mem_obj); err = clReleaseMemObject(b_mem_obj); err = clReleaseMemObject(c_mem_obj); free(A); free(B); free(C); //printf("---Done---"); /*stop_cycles = rdtsc(); clock_gettime(CLOCK_MONOTONIC, &stop_time_t); printf("Total cycles = %llu\n", (stop_cycles - start_cycles)); long int time_s = stop_time_t.tv_nsec - start_time_t.tv_nsec; printf("Approximate runtime (check) = %ld ms\n", (time_s/1000000)); printf("Bytes written %d\n", write_bytes); printf("transfer cycles = %llu\n", (stop_write - start_write)); printf("start at = %llu\n", (start_write - start_cycles)); printf("Bytes read %d\n", read_bytes); printf("transfer cycles = %llu\n", (stop_read - start_read)); printf("start at = %llu\n", (start_read - start_cycles));*/ } else if (MODE == 2){ printf("Multi Thread Mode\n"); //cal this: //assignToThisCore(0);//assign to core 0,1,2,... unsigned long long a, b; int i = 0; int j = 0; int k = 0; LoadMatrices(); pthread_t tid[SIZE*SIZE]; //printf("waiting\n"); //start our threads a = rdtsc(); syscall(BEGIN_PARALLEL_SECTION); for(i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ struct RowColumnData *RCData = (struct RowColumnData *) malloc(sizeof(struct RowColumnData)); RCData->RowNum = i; RCData->ColumnNum = j; //printf("Thread create %d Row %d Col %d\n", k, RCData->RowNum, RCData->ColumnNum); pthread_create(&tid[k], NULL, RowColumnMultiply, RCData); k++; } } //Join threads//////////////////////////// for (i=0;i<NUM_THREADS;i++) { pthread_join(tid[i], NULL); } syscall(END_PARALLEL_SECTION); b = rdtsc(); PrintMatrices(); //printf("\nend clock Cycles: %llu\n", b); printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 1) { printf("Single Thread Mode\n\n"); //unsigned long long a, b; //a = rdtsc(); //time_t t; int i,j,k; //srand((unsigned) time(&t)); LoadMatrices(); //multiply mats///////////////////////// for (i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ for(k=0;k<SIZE;k++){ matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]); } } } PrintMatrices(); //b = rdtsc(); //printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 0) { printf("---Misc Tests---\n\n"); printf("size of long long is %d\n", (int) sizeof(long long)); printf("size of long is %d\n", (int) sizeof(long)); printf("size of int is %d\n", (int) sizeof(int)); printf("size of short is %d\n", (int) sizeof(short)); printf("size of char * %d\n", (int) sizeof(char *)); printf("size of unsigned int (word) %d\n", (int) sizeof(unsigned int)); char *string = "test string"; printf("Here is the string 1: \"%s\"\n", string); //Using the struct //set string variable and point to print_me. object.string = strdup(string); object.print_me = (void (*)(void *)) print_me; //use of print_me object.print_me(object.string); //pointer fun struct Object *ptr = &object; printf("this is the value of the pointer to struct object: %p\n", ptr); object.next=&object; printf("this is the value of the pointer to struct object: %p\n", object.next); object_ptr = &object; object_ptr->next = &object; printf("this is the value of the pointer to struct object: %p\n", object_ptr->next); //Macro fun PRINT(ptr, ptr); PRINT(object.next, object.next); PRINT(object_ptr->next, object_ptr->next); int mmu_page_size = 1 << 12; printf("mmu_papge_size = %d\n", mmu_page_size); //setjmp and longjmp fun /*jmp_buf environment; int i; i = setjmp(environment); printf("\n\nsetjmp returned = %d\n", i); printf("Env 1:\n"); int x = 0; for(x = 0; x < 6; x++) { printf(" %x\n", environment[x]); } if (i < 3) { longjmp(environment, 3); } printf("longjmp finished with i = %d\n", i);*/ } else { printf("---Invalid Mode Set---\n\n"); } printf("\n---Done---\n"); return 1; }
void btParticlesDynamicsWorld::runCollideParticlesKernel() { btAlignedObjectArray<int> pairs; // float particleRad = m_simParams.m_particleRad; // float collideDist2 = (particleRad + particleRad)*(particleRad + particleRad); cl_int ciErrNum; if(m_useCpuControls[SIMSTAGE_COLLIDE_PARTICLES]->m_active) { // CPU version int memSize = sizeof(btVector3) * m_numParticles; { BT_PROFILE("Copy from GPU"); ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedPos, CL_TRUE, 0, memSize, &(m_hSortedPos[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dSortedVel, CL_TRUE, 0, memSize, &(m_hSortedVel[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); memSize = sizeof(btInt2) * m_numParticles; ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPosHash, CL_TRUE, 0, memSize, &(m_hPosHash[0]), 0, NULL, NULL); memSize = m_numGridCells * sizeof(int); ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dCellStart, CL_TRUE, 0, memSize, &(m_hCellStart[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); } for(int index = 0; index < m_numParticles; index++) { btVector3 posA = m_hSortedPos[index]; btVector3 velA = m_hSortedVel[index]; btVector3 force = btVector3(0, 0, 0); float particleRad = m_simParams.m_particleRad; float collisionDamping = m_simParams.m_collisionDamping; float spring = m_simParams.m_spring; float shear = m_simParams.m_shear; float attraction = m_simParams.m_attraction; int unsortedIndex = m_hPosHash[index].y; //Get address in grid btInt4 gridPosA = cpu_getGridPos(posA, &m_simParams); //Accumulate surrounding cells btInt4 gridPosB; for(int z = -1; z <= 1; z++) { gridPosB.z = gridPosA.z + z; for(int y = -1; y <= 1; y++) { gridPosB.y = gridPosA.y + y; for(int x = -1; x <= 1; x++) { gridPosB.x = gridPosA.x + x; //Get start particle index for this cell unsigned int hashB = cpu_getPosHash(gridPosB, &m_simParams); int startI = m_hCellStart[hashB]; //Skip empty cell if(startI < 0) { continue; } //Iterate over particles in this cell int endI = startI + 32; if(endI > m_numParticles) endI = m_numParticles; for(int j = startI; j < endI; j++) { unsigned int hashC = m_hPosHash[j].x; if(hashC != hashB) { break; } if(j == index) { continue; } btPair pair; pair.v0[0] = index; pair.v0[1] = j; pairs.push_back(pair.value); // printf("index=%d, j=%d\n",index,j); // printf("(index=%d, j=%d) ",index,j); btVector3 posB = m_hSortedPos[j]; btVector3 velB = m_hSortedVel[j]; //Collide two spheres force += cpu_collideTwoParticles( posA, posB, velA, velB, particleRad, particleRad, spring, collisionDamping, shear, attraction); } } } } //Write new velocity back to original unsorted location m_hVel[unsortedIndex] = velA + force; } //#define BRUTE_FORCE_CHECK 1 #ifdef BRUTE_FORCE_CHECK for(int index = 0; index < m_numParticles; index++) { btVector3 posA = m_hSortedPos[index]; btVector3 velA = m_hSortedVel[index]; btVector3 force = btVector3(0, 0, 0); int unsortedIndex = m_hPosHash[index].y; float collisionDamping = m_simParams.m_collisionDamping; float spring = m_simParams.m_spring; float shear = m_simParams.m_shear; float attraction = m_simParams.m_attraction; for(int j = 0 ; j < m_numParticles; j++) { if (index!=j) { btVector3 posB = m_hSortedPos[j]; btVector3 velB = m_hSortedVel[j]; btVector3 relPos = posB - posA; relPos[3] = 0.f; float dist2 = (relPos[0] * relPos[0] + relPos[1] * relPos[1] + relPos[2] * relPos[2]); if(dist2 < collideDist2) { //Collide two spheres // force += cpu_collideTwoParticles( posA, posB, velA, velB, particleRad, particleRad, // spring, collisionDamping, shear, attraction); btPair pair; pair.v0[0] = index; pair.v0[1] = j; if (pairs.findLinearSearch(pair.value)==pairs.size()) { printf("not found index=%d, j=%d\n",index,j); } } } } //Write new velocity back to original unsorted location //m_hVel[unsortedIndex] = velA + force; } #endif //BRUTE_FORCE_CHECK memSize = sizeof(btVector3) * m_numParticles; ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, m_dVel, CL_TRUE, 0, memSize, &(m_hVel[0]), 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); } else { runKernelWithWorkgroupSize(PARTICLES_KERNEL_COLLIDE_PARTICLES, m_numParticles); cl_int ciErrNum = clFinish(m_cqCommandQue); oclCHECKERROR(ciErrNum, CL_SUCCESS); } }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'rotate_long2long2.cl' */ source_code = read_buffer("rotate_long2long2.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "rotate_long2long2", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_long2 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_long2)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_long2){{2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_long2), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_long2), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_long2 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_long2)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_long2){{2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_long2), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_long2), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_long2 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_long2)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_long2)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_long2), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_long2), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_long2)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main(int argc, char **argv){ printf("Check OpenCL environtment\n"); cl_platform_id platid; cl_device_id devid; cl_int res; size_t param; /* Query OpenCL, get some information about the returned device */ clGetPlatformIDs(1u, &platid, NULL); clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL); cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL); clGetDeviceInfo(devid, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name); clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), ¶m, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param); clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), ¶m, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param); clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param); /* Check if kernel source exists, we compile argv[1] passed kernel */ if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); } char *kernel_source; if(load_program_source(argv[1], &kernel_source)) return 1; printf("Building from OpenCL source: \t%s\n", argv[1]); printf("Compile/query OpenCL_program:\t%s\n", argv[2]); /* Create context and kernel program */ cl_context context = clCreateContext(0, 1, &devid, NULL, NULL, NULL); cl_program pro = clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL); res = clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL); if(res != CL_SUCCESS){ printf("clBuildProgram failed: %d\n", res); char buf[0x10000]; clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL); printf("\n%s\n", buf); return(-1); } cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); check_return(res); /* Get the maximum work-group size for executing the kernel on the device */ size_t global, local; res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL); check_return(res); printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local); res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); check_return(res); printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param); cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL); if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); } local = 4; int n = 2 * local; //num_group * local workgroup size global = n; int num_groups= global / local, allocated_local= sizeof(data) * local + sizeof(debug) * local; data *DP __attribute__ ((aligned(16))); DP = calloc(n, sizeof(data) *1); debug *dbg __attribute__ ((aligned(16))); dbg = calloc(n, sizeof(debug)); printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups); printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256)); printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local); cl_mem cl_DP, cl_EC, cl_INV, DEBUG; cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res); check_return(res); cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(Elliptic_Curve), NULL, &res); check_return(res); //_constant address space cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(u8) * 0x80, NULL, &res); check_return(res); DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res); check_return(res); Elliptic_Curve EC; /* Curve domain parameters, (test vectors) ------------------------------------------------------------------------------------- p: c1c627e1638fdc8e24299bb041e4e23af4bb5427 is prime a: c1c627e1638fdc8e24299bb041e4e23af4bb5424 divisor g = 62980 b: 877a6d84155a1de374b72d9f9d93b36bb563b2ab divisor g = 227169643 Gx: 010aff82b3ac72569ae645af3b527be133442131 divisor g = 32209245 Gy: 46b8ec1e6d71e5ecb549614887d57a287df573cc divisor g = 972 precomputed_per_curve_constants: U: c1c627e1638fdc8e24299bb041e4e23af4bb5425 V: 3e39d81e9c702371dbd6644fbe1b1dc50b44abd9 already prepared mod p to test: a: 07189f858e3f723890a66ec1079388ebd2ed509c b: 6043379beb0dade6eed1e9d6de64f4a0c50639d4 gx: 5ef84aacf4f0ea6752f572d0741f40049f354dca gy: 418c695435af6b3d4d7cbb72967395016ef67239 resulting point: P.x: 01718f862ebe9423bd661a65355aa1c86ba330f8 program MUST got this point !! P.y: 557e8ed53ffbfe2c990a121967b340f62e0e4fe2 taken mod p: P.x: 41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3 P.y: 73ca143c9badedf2d9d3c7573307115ccfe04f13 */ u8 *t; t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427"); memcpy(EC.p, t, 20); t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c"); memcpy(EC.a, t, 20); t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4"); memcpy(EC.b, t, 20); t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca"); memcpy(EC.Gx, t, 20); t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239"); memcpy(EC.Gy, t, 20); t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425"); memcpy(EC.U, t, 20); t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9"); memcpy(EC.V, t, 20); /* we need to map buffer now to load some k into data */ DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710"); for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21); free(t); //d for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1); /* we can alter just a byte into a chosen k to verify that we'll get a different point! */ //DP[2].k[2] = 0x09; //no res = clEnqueueWriteBuffer(cmd_queue, cl_DP, CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_EC, CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL); check_return(res); res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP); /* i/o buffer */ res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC); res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV); res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG); //this used to debug kernel output check_return(res); // printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local); cl_event NDRangeEvent; cl_ulong start, end; /* Execute NDrange */ res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent); check_return(res); // res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent); check_return(res); printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data)); DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res); check_return(res); /* using clEnqueueReadBuffer template */ // res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL); check_return(res); clFlush(cmd_queue); clFinish(cmd_queue); /* get NDRange execution time with internal ocl profiler */ res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); check_return(res); printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000)); //relative to NDRange call printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start))); printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n"); for(int i = 0; i < n; i++) { // if(i %local == 0) { printf("%d \t", i); //printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7)); /* silence this doubled debug info printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3], dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]); */ //printf("%d %d\n", P[i].dig, P[i].c); bn_print("", DP[i].k, 21, 1); bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1); printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3], DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]); // } } /* Release OpenCL stuff, free the rest */ clReleaseMemObject(cl_DP); clReleaseMemObject(cl_EC); clReleaseMemObject(cl_INV); clReleaseMemObject(DEBUG); clReleaseKernel(kernelobj); clReleaseProgram(pro); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(kernel_source); puts("Done!"); return 0; }