void AdvancedMaxPoolingLayer::BackPropagate() { #ifdef BUILD_OPENCL_MAX input_->delta.MoveToGPU(true); output_->delta.MoveToGPU(); maximum_mask_.MoveToGPU(); cl_uint error = 0; error |= clSetKernelArg (CLHelper::k_amaximumBackward, 0, sizeof (cl_mem), &input_->delta.cl_data_ptr_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 1, sizeof (cl_mem), &maximum_mask_.cl_data_ptr_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 2, sizeof (cl_mem), &output_->delta.cl_data_ptr_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 3, sizeof (unsigned int), &input_width_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 4, sizeof (unsigned int), &input_height_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 5, sizeof (unsigned int), &maps_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 6, sizeof (unsigned int), &output_width_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 7, sizeof (unsigned int), &output_height_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 8, sizeof (unsigned int), ®ion_width_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 9, sizeof (unsigned int), ®ion_height_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 10, sizeof (unsigned int), &stride_width_); error |= clSetKernelArg (CLHelper::k_amaximumBackward, 11, sizeof (unsigned int), &stride_height_); if (error != CL_SUCCESS) { FATAL ("Error setting kernel args: " << (signed int) error); } size_t global_work_size[] = { input_width_, input_height_, maps_* input_->data.samples() }; error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_amaximumBackward, 3, NULL, global_work_size, NULL, 0, NULL, NULL); if (error != CL_SUCCESS) { FATAL ("Error enqueueing kernel: " << (signed int) error); } #ifdef BRUTAL_FINISH error = clFinish (CLHelper::queue); if (error != CL_SUCCESS) { FATAL ("Error finishing command queue: " << (signed int) error); } #endif #else #define MP_HELPER_MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) #pragma omp parallel for default(shared) for(std::size_t sample = 0; sample < input_->data.samples(); sample++) { for (unsigned int map = 0; map < maps_; map++) { for (unsigned int ix = 0; ix < input_width_; ix++) { for(unsigned int iy = 0; iy < input_width_; iy++) { const unsigned int mask_index = ix + input_width_ * iy; const unsigned int oxstart = (ix < region_width_) ? 0 : (ix - region_width_) / stride_width_+ 1; const unsigned int oxend = MP_HELPER_MIN(ix / stride_width_ + 1, output_width_); const unsigned int oystart = (iy < region_height_) ? 0 : (iy - region_height_) / stride_height_ + 1; const unsigned int oyend = MP_HELPER_MIN(iy / stride_height_ + 1, output_height_); datum sum = 0.0; for (unsigned int oy = oystart; oy < oyend; oy++) { for (unsigned int ox = oxstart; ox < oxend; ox++) { if(*maximum_mask_.data_ptr_const(ox, oy, map, sample) == mask_index) sum += *output_->delta.data_ptr_const(ox, oy, map, sample); } } *(input_->delta.data_ptr(ix, iy, map, sample)) = sum; } } } } #endif }
int task(cl_context context, cl_device_id device, cl_command_queue queue, void* data_) { const TaskData* data = (const TaskData*) data_; cl_int err; if (data->points % data->points_per_work_item) check_error(CLQMC_INVALID_VALUE, "points must be a multiple of points_per_work_item"); if (data->replications % data->replications_per_work_item) check_error(CLQMC_INVALID_VALUE, "replications must be a multiple of replications_per_work_item"); // Lattice buffer size_t pointset_size; // gen_vec is given in common.c clqmcLatticeRule* pointset = clqmcLatticeRuleCreate(data->points, DIMENSION, gen_vec, &pointset_size, &err); check_error(err, NULL); cl_mem pointset_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, pointset_size, pointset, &err); check_error(err, "cannot create point set buffer"); // Shifts buffer clqmc_fptype* shifts = (clqmc_fptype*) malloc(data->replications * DIMENSION * sizeof(clqmc_fptype)); // populate random shifts using a random stream clrngMrg31k3pStream* stream = clrngMrg31k3pCreateStreams(NULL, 1, NULL, &err); check_error(err, NULL); for (cl_uint i = 0; i < data->replications; i++) for (cl_uint j = 0; j < DIMENSION; j++) shifts[i * DIMENSION + j] = clrngMrg31k3pRandomU01(stream); err = clrngMrg31k3pDestroyStreams(stream); check_error(err, NULL); cl_mem shifts_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, data->replications * DIMENSION * sizeof(clqmc_fptype), shifts, &err); check_error(err, "cannot create shifts buffer"); // Output buffer size_t points_block_count = data->points / data->points_per_work_item; cl_mem output_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, data->replications * points_block_count * sizeof(clqmc_fptype), NULL, &err); check_error(err, "cannot create output buffer"); // OpenCL kernel cl_program program = build_program_from_file(context, device, "client/DocsTutorial/example4_kernel.cl", NULL); check_error(err, NULL); cl_kernel kernel = clCreateKernel(program, "simulateWithRQMC", &err); check_error(err, "cannot create kernel"); int iarg = 0; err = clSetKernelArg(kernel, iarg++, sizeof(pointset_buf), &pointset_buf); err |= clSetKernelArg(kernel, iarg++, sizeof(shifts_buf), &shifts_buf); err |= clSetKernelArg(kernel, iarg++, sizeof(data->points_per_work_item), &data->points_per_work_item); err |= clSetKernelArg(kernel, iarg++, sizeof(data->replications), &data->replications); err |= clSetKernelArg(kernel, iarg++, sizeof(output_buf), &output_buf); check_error(err, "cannot set kernel arguments"); // Execution cl_event ev; size_t global_size = (data->replications / data->replications_per_work_item) * points_block_count; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &ev); check_error(err, "cannot enqueue kernel"); err = clWaitForEvents(1, &ev); check_error(err, "error waiting for events"); clqmc_fptype* output = (clqmc_fptype*) malloc(data->replications * points_block_count * sizeof(clqmc_fptype)); err = clEnqueueReadBuffer(queue, output_buf, CL_TRUE, 0, data->replications * points_block_count * sizeof(clqmc_fptype), output, 0, NULL, NULL); check_error(err, "cannot read output buffer"); printf("\nAdvanced randomized quasi-Monte Carlo integration:\n\n"); err = clqmcLatticeRuleWriteInfo(pointset, stdout); check_error(err, NULL); printf("\n"); rqmcReport(data->replications, data->points, points_block_count, output); // Clean up clReleaseEvent(ev); clReleaseMemObject(output_buf); clReleaseMemObject(pointset_buf); clReleaseKernel(kernel); clReleaseProgram(program); free(output); err = clqmcLatticeRuleDestroy(pointset); check_error(err, NULL); return EXIT_SUCCESS; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'relational_greater_than_or_equal_to_ulong16ulong16.cl' */ source_code = read_buffer("relational_greater_than_or_equal_to_ulong16ulong16.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "relational_greater_than_or_equal_to_ulong16ulong16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_ulong16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_ulong16 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_ulong16)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_ulong16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong16), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_int16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_int16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, int64_t __maybe_unused max_nonce) { const int thr_id = thr->id; struct opencl_thread_data *thrdata = thr->cgpu_data; struct cgpu_info *gpu = thr->cgpu; _clState *clState = clStates[thr_id]; const cl_kernel *kernel = &clState->kernel; const int dynamic_us = opt_dynamic_interval * 1000; cl_int status; size_t globalThreads[1]; size_t localThreads[1] = { clState->wsize }; int64_t hashes; /* Windows' timer resolution is only 15ms so oversample 5x */ if (gpu->dynamic && (++gpu->intervals * dynamic_us) > 70000) { struct timeval tv_gpuend; double gpu_us; gettimeofday(&tv_gpuend, NULL); gpu_us = us_tdiff(&tv_gpuend, &gpu->tv_gpustart) / gpu->intervals; if (gpu_us > dynamic_us) { if (gpu->intensity > MIN_INTENSITY) --gpu->intensity; } else if (gpu_us < dynamic_us / 2) { if (gpu->intensity < MAX_INTENSITY) ++gpu->intensity; } memcpy(&(gpu->tv_gpustart), &tv_gpuend, sizeof(struct timeval)); gpu->intervals = 0; } set_threads_hashes(clState->vwidth, &hashes, globalThreads, localThreads[0], &gpu->intensity); if (hashes > gpu->max_hashes) gpu->max_hashes = hashes; status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); return -1; } if (clState->goffset) { size_t global_work_offset[1]; global_work_offset[0] = work->blk.nonce; status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset, globalThreads, localThreads, 0, NULL, NULL); } else status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); return -1; } status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, BUFFERSIZE, thrdata->res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status); return -1; } /* The amount of work scanned can fluctuate when intensity changes * and since we do this one cycle behind, we increment the work more * than enough to prevent repeating work */ work->blk.nonce += gpu->max_hashes; /* This finish flushes the readbuffer set with CL_FALSE in clEnqueueReadBuffer */ clFinish(clState->commandQueue); /* FOUND entry is used as a counter to say how many nonces exist */ if (thrdata->res[FOUND]) { /* Clear the buffer again */ status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, BUFFERSIZE, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); return -1; } applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id); postcalc_hash_async(thr, work, thrdata->res); memset(thrdata->res, 0, BUFFERSIZE); /* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */ clFinish(clState->commandQueue); } return hashes; }
void kernel_gpu_opencl_wrapper_2(knode *knodes, long knodes_elem, long knodes_mem, int order, long maxheight, int count, long *currKnode, long *offset, long *lastKnode, long *offset_2, int *start, int *end, int *recstart, int *reclength) { //======================================================================================================================================================150 // CPU VARIABLES //======================================================================================================================================================150 // timer long long time0; long long time1; long long time2; long long time3; long long time4; long long time5; long long time6; time0 = get_time(); //======================================================================================================================================================150 // GPU SETUP //======================================================================================================================================================150 //====================================================================================================100 // INITIAL DRIVER OVERHEAD //====================================================================================================100 // cudaThreadSynchronize(); //====================================================================================================100 // COMMON VARIABLES //====================================================================================================100 // common variables cl_int error; //====================================================================================================100 // GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE //====================================================================================================100 // Get the number of available platforms cl_uint num_platforms; error = clGetPlatformIDs( 0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of available platforms cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs( num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the 1st platform cl_platform_id platform = platforms[0]; // Get the name of the selected platform and print it (if there are multiple platforms, choose the first one) char pbuf[100]; error = clGetPlatformInfo( platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); //====================================================================================================100 // CREATE CONTEXT FOR THE PLATFORM //====================================================================================================100 // Create context properties for selected platform cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; // Create context for selected platform being GPU cl_context context; context = clCreateContextFromType( context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // GET DEVICES AVAILABLE FOR THE CONTEXT, SELECT ONE //====================================================================================================100 // Get the number of devices (previousely selected for the context) size_t devices_size; error = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of devices (previousely selected for the context) cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo( context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Select the first device (previousely selected for the context) (if there are multiple devices, choose the first one) cl_device_id device; device = devices[0]; // Get the name of the selected device (previousely selected for the context) and print it error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); //====================================================================================================100 // CREATE COMMAND QUEUE FOR THE DEVICE //====================================================================================================100 // Create a command queue cl_command_queue command_queue; command_queue = clCreateCommandQueue( context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //====================================================================================================100 // CREATE PROGRAM, COMPILE IT //====================================================================================================100 // Load kernel source code from file const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl"); size_t sourceSize = strlen(source); // Create the program cl_program program = clCreateProgramWithSource( context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions,"-I./../"); #ifdef DEFAULT_ORDER_2 sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2); #endif // Compile the program error = clBuildProgram( program, 1, &device, clOptions, NULL, NULL); // Print warnings and errors from compilation static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); printf("-----OpenCL Compiler Output-----\n"); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); printf("--------------------------------\n"); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Create kernel cl_kernel kernel; kernel = clCreateKernel(program, "findRangeK", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time1 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY MALLOC //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN //====================================================================================================100 //==================================================50 // knodesD //==================================================50 cl_mem knodesD; knodesD = clCreateBuffer( context, CL_MEM_READ_WRITE, knodes_mem, NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 cl_mem currKnodeD; currKnodeD = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 cl_mem offsetD; offsetD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // lastKnodeD //==================================================50 cl_mem lastKnodeD; lastKnodeD = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offset_2D //==================================================50 cl_mem offset_2D; offset_2D = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(long), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // startD //==================================================50 cl_mem startD; startD = clCreateBuffer(context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // endD //==================================================50 cl_mem endD; endD = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansDStart //==================================================50 cl_mem ansDStart; ansDStart = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // ansDLength //==================================================50 cl_mem ansDLength; ansDLength = clCreateBuffer( context, CL_MEM_READ_WRITE, count*sizeof(int), NULL, &error ); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time2 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN //====================================================================================================100 //==================================================50 // knodesD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue knodesD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to knodes_mem, // size to be copied knodes, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // currKnodeD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue currKnodeD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied currKnode, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offsetD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue offsetD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied offset, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // lastKnodeD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue lastKnodeD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied lastKnode, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // offset_2D //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue offset_2D, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(long), // size to be copied offset_2, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // startD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue startD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied start, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // endD //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue endD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied end, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // END //==================================================50 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansDStart //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue endD, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied end, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // ansDLength //==================================================50 error = clEnqueueWriteBuffer( command_queue, // command queue ansDLength, // destination 1, // block the source from access until this copy operation complates (1=yes, 0=no) 0, // offset in destination to write to count*sizeof(int), // size to be copied reclength, // source 0, // # of events in the list of events to wait for NULL, // list of events to wait for NULL); // ID of this operation to be used by waiting operations if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time3 = get_time(); //==================================================50 // END //==================================================50 //======================================================================================================================================================150 // KERNEL //======================================================================================================================================================150 //====================================================================================================100 // Execution Parameters //====================================================================================================100 size_t local_work_size[1]; local_work_size[0] = order < 1024 ? order : 1024; size_t global_work_size[1]; global_work_size[0] = count * local_work_size[0]; printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]); //====================================================================================================100 // Kernel Arguments //====================================================================================================100 clSetKernelArg( kernel, 0, sizeof(long), (void *) &maxheight); clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *) &knodesD); clSetKernelArg( kernel, 2, sizeof(long), (void *) &knodes_elem); clSetKernelArg( kernel, 3, sizeof(cl_mem), (void *) &currKnodeD); clSetKernelArg( kernel, 4, sizeof(cl_mem), (void *) &offsetD); clSetKernelArg( kernel, 5, sizeof(cl_mem), (void *) &lastKnodeD); clSetKernelArg( kernel, 6, sizeof(cl_mem), (void *) &offset_2D); clSetKernelArg( kernel, 7, sizeof(cl_mem), (void *) &startD); clSetKernelArg( kernel, 8, sizeof(cl_mem), (void *) &endD); clSetKernelArg( kernel, 9, sizeof(cl_mem), (void *) &ansDStart); clSetKernelArg( kernel, 10, sizeof(cl_mem), (void *) &ansDLength); //====================================================================================================100 // Kernel //====================================================================================================100 error = clEnqueueNDRangeKernel( command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO error = clFinish(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time4 = get_time(); //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY COPY (CONTD.) //======================================================================================================================================================150 //====================================================================================================100 // DEVICE IN/OUT //====================================================================================================100 //==================================================50 // ansDStart //==================================================50 error = clEnqueueReadBuffer(command_queue, // The command queue. ansDStart, // The image on the device. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?) 0, // Offset. None in this case. count*sizeof(int), // Size to copy. recstart, // The pointer to the image on the host. 0, // Number of events in wait list. Not used. NULL, // Event wait list. Not used. NULL); // Event object for determining status. Not used. if (error != CL_SUCCESS) fatal_CL(error, __LINE__); //==================================================50 // ansDLength //==================================================50 error = clEnqueueReadBuffer(command_queue, // The command queue. ansDLength, // The image on the device. CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?) 0, // Offset. None in this case. count*sizeof(int), // Size to copy. reclength, // The pointer to the image on the host. 0, // Number of events in wait list. Not used. NULL, // Event wait list. Not used. NULL); // Event object for determining status. Not used. if (error != CL_SUCCESS) fatal_CL(error, __LINE__); time5 = get_time(); //==================================================50 // END //==================================================50 //====================================================================================================100 // END //====================================================================================================100 //======================================================================================================================================================150 // GPU MEMORY DEALLOCATION //======================================================================================================================================================150 // Release kernels... clReleaseKernel(kernel); // Now the program... clReleaseProgram(program); // Clean up the device memory... clReleaseMemObject(knodesD); clReleaseMemObject(currKnodeD); clReleaseMemObject(offsetD); clReleaseMemObject(lastKnodeD); clReleaseMemObject(offset_2D); clReleaseMemObject(startD); clReleaseMemObject(endD); clReleaseMemObject(ansDStart); clReleaseMemObject(ansDLength); // Flush the queue error = clFlush(command_queue); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // ...and finally, the queue and context. clReleaseCommandQueue(command_queue); // ??? clReleaseContext(context); time6 = get_time(); //======================================================================================================================================================150 // DISPLAY TIMING //======================================================================================================================================================150 printf("Time spent in different stages of GPU_CUDA KERNEL:\n"); printf("%15.12f s, %15.12f % : GPU: SET DEVICE / DRIVER INIT\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: ALO\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY IN\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU: KERNEL\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: COPY OUT\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time6-time0) * 100); printf("%15.12f s, %15.12f % : GPU MEM: FRE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time6-time0) * 100); printf("Total time:\n"); printf("%.12f s\n", (float) (time6-time0) / 1000000); //======================================================================================================================================================150 // END //======================================================================================================================================================150 }
void OpenCLExecuter::ocl_filter_shared(void) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer cl_sampler sampler; // OpenCL sampler cl_kernel ckKernel; // OpenCL kernel int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process // set Local work size dimensions // size_t local_threads[3] ={256,256,64}; // set Global work size dimensions // size_t global_threads[3] ={roundup((int) volobj->texwidth/local_threads[0], 0)*local_threads[0], roundup((int) volobj->texheight/local_threads[1], 0)*local_threads[1], roundup((int) volobj->texdepth/local_threads[2], 0)*local_threads[2]}; // set Global work size dimensions size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth}; // allocate the source buffer memory object src_buffer = clCreateFromGLTexture3D (ocl_wrapper->context, CL_MEM_READ_WRITE, GL_TEXTURE_3D, 0, volobj->TEXTURE3D_RED, &err); printf("OPENCL: clCreateFromGLTexture3D: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // create a sampler object sampler = clCreateSampler(ocl_wrapper->context, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &err); printf("OPENCL: clCreateSampler: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(sampler), (void*)&sampler); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); // grab input data from OpenGL, compute, copy the results back to OpenGL // Runs asynchronous to host, up until blocking clFinish at the end glFinish(); glFlush(); // grab the OpenGL texture object for read/writing from OpenCL err = clEnqueueAcquireGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,NULL,NULL); printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err)); // Execute a kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); /* // Blocking read of results from GPU to Host int size = volobj->texwidth*volobj->texheight*volobj->texdepth; unsigned char* result = new unsigned char[size]; err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, result, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); for(int i=0; i<size; i++) volobj->texture3d[3*i+0] = result[i]; delete[] result; */ // copy OpenCL buffer to OpenGl texture size_t corigin[3] = {0,0,0}; size_t cdimensions[3] = {(unsigned int)volobj->texwidth, (unsigned int)volobj->texheight, (unsigned int)volobj->texdepth}; err = clEnqueueCopyBufferToImage(ocl_wrapper->commandQue , dst_buffer, src_buffer, 0, corigin, cdimensions, 0, NULL, NULL); printf("OPENCL: clEnqueueCopyBufferToImage: %s\n", ocl_wrapper->get_error(err)); //make sure we block until we are done. //err = clFinish(ocl_wrapper->commandQue); //printf("OPENCL: clFinish: %s\n", ocl_wrapper->get_error(err)); //release opengl objects now err = clEnqueueReleaseGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,0,0); printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(src_buffer)clReleaseMemObject(src_buffer); if(dst_buffer)clReleaseMemObject(dst_buffer); }
void OpenCLExecuter::ocl_parrallelReduction(void) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem tmp_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group size_t numWorkGroups; cl_kernel ckKernel; // OpenCL kernel int iNumElements = 65536; //65536 // Length of float arrays to process // set Local work size dimensions szLocalWorkSize = 512; // set Global work size dimensions szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize; //szGlobalWorkSize = iNumElements; numWorkGroups = (float)szGlobalWorkSize/(float)szLocalWorkSize; printf("OPENCL: number of elements: %d\n", (int)iNumElements); printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize); printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize); printf("OPENCL: work groups: %d\n", (int)(numWorkGroups)); //temp array int* data = new int[iNumElements]; for(int i=0; i<iNumElements; i++) data[i] = randomFloat(1.0, (float)iNumElements); data[iNumElements/2] = -100.0; //for(int i=0; i<iNumElements; i++) // printf("data: %d\n", data[i]); size_t global_threads[1] ={iNumElements}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(int) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the temp buffer memory object tmp_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(int) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(int) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "min_reduce", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(int)*szLocalWorkSize, NULL); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&iNumElements); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end int numb_iterations = sqrt((float)numWorkGroups); numb_iterations=0; bool cont = true; Timer timer; timer.startTimer(); //for(int i=0; i<numb_iterations; i++) while(cont) { // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); numb_iterations++; if(data[1]==0) cont = false; //printf("min: %d\n", data[0]); for(int i=0; i<numWorkGroups; i++) printf("min: %d\n", data[i]); } timer.endTimer("GPU find min"); timer.startTimer(); int min=iNumElements; for(int i=0; i<iNumElements; i++) if(data[i]<min) min = data[i]; timer.endTimer("CPU find min"); printf("iters: %d\n", numb_iterations); printf("gpu-min: %d\n", data[0]); printf("cpu-min: %d\n", min); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); // printf("min: %d\n", data[0]); delete[] data; }
int main() { // Initiating opencl cl_device_id device_id; cl_int err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 1, &device_id, NULL); if (err != CL_SUCCESS) { std::cout<<"Error in device."<<std::endl; return EXIT_FAILURE; } cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { std::cout<<"Error in context."<<std::endl; return EXIT_FAILURE; } cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { std::cout<<"Error in command queue."<<std::endl; return EXIT_FAILURE; } std::ifstream in("transpMatrix.cl"); std::string contents((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>()); const char* kernelSource = contents.c_str(); cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err); if (!program) { std::cout<<"Error in program."<<std::endl; return EXIT_FAILURE; } err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; std::cout<<"Error in compiling the opencl program."<<std::endl; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); std::cout<<buffer<<std::endl; return EXIT_FAILURE; } cl_kernel kernel = clCreateKernel(program, "simplecl", &err); if (!kernel || err != CL_SUCCESS) { std::cout<<"Error in kernel "<<err<<std::endl; return EXIT_FAILURE; } // Data to compute float* data = new float[count*count]; for(int i = 0; i < count; ++i) { for(int j = 0; j < count; ++j) { data[i*count+j] = rand()%10; std::cout<<data[i*count+j]<<" "; } std::cout<<std::endl; } std::cout<<std::endl; // Creating communication buffers cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count*count, NULL, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count*count, NULL, NULL); if (!input || !output) { std::cout<<"Error in allocation."<<std::endl; return EXIT_FAILURE; } // Copy data to input buffer err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { std::cout<<"Error in copy."<<std::endl; return EXIT_FAILURE; } err = 0; err = clSetKernelArg(kernel, 0, sizeof(int), &count); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); if (err != CL_SUCCESS) { std::cout<<"Error in argument."<<std::endl; return EXIT_FAILURE; } size_t local[] = {1,1}; size_t global[] = {10,10}; // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); // if (err != CL_SUCCESS) // { // std::cout<<"Error in getting loal."<<std::endl; // return EXIT_FAILURE; // } err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, global, local, 0, NULL, NULL); if (err != CL_SUCCESS) { std::cout<<"Error in pushing to queue "<<err<<std::endl; return EXIT_FAILURE; } clFinish(commands); // Is done now err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL ); if (err != CL_SUCCESS) { std::cout<<"Error in reading back."<<std::endl; return EXIT_FAILURE; } for(int i = 0; i < count; ++i) { for(int j = 0; j < count; ++j) { std::cout<<data[i*count+j]<<" "; } std::cout<<std::endl; } std::cout<<std::endl; return 0; }
int main(int argc, char **argv) { cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err)); const char *program_source[] = { "__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n", "{\n", " int i = get_global_id(0);\n", " dst[i] = src[i] * factor;\n", "}\n" }; cl_program program; program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err)); if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) { char buffer[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "CL Compilation failed:\n%s", buffer); abort(); } CL_CHECK(clUnloadCompiler()); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); int factor = 2; cl_kernel kernel; kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err)); for (int i=0; i<NUM_DATA; i++) { CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { int data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL)); printf(" %d", data); } printf("\n"); CL_CHECK(clReleaseMemObject(input_buffer)); CL_CHECK(clReleaseMemObject(output_buffer)); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int main(void) { // se crea los 2 vectores de entrada int i; const int LIST_SIZE = 1024; int *A = (int*)malloc(sizeof(int)*LIST_SIZE); int *B = (int*)malloc(sizeof(int)*LIST_SIZE); for(i = 0; i < LIST_SIZE; i++) { A[i] = i; B[i] = LIST_SIZE - i; } // cargamos el kernel en source_str FILE *fp; char *source_str; size_t source_size; fp = fopen("vector_add_kernel.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // obtenemos las plataformas y informacion de los devices cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); // creamos un contexto OpenCL cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); // creamos la cola de comandos cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // creamos el buffer de memoria en el device para cada vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); // copiamos los vectores A y B a sus respectivas memorias buffer ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL); // creamos un programa para el kernel cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // generamos el programa ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); // creamos el kernel cl_kernel kernel = clCreateKernel(program, "vector_add", &ret); // establecemos los argumentos del kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); // ejecutamos el kernel de la lista size_t global_item_size = LIST_SIZE; size_t local_item_size = 64; // dividimos los work items en grupos de 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // copiamos la memoria buffer C del device hacia la variable local C int *C = (int*)malloc(sizeof(int)*LIST_SIZE); ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL); // muestra el resultado for(i = 0; i < LIST_SIZE; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); free(A); free(B); free(C); return 0; }
int main(int argc, char **argv) { printf("start \n"); int x, y, nsteps, i, j; float *u_h; double *f_h; //pointers to host memory int ArraySizeX = 5122; int ArraySizeY = 5122; double n, ux, uy, uxx, uxy, uyy, usq; FILE *fp; size_t size = ArraySizeX*ArraySizeY*sizeof(float); size_t size1 = ArraySizeX*ArraySizeY*9*sizeof(double); u_h = (float *)calloc(ArraySizeX*ArraySizeY,sizeof(float)); f_h = (double *)calloc(ArraySizeX*ArraySizeY*9,sizeof(double)); printf("initialization \n"); // initialization for( x = 0;x<ArraySizeX;x++){ for( y =0;y<ArraySizeY;y++){ // define the macroscopic properties of the initial condition. n = 1 + Amp2*exp(-(pow(x-ArraySizeX/2,2)+pow(y-ArraySizeY/2,2))/Width); ux = 0; uy = 0; // intialize f to be the local equilibrium values uxx = ux*ux; uyy = uy*uy; uxy = 2*ux*uy; usq = uxx+ uyy; f_h[x*ArraySizeY*9+y*9] = w1*n*(1-1.5*usq); f_h[x*ArraySizeY*9+y*9+1] = w2*n*(1+3*ux+4.5*uxx-1.5*usq); f_h[x*ArraySizeY*9+y*9+2] = w2*n*(1-3*ux+4.5*uxx-1.5*usq); f_h[x*ArraySizeY*9+y*9+3] = w2*n*(1+3*uy+4.5*uyy-1.5*usq); f_h[x*ArraySizeY*9+y*9+4]= w2*n*(1-3*uy+4.5*uyy-1.5*usq); f_h[x*ArraySizeY*9+y*9+5] = w3*n*(1+3*(ux+uy)+4.5*(uxx+uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+6] = w3*n*(1+3*(-ux+uy)+4.5*(uxx-uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+7] = w3*n*(1+3*(-ux-uy)+4.5*(uxx+uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+8] = w3*n*(1+3*(ux-uy)+4.5*(uxx-uxy+uyy)-1.5*usq); } } cl_event event; cl_ulong time_start, time_end, total_time; // use this to check the output of each API call cl_int status; // retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0,NULL,&numPlatforms); chk(status, "clGetPlatformIDs0"); // allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id *) malloc(numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); chk(status, "clGetPlatformIDs1"); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); chk(status, "clGetDeviceIDs0"); // Allocate enough space for each device cl_device_id *devices = NULL; devices = (cl_device_id *) malloc(numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); chk(status, "clGetDeviceIDs1"); // Create a context and associate it with devices cl_context context; context = clCreateContext(NULL,numDevices, devices, NULL, NULL, &status); chk(status,"clCreateContext"); // Create a command queue and associate it with device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0],CL_QUEUE_PROFILING_ENABLE,&status); chk(status,"clCreateCommandQueue"); // Create Buffer objects on devices cl_mem u_d, f_d; u_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); chk(status,"clCreatebuffer"); f_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size1, NULL, &status); chk(status, "clCreatebuffer"); // perform computing on GPU // copy data from host to device status = clEnqueueWriteBuffer(cmdQueue, u_d, CL_FALSE, 0, size, u_h, 0, NULL, NULL); chk(status,"ClEnqueueWriteBuffer"); status = clEnqueueWriteBuffer(cmdQueue, f_d, CL_FALSE, 0, size1, f_h, 0, NULL, NULL); chk(status, "clEnqueueWriteBuffer"); // create program with source code cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource, NULL, &status); chk(status, "clCreateProgramWithSource"); // Compile program for the device status = clBuildProgram(program, numDevices, devices, NULL, NULL,NULL); // chk(status, "ClBuildProgram"); if(status != CL_SUCCESS){ printf("clBuildProgram failed (%d) \n", status); size_t log_size; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char *log = (char *) malloc(log_size); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL); printf("%s\n", log); exit(-1); } printf("successfully built program \n"); // Create lattice-boltzman kernel cl_kernel kernel, kernel1; kernel = clCreateKernel(program, "lbiteration", &status); kernel1 = clCreateKernel(program, "Denrho", &status); chk(status, "clCreateKernel"); printf("successfully create kernel \n"); // Associate the input and output buffers with the kernel status = clSetKernelArg(kernel,0, sizeof(cl_mem), &f_d); status |= clSetKernelArg(kernel1,0, sizeof(cl_mem), &u_d); status |= clSetKernelArg(kernel1,1, sizeof(cl_mem), &f_d); status |= clSetKernelArg(kernel, 1, sizeof(int), &ArraySizeX); status |= clSetKernelArg(kernel1,2, sizeof(int), &ArraySizeX); status |= clSetKernelArg(kernel, 2, sizeof(int), &ArraySizeY); status |= clSetKernelArg(kernel1,3, sizeof(int),&ArraySizeY); chk(status, "clSerKernelArg"); // set the work dimensions size_t localworksize[2] = {BLOCK_SIZE_X,BLOCK_SIZE_Y}; int nBLOCKSX = (ArraySizeX-2)/(BLOCK_SIZE_X -2); int nBLOCKSY = (ArraySizeY-2)/(BLOCK_SIZE_Y -2); size_t globalworksize[2] = {nBLOCKSX*BLOCK_SIZE_X,nBLOCKSY*BLOCK_SIZE_Y}; // loop the kernel for( nsteps = 0; nsteps < 100; nsteps++){ status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalworksize,localworksize,0,NULL,&event); clWaitForEvents(1 , &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; } printf("Good so far \n"); status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 2, NULL, globalworksize,localworksize,0,NULL,&event); chk(status, "clEnqueueNDR"); clWaitForEvents(1 , &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("running time is %0.3f \n",(total_time/1000000000.0)); // retrieve data from device status = clEnqueueReadBuffer(cmdQueue, u_d, CL_TRUE, 0, size, u_h, 0, NULL, NULL); chk(status, "clEnqueueReadBuffer"); // Output results fp = fopen("SolutionCL.txt", "wt"); for(i= 0;i<ArraySizeX;i++){ for(j=0;j<ArraySizeY;j++) fprintf(fp, " %f", u_h[i*ArraySizeY+j]); fprintf(fp, "\n"); } fclose(fp); //cleanup clReleaseKernel(kernel); clReleaseKernel(kernel1); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(u_d); clReleaseMemObject(f_d); clReleaseContext(context); free(u_h); free(f_h); free(platforms); free(devices); return 0; }
int main(int argc, char** argv) { srand(1000); int i; unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); randomInit(h_A, size_A); randomInit(h_B, size_B); unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; cl_event mm; size_t dataBytes; size_t kernelLength; cl_int errcode; cl_mem d_A; cl_mem d_B; cl_mem d_C; clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); FILE* fp = fopen("hw2.cl", "r"); fseek (fp , 0 , SEEK_END); const size_t lSize = ftell(fp); rewind(fp); unsigned char* buffer; buffer = (unsigned char*) malloc (lSize); fread(buffer, 1, lSize, fp); fclose(fp); cl_int status; clProgram = clCreateProgramWithBinary(clGPUContext, 1, (const cl_device_id *)clDevices, &lSize, (const unsigned char**)&buffer, &status, &errcode); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); clKernel = clCreateKernel(clProgram, "MM", &errcode); size_t globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); globalWorkSize[0] = 16; globalWorkSize[1] = 16; cl_ulong time_start, time_end, total_time = 0; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, NULL, 0, NULL, &mm); printf("Average time = %lu\n"); clFinish(clCommandQue); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("Average time = %lu\n", total_time); errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
int main(int argc, char *argv[]) { double Mops, t1, t2; double tsx, tsy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int i, nit; int k_offset, j; logical verified; char size[16]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; setup_opencl(argc, argv); timer_clear(0); timer_start(0); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; DTIMER_START(T_KERNEL_EMBAR); // Launch the kernel int q_size = GROUP_SIZE * NQ * sizeof(cl_double); int sx_size = GROUP_SIZE * sizeof(cl_double); int sy_size = GROUP_SIZE * sizeof(cl_double); err_code = clSetKernelArg(kernel, 0, q_size, NULL); err_code |= clSetKernelArg(kernel, 1, sx_size, NULL); err_code |= clSetKernelArg(kernel, 2, sy_size, NULL); err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq); err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx); err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy); err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset); err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an); clu_CheckError(err_code, "clSetKernelArg()"); size_t localWorkSize[] = { GROUP_SIZE }; size_t globalWorkSize[] = { np }; err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_EMBAR); double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size); double *gsx = (double*)malloc(gsx_size); double *gsy = (double*)malloc(gsy_size); gc = 0.0; tsx = 0.0; tsy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } // 9. Get the result DTIMER_START(T_BUFFER_READ); err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, gq, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, gsx, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, gsy, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); DTIMER_STOP(T_BUFFER_READ); for (i = 0; i < np/localWorkSize[0]; i++) { for (j = 0; j < NQ; j++ ){ q[j] = q[j] + gq[i][j]; } tsx = tsx + gsx[i]; tsy = tsy + gsy[i]; } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((tsx - sx_verify_value) / sx_verify_value); sy_err = fabs((tsy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", tsx, tsy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } free(gq); free(gsx); free(gsy); release_opencl(); fflush(stdout); return 0; }
void AdvancedMaxPoolingLayer::FeedForward() { #ifdef BUILD_OPENCL_MAX input_->data.MoveToGPU(); output_->data.MoveToGPU(true); maximum_mask_.MoveToGPU(true); cl_uint error = 0; error |= clSetKernelArg (CLHelper::k_amaximumForward, 0, sizeof (cl_mem), &input_->data.cl_data_ptr_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 1, sizeof (cl_mem), &maximum_mask_.cl_data_ptr_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 2, sizeof (cl_mem), &output_->data.cl_data_ptr_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 3, sizeof (unsigned int), &input_width_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 4, sizeof (unsigned int), &input_height_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 5, sizeof (unsigned int), &maps_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 6, sizeof (unsigned int), &output_width_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 7, sizeof (unsigned int), &output_height_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 8, sizeof (unsigned int), ®ion_width_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 9, sizeof (unsigned int), ®ion_height_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 10, sizeof (unsigned int), &stride_width_); error |= clSetKernelArg (CLHelper::k_amaximumForward, 11, sizeof (unsigned int), &stride_height_); if (error != CL_SUCCESS) { FATAL ("Error setting kernel args: " << (signed int) error); } size_t global_work_size[] = { output_width_, output_height_, maps_* input_->data.samples() }; error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_amaximumForward, 3, NULL, global_work_size, NULL, 0, NULL, NULL); if (error != CL_SUCCESS) { FATAL ("Error enqueueing kernel: " << (signed int) error); } #ifdef BRUTAL_FINISH error = clFinish (CLHelper::queue); if (error != CL_SUCCESS) { FATAL ("Error finishing command queue: " << (signed int) error); } #endif #else #pragma omp parallel for default(shared) for (std::size_t sample = 0; sample < input_->data.samples(); sample++) { for (unsigned int map = 0; map < maps_; map++) { for (unsigned int ox = 0; ox < output_width_; ox++) { for (unsigned int oy = 0; oy < output_height_; oy++) { // Find maximum in region datum maximum = std::numeric_limits<datum>::lowest(); unsigned int mix = 0; unsigned int miy = 0; for (unsigned int iy = oy * stride_height_; iy < (oy * stride_height_) + region_height_; iy++) { for (unsigned int ix = ox * stride_width_; ix < (ox * stride_width_) + region_width_; ix++) { const datum ival = *input_->data.data_ptr_const (ix, iy, map, sample); if (ival > maximum) { maximum = ival; mix = ix; miy = iy; } } } // Found maximum, save *maximum_mask_.data_ptr(ox, oy, map, sample) = input_width_ * miy + mix; // Feed forward *output_->data.data_ptr(ox, oy, map, sample) = maximum; } } } } #endif }
void OpenCLExecuter::ocl_filterBoundingBox(int channel, int window_size) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem bbmin_buffer; // OpenCL device source buffer cl_mem bbmax_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel cl_int4 minbb; cl_int4 maxbb; minbb.s[0] = minbb.s[1] = minbb.s[2] = 8192; maxbb.s[0] = maxbb.s[1] = maxbb.s[2] = -8192; int iNumElements = 3*volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object bbmin_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(cl_int4), NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); bbmax_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(cl_int4), NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&bbmin_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&bbmax_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&channel); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_FALSE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmin_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&minbb, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueReadBuffer (ocl_wrapper->commandQue, bbmax_buffer, CL_TRUE, 0, sizeof(cl_int4), (void*)&maxbb, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(src_buffer)clReleaseMemObject(src_buffer); if(bbmin_buffer)clReleaseMemObject(bbmin_buffer); if(bbmax_buffer)clReleaseMemObject(bbmax_buffer); maxbb.s[0] += (float)window_size/2.0; maxbb.s[1] += (float)window_size/2.0; maxbb.s[2] += (float)window_size/2.0; minbb.s[0] -= (float)window_size/2.0; minbb.s[1] -= (float)window_size/2.0; minbb.s[2] -= (float)window_size/2.0; maxbb.s[0] += 2; maxbb.s[1] += 2; maxbb.s[2] += 2; minbb.s[0] -= 2; minbb.s[1] -= 2; minbb.s[2] -= 2; if(maxbb.s[0]>volobj->texwidth-1) maxbb.s[0] =volobj->texwidth-1; if(maxbb.s[1]>volobj->texheight-1) maxbb.s[1] =volobj->texheight-1; if(maxbb.s[2]>volobj->texdepth-1) maxbb.s[2] =volobj->texdepth-1; if(minbb.s[0]<0) minbb.s[0]=0; if(minbb.s[1]<0) minbb.s[1]=0; if(minbb.s[2]<0) minbb.s[2]=0; volobj->boundingboxSize.x = ((maxbb.s[0])-(minbb.s[0]-1)); volobj->boundingboxSize.y = ((maxbb.s[1])-(minbb.s[1]-1)); volobj->boundingboxSize.z = ((maxbb.s[2])-(minbb.s[2]-1)); volobj->boundingboxCentre.x = 0.0; //-(((float)boundingboxSize.x)/2.0); volobj->boundingboxCentre.y = 0.0; //-(((float)boundingboxSize.y)/2.0); volobj->boundingboxCentre.z = 0.0; //-(((float)boundingboxSize.z)/2.0); volobj->boundingboxMin = Vector(minbb.s[0], minbb.s[1], minbb.s[2]); volobj->boundingboxMax = Vector(maxbb.s[0], maxbb.s[1], maxbb.s[2]); printf("min: %f, %f, %f\n", volobj->boundingboxMin.x, volobj->boundingboxMin.y, volobj->boundingboxMin.z); printf("max: %f, %f, %f\n", volobj->boundingboxMax.x, volobj->boundingboxMax.y, volobj->boundingboxMax.z); }
int main(void) { //time meassuring struct timeval tvs; struct timeval tve; float elapsedTime; int Nx; int Ny; int Nz; int N; int plotnum=0; int Tmax=0; int plottime=0; int plotgap=0; float Lx,Ly,Lz; float dt=0.0; float A=0.0; float B=0.0; float Du=0.0; float Dv=0.0; float a[2]={1.0,0.0}; float b[2]={0.5,0.0}; float* x,*y,*z ; float* u[2],*v[2]; //openCL variables cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem cl_u[2] = {NULL,NULL}; cl_mem cl_v[2] = {NULL,NULL}; cl_mem cl_uhat[2] = {NULL,NULL}; cl_mem cl_vhat[2] = {NULL,NULL}; cl_mem cl_x = NULL; cl_mem cl_y = NULL; cl_mem cl_z = NULL; cl_mem cl_kx = NULL; cl_mem cl_ky = NULL; cl_mem cl_kz = NULL; cl_program p_grid = NULL,p_frequencies = NULL,p_initialdata = NULL,p_linearpart=NULL,p_nonlinearpart=NULL; cl_kernel grid = NULL,frequencies = NULL,initialdata = NULL,linearpart=NULL,nonlinearpart=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); command_queue = clCreateCommandQueue(context, device_id, 0, &ret); size_t source_size; char *source_str; //end opencl int i,n; int status=0; //int start, finish, count_rate, ind, numthreads char nameconfig[100]=""; //Read infutfile char InputFileName[]="./INPUTFILE"; FILE*fp; fp=fopen(InputFileName,"r"); if(!fp) {fprintf(stderr, "Failed to load IPUTFILE.\n");exit(1);} int ierr=fscanf(fp, "%d %d %d %d %d %f %f %f %f %f %f %f %f", &Nx,&Ny,&Nz,&Tmax,&plotgap,&Lx,&Ly,&Lz,&dt,&Du,&Dv,&A,&B); if(ierr!=13){fprintf(stderr, "INPUTFILE corrupted.\n");exit(1);} fclose(fp); printf("NX %d\n",Nx); printf("NY %d\n",Ny); printf("NZ %d\n",Nz); printf("Tmax %d\n",Tmax); printf("plotgap %d\n",plotgap); printf("Lx %f\n",Lx); printf("Ly %f\n",Ly); printf("Lz %f\n",Lz); printf("dt %f\n",dt); printf("Du %f\n",Du); printf("Dv %f\n",Dv); printf("F %f\n",A); printf("k %f\n",B); printf("Read inputfile\n"); N=Nx*Ny*Nz; plottime=plotgap; B=A+B; //ALLocate the memory u[0]=(float*) malloc(N*sizeof(float)); v[0]=(float*) malloc(N*sizeof(float)); x=(float*) malloc(Nx*sizeof(float)); y=(float*) malloc(Ny*sizeof(float)); z=(float*) malloc(Nz*sizeof(float)); //allocate gpu mem cl_u[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_u[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); printf("allocated space\n"); // FFT library realted declarations. clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {Nx, Ny, Nz}; // Setup clFFT. clfftSetupData fftSetup; ret = clfftInitSetupData(&fftSetup); ret = clfftSetup(&fftSetup); // Create a default plan for a complex FFT. ret = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths); // Set plan parameters. ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR); ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE); // Bake the plan. ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL); // Create temporary buffer. cl_mem tmpBufferu = 0; cl_mem tmpBufferv = 0; // Size of temp buffer. size_t tmpBufferSize = 0; status = clfftGetTmpBufSize(planHandle, &tmpBufferSize); if ((status == 0) && (tmpBufferSize > 0)) { tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); if (ret != CL_SUCCESS) printf("Error with tmpBuffer clCreateBuffer\n"); } //kernel grid fp = fopen("./grid.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL); grid = clCreateKernel(p_grid, "grid", &ret); //first x cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx); size_t global_work_size_x[3] = {Nx, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny); size_t global_work_size_y[3] = {Ny, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz); size_t global_work_size_z[3] = {Nz, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid); //kernel initial data fp = fopen("./initialdata.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_initialdata = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_initialdata, 1, &device_id, NULL, NULL, NULL); initialdata = clCreateKernel(p_initialdata, "initialdata", &ret); ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(initialdata, 2, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(initialdata, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(initialdata, 4, sizeof(cl_mem),(void* )&cl_x); ret = clSetKernelArg(initialdata, 5, sizeof(cl_mem),(void* )&cl_y); ret = clSetKernelArg(initialdata, 6, sizeof(cl_mem),(void* )&cl_z); ret = clSetKernelArg(initialdata, 7, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 8, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 9, sizeof(int),(void* )&Nz); size_t global_work_size[3] = {N, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(initialdata); ret = clReleaseProgram(p_initialdata); ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseMemObject(cl_x); ret = clReleaseMemObject(cl_y); ret = clReleaseMemObject(cl_z); //write to disk fp=fopen("./data/xcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write xcoord.dat.\n"); exit(1); } for(i=0;i<Nx;i++){fprintf(fp,"%f\n",x[i]);} fclose( fp ); fp=fopen("./data/ycoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write ycoord.dat.\n"); exit(1); } for(i=0;i<Ny;i++){fprintf(fp,"%f\n",y[i]);} fclose( fp ); fp=fopen("./data/zcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write zcoord.dat.\n"); exit(1); } for(i=0;i<Nz;i++){fprintf(fp,"%f\n",z[i]);} fclose( fp ); free(x); free(y); free(z); n=0; plotnum=0; //output of initial data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); //frequencies kernel fp = fopen("./frequencies.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load frequencies.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_frequencies = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_frequencies, 1, &device_id, NULL, NULL, NULL); frequencies = clCreateKernel(p_frequencies, "frequencies", &ret); //get frequencies first x cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_kz = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kz); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nz); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); printf("Setup grid, fourier frequencies and initialcondition\n"); //load the rest of the kernels //linearpart kernel fp = fopen("./linearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load linearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_linearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_linearpart, 1, &device_id, NULL, NULL, NULL); linearpart = clCreateKernel(p_linearpart, "linearpart", &ret); //kernel nonlinear fp = fopen("./nonlinearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load nonlinearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_nonlinearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_nonlinearpart, 1, &device_id, NULL, NULL, NULL); nonlinearpart = clCreateKernel(p_nonlinearpart, "nonlinearpart", &ret); printf("Got initial data, starting timestepping\n"); gettimeofday(&tvs, NULL); for(n=0;n<=Tmax;n++){ //linear ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); //nonlinearpart ret = clSetKernelArg(nonlinearpart, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(nonlinearpart, 1, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(nonlinearpart, 2, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(nonlinearpart, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(nonlinearpart, 4, sizeof(float),(void* )&dt); ret = clSetKernelArg(nonlinearpart, 5, sizeof(float),(void* )&a[0]); ret = clSetKernelArg(nonlinearpart, 6, sizeof(float),(void* )&a[1]); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // linear part ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d\n",n*dt,n,plotnum); plottime=plottime+plotgap; plotnum=plotnum+1; ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); //output of data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write u-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write v-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); } } gettimeofday(&tve, NULL); printf("Finished time stepping\n"); elapsedTime = (tve.tv_sec - tvs.tv_sec) * 1000.0; // sec to ms elapsedTime += (tve.tv_usec - tvs.tv_usec) / 1000.0; // us to ms printf("%f,",elapsedTime); clReleaseMemObject(cl_u[0]); clReleaseMemObject(cl_u[1]); clReleaseMemObject(cl_v[0]); clReleaseMemObject(cl_v[1]); clReleaseMemObject(cl_uhat[0]); clReleaseMemObject(cl_uhat[1]); clReleaseMemObject(cl_vhat[0]); clReleaseMemObject(cl_vhat[1]); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); clReleaseMemObject(cl_kz); ret = clReleaseKernel(frequencies); ret = clReleaseProgram(p_frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseProgram(p_linearpart); ret = clReleaseKernel(nonlinearpart); ret = clReleaseProgram(p_nonlinearpart); free(u[0]); free(v[0]); clReleaseMemObject(tmpBufferu); clReleaseMemObject(tmpBufferv); /* Release the plan. */ ret = clfftDestroyPlan(&planHandle); /* Release clFFT library. */ clfftTeardown(); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); printf("Program execution complete\n"); return 0; }
void OpenCLExecuter::ocl_filterPeronaMalik(float lambda, float dT, unsigned char* src_array, unsigned char* dst_array, int w, int h, int d) { float lambda2 = lambda*lambda; cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel int iNumElements = w*h*d; // Length of float arrays to process size_t global_threads[3] ={w,h,d}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(float), (void*)&lambda2); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(float), (void*)&dT); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texwidth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&volobj->texheight); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&volobj->texdepth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); }
int main(void) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device preferable) int niters = ITERS; // number of iterations int nsteps; float step_size; size_t nwork_groups; size_t max_size, work_group_size = 8; float pi_res; cl_mem d_partial_sums; char *kernelsource = getKernelSource("../pi_ocl.cl"); // Kernel source cl_int err; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel_pi; // compute kernel // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Output information err = output_device_info(device_id); // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel_pi = clCreateKernel(program, "pi", &err); if (!kernel_pi || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Find kernel work-group size err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err)); return EXIT_FAILURE; } // Now that we know the size of the work-groups, we can set the number of // work-groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if (nwork_groups < 1) { err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL); work_group_size = in_nsteps / (nwork_groups * niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/(float)nsteps; h_psum = calloc(sizeof(float), nwork_groups); if (!h_psum) { printf("Error: could not allocate host memory for h_psum\n"); return EXIT_FAILURE; } printf(" %ld work-groups of size %ld. %d Integration steps\n", nwork_groups, work_group_size, nsteps); d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // Set kernel arguments err = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments!\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1D input data set // using the maximum number of work items for this device size_t global = nwork_groups * work_group_size; size_t local = work_group_size; double rtime = wtime(); err = clEnqueueNDRangeKernel( commands, kernel_pi, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clEnqueueReadBuffer( commands, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // complete the sum and compute the final integral value on the host pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); // clean up clReleaseMemObject(d_partial_sums); clReleaseProgram(program); clReleaseKernel(kernel_pi); clReleaseCommandQueue(commands); clReleaseContext(context); free(kernelsource); free(h_psum); }
void OpenCLExecuter::ocl_filterGaussian(unsigned char* src_array, unsigned char* dst_array, int w, int h, int d) { // printf("gaussian_sum: %f\n", gaussian_sum); printf("gaussian_width: %d\n", filter_width); printf("gaussian_mask size: %d\n", filter_kernel.size()); cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem gauss_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel int iNumElements = w*h*d; // Length of float arrays to process size_t global_threads[3] ={w,h,d}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); gauss_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(float) * filter_kernel.size(), NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); //================================================== // X axis //================================================== // Create the kernel ckKernel = clCreateKernel (cpProgram, "gaussianX", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); /*size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);*/ // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host //err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //================================================== // Y axis //================================================== // Create the kernel ckKernel = clCreateKernel (cpProgram, "gaussianY", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); /* size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); */ // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end // Write data from host to GPU //err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host //err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //================================================== // Z axis //================================================== // Create the kernel ckKernel = clCreateKernel (cpProgram, "gaussianZ", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&gauss_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&filter_width); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&w); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 5, sizeof(int), (void*)&h); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 6, sizeof(int), (void*)&d); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); /*size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); */ // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end //Prepare data to upload //for(int j=0; j<iNumElements; j++) // data[j] = volobj->texture3d[3*j+0]; // Write data from host to GPU //err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, src_array, 0, NULL, NULL); //printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, gauss_buffer, CL_FALSE, 0, sizeof(float) * filter_kernel.size(), &filter_kernel[0], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, dst_array, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); if(gauss_buffer)clReleaseMemObject(gauss_buffer); }
// host stub function void ops_par_loop_update_halo_kernel1_fr2(char const *name, ops_block block, int dim, int *range, ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7) { // Timing double t1, t2, c1, c2; ops_arg args[8] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 8, range, 22)) return; #endif if (OPS_diags > 1) { ops_timing_realloc(22, "update_halo_kernel1_fr2"); OPS_kernels[22].count++; ops_timers_core(&c1, &t1); } // compute locally allocated range for the sub-block int start[3]; int end[3]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for (int n = 0; n < 3; n++) { start[n] = sb->decomp_disp[n]; end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; if (start[n] >= range[2 * n]) { start[n] = 0; } else { start[n] = range[2 * n] - start[n]; } if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) start[n] = range[2 * n]; if (end[n] >= range[2 * n + 1]) { end[n] = range[2 * n + 1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n] == MPI_PROC_NULL && (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); } #else for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #endif int x_size = MAX(0, end[0] - start[0]); int y_size = MAX(0, end[1] - start[1]); int z_size = MAX(0, end[2] - start[2]); int xdim0 = args[0].dat->size[0]; int ydim0 = args[0].dat->size[1]; int xdim1 = args[1].dat->size[0]; int ydim1 = args[1].dat->size[1]; int xdim2 = args[2].dat->size[0]; int ydim2 = args[2].dat->size[1]; int xdim3 = args[3].dat->size[0]; int ydim3 = args[3].dat->size[1]; int xdim4 = args[4].dat->size[0]; int ydim4 = args[4].dat->size[1]; int xdim5 = args[5].dat->size[0]; int ydim5 = args[5].dat->size[1]; int xdim6 = args[6].dat->size[0]; int ydim6 = args[6].dat->size[1]; // build opencl kernel if not already built buildOpenCLKernels_update_halo_kernel1_fr2(xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6); // set up OpenCL thread blocks size_t globalWorkSize[3] = { ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x, ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y, ((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z}; size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y, OPS_block_size_z}; int *arg7h = (int *)arg7.data; int consts_bytes = 0; consts_bytes += ROUND_UP(NUM_FIELDS * sizeof(int)); reallocConstArrays(consts_bytes); consts_bytes = 0; arg7.data = OPS_consts_h + consts_bytes; arg7.data_d = OPS_consts_d + consts_bytes; for (int d = 0; d < NUM_FIELDS; d++) ((int *)arg7.data)[d] = arg7h[d]; consts_bytes += ROUND_UP(NUM_FIELDS * sizeof(int)); mvConstArraysToDevice(consts_bytes); // set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); base0 = base0 + args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 * (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); base1 = base1 + args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 * (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; #endif int base2 = 1 * 1 * (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); base2 = base2 + args[2].dat->size[0] * 1 * (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); base2 = base2 + args[2].dat->size[0] * 1 * args[2].dat->size[1] * 1 * (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; #endif int base3 = 1 * 1 * (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); base3 = base3 + args[3].dat->size[0] * 1 * (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); base3 = base3 + args[3].dat->size[0] * 1 * args[3].dat->size[1] * 1 * (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; #endif int base4 = 1 * 1 * (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); base4 = base4 + args[4].dat->size[0] * 1 * (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); base4 = base4 + args[4].dat->size[0] * 1 * args[4].dat->size[1] * 1 * (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; #endif int base5 = 1 * 1 * (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); base5 = base5 + args[5].dat->size[0] * 1 * (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); base5 = base5 + args[5].dat->size[0] * 1 * args[5].dat->size[1] * 1 * (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; #endif int base6 = 1 * 1 * (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); base6 = base6 + args[6].dat->size[0] * 1 * (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); base6 = base6 + args[6].dat->size[0] * 1 * args[6].dat->size[1] * 1 * (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); ops_H_D_exchanges_device(args, 8); ops_halo_exchanges(args, 8, range); ops_H_D_exchanges_device(args, 8); if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[22].mpi_time += t2 - t1; } if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) { clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 0, sizeof(cl_mem), (void *)&arg0.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 1, sizeof(cl_mem), (void *)&arg1.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 2, sizeof(cl_mem), (void *)&arg2.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 3, sizeof(cl_mem), (void *)&arg3.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 4, sizeof(cl_mem), (void *)&arg4.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 5, sizeof(cl_mem), (void *)&arg5.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 6, sizeof(cl_mem), (void *)&arg6.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 7, sizeof(cl_mem), (void *)&arg7.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 8, sizeof(cl_int), (void *)&base0)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 9, sizeof(cl_int), (void *)&base1)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 10, sizeof(cl_int), (void *)&base2)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 11, sizeof(cl_int), (void *)&base3)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 12, sizeof(cl_int), (void *)&base4)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 13, sizeof(cl_int), (void *)&base5)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 14, sizeof(cl_int), (void *)&base6)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 15, sizeof(cl_int), (void *)&x_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 16, sizeof(cl_int), (void *)&y_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[22], 17, sizeof(cl_int), (void *)&z_size)); // call/enque opencl kernel wrapper function clSafeCall(clEnqueueNDRangeKernel( OPS_opencl_core.command_queue, OPS_opencl_core.kernel[22], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL)); } if (OPS_diags > 1) { clSafeCall(clFinish(OPS_opencl_core.command_queue)); } if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[22].time += t1 - t2; } ops_set_dirtybit_device(args, 8); ops_set_halo_dirtybit3(&args[0], range); ops_set_halo_dirtybit3(&args[1], range); ops_set_halo_dirtybit3(&args[2], range); ops_set_halo_dirtybit3(&args[3], range); ops_set_halo_dirtybit3(&args[4], range); ops_set_halo_dirtybit3(&args[5], range); ops_set_halo_dirtybit3(&args[6], range); if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c2, &t2); OPS_kernels[22].mpi_time += t2 - t1; OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg2); OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg3); OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg4); OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg5); OPS_kernels[22].transfer += ops_compute_transfer(dim, start, end, &arg6); } }
int main() { // Create the variables for the time measure int starttime, stoptime; //Get initial time starttime = GetTimeMs(); // This code executes on the OpenCL host // Host data float *A=NULL; // Input array float *B=NULL; // Input array float *C=NULL; // Output array // Elements in each array const int elements=2048; // Compute the size of the data size_t datasize=sizeof(int)*elements; // Allocate space for input/output data A=(float*)malloc(datasize); B=(float*)malloc(datasize); C=(float*)malloc(datasize); // Initialize the input data A[0]=2.2; A[1]=1.3; B[0]=3.7; B[1]=5.4; // Load the kernel source code into the array programSource FILE *fp; char *programSource; size_t programSize; fp = fopen("fplos_kernels.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } programSource = (char*)malloc(MAX_SOURCE_SIZE); fclose( fp ); // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms=0; status=clGetPlatformIDs(0, NULL,&numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms=NULL; platforms=(cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Retrieve the number of devices cl_uint numDevices=0; status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL,&numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // Create a buffer object that will contain the data // from the host array A cl_mem bufA; bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will contain the data // from the host array B cl_mem bufB; bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will hold the output data cl_mem bufC; bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); // Write input array A to the device buffer bufferA status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 0, datasize, A, 0, NULL, NULL); // Write input array B to the device buffer bufferB status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 0, datasize, B, 0, NULL, NULL); // Create a program with source code cl_program program=clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status); // Build (compile) the program for the device status=clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); // Create the vector addition kernel cl_kernel kernel; kernel=clCreateKernel(program, "floatadd", &status); // Associate the input and output buffers with the kernel status=clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA); status=clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB); status=clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC); // Define an index space (global work size) of work // items for execution. A workgroup size (local work size) // is not required, but can be used. size_t globalWorkSize[1]; // There are 'elements' work-items globalWorkSize[0]=elements; // Execute the kernel for execution status=clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); // Read the device output buffer to the host output array clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0, datasize, C, 0, NULL, NULL); printf("Output = %.1f\n", C[0]); printf("Output = %.1f\n", C[1]); // Free OpenCL resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC); clReleaseContext(context); // Free host resources free(A); free(B); free(C); free(platforms); free(devices); //Get initial time stoptime = GetTimeMs(); printf("Duration= %d ms\n", stoptime - starttime); return 0; }
int main(int argc, const char * argv[]) { //First we set the variables for measuring performance. struct timeval tim1, tim2; uint64_t time; //Calling the function "gettimeofday" to measure the time before the program executes. gettimeofday(&tim1, NULL); /* * These are the declarations of the OpenCL structures are described below: * cl_platform-id - Stores the types of platforms installed on the host. * cl_device_id - Stores the type of the device (GPU, CPU, etc.) * cl_context - Stores the context in which a command queue can be created. * cl_command_queue - Stores the command queue which governs how the GPU will * will execute the kernel. * cl_program - Stores the kernel code (which can be comprised of several kernels). Is compiled later its * functions get packaged into kernels. * cl_kernel - The OpenCL data structure that represents kernels. */ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; //A cl_int used to store error flags that are returned if OpenCL function does not execute properly. cl_int err; /* * A file object and buffers used to store the input kernel code as well as allocate the memory for the kernel code * and the output log from the compiler during the compilation of the kernel code. */ FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; //The number of work items in each dimension of the data. size_t work_units_per_kernel; //This value determines the size of the nxn (square) array. int n = 1000; //Allocating the memory for the nxn arrays of floats. float **h_xx = (float**)malloc(sizeof(float*)*n); float **h_yy = (float**)malloc(sizeof(float*)*n); float **h_zz = (float**)malloc(sizeof(float*)*n); for(int i = 0; i<n; i++){ h_xx[i] = (float*)malloc(sizeof(float)*n); h_yy[i] = (float*)malloc(sizeof(float)*n); h_zz[i] = (float*)malloc(sizeof(float)*n); //Initializing the arrays. for(int j = 0; j<n; j++){ h_xx[i][j] = i+j; h_yy[i][j] = i+j; } } /* * These three variables of the type cl_mem (memory object) are used as buffers and hold the data which will * be sent to the device and then once calculated sent back to the host. */ cl_mem d_xx; cl_mem d_yy; cl_mem d_zz; // Obtains the Platform information installed on the host and stores into the memory location of the variable "platform" err = clGetPlatformIDs(1, &platform, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Failed to locate Platform." << std::endl; exit(1); } // Obtains the device information (looking for specifically GPU devices) and stores it into the memory location of the variable "device" err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err != CL_SUCCESS){ printf("Error: Failed to locate Device."); exit(1); } // Creates a context on the device and stores it into the "context" variable. context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create context." << std::endl; exit(1); } /* * The following code stores the file "arraySum.cl" into the FILE object "program_handle". It then determines the size * of the file and reads the content into the variable "program_buffer". */ program_handle = fopen("flopstestloop.cl", "r"); if(!program_handle){ std::cout << "Error: Failed to Load Kernel" << std::endl; exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)malloc(program_size + 1); program_buffer[program_size] = '\0'; fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); // Stores the kernel code into a program and stores it into the "program" variable. program = clCreateProgramWithSource(context, 1, (const char **)&program_buffer, (const size_t *)&program_size, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the program" << std::endl; exit(1); } free(program_buffer); //Compiles the program and stores the compiled code into the argument "program" err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not compile the program" << std::endl; /* * The following code first allocates the correct amount of memory in order to store the output of the compilers * build log and then it stores this log into the buffer "program_log". Finally it prints this buffer to the * screen. */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*)malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } //From the compiled code in the program creates a kernel called "arraysum" kernel = clCreateKernel(program, "arraysum", &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the kernel" << std::endl; exit(1); } //Creates a command queue and stores it into the variable "queue". queue = clCreateCommandQueue(context, device, 0, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the queue" << std::endl; exit(1); } //Creating the Device memory buffers. These will be used to transfer data from the host to the device and vice versa. d_xx = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the buffer d_xx" << std::endl; exit(1); } d_yy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the buffer d_yy" << std::endl; exit(1); } d_zz = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*n, NULL, &err); if(err != CL_SUCCESS){ std::cout << "Error: Could not create the buffer d_zz" << std::endl; exit(1); } /* * This for loop loops over the each row in the matrices x and y first writes the row to the device memory where * the kernel arguments are then set and then then passed to the compiled kernel code already located on the device. * Once executed, the results are then stored in the d_zz buffer and are read back to the host. */ for(int i = 0; i<n; i++) { //Writing the data from the host to the device err = clEnqueueWriteBuffer(queue, d_xx, CL_TRUE, 0, sizeof(float)*n, h_xx[i], 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not write to buffer d_xx" << std::endl; exit(1); } err = clEnqueueWriteBuffer(queue, d_yy, CL_TRUE, 0, sizeof(float)*n, h_yy[i], 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not write to buffer d_yy" << std::endl; exit(1); } //Setting the Kernel Arguments err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_xx); if(err != CL_SUCCESS){ std::cout << "Error: Could not set kernel argument h_xx." << std::endl; exit(1); } err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_yy); if(err != CL_SUCCESS){ std::cout << "Error: Could not set kernel argument h_yy." << std::endl; exit(1); } err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_zz); if(err != CL_SUCCESS){ std::cout << "Error: Could not set kernel argument h_zz." << std::endl; } work_units_per_kernel = n; //Executing the Kernel err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units_per_kernel, NULL, 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not execute kernel." << std::endl; exit(1); } //Reading the Data from the Kernel err = clEnqueueReadBuffer(queue, d_zz, CL_TRUE, 0, n*(sizeof(float)), h_zz[i], 0, NULL, NULL); if(err != CL_SUCCESS){ std::cout << "Error: Could not read data from kernel." << std::endl; exit(1); } } //Measuring the time after the OpenCL code has executed and has been copied back to the host. gettimeofday(&tim2, NULL); //Finding the difference between the two measured times. time = tim2.tv_sec - tim1.tv_sec; //Displaying the elapsed time in seconds. std::cout << time + (tim2.tv_usec - tim1.tv_usec)/1000000.00 << std::endl; //The previously allocated memory is freed. clReleaseMemObject(d_xx); clReleaseMemObject(d_yy); clReleaseMemObject(d_zz); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
void spmv_csr_cpu(const csr_matrix* csr,const float* x,const float* y,float* out) { int num_rows = csr->num_rows; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { fprintf(stderr, "ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_csr_src = "csr_ocl"; char * tempchar = "./spmv_kernel.cl"; FILE * fp = fopen(tempchar, "rb"); if(!fp) { fprintf(stderr, "ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); int use_gpu = 1; if(initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = { source, 0 }; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel_csr; kernel_csr = clCreateKernel(prog, kernel_csr_src, &err); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateKernel() 0 => %d\n", err); return -1; } clReleaseProgram(prog); cl_mem memAp; cl_mem memAj; cl_mem memAx; cl_mem memx; cl_mem memy; memAp = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*(csr.num_rows+1), NULL, &err); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memAj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*csr.num_nonzeros, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memAx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memx = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} memy = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*csr.num_rows, NULL, &err ); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clCreateBuffer\n"); return -1;} //write buffers err = clEnqueueWriteBuffer(cmd_queue, memAp, CL_FALSE, 0, sizeof(unsigned int)*csr.num_rows+4, csr->Ap, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memAj, CL_FALSE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr->Aj, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memAx, CL_FALSE, 0, sizeof(float)*csr.num_nonzeros, csr->Ax, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memx, CL_FALSE, 0, sizeof(float)*csr.num_cols, x, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, memy, CL_FALSE, 0, sizeof(float)*csr.num_rows, y, 0, NULL, NULL); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: clEnqueueWriteBuffer\n"); return -1; } clSetKernelArg(kernel_csr, 0, sizeof(unsigned int *), (unsigned int *) &csr->num_rows); clSetKernelArg(kernel_csr, 1, sizeof(void *), (void*) &memAp); clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memAj); clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memAx); clSetKernelArg(kernel_csr, 2, sizeof(void *), (void*) &memx); clSetKernelArg(kernel_csr, 3, sizeof(void *), (void*) &memy); err = clEnqueueNDRangeKernel(cmd_queue, kernel_csr, 2, NULL, global_work, local_work, 0, 0, 0); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, memy, 1, 0, sizeof(float)*csr.num_rows, out, 0, 0, 0); if(err != CL_SUCCESS) { fprintf(stderr, "ERROR: 1 clEnqueueReadBuffer: out\n"); return -1; } clReleaseMemObject(memAp); clReleaseMemObject(memAj); clReleaseMemObject(memAx); clReleaseMemObject(memx); clReleaseMemObject(memy); }
void vectorVectorAdditionGMDP (cl_uint numDevices,cl_device_id *devices, cl_program program,cl_context context,double * h_VectA,double *h_VectB, double *h_Output,int vectSize) { cl_event gpuExec[1]; cl_int err; cl_command_queue cmdQueue; //holds command queue object cl_kernel kernel; //holds kernel object cl_mem d_VectA,d_VectB,d_Output; //holds device input output buffer cl_event events; // events size_t globalWorkSize[2]={vectSize,vectSize}; //holds global group size double gflops=0.0; //holds total achieved gflops cl_ulong startTime, endTime,elapsedTime; //holds time float executionTimeInSeconds; //holds total execution time /*create command queue*/ cmdQueue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err); if( err != CL_SUCCESS || cmdQueue == 0) { printf("\n\t Failed to create command queue \n" ); exit (-1); } /*create kernel object*/ kernel = clCreateKernel(program,"VectVectAddDPKernel",&err); OPENCL_CHECK_STATUS("error while creating kernel",err); /*create buffer*/ d_VectA=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectA,&err); OPENCL_CHECK_STATUS("error while creating buffer for input",err); d_VectB=clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(double)*vectSize,h_VectB,&err); OPENCL_CHECK_STATUS("error while creating buffer for input",err); d_Output=clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(double)*vectSize,NULL,&err); OPENCL_CHECK_STATUS("error while creating buffer for d_Output",err); /*set kernel arg*/ err=clSetKernelArg(kernel,0,sizeof(cl_mem),&d_VectA); OPENCL_CHECK_STATUS("error while setting arg 0",err); err=clSetKernelArg(kernel,1,sizeof(cl_mem),&d_VectB); OPENCL_CHECK_STATUS("error while setting arg 1",err); err=clSetKernelArg(kernel,2,sizeof(cl_mem),&d_Output); OPENCL_CHECK_STATUS("error while setting arg 2",err); /*load kernel*/ err = clEnqueueNDRangeKernel(cmdQueue,kernel,2,NULL,globalWorkSize,NULL,0,NULL,&gpuExec[0]); OPENCL_CHECK_STATUS("error while creating ND range",err); //completion of all commands to command queue err = clFinish(cmdQueue); OPENCL_CHECK_STATUS("clFinish",err); /* calculate start time and end time*/ clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(gpuExec[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); /* total elapsed time*/ elapsedTime = endTime-startTime; /*total execution time*/ executionTimeInSeconds = (float)(1.0e-9 * elapsedTime); /* reading buffer object*/ err = clEnqueueReadBuffer(cmdQueue,d_Output,CL_TRUE,0,sizeof(cl_double)*vectSize,h_Output,0,0,&events); OPENCL_CHECK_STATUS("error while reading buffer",err); /* calculate total gflops*/ gflops= (1.0e-9 * (( vectSize) / executionTimeInSeconds)); // Print the gflops on the screen print_on_screen("Vector Vector Addition double precision using global memory",executionTimeInSeconds,vectSize,gflops,1); //check results vectVectAddCheckResultGMDP(h_VectA,h_VectB,h_Output,vectSize); //release opencl objects clReleaseMemObject(d_VectA); clReleaseMemObject(d_VectB); clReleaseMemObject(d_Output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(cmdQueue); clReleaseContext(context); }
// host stub function void ops_par_loop_advec_mom_kernel_post_pre_advec_x(char const *name, ops_block block, int dim, int* range, ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4) { ops_arg args[5] = { arg0, arg1, arg2, arg3, arg4}; ops_timing_realloc(18,"advec_mom_kernel_post_pre_advec_x"); OPS_kernels[18].count++; //compute locally allocated range for the sub-block int start[3]; int end[3]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for ( int n=0; n<3; n++ ){ start[n] = sb->decomp_disp[n];end[n] = sb->decomp_disp[n]+sb->decomp_size[n]; if (start[n] >= range[2*n]) { start[n] = 0; } else { start[n] = range[2*n] - start[n]; } if (sb->id_m[n]==MPI_PROC_NULL && range[2*n] < 0) start[n] = range[2*n]; if (end[n] >= range[2*n+1]) { end[n] = range[2*n+1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n]==MPI_PROC_NULL && (range[2*n+1] > sb->decomp_disp[n]+sb->decomp_size[n])) end[n] += (range[2*n+1]-sb->decomp_disp[n]-sb->decomp_size[n]); } #else //OPS_MPI for ( int n=0; n<3; n++ ){ start[n] = range[2*n];end[n] = range[2*n+1]; } #endif //OPS_MPI int x_size = MAX(0,end[0]-start[0]); int y_size = MAX(0,end[1]-start[1]); int z_size = MAX(0,end[2]-start[2]); int xdim0 = args[0].dat->size[0]*args[0].dat->dim; int ydim0 = args[0].dat->size[1]; int xdim1 = args[1].dat->size[0]*args[1].dat->dim; int ydim1 = args[1].dat->size[1]; int xdim2 = args[2].dat->size[0]*args[2].dat->dim; int ydim2 = args[2].dat->size[1]; int xdim3 = args[3].dat->size[0]*args[3].dat->dim; int ydim3 = args[3].dat->size[1]; int xdim4 = args[4].dat->size[0]*args[4].dat->dim; int ydim4 = args[4].dat->size[1]; //build opencl kernel if not already built buildOpenCLKernels_advec_mom_kernel_post_pre_advec_x( xdim0,ydim0,xdim1,ydim1,xdim2,ydim2,xdim3,ydim3,xdim4,ydim4); //Timing double t1,t2,c1,c2; ops_timers_core(&c2,&t2); //set up OpenCL thread blocks size_t globalWorkSize[3] = {((x_size-1)/OPS_block_size_x+ 1)*OPS_block_size_x, ((y_size-1)/OPS_block_size_y + 1)*OPS_block_size_y, MAX(1,end[2]-start[2])}; size_t localWorkSize[3] = {OPS_block_size_x,OPS_block_size_y,1}; int dat0 = args[0].dat->elem_size; int dat1 = args[1].dat->elem_size; int dat2 = args[2].dat->elem_size; int dat3 = args[3].dat->elem_size; int dat4 = args[4].dat->elem_size; //set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif //OPS_MPI int base0 = 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); base0 = base0 + args[0].dat->size[0] * args[0].dat->size[1] * (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif //OPS_MPI int base1 = 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); base1 = base1 + args[1].dat->size[0] * args[1].dat->size[1] * (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; #endif //OPS_MPI int base2 = 1 * (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); base2 = base2 + args[2].dat->size[0] * (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); base2 = base2 + args[2].dat->size[0] * args[2].dat->size[1] * (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; #endif //OPS_MPI int base3 = 1 * (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); base3 = base3 + args[3].dat->size[0] * (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); base3 = base3 + args[3].dat->size[0] * args[3].dat->size[1] * (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; #else //OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; #endif //OPS_MPI int base4 = 1 * (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); base4 = base4 + args[4].dat->size[0] * (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); base4 = base4 + args[4].dat->size[0] * args[4].dat->size[1] * (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); ops_H_D_exchanges_device(args, 5); ops_halo_exchanges(args,5,range); ops_H_D_exchanges_device(args, 5); ops_timers_core(&c1,&t1); OPS_kernels[18].mpi_time += t1-t2; clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 0, sizeof(cl_mem), (void*) &arg0.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 1, sizeof(cl_mem), (void*) &arg1.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 2, sizeof(cl_mem), (void*) &arg2.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 3, sizeof(cl_mem), (void*) &arg3.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 4, sizeof(cl_mem), (void*) &arg4.data_d )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 5, sizeof(cl_int), (void*) &base0 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 6, sizeof(cl_int), (void*) &base1 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 7, sizeof(cl_int), (void*) &base2 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 8, sizeof(cl_int), (void*) &base3 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 9, sizeof(cl_int), (void*) &base4 )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 10, sizeof(cl_int), (void*) &x_size )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 11, sizeof(cl_int), (void*) &y_size )); clSafeCall( clSetKernelArg(OPS_opencl_core.kernel[18], 12, sizeof(cl_int), (void*) &z_size )); //call/enque opencl kernel wrapper function clSafeCall( clEnqueueNDRangeKernel(OPS_opencl_core.command_queue, OPS_opencl_core.kernel[18], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL) ); if (OPS_diags>1) { clSafeCall( clFinish(OPS_opencl_core.command_queue) ); } ops_set_dirtybit_device(args, 5); ops_set_halo_dirtybit3(&args[0],range); ops_set_halo_dirtybit3(&args[3],range); //Update kernel record ops_timers_core(&c2,&t2); OPS_kernels[18].time += t2-t1; OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg0); OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg1); OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg2); OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg3); OPS_kernels[18].transfer += ops_compute_transfer(dim, range, &arg4); }
void OpenCLExecuter::ocl_filter_multi(void) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer[MAX_DEVICES]; // OpenCL device source buffer cl_mem dst_buffer[MAX_DEVICES]; // OpenCL device source buffer cl_command_queue queues[MAX_DEVICES]; // OpenCL device queue cl_kernel ckKernel[MAX_DEVICES]; // OpenCL kernel cl_event gpuDone[MAX_DEVICES]; // int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth*3; // Length of float arrays to process int xdim, ydim, zdim; xdim = (float)volobj->texwidth; // (float)ocl_wrapper->numDevices; ydim = (float)volobj->texheight; // (float)ocl_wrapper->numDevices; zdim = (float)volobj->texdepth / (float)ocl_wrapper->numDevices; //Length of array to process int iNumElements = (xdim*ydim*zdim); size_t global_threads[3] = {xdim, ydim, zdim}; //temp array unsigned char** data = new unsigned char*[ocl_wrapper->numDevices]; for(int i=0; i<ocl_wrapper->numDevices; i++) data[i] = new unsigned char[iNumElements]; for(int i=0; i<ocl_wrapper->numDevices; i++) { printf("OPENCL: Computing Device%d\n", i); //create the command queue we will use to execute OpenCL commands queues[i] = clCreateCommandQueue(ocl_wrapper->context, ocl_wrapper->devices[i], 0, &err); printf("OPENCL: clCreateCommandQueue: %s\n", ocl_wrapper->get_error(err)); // allocate the source buffer memory object src_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer[i] = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel[i] = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel[i], 0, sizeof(cl_mem), (void*)&src_buffer[i]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 1, sizeof(cl_mem), (void*)&dst_buffer[i]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 2, sizeof(int), (void*)&global_threads[0]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 3, sizeof(int), (void*)&global_threads[1]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel[i], 4, sizeof(int), (void*)&global_threads[2]); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); //Prepare data to upload int iOffsetElements = (xdim*ydim*zdim*i); for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++) data[i][j-iOffsetElements] = volobj->texture3d[3*j+0]; // Write data from host to GPU err = clEnqueueWriteBuffer (queues[i], src_buffer[i], CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); } for(int i=0; i<ocl_wrapper->numDevices; i++) { // Launch kernel err = clEnqueueNDRangeKernel (queues[i], ckKernel[i], 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); } for(int i=0; i<ocl_wrapper->numDevices; i++) { // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (queues[i], dst_buffer[i], CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data[i], 0, NULL, &gpuDone[i]); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); } // Synchronize with the GPUs printf("OPENCL: Waiting for devices to sync\n"); clWaitForEvents(ocl_wrapper->numDevices, gpuDone); for(int i=0; i<ocl_wrapper->numDevices; i++) { //read data back int iOffsetElements = (xdim*ydim*zdim*i); for(int j=iOffsetElements; j<iNumElements+iOffsetElements; j++) volobj->texture3d[3*j+0] = data[i][j-iOffsetElements]; } for(int i=0; i<ocl_wrapper->numDevices; i++) { // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel[i])clReleaseKernel(ckKernel[i]); //need to release any other OpenCL memory objects here if(dst_buffer[i])clReleaseMemObject(dst_buffer[i]); if(src_buffer[i])clReleaseMemObject(src_buffer[i]); } for(int i=0; i<ocl_wrapper->numDevices; i++) delete[] data[i]; delete[] data; }
void spmv_bcsr_ocl(b4csr_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devRowPtr; cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; //Initialize values int data_align = mat->b4csr_aligned_size; int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int blockrownum = mat->b4csr_row_num; int blocknum = mat->b4csr_block_num; int vecsize = mat->matinfo.width; int bwidth = mat->b4csr_bwidth; int bheight = mat->b4csr_bheight; int width4num = bwidth / 4; int padveclen = findPaddedSize(vecsize, 8); float* paddedvec = (float*)malloc(sizeof(float)*padveclen); memset(paddedvec, 0, sizeof(float)*padveclen); memcpy(paddedvec, vec, sizeof(float)*vecsize); ALLOCATE_GPU_READ(devRowPtr, mat->b4csr_row_ptr, sizeof(int)*(blockrownum + 1)); ALLOCATE_GPU_READ(devColid, mat->b4csr_col_id, sizeof(int)*blocknum); ALLOCATE_GPU_READ(devData, mat->b4csr_data, sizeof(float)*data_align*width4num*bheight); ALLOCATE_GPU_READ(devVec, paddedvec, sizeof(float)*padveclen); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_RGBA, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; if (height % 4 != 0) height += (4 - (height % 4)); float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height/4, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height/4, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1}; int gsize = blockrownum * CSR_VEC_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bcsr_red_00"; kernelname[13] += bheight; kernelname[14] += bwidth; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtr); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &data_align4); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBCSR %dx%d block cpu time %lf ms GFLOPS %lf code %d \n\n", bheight, bwidth, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } //Clean up if (image2dVec) free(image2dVec); if (devRowPtr) clReleaseMemObject(devRowPtr); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
void OpenCLExecuter::ocl_filter(int src_chan) { cl_int err; // debugging variables size_t szParmDataBytes; // Byte size of context information cl_mem src_buffer; // OpenCL device source buffer cl_mem dst_buffer; // OpenCL device source buffer size_t szGlobalWorkSize; // 1D var for Total # of work items size_t szLocalWorkSize; // 1D var for # of work items in the work group cl_kernel ckKernel; // OpenCL kernel int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process //temp array unsigned char* data = new unsigned char[iNumElements]; // set Local work size dimensions //szLocalWorkSize = 256; // set Global work size dimensions //szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize; //szGlobalWorkSize = iNumElements; // printf("OPENCL: number of elements: %d\n", (int)iNumElements); // printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize); // printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize); // printf("OPENCL: work groups: %d\n", (int)((float)szGlobalWorkSize/(float)szLocalWorkSize)); size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth}; // allocate the source buffer memory object src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // allocate the destination buffer memory object dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err); printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err)); // Create the kernel ckKernel = clCreateKernel (cpProgram, "myFunc", &err); printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err)); // Set the Argument values err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 3, sizeof(int), (void*)&volobj->texheight); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); err = clSetKernelArg (ckKernel, 4, sizeof(int), (void*)&volobj->texdepth); printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err)); size_t local; err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL); printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err)); printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local); // Copy input data to GPU, compute, copy results back // Runs asynchronous to host, up until blocking read at end //Prepare data to upload for(int j=0; j<iNumElements; j++) data[j] = volobj->texture3d[3*j+src_chan]; // Write data from host to GPU err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Write data from host to GPU // err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL); // printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err)); // Launch kernel err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL); printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host // err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, volobj->texture3d, 0, NULL, NULL); // printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); // Blocking read of results from GPU to Host // Blocking read of results from GPU to Host err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, data, 0, NULL, NULL); printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err)); //read data back for(int i=0; i<iNumElements; i++) { if(volobj->is_greyscale) volobj->texture3d[3*i+0] = volobj->texture3d[3*i+1] = volobj->texture3d[3*i+2] = data[i]; else volobj->texture3d[3*i+src_chan] = data[i]; } // Cleanup allocated objects printf("OPENCL: Releasing kernel memory\n"); if(ckKernel)clReleaseKernel(ckKernel); //need to release any other OpenCL memory objects here if(dst_buffer)clReleaseMemObject(dst_buffer); if(src_buffer)clReleaseMemObject(src_buffer); delete[] data; }
void Convolutioner_FrequencyDomain_OpenCL::process(AudioInOutBuffers<float_type>& audio ) { // unsigned int _2B = audio.channelLength_ * 2; unsigned int _B = audio.channelLength_; unsigned int _C = audio.numOfChannels_; //numOfChannels unsigned int _P = partitionedIR_.get_numOfPartsPerChannel(); //numOfIRPartsPerChannel //. //_ if >>>latency<<< or >>>number of channels<<< changed: // set partitionedIR // recreate buffers // recreate fft plans if ( window_.get_inputBlockSize() != audio.channelLength_ || window_.get_numOfChannels() != audio.numOfChannels_) { //Setting partitionedIR if (window_.get_inputBlockSize() != audio.channelLength_) { partitionedIR_.setNewIRF( irf_, audio.channelLength_ ); _P = partitionedIR_.get_numOfPartsPerChannel(); //Recreate, initialize buffers, and set as kernel arguments: PIR //recreate bufferPIR_R_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P); bufferPIR_I_.recreate(CL_MEM_READ_ONLY, _2B * _C * _P); //. //initialize bufferPIR_R_.set(partitionedIR_.real_ ); bufferPIR_I_.set(partitionedIR_.imaginary_); //. //set as kernel argument bufferPIR_R_.setAsKernelArgument(0, complexMultiplyAdd_kernel_); bufferPIR_I_.setAsKernelArgument(1, complexMultiplyAdd_kernel_); //. //.(Recreate...) } //. //Recreate initialize buffers, and set as kernel arguments: transform, FDL, accumulator //recreate /****/bufferTransform_R_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /****/bufferTransform_I_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /**********/bufferFDL_R_.recreate(CL_MEM_READ_WRITE, _2B * _C * _P ); /**********/bufferFDL_I_.recreate(CL_MEM_READ_WRITE, _2B * _C * _P ); /**/bufferAccumulator_R_.recreate(CL_MEM_READ_WRITE, _2B * _C ); /**/bufferAccumulator_I_.recreate(CL_MEM_READ_WRITE, _2B * _C ); cpu_bufferAccumulator_R_ = new float_type[_2B * _C ]; cpu_bufferAccumulator_I_ = new float_type[_2B * _C ]; //. //initialize FDL with 0 bufferFDL_R_.fillWithZero(); bufferFDL_I_.fillWithZero(); lastInsertedDelayLineIdx = 0; //. //set as kernel argument /**********/bufferFDL_R_.setAsKernelArgument(2, complexMultiplyAdd_kernel_); /**********/bufferFDL_I_.setAsKernelArgument(3, complexMultiplyAdd_kernel_); /**/bufferAccumulator_R_.setAsKernelArgument(4, complexMultiplyAdd_kernel_); /**/bufferAccumulator_I_.setAsKernelArgument(5, complexMultiplyAdd_kernel_); //. //.(Recreate...) //Recreate plans clFFT_Dim3 dim; dim.x = _2B; dim.y = 1; dim.z = 1; fftPlan_ = clFFT_CreatePlan(context_, dim, clFFT_1D, clFFT_SplitComplexFormat, &lastCommandStatus_); //. } //update each time bufferGlobalParameters because of incrementing of lastInsertedDelayLineIdx /*(_2B, _C, _P, pir_C, FDL_LINE)*/ cpuData_bufferGlobalParameters_[0] = _2B; cpuData_bufferGlobalParameters_[1] = _C; cpuData_bufferGlobalParameters_[2] = _P; cpuData_bufferGlobalParameters_[3] = irf_->numOfChannels_; cpuData_bufferGlobalParameters_[4] = lastInsertedDelayLineIdx; bufferGlobalParameters_.set(cpuData_bufferGlobalParameters_); //. //Update channelsWindow window_.update( audio, /*history size*/ _B ); //. //Init >>bufferTransform<< bufferTransform_R_.set(window_.buffer_.data_); for(unsigned int i = 0; i < _2B * _C; ++i) cpu_bufferAccumulator_I_[i]=0; bufferTransform_I_.set(cpu_bufferAccumulator_I_); //. //Make fft of bufferTransform lastCommandStatus_ = clFFT_ExecutePlannar( cmdQueue_, fftPlan_, _C, clFFT_Forward, bufferTransform_R_, bufferTransform_I_, bufferTransform_R_, bufferTransform_I_, 0, NULL, NULL ); //. //Copy bufferTransform into bufferFDL (inserting new delay line) (real and imaginary part) clEnqueueCopyBuffer( cmdQueue_, bufferTransform_R_, bufferFDL_R_, 0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type), 0, NULL, NULL); clEnqueueCopyBuffer( cmdQueue_, bufferTransform_I_, bufferFDL_I_, 0, lastInsertedDelayLineIdx * (_2B * _C ) * sizeof(float_type), (_2B * _C ) * sizeof(float_type), 0, NULL, NULL); //. //Increment host lastInsertedDelayLine lastInsertedDelayLineIdx = (lastInsertedDelayLineIdx + 1 ) % _P; //. //Execute kernel size_t globalWorkSize[1]; globalWorkSize[0] = _2B * _C /* == window_.get_allLength() */; lastCommandStatus_ = clEnqueueNDRangeKernel(cmdQueue_, complexMultiplyAdd_kernel_, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if(lastCommandStatus_ == -4) { std::cout << "Too much amount of memory must be allocated on the GPU due to lenght of impulse response and number of channels."; throw int(); } else if(lastCommandStatus_ != 0) { std::cout << "Error while sending clEnqueueNDRangeKernel."; throw int(); } //. //ifft of bufferAccumulator lastCommandStatus_ = clFFT_ExecutePlannar( cmdQueue_, fftPlan_, _C, clFFT_Inverse, bufferAccumulator_R_, bufferAccumulator_I_, bufferAccumulator_R_, bufferAccumulator_I_, 0, NULL, NULL ); //. //Copy from bufferAccumulator to cpu bufferAccumulator_R_.get(cpu_bufferAccumulator_R_); //. //Flushing and finishing clFlush(cmdQueue_); clFinish(cmdQueue_); //. //Write fftw vector form to audio.outputChannel[number of Channel] for (unsigned int channNum = 0; channNum < _C; ++channNum) for (unsigned sampleNum = 0; sampleNum < _B; ++sampleNum) audio.out_[channNum][sampleNum] = (cpu_bufferAccumulator_R_[channNum*_2B + _B + sampleNum])/_2B; //. }
int crackMD5(char *hash, char *cs, int passlen) { clut_device dev; // device struct cl_event evt; // performance measurement event cl_kernel kernel; // execution kernel cl_int ret; // error code double td; int cs_len, sync_flag; long chunk, disp; unsigned char bin_hash[HASH_SIZE]; cs_len = strlen(cs); sync_flag = 0; strToBin(hash, bin_hash, 2*HASH_SIZE); disp = DISPOSITIONS(cs_len, passlen); chunk = DISP_PER_CORE(disp, AVAILABLE_THREADS); debug("HOST", "Numero di disposizione da calcolare per stream processing unit = %lu\n", chunk); clut_open_device(&dev, PATH_TO_KERNEL); clut_print_device_info(&dev); /* ----------------------------------------- Create execution kernel ----------------------------------------- */ kernel = clCreateKernel(dev.program, KERNEL_NAME, &ret); clut_check_err(ret, "Fallita la creazione del kernel"); /* ----------------------------------- Create memory buffers on the device ----------------------------------- */ cl_mem dchunk = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(long), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del chunk"); cl_mem dhash = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, HASH_SIZE * sizeof(unsigned char), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione dell'hash"); cl_mem charset = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, cs_len * sizeof(char), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del charset"); cl_mem charset_size = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset"); cl_mem dpasslen = clCreateBuffer(dev.context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della taglia del charset"); //cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, AVAILABLE_CORES * sizeof(int), NULL, &ret); cl_mem sync = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione del flag di sync"); cl_mem dcracked = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE, NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro"); cl_mem computed_hash = clCreateBuffer(dev.context, CL_MEM_READ_WRITE, HASH_SIZE * sizeof(unsigned char), NULL, &ret); if (ret) clut_panic(ret, "Fallita l'allocazione della memoria sul device per la memorizzazione della password in chiaro"); /* ----------------------------------- Write memory buffers on the device ------------------------------------ */ ret = clEnqueueWriteBuffer(dev.queue, dchunk, CL_TRUE, 0, sizeof(long), &chunk, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura del chunk sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, dhash, CL_TRUE, 0, HASH_SIZE * sizeof(unsigned char), (int *)bin_hash, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura dell'hash sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, charset, CL_TRUE, 0, cs_len * sizeof(char), cs, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura del charset sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, charset_size, CL_TRUE, 0, sizeof(int), &cs_len, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device"); ret = clEnqueueWriteBuffer(dev.queue, dpasslen, CL_TRUE, 0, sizeof(int), &passlen, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device"); //ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), &sync_flag, 0, NULL, NULL); ret = clEnqueueWriteBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &sync_flag, 0, NULL, NULL); if(ret) clut_panic(ret, "Fallita la scrittura della taglia del charset sul buffer di memoria del device"); /* --------------------------------- Set the arguments to our compute kernel --------------------------------- */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &dchunk); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dhash); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &charset); ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &charset_size); ret |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dpasslen); ret |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &sync); ret |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &dcracked); ret |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &computed_hash); clut_check_err(ret, "Fallito il setting degli argomenti del kernel"); /* ---------------------------------------- Execute the OpenCL kernel ---------------------------------------- */ size_t global_dim[] = { AVAILABLE_THREADS }; ret = clEnqueueNDRangeKernel(dev.queue, kernel, 1, NULL, global_dim, NULL, 0, NULL, &evt); if(ret) clut_check_err(ret, "Fallita l'esecuzione del kernel"); /* -------------------------- Read the device memory buffer to the local variable ---------------------------- */ //int found[80]; int found; int digest[HASH_SIZE/sizeof(int)]; char *password = (char *) malloc(passlen * sizeof(char) + 1); memset(password, 0, passlen * sizeof(char) + 1); //memset(found, 0, AVAILABLE_CORES * sizeof(int)); //ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, AVAILABLE_CORES * sizeof(int), found, 0, NULL, NULL); ret = clEnqueueReadBuffer(dev.queue, sync, CL_TRUE, 0, sizeof(int), &found, 0, NULL, NULL); if(ret) clut_check_err(ret, "Fallimento nel leggere se la password e' stata trovata con successo"); debug("HOST", "La password e' stata trovata dal kernel OpenCL? "); /*int i; for(i=0; i<AVAILABLE_CORES; i++){ printf(" %d ", found[i]); } printf("\n");*/ if(found){ ret = clEnqueueReadBuffer(dev.queue, dcracked, CL_TRUE, 0, HASH_SIZE, digest, 0, NULL, NULL); if(ret) clut_check_err(ret, "Fallimento nel leggere la password"); printf("Si. Password: %s\n", (char *)digest); } else printf("No.\n"); /* ------------------------------------- Return kernel execution time ---------------------------------------- */ td = clut_get_duration(evt); debug("HOST","Kernel duration: %f secs\n", td); /* ----------------------------------------------- Clean up -------------------------------------------------- */ ret = clReleaseKernel(kernel); ret |= clReleaseMemObject(dchunk); ret |= clReleaseMemObject(dhash); ret |= clReleaseMemObject(charset); ret |= clReleaseMemObject(charset_size); ret |= clReleaseMemObject(dpasslen); ret |= clReleaseMemObject(sync); ret |= clReleaseMemObject(dcracked); ret |= clReleaseMemObject(computed_hash); clut_check_err(ret, "Rilascio di risorse fallito"); clFinish(dev.queue); clut_close_device(&dev); return 0; }